diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index aa066f935f3e062ffa7310725030968159db976b..87b97583e4e0f405fca0a43b51df8130ec508147 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -54,6 +54,7 @@ SET(libcorpus2_STAT_SRC tagsetmanager.cpp tagsetparser.cpp token.cpp + tokenmetadata.cpp io/cclreader.cpp io/orthwriter.cpp io/plainwriter.cpp diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp index 78a2e606a7578c24cb322d43aa5d1a73d99c3fa5..7efe33d2a6b6a8c16fee0e6f4d42d996d886e038 100644 --- a/libcorpus2/token.cpp +++ b/libcorpus2/token.cpp @@ -17,11 +17,12 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/token.h> #include <sstream> #include <libpwrutils/foreach.h> +#include <libcorpus2/tokenmetadata.h> namespace Corpus2 { Token::Token() - : orth_(), wa_(), lexemes_() + : orth_(), wa_(), lexemes_(), metadata_(NULL) { } @@ -32,7 +33,13 @@ Token::Token(const UnicodeString &orth, PwrNlp::Whitespace::Enum wa) Token* Token::clone() const { - Token* t = new Token(*this); + Token* t = new Token(); + t->orth_ = orth_; + t->wa_ = wa_; + t->lexemes_ = lexemes_; + if (metadata_.get()) { + t->set_metadata(metadata_->clone()); + } return t; } @@ -118,4 +125,9 @@ bool Token::orth_pos_match(mask_t pos, const UnicodeString &orth) const return true; } +void Token::create_metadata() +{ + metadata_.reset(new TokenMetaData); +} + } /* end ns Corpus2 */ diff --git a/libcorpus2/token.h b/libcorpus2/token.h index a6da809044ae0a83ae35c98254891e9847a279aa..7246fb2bfd5c7ae5d15ab294bcb95de8c872d4cb 100644 --- a/libcorpus2/token.h +++ b/libcorpus2/token.h @@ -24,11 +24,16 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libpwrutils/whitespace.h> #include <unicode/unistr.h> +#include <memory> #include <string> #include <vector> namespace Corpus2 { + +/// Forward declaration of optional Token metadata class +class TokenMetaData; + /** * A single token with morphological analyses. * @@ -37,7 +42,7 @@ namespace Corpus2 { * of possible interpretations stored as lexemes. */ class Token - : boost::equality_comparable<Token> + : boost::equality_comparable<Token>, boost::noncopyable { public: /// Creates an empty Token @@ -135,6 +140,19 @@ public: */ bool orth_pos_match(mask_t pos, const UnicodeString& orth) const; + /// Metadata setter + void set_metadata(TokenMetaData* md) { + metadata_.reset(md); + } + + /// Metadata getter + TokenMetaData* get_metadata() const { + return metadata_.get(); + } + + /// Creates an empty metdata object for this Token + void create_metadata(); + private: /// The orth (actual encountered form) //boost::flyweight<UnicodeString> orth_; @@ -145,6 +163,9 @@ private: /// The possible lexemes std::vector<Lexeme> lexemes_; + + /// Metadata + std::auto_ptr<TokenMetaData> metadata_; }; } /* end ns Corpus2 */ diff --git a/libcorpus2/tokenmetadata.cpp b/libcorpus2/tokenmetadata.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c41f769c65ac33e5af87f5ae4db1f4cbad95e9a1 --- /dev/null +++ b/libcorpus2/tokenmetadata.cpp @@ -0,0 +1,54 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/tokenmetadata.h> +#include <libpwrutils/foreach.h> +#include <boost/make_shared.hpp> + +namespace Corpus2 { + +TokenMetaData::TokenMetaData() +{ +} + +TokenMetaData* TokenMetaData::clone() const +{ + return new TokenMetaData(*this); +} + +bool TokenMetaData::has_attribute(const std::string &name) const +{ + return attributes_.find(name) != attributes_.end(); +} + +std::string TokenMetaData::get_attribute(const std::string &name) const +{ + std::map<std::string, std::string>::const_iterator i; + i = attributes_.find(name); + if (i != attributes_.end()) { + return i->second; + } else { + return ""; + } +} + +void TokenMetaData::set_attribute(const std::string &name, + const std::string &value) +{ + attributes_[name] = value; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/tokenmetadata.h b/libcorpus2/tokenmetadata.h new file mode 100644 index 0000000000000000000000000000000000000000..6d1c6444c23173ea28dc6f1530aefb93eaa41bbb --- /dev/null +++ b/libcorpus2/tokenmetadata.h @@ -0,0 +1,53 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_TOKENMETADATA_H +#define LIBCORPUS2_TOKENMETADATA_H + +#include <string> +#include <map> +#include <boost/shared_ptr.hpp> + +namespace Corpus2 { + + +/// Token metadata class +class TokenMetaData +{ +public: + TokenMetaData(); + + TokenMetaData* clone() const; + + typedef std::map<std::string, std::string> attr_map_t; + + bool has_attribute(const std::string& name) const; + + std::string get_attribute(const std::string& name) const; + + void set_attribute(const std::string& name, const std::string& value); + + const attr_map_t& attributes() const { + return attributes_; + } + +private: + attr_map_t attributes_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_TOKENMETADATA_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e2b53b044182c393a80620bb129624158172586b..1fad3db39ffca8c9feae1ce93a84d901494086e1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,6 +12,7 @@ add_executable( tests ioann.cpp tag_split.cpp tagset_parse.cpp + tokenmetadata.cpp ) target_link_libraries ( tests corpus2 pwrutils ${Boost_LIBRARIES}) diff --git a/tests/basic.cpp b/tests/basic.cpp index 6c788dc4e5b07219209b414d72cc442d0f91cec7..153b80147623a81df6a161c0d91eeabb7e43c45c 100644 --- a/tests/basic.cpp +++ b/tests/basic.cpp @@ -47,14 +47,15 @@ BOOST_AUTO_TEST_CASE( token_dup_lexemes ) t.add_lexeme(l2); BOOST_CHECK(!t.check_duplicate_lexemes()); BOOST_CHECK(!t.remove_duplicate_lexemes()); - Corpus2::Token tt(t); + Corpus2::Token* tt = t.clone(); t.add_lexeme(l1); - BOOST_CHECK(t != tt); + BOOST_CHECK(t != *tt); BOOST_CHECK(t.check_duplicate_lexemes()); BOOST_CHECK(t.remove_duplicate_lexemes()); BOOST_CHECK(!t.check_duplicate_lexemes()); BOOST_CHECK(!t.remove_duplicate_lexemes()); - BOOST_CHECK(t == tt); + BOOST_CHECK(t == *tt); + delete tt; } BOOST_AUTO_TEST_CASE( is_icu_working ) diff --git a/tests/tokenmetadata.cpp b/tests/tokenmetadata.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1601226a27918efb7be1ccff0604876d5ac213d6 --- /dev/null +++ b/tests/tokenmetadata.cpp @@ -0,0 +1,39 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <boost/test/unit_test.hpp> +#include <libcorpus2/token.h> +#include <libcorpus2/tokenmetadata.h> + +BOOST_AUTO_TEST_SUITE(token_metadata) + +BOOST_AUTO_TEST_CASE(meta1) +{ + Corpus2::Token t1(UnicodeString::fromUTF8("t1"), PwrNlp::Whitespace::None); + BOOST_CHECK(!t1.get_metadata()); + t1.create_metadata(); + BOOST_CHECK(t1.get_metadata()); + BOOST_CHECK(t1.get_metadata()->attributes().empty()); + Corpus2::Token* t2 = t1.clone(); + BOOST_CHECK(t2->get_metadata()); + BOOST_CHECK(t2->get_metadata()->attributes().empty()); + t2->get_metadata()->set_attribute("A", "B"); + BOOST_CHECK(!t2->get_metadata()->attributes().empty()); + BOOST_CHECK(t1.get_metadata()->attributes().empty()); + delete t2; +} + +BOOST_AUTO_TEST_SUITE_END()