diff --git a/libwccl/lexicon/lexicon.cpp b/libwccl/lexicon/lexicon.cpp index 1045aa579236e32d60f127033b5ec992c970be9c..2bc89bbca61e4a13a9dbe0bf412dc08996392a94 100644 --- a/libwccl/lexicon/lexicon.cpp +++ b/libwccl/lexicon/lexicon.cpp @@ -25,23 +25,30 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Wccl { -const UnicodeString& Lexicon::translate(const UnicodeString &key) const +boost::shared_ptr<StrSet> Lexicon::translate(const UnicodeString &key) const { - static UnicodeString empty; + boost::shared_ptr<StrSet> ret_set = boost::make_shared<StrSet>(); map_t::const_iterator i = map_.find(key); if (i == map_.end()) { - return empty; + return ret_set; + } + + BOOST_FOREACH (const UnicodeString& s, i->second){ + ret_set->insert(s); } - return i->second; + + return ret_set; } boost::shared_ptr<StrSet> Lexicon::translate(const StrSet& set) const { boost::shared_ptr<StrSet> ret_set = boost::make_shared<StrSet>(); BOOST_FOREACH (const UnicodeString& s, set.get_value()) { - const UnicodeString& v = translate(s); - if (!v.isEmpty()) { - ret_set->insert(v); + boost::shared_ptr<StrSet> v = translate(s); + if (!v->empty()) { + BOOST_FOREACH(const UnicodeString& s, v->contents()){ + ret_set->insert(s); + } } } return ret_set; @@ -49,16 +56,13 @@ boost::shared_ptr<StrSet> Lexicon::translate(const StrSet& set) const void Lexicon::insert(const UnicodeString& key, const UnicodeString& value) { - if (has_key(key)) { - throw InvalidArgument("key", "Duplicated key in lexicon: " + PwrNlp::to_utf8(key)); - } if (key.isEmpty()) { throw InvalidArgument("key", "Empty key string in lexicon."); } if (value.isEmpty()) { throw InvalidArgument("value", "Empty value string in lexicon."); } - map_[key] = value; + map_[key].push_back(value); } } /* end ns Wccl */ diff --git a/libwccl/lexicon/lexicon.h b/libwccl/lexicon/lexicon.h index de8300bfe91fc4f9f735f8667f92d2f092f5d775..34e56d2339b8fd00a943f3c0119500b91bde12cb 100644 --- a/libwccl/lexicon/lexicon.h +++ b/libwccl/lexicon/lexicon.h @@ -29,7 +29,7 @@ namespace Wccl { class Lexicon : boost::noncopyable { public: - typedef boost::unordered_map<UnicodeString, UnicodeString> map_t; + typedef boost::unordered_map<UnicodeString, std::vector<UnicodeString> > map_t; Lexicon(const std::string& name, const std::string& file_name) : name_(name), @@ -43,7 +43,7 @@ public: * @returns Value assigned to the given key, if present. * Empty UnicodeString if the key was not present. */ - const UnicodeString& translate(const UnicodeString& key) const; + boost::shared_ptr<StrSet> translate(const UnicodeString& key) const; /** diff --git a/swig/lexicon.i b/swig/lexicon.i index d32a41f4948d288af145bb14fb953102d5e162b3..b7c6eec86864326e7deafaa301da48c859ed716a 100644 --- a/swig/lexicon.i +++ b/swig/lexicon.i @@ -32,7 +32,7 @@ namespace Wccl { /* --------------------------------------------------------------------- */ - const UnicodeString& translate(const UnicodeString& key) const; + boost::shared_ptr<StrSet> translate(const UnicodeString& key) const; // boost::shared_ptr<StrSet> translate(const StrSet& set) const; // TODO diff --git a/tests/data/indecl.lex b/tests/data/indecl.lex new file mode 100644 index 0000000000000000000000000000000000000000..97474c1014105692c029a72cbacc69b70aa6b483 --- /dev/null +++ b/tests/data/indecl.lex @@ -0,0 +1,10 @@ +by part +och interj +ach interj +dla prep +bez prep +z prep +dziś adv +wczoraj adv +by part2 +uważać verb1 diff --git a/tests/data/indecl.wccl b/tests/data/indecl.wccl new file mode 100644 index 0000000000000000000000000000000000000000..49ca105d183b92efc5e965420227d47d5ff383e4 --- /dev/null +++ b/tests/data/indecl.wccl @@ -0,0 +1,12 @@ +import("indecl.lex", "indecl") // import file as "indecl" + +@"indecl" ( // gets the label from the lexicon + lex(base[0], "indecl"); + lex("dziś", "indecl"); + lex("by", "indecl"); + lex("marchewka", "indecl"); + lex(["by", "kot", "marchewka"], "indecl"); + lex(["by", "dla"], "indecl"); + lex(["by", "kot", "marchewka", "wczoraj"], "indecl") +) + diff --git a/tests/data/lex_test.ccl b/tests/data/lex_test.ccl new file mode 100644 index 0000000000000000000000000000000000000000..d1d6d5b383a4bd3eb8e9b77a6d2eba3459fc2984 --- /dev/null +++ b/tests/data/lex_test.ccl @@ -0,0 +1,32 @@ +wholeWccl=indecl.wccl +sentence=t01.xml +tagset=nkjp +--- +[] + +["verb1"] +--- +[] + +["adv"] +--- +[] + +["part", "part2"] +--- +[] + +[] +--- +[] + +["part", "part2"] +--- +[] + +["part", "part2", "prep"] +--- +[] + +["part", "part2", "adv"] +--- diff --git a/tests/datadriven.cpp b/tests/datadriven.cpp index d94dbeb363580bf0f768c0ffef02489c5d6c45ff..377ab2765b6e7287977e1f557216a54ada617290 100644 --- a/tests/datadriven.cpp +++ b/tests/datadriven.cpp @@ -28,6 +28,8 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libwccl/parser/Parser.h> #include <libwccl/ops/funexeccontext.h> +#include <libwccl/wcclfile.h> + #include <fstream> #include <boost/filesystem/fstream.hpp> #include <iostream> @@ -66,6 +68,8 @@ void test_one_item_actual(const compare_test& c) std::string separators = "="; std::string tagset_name = "kipi"; std::string sentence_filename; + std::string wholeWccl; + boost::shared_ptr<Wccl::WcclFile> wcclFile; std::string line; int line_no = 0; while (ifs_in.good() && line != "---") { @@ -78,6 +82,8 @@ void test_one_item_actual(const compare_test& c) tagset_name = fields[1]; } else if (fields[0] == "sentence") { sentence_filename = fields[1]; + } else if (fields[0] == "wholeWccl") { + wholeWccl = fields[1]; } } } @@ -103,6 +109,7 @@ void test_one_item_actual(const compare_test& c) std::string operator_string, expected_output; + int lexCounter = 0; while (ifs_in.good()) { ++line_no; std::getline(ifs_in, line); @@ -113,7 +120,20 @@ void test_one_item_actual(const compare_test& c) Wccl::Parser parser(tagset); boost::shared_ptr<Wccl::FunctionalOperator> parsed; try { - parsed = parser.parseAnyOperator(operator_string); + if(wholeWccl.size() == 0){ + parsed = parser.parseAnyOperator(operator_string); + }else{ + path wholeWcclPath = c.search_path / wholeWccl; + wcclFile = parser.parseWcclFileFromPath(wholeWcclPath.string(), c.search_path.string()); + parsed = wcclFile->get_untyped_op_ptr("indecl", lexCounter); + operator_string = "operator number "; + char lexCounterStr[10]; + sprintf(lexCounterStr, "%d", lexCounter); + operator_string += lexCounterStr; + operator_string += " defined in file: "; + operator_string += wholeWcclPath.string(); + lexCounter++; + } } catch (Wccl::WcclError& e) { std::cerr << e.info() << "\n---\n" << operator_string << "\n---\n"; throw;