From 66d3eb6d3b198f1b520ea1c2b875ca39dec11aa3 Mon Sep 17 00:00:00 2001 From: Bartosz Broda <bartosz.broda@gmail.com> Date: Tue, 14 Jun 2011 11:44:11 +0200 Subject: [PATCH] first working version of mwereader (for sentences only) --- libmwereader/mwe.cpp | 2 +- libmwereader/mwe.h | 2 +- libmwereader/mweparser.cpp | 4 ++- libmwereader/mwereader.cpp | 52 ++++++++++++++++++++++++++++++-------- libmwereader/mwereader.h | 3 +++ libmwereader/mwertest.cpp | 2 ++ libmwereader/test_mwe.xml | 9 +++++-- 7 files changed, 59 insertions(+), 15 deletions(-) diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp index 5799d2c..ec4bbe0 100644 --- a/libmwereader/mwe.cpp +++ b/libmwereader/mwe.cpp @@ -24,7 +24,7 @@ LexicalUnit::LexicalUnit(const std::string &base, } bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, - std::set<size_t> &out_position, int &head_pos) const + std::set<int> &out_position, int &head_pos) const { // set variables for(variables_map::const_iterator ivars = variables_.begin(); diff --git a/libmwereader/mwe.h b/libmwereader/mwe.h index 439323f..0fec3e0 100644 --- a/libmwereader/mwe.h +++ b/libmwereader/mwe.h @@ -35,7 +35,7 @@ public: * \returns true if this lexical unit was found here */ virtual bool IsHere(const Wccl::SentenceContext& sc, - std::set<size_t> &out_positions, int &head_pos) const; + std::set<int> &out_positions, int &head_pos) const; const std::string & get_base() const{ return base_;} const variables_map & get_variables() const{ return variables_;} diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index cda5674..54e8b27 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -42,6 +42,8 @@ namespace Corpus2 { return op; } + ////////////////////////////////////////////////////////////////////// + MWEParser::MWEParser(MWEIndex &index) : BasicSaxParser(), state_(NONE), mwe_index_(index) { @@ -71,7 +73,7 @@ namespace Corpus2 { + group_type_); } - + variables_.clear(); } std::string MWEParser::get_attribute(const AttributeList& attributes, diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index e6328be..6b0c861 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -30,33 +30,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( { // TODO MWE stuff Sentence::Ptr pSentence = inner_reader_->get_next_sentence(); - Wccl::SentenceContext sc(pSentence); + return process_sentence(sc); + + } + + Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc) + { for(int i = 0; i < sc.size(); ++i){ sc.set_position(i); - Corpus2::Token *pToken = (*pSentence)[i]; - std::cout << pToken->orth_utf8() << " "; + Corpus2::Token *pToken = sc.at(i); + //std::cout << pToken->orth_utf8() << " "; std::vector<Lexeme>& lexemes = pToken->lexemes(); foreach(const Lexeme& lex, lexemes){ if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); - if(potential.size()) - std::cout << "# "; foreach(LexicalUnit::Ptr pLU, potential){ - std::set<size_t> positions; + std::set<int> positions; int head; bool is_here = pLU->IsHere(sc, positions, head); - if(is_here) - std::cout << "** " << pLU->get_base() << "** "; + if(is_here){ + std::string new_orth_utf8; + Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr(); + + std::vector<Token*> &tokens = sent->tokens(); + foreach(const int &pos, positions){ + Token* tok = tokens [pos]; + new_orth_utf8 += tok->orth_utf8() + " "; + if(pos != head){ + delete tok; + tokens[pos] = NULL; + if(pos >= i) + i--; + } + } + + Corpus2::Token *tok = (*sent)[head]; + tok->set_orth_utf8(new_orth_utf8); + foreach(Lexeme& lex, tok->lexemes()) + if(lex.is_disamb()) + lex.set_lemma_utf8(pLU->get_base()); + + std::vector <Token*>::iterator del_iter = tokens.begin(); + while (del_iter != tokens.end()) { + if( (*del_iter) == NULL) + del_iter = tokens.erase(del_iter); + else + ++del_iter; + } + } + } } } } - std::cout << "ENDL\n"; - return pSentence; + + return sc.get_sentence_ptr(); } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h index 4c65c2e..93d1719 100644 --- a/libmwereader/mwereader.h +++ b/libmwereader/mwereader.h @@ -46,6 +46,9 @@ public: static bool registered; +protected: + Sentence::Ptr process_sentence(Wccl::SentenceContext & sc); + private: void load_mwes(const std::string& filename); diff --git a/libmwereader/mwertest.cpp b/libmwereader/mwertest.cpp index c5b9ceb..3265fad 100644 --- a/libmwereader/mwertest.cpp +++ b/libmwereader/mwertest.cpp @@ -11,4 +11,6 @@ int main(int ac, char**av) MWEParser parser(temp_index); parser.parse_file(av[1]); + + } diff --git a/libmwereader/test_mwe.xml b/libmwereader/test_mwe.xml index 020213f..85368ac 100644 --- a/libmwereader/test_mwe.xml +++ b/libmwereader/test_mwe.xml @@ -9,8 +9,8 @@ setvar($Pos2, 1), inter(class[0],{subst,ger,depr}), - inter(class[1],{subst,ger,depr}), - inter(cas[0], cas[1]) + inter(class[1],{subst,ger,depr})//, + //inter(cas[0], cas[1]) - głupie ograniczenie, np. Debatowali nad ceną netto ) </condition> <instances> @@ -24,6 +24,11 @@ <var name="Subst2">netto</var> <head>inter(base[0], "waga")</head> </MWE> + <MWE base="ratyfikacja traktatu"> + <var name="Subst1">ratyfikacja</var> + <var name="Subst2">traktat</var> + <head>inter(base[0], "traktat")</head> + </MWE> </instances> </mwegroup> <mwegroup name="AdjSubstFix" type="fix" class="subst"> -- GitLab