From 20edcc90470b4e719da1d544dbe49b410efa5983 Mon Sep 17 00:00:00 2001 From: omekr <roman.kurc@pwr.wroc.pl> Date: Fri, 5 Aug 2011 13:10:39 +0200 Subject: [PATCH] test against converted multiword units --- libmwereader/mwe.cpp | 15 +++++++++++---- libmwereader/mweparser.cpp | 17 +++++++++++------ libmwereader/mwereader.cpp | 3 +++ libmwereader/tests/mwefunctional.cpp | 2 ++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp index 4fcc72b..9397a4f 100644 --- a/libmwereader/mwe.cpp +++ b/libmwereader/mwe.cpp @@ -1,6 +1,7 @@ #include "mwe.h" #include <boost/algorithm/string.hpp> #include <libwccl/values/strset.h> +#include <boost/algorithm/string/predicate.hpp> namespace Corpus2{ @@ -27,11 +28,14 @@ LexicalUnit::LexicalUnit(const std::string &base, bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, std::set<int> &out_position, int &head_pos) const { - // set variables + // set variables, skip vars with names starting with '!' for(variables_map::const_iterator ivars = variables_.begin(); - ivars != variables_.end(); ++ivars){ - condition_->set<Wccl::StrSet>(ivars->first, ivars->second); - } + ivars != variables_.end(); ++ivars){ + if(!boost::starts_with(ivars->first, "!")){ + std::cout << ivars->first << " " << std::endl; + condition_->set<Wccl::StrSet>(ivars->first, ivars->second); + } + } // fire up the operator boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc); @@ -102,6 +106,7 @@ void MWEIndex::add_lexicalunit(LexicalUnit::Ptr lu) { foreach(const std::string& base, lu->get_potential_bases()){ value_type::iterator find = index_.find(base); + std::cout << "b:"<<base<<std::endl; if(find == index_.end()){ // not found -> create new one luvec v; v.push_back(lu); @@ -113,6 +118,8 @@ void MWEIndex::add_lexicalunit(LexicalUnit::Ptr lu) } const MWEIndex::luvec& MWEIndex::get_potential_lu(const std::string &base){ + std::cout << "index " << index_.size()<< std::endl; + std::cout << "sb:"<<base<<std::endl; value_type::iterator find = index_.find(base); if(find == index_.end()){ // not found -> return empty return empty_; diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index 1edb9c6..5d7534f 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -34,9 +34,9 @@ namespace Corpus2 { if(search != where.end()) return search->second; - + //std::cout << " dddddddddddddd "<< cond << std::endl; BoolOpPtr op = parser_.parseBoolOperator(cond); - + //std::cout << " dddddddddddddd $$$$" << cond << std::endl; where[cond] = op; return op; @@ -57,24 +57,28 @@ namespace Corpus2 { void MWEParser::create_mwe() { print_current_mwe(true); + //std::cout << " kupa cond" << std::endl; MWEBuilder::BoolOpPtr main = mwe_builder_->get_mwe_condition( wccl_operator_); + //std::cout << " kupa head" << std::endl; MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition( head_cond_); - + //std::cout << " kupa " << std::endl; if(group_type_ == "fix"){ // group_name_ -> lower case - + //std::cout << " kupa fix" << std::endl; mwe_index_.add_lexicalunit( LexicalUnit::Ptr(new FixedLU(mwe_base_, main, head, variables_))); } else if(group_type_ == "flex"){ + //std::cout << " kupa flex" << std::endl; mwe_index_.add_lexicalunit(LexicalUnit::Ptr(new FlexLU(mwe_base_, main, head, variables_))); } else { throw Wccl::WcclError("Unknown type of lexical unit:" + group_type_); } - + //std::cout << " kupa clear" << std::endl; variables_.clear(); + //std::cout << "po kupie " << std::endl; } std::string MWEParser::get_attribute(const AttributeList& attributes, @@ -109,8 +113,9 @@ namespace Corpus2 { void MWEParser::on_start_element(const Glib::ustring &name, const AttributeList& attributes) { + std::cout << "about to check" << std::endl; std::cout << state_ << ": " << name << std::endl; - + std::cout << "done with check" << std::endl; if(state_ == NONE && name == "units_description"){ tagset_ = get_attribute(attributes, "tagset"); mwe_builder_ = boost::shared_ptr<MWEBuilder>(new MWEBuilder(Corpus2::get_named_tagset(tagset_))); diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index 83739ee..90b3bcd 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -47,10 +47,13 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( if(lex.is_disamb()){ std::string base = lex.lemma_utf8(); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); + std::cout << "potential " << potential.size() << std::endl; foreach(LexicalUnit::Ptr pLU, potential){ std::set<int> positions; int head; + //std::cout << " is " << std::endl; bool is_here = pLU->IsHere(sc, positions, head); + //std::cout << " is out" << std::endl; if(is_here){ std::string new_orth_utf8; Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr(); diff --git a/libmwereader/tests/mwefunctional.cpp b/libmwereader/tests/mwefunctional.cpp index 141d3fc..5e0b082 100644 --- a/libmwereader/tests/mwefunctional.cpp +++ b/libmwereader/tests/mwefunctional.cpp @@ -31,6 +31,8 @@ struct Fixture{ }; + + BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture) { BOOST_MESSAGE("test: finding preferred lexeme"); -- GitLab