From cc8c48adb0bea22be8d82344c8b8420a97054e40 Mon Sep 17 00:00:00 2001 From: Bartosz Broda <bartosz.broda@gmail.com> Date: Mon, 13 Jun 2011 23:27:58 +0200 Subject: [PATCH] add add_lexicalunit to index, small improvement in creation of mwe in sax parser --- libmwereader/mwe.cpp | 56 +++++++++++++++++++++++++++--------- libmwereader/mwe.h | 58 ++++++++++++++++++++++++++++++++------ libmwereader/mweparser.cpp | 16 ++++++++--- 3 files changed, 104 insertions(+), 26 deletions(-) diff --git a/libmwereader/mwe.cpp b/libmwereader/mwe.cpp index e76aaf4..897c80e 100644 --- a/libmwereader/mwe.cpp +++ b/libmwereader/mwe.cpp @@ -4,27 +4,21 @@ namespace Corpus2{ LexicalUnit::LexicalUnit(const std::string &base, - boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition, - boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond, - std::map<std::string, std::string> variables) + LexicalUnit::BoolOpPtr condition, + LexicalUnit::BoolOpPtr head_cond, + LexicalUnit::strmap variables) : condition_(condition), head_cond_(head_cond), variables_(variables), base_(base), nowhere_(Wccl::Position()) { - // noop + for(strmap::iterator iter = variables_.begin(); + iter != variables_.end(); ++iter) + potential_bases_.insert(iter->second); } -FixedLU::FixedLU(const std::string &base, - boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition, - boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond, - std::map<std::string, std::string> variables) - : LexicalUnit(base, condition, head_cond, variables) -{ -} - -bool FixedLU::IsHere(const Wccl::SentenceContext &sc, +bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, std::set<size_t> &out_position) { boost::shared_ptr<const Wccl::Bool> pResult = condition_->apply(sc); @@ -45,4 +39,40 @@ bool FixedLU::IsHere(const Wccl::SentenceContext &sc, return true; } + +FixedLU::FixedLU(const std::string &base, + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition, + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond, + std::map<std::string, std::string> variables) + : LexicalUnit(base, condition, head_cond, variables) +{ +} + +FlexLU::FlexLU(const std::string &base, + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition, + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond, + std::map<std::string, std::string> variables) + : LexicalUnit(base, condition, head_cond, variables) +{ +} + +MWEIndex::MWEIndex() +{ + +} + +void MWEIndex::add_lexicalunit(LexicalUnitPtr lu) +{ + foreach(const std::string& base, lu->get_potential_bases()){ + value_type::iterator find = index_.find(base); + if(find == index_.end()){ // not found -> create new one + luvec v; + v.push_back(lu); + index_.insert( std::make_pair(base, v)); + }else{// already exists -> add lu + (find->second).push_back(lu); + } + } +} + }//ns Corpus2 diff --git a/libmwereader/mwe.h b/libmwereader/mwe.h index f62f78f..0676ad0 100644 --- a/libmwereader/mwe.h +++ b/libmwereader/mwe.h @@ -1,6 +1,8 @@ #ifndef LIBMWEREADER_MWE_H #define LIBMWEREADER_MWE_H +#include <boost/unordered_map.hpp> + #include <libcorpus2/io/reader.h> #include <libwccl/ops/operator.h> @@ -11,24 +13,48 @@ namespace Corpus2 { class LexicalUnit { public: - LexicalUnit(const std::string &base, - boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition, - boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond, - std::map<std::string, std::string> variables + typedef std::map<std::string, std::string> strmap; + typedef std::set<std::string> strset; + typedef boost::shared_ptr<Wccl::Operator<Wccl::Bool> > BoolOpPtr; + + LexicalUnit(const std::string &base, BoolOpPtr condition, + BoolOpPtr head_cond, strmap variables ); + /** + * \param sc SentenceContext with position set to value which + * will be checked + * \param out_positions will contain absolute position in + * SentenceContext (called with sc->get_abs_position) only if + * the main condition of this LexicalUnit will return true in current + * sentence context + * \returns true if this lexical unit was found here + */ virtual bool IsHere(const Wccl::SentenceContext& sc, - std::set<size_t> &out_position) = 0; + std::set<size_t> &out_positions) ; + + const std::string & get_base() const{ return base_;} + const strmap & get_variables() const{ return variables_;} + const strset& get_potential_bases() const{ return potential_bases_;} + + protected: + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition_; boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond_; - std::map<std::string, std::string> variables_; + strmap variables_; std::string base_; + strset potential_bases_; + const Wccl::Position nowhere_; }; +typedef boost::shared_ptr<LexicalUnit> LexicalUnitPtr; + + +// TODO: czy bedzie potrzebny podzial na fix/flex w kodzie? class FixedLU : public LexicalUnit { public: @@ -37,18 +63,32 @@ public: boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond, std::map<std::string, std::string> variables ); - virtual bool IsHere(const Wccl::SentenceContext& sc, - std::set<size_t> &out_position); - }; class FlexLU : public LexicalUnit { +public: + FlexLU(const std::string &base, + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > condition, + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > head_cond, + std::map<std::string, std::string> variables + ); }; class MWEIndex // lub base -> vector<LexicalUnit> { +public: + MWEIndex(); + + void add_lexicalunit(LexicalUnitPtr lu); + +protected: + typedef std::vector<LexicalUnitPtr> luvec; + typedef boost::unordered_map<std::string,luvec> value_type; + + + value_type index_; }; }// ns Corpus2 diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index 5f4e88d..c8fd91e 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -60,10 +60,17 @@ namespace Corpus2 { MWEBuilder::BoolOpPtr head = mwe_builder_->get_head_condition( head_cond_); - //foreach(const std::string&varname, main->valid_variable_names()) - //if(boost::algorithm::starts_with(varname, "Pos")) - //std::cout << "Pozycja: " << varname << std::endl; - + LexicalUnitPtr lu; + if(group_type_ == "fix"){ // group_name_ -> lower case + lu = LexicalUnitPtr(new FixedLU(mwe_base_, main, head, + variables_)); + } else if(group_type_ == "flex"){ + lu = LexicalUnitPtr(new FlexLU(mwe_base_, main, head, + variables_)); + } else { + throw Wccl::WcclError("Unknown type of lexical unit:" + + group_type_); + } } std::string MWEParser::get_attribute(const AttributeList& attributes, @@ -85,6 +92,7 @@ namespace Corpus2 { group_name_ = a.value; } else if(a.name == "type"){ group_type_ = a.value; + boost::algorithm::to_lower(group_type_); } else if(a.name == "class"){ group_class_ = a.value; } -- GitLab