From 44625c67025676b1d1d73b0490053aef930a9db9 Mon Sep 17 00:00:00 2001 From: Bartosz Broda <bartosz.broda@gmail.com> Date: Fri, 10 Jun 2011 21:00:23 +0200 Subject: [PATCH] some work on mwe parsing --- libmwereader/mweparser.cpp | 12 ++++++++++-- libmwereader/mweparser.h | 18 +++++++++++++++++- libmwereader/mwereader.cpp | 2 +- libmwereader/mwereader.h | 2 +- libmwereader/mwertest.cpp | 6 ++++-- 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index 6cedc52..add215e 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -1,6 +1,7 @@ #include "mweparser.h" #include <libpwrutils/foreach.h> +#include <libcorpus2/tagsetmanager.h> #include <libxml++/libxml++.h> #include <libxml2/libxml/parser.h> @@ -9,8 +10,14 @@ namespace Corpus2 { - MWEParser::MWEParser() - : BasicSaxParser(), state_(NONE) + MWEBuilder::MWEBuilder(const Tagset& tagset) + : tagset_(tagset) + { + + } + + MWEParser::MWEParser(MWEIndex &index) + : BasicSaxParser(), state_(NONE), mwe_index_(index) { } @@ -47,6 +54,7 @@ namespace Corpus2 { if(state_ == NONE && name == "units_description"){ tagset_ = get_attribute(attributes, "tagset"); + mwe_builder_ = boost::shared_ptr<MWEBuilder>(new MWEBuilder(Corpus2::get_named_tagset(tagset_))); state_ = UNITSDESC; } else if(state_ == UNITSDESC && name == "mwegroup"){ parse_mwegroup_attributes(attributes); diff --git a/libmwereader/mweparser.h b/libmwereader/mweparser.h index 81313e5..85cde5f 100644 --- a/libmwereader/mweparser.h +++ b/libmwereader/mweparser.h @@ -3,18 +3,31 @@ #include <libcorpus2/io/reader.h> #include <libcorpus2/io/sax.h> +#include <boost/unordered_map.hpp> + +#include "mwe.h" namespace Corpus2 { class MWEBuilder { +public: + MWEBuilder(const Tagset& tagset); + typedef boost::unordered_map<std::string, std::string> value_type; + +private: + const Tagset& tagset_; + /// str -> ptr to ccl operator + value_type main_conditions_; + /// str -> ptr to ccl operator + value_type head_conditions_; }; class MWEParser : public BasicSaxParser { public: - MWEParser(); + MWEParser(MWEIndex &index); protected: typedef std::map<std::string, std::string> str_map; @@ -56,6 +69,9 @@ protected: std::string group_type_; std::string group_class_; std::string head_cond_; + + MWEIndex &mwe_index_; + boost::shared_ptr<MWEBuilder> mwe_builder_; }; } // ns Corpus2 diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index b8e0909..d7b38c7 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -73,7 +73,7 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( void MWEReader::load_mwes(const std::string &filename) { - MWEParser parser; + MWEParser parser(mwe_index_); parser.parse_file(filename); } diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h index 7b8b4d9..4c65c2e 100644 --- a/libmwereader/mwereader.h +++ b/libmwereader/mwereader.h @@ -49,7 +49,7 @@ public: private: void load_mwes(const std::string& filename); - //MWEIndex mwe_index_; + MWEIndex mwe_index_; /// ptr to inner reader doing the real work of reading a corpus TokenReaderPtr inner_reader_; /// path for inner reader diff --git a/libmwereader/mwertest.cpp b/libmwereader/mwertest.cpp index cb78a83..c5b9ceb 100644 --- a/libmwereader/mwertest.cpp +++ b/libmwereader/mwertest.cpp @@ -5,8 +5,10 @@ int main(int ac, char**av) { using namespace Corpus2; - std::cout << "Starting tests" << std::endl; + std::cout << "Starting tests... " << ac<< std::endl; - MWEParser parser; + MWEIndex temp_index; + + MWEParser parser(temp_index); parser.parse_file(av[1]); } -- GitLab