diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index aab66be1e5649f69ea03aedeea53b0f233bf2736..ab85753b7ec6bf7bc3ae81aca891742354694e81 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -1,10 +1,11 @@ #include "mweparser.h" #include <libpwrutils/foreach.h> + #include <libxml++/libxml++.h> #include <libxml2/libxml/parser.h> #include <boost/make_shared.hpp> -#include <fstream> +#include <boost/algorithm/string.hpp> namespace Corpus2 { @@ -14,15 +15,30 @@ namespace Corpus2 { } - std::string MWEParser::get_tagset_from_attributes(const AttributeList& attributes) const + std::string MWEParser::get_attribute(const AttributeList& attributes, + const std::string &name) const + { + std::string value; + foreach (const Attribute& a, attributes) { + if (a.name == name) { + value = a.value; + } + } + return value; + } + + void MWEParser::parse_mwegroup_attributes(const AttributeList& attributes) { - std::string tagset; foreach (const Attribute& a, attributes) { - if (a.name == "tagset") { - tagset = a.value; + if (a.name == "name") { + group_name_ = a.value; + } else if(a.name == "type"){ + group_type_ = a.value; + } else if(a.name == "class"){ + group_class_ = a.value; } } - return tagset; + } void MWEParser::on_start_element(const Glib::ustring &name, @@ -31,11 +47,31 @@ namespace Corpus2 { std::cout << state_ << ": " << name << std::endl; if(state_ == NONE && name == "units_description"){ + tagset_ = get_attribute(attributes, "tagset"); state_ = UNITSDESC; - tagset_ = get_tagset_from_attributes(attributes); - } else if (state_ == UNITSDESC && name == "macros"){ - state_ = MACROS; + } else if(state_ == UNITSDESC && name == "mwegroup"){ + parse_mwegroup_attributes(attributes); + state_ = MWEGROUP; + } else if(state_ == MWEGROUP && name == "condition"){ + state_ = CONDITION; + grab_characters_ = true; + clear_buf(); + } else if(state_ == MWEGROUP && name == "instances"){ + state_ = INSTANCES; + } else if(state_ == INSTANCES && name == "MWE"){ + state_ = MWE; + mwe_base_ = get_attribute(attributes, "base"); + } else if(state_ == MWE && name == "var"){ + state_ = VAR; + var_name_ = get_attribute(attributes, "name"); + grab_characters_ = true; + clear_buf(); + } else if(state_ == MWE && name == "head"){ + state_ = HEAD; + grab_characters_ = true; + clear_buf(); } + } void MWEParser::on_end_element(const Glib::ustring &name) @@ -44,9 +80,41 @@ namespace Corpus2 { if(name == "units_description"){ state_ = NONE; - } else if(state_ == MACROS, name == "macros"){ + } else if(state_ == CONDITION && name == "condition"){ + wccl_operator_ = finish_get_text(); + std::cout << wccl_operator_ << std::endl; + state_ = MWEGROUP; + } else if(state_ == MWEGROUP && name == "mwegroup"){ state_ = UNITSDESC; + } else if(state_ == INSTANCES && name == "instances"){ + state_ = MWEGROUP; + } else if(state_ == MWE && name == "MWE"){ + state_ = INSTANCES; + // TODO: tworzenie jednostki + std::cout << "Tworzenie jednostki: " << mwe_base_ << " dla "; + foreach(str_map::value_type &i, variables_) + std::cout << i.first << ": " << i.second << ", "; + std::cout << "\nhead: " << head_cond_ << "\nop: " + << wccl_operator_ << std::endl; + std::cout << "MWE Group name: " << group_name_ << std::endl; + } else if(state_ == VAR && name == "var"){ + state_ = MWE; + variables_[var_name_] = finish_get_text(); + } else if(state_ == HEAD && name == "head"){ + state_ = MWE; + head_cond_ = finish_get_text(); + } else{ + std::cerr << "Wrong state_:" << state_ << " for name: " + << name << std::endl; } } + std::string MWEParser::finish_get_text() + { + std::string str = get_buf(); + boost::algorithm::trim(str); + grab_characters_ = false; + return str; + } + } // ns Corpus2 diff --git a/libmwereader/mweparser.h b/libmwereader/mweparser.h index e4adabc29e6ce472d483421cccab8fe12d72fe5c..442e1a81ccd79871d470c1825492e00b41f3af77 100644 --- a/libmwereader/mweparser.h +++ b/libmwereader/mweparser.h @@ -6,6 +6,8 @@ namespace Corpus2 { +typedef std::map<std::string, std::string> str_map; + class MWEParser : public BasicSaxParser { public: @@ -16,29 +18,37 @@ protected: const AttributeList& attributes); void on_end_element(const Glib::ustring &name); + std::string finish_get_text(); /// retrives tagset= attribute - std::string get_tagset_from_attributes(const AttributeList& attributes) const; + std::string get_attribute(const AttributeList& attributes, + const std::string &name) const; + void parse_mwegroup_attributes(const AttributeList& attributes); /// tagset name used in wccl operators std::string tagset_; enum States{NONE, // not started UNITSDESC, // in <units_description - MACROS, // in <macros> - MACROSINGLE, // in <m> - NAME, // <name> of a macro - VAR, // <var> of a macro - COND, // <con> of a macro - LU, // in <LU> - LUBASE, // <LUbase> of lexical unit - TYPE, // type (<t>) of lexical unit - PATTERN, // pattern(i.e., macro) of lexical unit (<pat>) - HEADCOND, // head condition (<h>) - CLASS, // (flex) class of lexical unit (class) + MWEGROUP, // in <mwegroup> + CONDITION, // in <condition> + INSTANCES, // <instances> + MWE, // start of MWE, <MWE> + VAR, // <var> of <MWE> + HEAD, // <head> condition of MWE }; + States state_; + + str_map variables_; // name -> val + std::string wccl_operator_; + std::string mwe_base_; + std::string var_name_; + std::string group_name_; + std::string group_type_; + std::string group_class_; + std::string head_cond_; }; } // ns Corpus2 diff --git a/libmwereader/test_mwe.xml b/libmwereader/test_mwe.xml index cd2073b0f96a71a3fafba234d610cf4f9367bb84..78c3d1a5a9678a48e1983ceb60017947c1ad89b6 100644 --- a/libmwereader/test_mwe.xml +++ b/libmwereader/test_mwe.xml @@ -19,4 +19,4 @@ </instances> </mwegroup> -</units_description> \ No newline at end of file +</units_description>