diff --git a/libmwereader/CMakeLists.txt b/libmwereader/CMakeLists.txt index 80355dbcab621478dcc152527d6046f3b96d8df6..f1cd0acecf828c6365ccecd5d986c7324ba3cabf 100644 --- a/libmwereader/CMakeLists.txt +++ b/libmwereader/CMakeLists.txt @@ -3,11 +3,6 @@ cmake_minimum_required(VERSION 2.8.0) set(libmwereader_major 0) set(libmwereader_minor 1) -add_library(corpus2_mwereader SHARED mwereader.cpp ) - -set_target_properties(corpus2_mwereader PROPERTIES - VERSION "${libmwereader_major}.${libmwereader_minor}" - SOVERSION ${libmwereader_major}) find_package(Corpus2 1.0.9 REQUIRED) set(LIBS ${LIBS} ${Corpus2_LIBRARIES}) @@ -25,12 +20,23 @@ set(LIBS ${LIBS} ${ICU_LIBRARIES} ${ICU_I18N_LIBRARIES}) find_package(ANTLR REQUIRED) include_directories(${ANTLR_INCLUDE_DIR}) +find_package(LibXML++ REQUIRED QUIET) +include_directories(${LibXML++_INCLUDE_DIRS}) +link_directories(${LibXML++_LIBRARY_DIRS}) +set(LIBS ${LIBS} ${LibXML++_LIBRARIES}) + +add_library(corpus2_mwereader SHARED mwereader.cpp ) + target_link_libraries(corpus2_mwereader corpus2) -add_executable(mwertest mwertest.cpp) +add_executable(mwertest mwertest.cpp mweparser.cpp) target_link_libraries(mwertest corpus2_mwereader ${LIBS}) +set_target_properties(corpus2_mwereader PROPERTIES + VERSION "${libmwereader_major}.${libmwereader_minor}" + SOVERSION ${libmwereader_major}) + if(UNIX) install(TARGETS corpus2_mwereader LIBRARY DESTINATION lib) #install(TARGETS c2pqtest RUNTIME DESTINATION bin) diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ab85753b7ec6bf7bc3ae81aca891742354694e81 --- /dev/null +++ b/libmwereader/mweparser.cpp @@ -0,0 +1,120 @@ +#include "mweparser.h" + +#include <libpwrutils/foreach.h> + +#include <libxml++/libxml++.h> +#include <libxml2/libxml/parser.h> +#include <boost/make_shared.hpp> +#include <boost/algorithm/string.hpp> + +namespace Corpus2 { + + MWEParser::MWEParser() + : BasicSaxParser(), state_(NONE) + { + + } + + std::string MWEParser::get_attribute(const AttributeList& attributes, + const std::string &name) const + { + std::string value; + foreach (const Attribute& a, attributes) { + if (a.name == name) { + value = a.value; + } + } + return value; + } + + void MWEParser::parse_mwegroup_attributes(const AttributeList& attributes) + { + foreach (const Attribute& a, attributes) { + if (a.name == "name") { + group_name_ = a.value; + } else if(a.name == "type"){ + group_type_ = a.value; + } else if(a.name == "class"){ + group_class_ = a.value; + } + } + + } + + void MWEParser::on_start_element(const Glib::ustring &name, + const AttributeList& attributes) + { + std::cout << state_ << ": " << name << std::endl; + + if(state_ == NONE && name == "units_description"){ + tagset_ = get_attribute(attributes, "tagset"); + state_ = UNITSDESC; + } else if(state_ == UNITSDESC && name == "mwegroup"){ + parse_mwegroup_attributes(attributes); + state_ = MWEGROUP; + } else if(state_ == MWEGROUP && name == "condition"){ + state_ = CONDITION; + grab_characters_ = true; + clear_buf(); + } else if(state_ == MWEGROUP && name == "instances"){ + state_ = INSTANCES; + } else if(state_ == INSTANCES && name == "MWE"){ + state_ = MWE; + mwe_base_ = get_attribute(attributes, "base"); + } else if(state_ == MWE && name == "var"){ + state_ = VAR; + var_name_ = get_attribute(attributes, "name"); + grab_characters_ = true; + clear_buf(); + } else if(state_ == MWE && name == "head"){ + state_ = HEAD; + grab_characters_ = true; + clear_buf(); + } + + } + + void MWEParser::on_end_element(const Glib::ustring &name) + { + std::cout << "/" << state_ << ": " << name << std::endl; + + if(name == "units_description"){ + state_ = NONE; + } else if(state_ == CONDITION && name == "condition"){ + wccl_operator_ = finish_get_text(); + std::cout << wccl_operator_ << std::endl; + state_ = MWEGROUP; + } else if(state_ == MWEGROUP && name == "mwegroup"){ + state_ = UNITSDESC; + } else if(state_ == INSTANCES && name == "instances"){ + state_ = MWEGROUP; + } else if(state_ == MWE && name == "MWE"){ + state_ = INSTANCES; + // TODO: tworzenie jednostki + std::cout << "Tworzenie jednostki: " << mwe_base_ << " dla "; + foreach(str_map::value_type &i, variables_) + std::cout << i.first << ": " << i.second << ", "; + std::cout << "\nhead: " << head_cond_ << "\nop: " + << wccl_operator_ << std::endl; + std::cout << "MWE Group name: " << group_name_ << std::endl; + } else if(state_ == VAR && name == "var"){ + state_ = MWE; + variables_[var_name_] = finish_get_text(); + } else if(state_ == HEAD && name == "head"){ + state_ = MWE; + head_cond_ = finish_get_text(); + } else{ + std::cerr << "Wrong state_:" << state_ << " for name: " + << name << std::endl; + } + } + + std::string MWEParser::finish_get_text() + { + std::string str = get_buf(); + boost::algorithm::trim(str); + grab_characters_ = false; + return str; + } + +} // ns Corpus2 diff --git a/libmwereader/mweparser.h b/libmwereader/mweparser.h new file mode 100644 index 0000000000000000000000000000000000000000..442e1a81ccd79871d470c1825492e00b41f3af77 --- /dev/null +++ b/libmwereader/mweparser.h @@ -0,0 +1,56 @@ +#ifndef LIBMWEREADER_MWEPARSER_H +#define LIBMWEREADER_MWEPARSER_H + +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/sax.h> + +namespace Corpus2 { + +typedef std::map<std::string, std::string> str_map; + +class MWEParser : public BasicSaxParser +{ +public: + MWEParser(); + +protected: + void on_start_element(const Glib::ustring &name, + const AttributeList& attributes); + + void on_end_element(const Glib::ustring &name); + std::string finish_get_text(); + + + /// retrives tagset= attribute + std::string get_attribute(const AttributeList& attributes, + const std::string &name) const; + void parse_mwegroup_attributes(const AttributeList& attributes); + + /// tagset name used in wccl operators + std::string tagset_; + + enum States{NONE, // not started + UNITSDESC, // in <units_description + MWEGROUP, // in <mwegroup> + CONDITION, // in <condition> + INSTANCES, // <instances> + MWE, // start of MWE, <MWE> + VAR, // <var> of <MWE> + HEAD, // <head> condition of MWE + }; + + States state_; + + str_map variables_; // name -> val + std::string wccl_operator_; + std::string mwe_base_; + std::string var_name_; + std::string group_name_; + std::string group_type_; + std::string group_class_; + std::string head_cond_; +}; + +} // ns Corpus2 + +#endif // LIBMWEREADER_MWEPARSER_H diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index 95d5ede61ab07f61fc289f1868291e151090c09a..f6bb63b4d826c138e8903ad76a815596c9f38e6c 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -1,15 +1,15 @@ #include "mwereader.h" +#include <boost/algorithm/string.hpp> namespace Corpus2{ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( - "mwereader","token,chunk,sentence"); // TODO wiecej helpa + "mwereader","inner,mwepath"); // TODO more help? MWEReader::MWEReader(const Tagset &tagset, const std::string &filename) - : TokenReader(tagset) + : TokenReader(tagset), inner_filename_(filename) { // TODO implementataion - std::cerr << "Jestem sobie MWE Readerkiem" << std::endl; } MWEReader::~MWEReader() @@ -19,37 +19,51 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( Token* MWEReader::get_next_token() { - // TODO implementation - return 0; + // TODO MWE stuff + // get whole sentence -> process it -> return token by token + return inner_reader_->get_next_token(); } Sentence::Ptr MWEReader::get_next_sentence() { - // TODO implementataion - return Sentence::Ptr(); + // TODO MWE stuff + return inner_reader_->get_next_sentence(); } boost::shared_ptr<Chunk> MWEReader::get_next_chunk() { - // TODO implementataion - return boost::shared_ptr<Chunk>(); + // TODO MWE stuff + // get whole chunk -> process sentences -> return processed chunk + return inner_reader_->get_next_chunk(); } void MWEReader::set_option(const std::string& option) { - // TODO implementataion + + if(boost::algorithm::starts_with(option, "inner:")) { + std::string inner = option.substr(6); + inner_reader_ = create_path_reader(inner, this->tagset(), + inner_filename_); + } + + + // TODO MWE stuff } void MWEReader::validate() { - // TODO implementataion + if(inner_reader_ == NULL) + throw Corpus2Error("Inner reader not initialised."); + // TODO MWE stuff } std::string MWEReader::get_option(const std::string& option) const { - // TODO implementataion - std::string s; - return s; + if(boost::algorithm::starts_with(option, "inner:") + && inner_reader_ != NULL) + return option; + // TODO options for MWE + return inner_reader_->get_option(option); } diff --git a/libmwereader/mwereader.h b/libmwereader/mwereader.h index 71466e51d925bafe4ca640a100f32dd38e849e84..e1e2c2b468293251cc0a2f8b72abb0b67d2ca38d 100644 --- a/libmwereader/mwereader.h +++ b/libmwereader/mwereader.h @@ -9,14 +9,23 @@ namespace Corpus2 { class MWEReader: public TokenReader { public: + /** + * \param filename corpus filename (MWE file is given in options) + */ MWEReader(const Tagset& tagset, const std::string& filename); ~MWEReader(); + /// retrieves whole sentence, finds MWEs, and return tokens Token* get_next_token(); + /// the prefered mode for this reader Sentence::Ptr get_next_sentence(); + /** + * retrieves chunk with inner reader and then searches for MWEs within + * sentences. + */ boost::shared_ptr<Chunk> get_next_chunk(); void set_option(const std::string& option); @@ -34,6 +43,12 @@ public: virtual void validate(); static bool registered; +private: + /// ptr to inner reader doing the real work of reading a corpus + TokenReaderPtr inner_reader_; + /// path for inner reader + std::string inner_filename_; + /// inner reader option }; } // ns Corpus2 diff --git a/libmwereader/mwertest.cpp b/libmwereader/mwertest.cpp index e6a6badf71cc8658d4984563f705a8770704ded8..cb78a83ebf5cabf6df32e1018e0603dac7fcd40d 100644 --- a/libmwereader/mwertest.cpp +++ b/libmwereader/mwertest.cpp @@ -1,7 +1,12 @@ #include <iostream> +#include "mweparser.h" int main(int ac, char**av) { - std::cout << "TEST" << std::endl; + using namespace Corpus2; + std::cout << "Starting tests" << std::endl; + + MWEParser parser; + parser.parse_file(av[1]); } diff --git a/libmwereader/test_mwe.xml b/libmwereader/test_mwe.xml new file mode 100644 index 0000000000000000000000000000000000000000..78c3d1a5a9678a48e1983ceb60017947c1ad89b6 --- /dev/null +++ b/libmwereader/test_mwe.xml @@ -0,0 +1,22 @@ +<?xml version='1.0' encoding='utf-8'?> +<units_description tagset='kipi'> + <mwegroup name="SubstSubstFix" type="fix" class="subst"> + <condition> + and( + inter(base[0],$s:Subst1), + inter(class[0],{subst,ger,depr}), + inter(base[1],$s:Subst2), + inter(class[1],{subst,ger,depr}), + inter(cas[0], cas[1]) + ) + </condition> + <instances> + <MWE base="Jan Paweł"> + <var name="Subst1">jan</var> + <var name="Subst2">paweł</var> + <head>inter(cas[0], {nom})</head> + </MWE> + </instances> + </mwegroup> + +</units_description>