diff --git a/libmwereader/CMakeLists.txt b/libmwereader/CMakeLists.txt index 80355dbcab621478dcc152527d6046f3b96d8df6..f1cd0acecf828c6365ccecd5d986c7324ba3cabf 100644 --- a/libmwereader/CMakeLists.txt +++ b/libmwereader/CMakeLists.txt @@ -3,11 +3,6 @@ cmake_minimum_required(VERSION 2.8.0) set(libmwereader_major 0) set(libmwereader_minor 1) -add_library(corpus2_mwereader SHARED mwereader.cpp ) - -set_target_properties(corpus2_mwereader PROPERTIES - VERSION "${libmwereader_major}.${libmwereader_minor}" - SOVERSION ${libmwereader_major}) find_package(Corpus2 1.0.9 REQUIRED) set(LIBS ${LIBS} ${Corpus2_LIBRARIES}) @@ -25,12 +20,23 @@ set(LIBS ${LIBS} ${ICU_LIBRARIES} ${ICU_I18N_LIBRARIES}) find_package(ANTLR REQUIRED) include_directories(${ANTLR_INCLUDE_DIR}) +find_package(LibXML++ REQUIRED QUIET) +include_directories(${LibXML++_INCLUDE_DIRS}) +link_directories(${LibXML++_LIBRARY_DIRS}) +set(LIBS ${LIBS} ${LibXML++_LIBRARIES}) + +add_library(corpus2_mwereader SHARED mwereader.cpp ) + target_link_libraries(corpus2_mwereader corpus2) -add_executable(mwertest mwertest.cpp) +add_executable(mwertest mwertest.cpp mweparser.cpp) target_link_libraries(mwertest corpus2_mwereader ${LIBS}) +set_target_properties(corpus2_mwereader PROPERTIES + VERSION "${libmwereader_major}.${libmwereader_minor}" + SOVERSION ${libmwereader_major}) + if(UNIX) install(TARGETS corpus2_mwereader LIBRARY DESTINATION lib) #install(TARGETS c2pqtest RUNTIME DESTINATION bin) diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aab66be1e5649f69ea03aedeea53b0f233bf2736 --- /dev/null +++ b/libmwereader/mweparser.cpp @@ -0,0 +1,52 @@ +#include "mweparser.h" + +#include <libpwrutils/foreach.h> +#include <libxml++/libxml++.h> +#include <libxml2/libxml/parser.h> +#include <boost/make_shared.hpp> +#include <fstream> + +namespace Corpus2 { + + MWEParser::MWEParser() + : BasicSaxParser(), state_(NONE) + { + + } + + std::string MWEParser::get_tagset_from_attributes(const AttributeList& attributes) const + { + std::string tagset; + foreach (const Attribute& a, attributes) { + if (a.name == "tagset") { + tagset = a.value; + } + } + return tagset; + } + + void MWEParser::on_start_element(const Glib::ustring &name, + const AttributeList& attributes) + { + std::cout << state_ << ": " << name << std::endl; + + if(state_ == NONE && name == "units_description"){ + state_ = UNITSDESC; + tagset_ = get_tagset_from_attributes(attributes); + } else if (state_ == UNITSDESC && name == "macros"){ + state_ = MACROS; + } + } + + void MWEParser::on_end_element(const Glib::ustring &name) + { + std::cout << "/" << state_ << ": " << name << std::endl; + + if(name == "units_description"){ + state_ = NONE; + } else if(state_ == MACROS, name == "macros"){ + state_ = UNITSDESC; + } + } + +} // ns Corpus2 diff --git a/libmwereader/mweparser.h b/libmwereader/mweparser.h new file mode 100644 index 0000000000000000000000000000000000000000..e4adabc29e6ce472d483421cccab8fe12d72fe5c --- /dev/null +++ b/libmwereader/mweparser.h @@ -0,0 +1,46 @@ +#ifndef LIBMWEREADER_MWEPARSER_H +#define LIBMWEREADER_MWEPARSER_H + +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/sax.h> + +namespace Corpus2 { + +class MWEParser : public BasicSaxParser +{ +public: + MWEParser(); + +protected: + void on_start_element(const Glib::ustring &name, + const AttributeList& attributes); + + void on_end_element(const Glib::ustring &name); + + + /// retrives tagset= attribute + std::string get_tagset_from_attributes(const AttributeList& attributes) const; + + /// tagset name used in wccl operators + std::string tagset_; + + enum States{NONE, // not started + UNITSDESC, // in <units_description + MACROS, // in <macros> + MACROSINGLE, // in <m> + NAME, // <name> of a macro + VAR, // <var> of a macro + COND, // <con> of a macro + LU, // in <LU> + LUBASE, // <LUbase> of lexical unit + TYPE, // type (<t>) of lexical unit + PATTERN, // pattern(i.e., macro) of lexical unit (<pat>) + HEADCOND, // head condition (<h>) + CLASS, // (flex) class of lexical unit (class) + }; + States state_; +}; + +} // ns Corpus2 + +#endif // LIBMWEREADER_MWEPARSER_H diff --git a/libmwereader/mwertest.cpp b/libmwereader/mwertest.cpp index e6a6badf71cc8658d4984563f705a8770704ded8..cb78a83ebf5cabf6df32e1018e0603dac7fcd40d 100644 --- a/libmwereader/mwertest.cpp +++ b/libmwereader/mwertest.cpp @@ -1,7 +1,12 @@ #include <iostream> +#include "mweparser.h" int main(int ac, char**av) { - std::cout << "TEST" << std::endl; + using namespace Corpus2; + std::cout << "Starting tests" << std::endl; + + MWEParser parser; + parser.parse_file(av[1]); }