Skip to content
Snippets Groups Projects
Commit f7411288 authored by Bartosz Broda's avatar Bartosz Broda
Browse files

new mwe format parsing (strings only)

parent 37c0e455
Branches
No related merge requests found
#include "mweparser.h"
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
#include <boost/make_shared.hpp>
#include <fstream>
#include <boost/algorithm/string.hpp>
namespace Corpus2 {
......@@ -14,15 +15,30 @@ namespace Corpus2 {
}
std::string MWEParser::get_tagset_from_attributes(const AttributeList& attributes) const
std::string MWEParser::get_attribute(const AttributeList& attributes,
const std::string &name) const
{
std::string value;
foreach (const Attribute& a, attributes) {
if (a.name == name) {
value = a.value;
}
}
return value;
}
void MWEParser::parse_mwegroup_attributes(const AttributeList& attributes)
{
std::string tagset;
foreach (const Attribute& a, attributes) {
if (a.name == "tagset") {
tagset = a.value;
if (a.name == "name") {
group_name_ = a.value;
} else if(a.name == "type"){
group_type_ = a.value;
} else if(a.name == "class"){
group_class_ = a.value;
}
}
return tagset;
}
void MWEParser::on_start_element(const Glib::ustring &name,
......@@ -31,11 +47,31 @@ namespace Corpus2 {
std::cout << state_ << ": " << name << std::endl;
if(state_ == NONE && name == "units_description"){
tagset_ = get_attribute(attributes, "tagset");
state_ = UNITSDESC;
tagset_ = get_tagset_from_attributes(attributes);
} else if (state_ == UNITSDESC && name == "macros"){
state_ = MACROS;
} else if(state_ == UNITSDESC && name == "mwegroup"){
parse_mwegroup_attributes(attributes);
state_ = MWEGROUP;
} else if(state_ == MWEGROUP && name == "condition"){
state_ = CONDITION;
grab_characters_ = true;
clear_buf();
} else if(state_ == MWEGROUP && name == "instances"){
state_ = INSTANCES;
} else if(state_ == INSTANCES && name == "MWE"){
state_ = MWE;
mwe_base_ = get_attribute(attributes, "base");
} else if(state_ == MWE && name == "var"){
state_ = VAR;
var_name_ = get_attribute(attributes, "name");
grab_characters_ = true;
clear_buf();
} else if(state_ == MWE && name == "head"){
state_ = HEAD;
grab_characters_ = true;
clear_buf();
}
}
void MWEParser::on_end_element(const Glib::ustring &name)
......@@ -44,9 +80,41 @@ namespace Corpus2 {
if(name == "units_description"){
state_ = NONE;
} else if(state_ == MACROS, name == "macros"){
} else if(state_ == CONDITION && name == "condition"){
wccl_operator_ = finish_get_text();
std::cout << wccl_operator_ << std::endl;
state_ = MWEGROUP;
} else if(state_ == MWEGROUP && name == "mwegroup"){
state_ = UNITSDESC;
} else if(state_ == INSTANCES && name == "instances"){
state_ = MWEGROUP;
} else if(state_ == MWE && name == "MWE"){
state_ = INSTANCES;
// TODO: tworzenie jednostki
std::cout << "Tworzenie jednostki: " << mwe_base_ << " dla ";
foreach(str_map::value_type &i, variables_)
std::cout << i.first << ": " << i.second << ", ";
std::cout << "\nhead: " << head_cond_ << "\nop: "
<< wccl_operator_ << std::endl;
std::cout << "MWE Group name: " << group_name_ << std::endl;
} else if(state_ == VAR && name == "var"){
state_ = MWE;
variables_[var_name_] = finish_get_text();
} else if(state_ == HEAD && name == "head"){
state_ = MWE;
head_cond_ = finish_get_text();
} else{
std::cerr << "Wrong state_:" << state_ << " for name: "
<< name << std::endl;
}
}
std::string MWEParser::finish_get_text()
{
std::string str = get_buf();
boost::algorithm::trim(str);
grab_characters_ = false;
return str;
}
} // ns Corpus2
......@@ -6,6 +6,8 @@
namespace Corpus2 {
typedef std::map<std::string, std::string> str_map;
class MWEParser : public BasicSaxParser
{
public:
......@@ -16,29 +18,37 @@ protected:
const AttributeList& attributes);
void on_end_element(const Glib::ustring &name);
std::string finish_get_text();
/// retrives tagset= attribute
std::string get_tagset_from_attributes(const AttributeList& attributes) const;
std::string get_attribute(const AttributeList& attributes,
const std::string &name) const;
void parse_mwegroup_attributes(const AttributeList& attributes);
/// tagset name used in wccl operators
std::string tagset_;
enum States{NONE, // not started
UNITSDESC, // in <units_description
MACROS, // in <macros>
MACROSINGLE, // in <m>
NAME, // <name> of a macro
VAR, // <var> of a macro
COND, // <con> of a macro
LU, // in <LU>
LUBASE, // <LUbase> of lexical unit
TYPE, // type (<t>) of lexical unit
PATTERN, // pattern(i.e., macro) of lexical unit (<pat>)
HEADCOND, // head condition (<h>)
CLASS, // (flex) class of lexical unit (class)
MWEGROUP, // in <mwegroup>
CONDITION, // in <condition>
INSTANCES, // <instances>
MWE, // start of MWE, <MWE>
VAR, // <var> of <MWE>
HEAD, // <head> condition of MWE
};
States state_;
str_map variables_; // name -> val
std::string wccl_operator_;
std::string mwe_base_;
std::string var_name_;
std::string group_name_;
std::string group_type_;
std::string group_class_;
std::string head_cond_;
};
} // ns Corpus2
......
......@@ -19,4 +19,4 @@
</instances>
</mwegroup>
</units_description>
\ No newline at end of file
</units_description>
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment