Skip to content
Snippets Groups Projects
Commit 39d553cc authored by ilor's avatar ilor
Browse files

Merge branch 'master' of nlp.pwr.wroc.pl:wccl

parents 1a40b138 ad75559f
Branches
No related merge requests found
......@@ -3,11 +3,6 @@ cmake_minimum_required(VERSION 2.8.0)
set(libmwereader_major 0)
set(libmwereader_minor 1)
add_library(corpus2_mwereader SHARED mwereader.cpp )
set_target_properties(corpus2_mwereader PROPERTIES
VERSION "${libmwereader_major}.${libmwereader_minor}"
SOVERSION ${libmwereader_major})
find_package(Corpus2 1.0.9 REQUIRED)
set(LIBS ${LIBS} ${Corpus2_LIBRARIES})
......@@ -25,12 +20,23 @@ set(LIBS ${LIBS} ${ICU_LIBRARIES} ${ICU_I18N_LIBRARIES})
find_package(ANTLR REQUIRED)
include_directories(${ANTLR_INCLUDE_DIR})
find_package(LibXML++ REQUIRED QUIET)
include_directories(${LibXML++_INCLUDE_DIRS})
link_directories(${LibXML++_LIBRARY_DIRS})
set(LIBS ${LIBS} ${LibXML++_LIBRARIES})
add_library(corpus2_mwereader SHARED mwereader.cpp )
target_link_libraries(corpus2_mwereader corpus2)
add_executable(mwertest mwertest.cpp)
add_executable(mwertest mwertest.cpp mweparser.cpp)
target_link_libraries(mwertest corpus2_mwereader ${LIBS})
set_target_properties(corpus2_mwereader PROPERTIES
VERSION "${libmwereader_major}.${libmwereader_minor}"
SOVERSION ${libmwereader_major})
if(UNIX)
install(TARGETS corpus2_mwereader LIBRARY DESTINATION lib)
#install(TARGETS c2pqtest RUNTIME DESTINATION bin)
......
#include "mweparser.h"
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
#include <boost/make_shared.hpp>
#include <boost/algorithm/string.hpp>
namespace Corpus2 {
MWEParser::MWEParser()
: BasicSaxParser(), state_(NONE)
{
}
std::string MWEParser::get_attribute(const AttributeList& attributes,
const std::string &name) const
{
std::string value;
foreach (const Attribute& a, attributes) {
if (a.name == name) {
value = a.value;
}
}
return value;
}
void MWEParser::parse_mwegroup_attributes(const AttributeList& attributes)
{
foreach (const Attribute& a, attributes) {
if (a.name == "name") {
group_name_ = a.value;
} else if(a.name == "type"){
group_type_ = a.value;
} else if(a.name == "class"){
group_class_ = a.value;
}
}
}
void MWEParser::on_start_element(const Glib::ustring &name,
const AttributeList& attributes)
{
std::cout << state_ << ": " << name << std::endl;
if(state_ == NONE && name == "units_description"){
tagset_ = get_attribute(attributes, "tagset");
state_ = UNITSDESC;
} else if(state_ == UNITSDESC && name == "mwegroup"){
parse_mwegroup_attributes(attributes);
state_ = MWEGROUP;
} else if(state_ == MWEGROUP && name == "condition"){
state_ = CONDITION;
grab_characters_ = true;
clear_buf();
} else if(state_ == MWEGROUP && name == "instances"){
state_ = INSTANCES;
} else if(state_ == INSTANCES && name == "MWE"){
state_ = MWE;
mwe_base_ = get_attribute(attributes, "base");
} else if(state_ == MWE && name == "var"){
state_ = VAR;
var_name_ = get_attribute(attributes, "name");
grab_characters_ = true;
clear_buf();
} else if(state_ == MWE && name == "head"){
state_ = HEAD;
grab_characters_ = true;
clear_buf();
}
}
void MWEParser::on_end_element(const Glib::ustring &name)
{
std::cout << "/" << state_ << ": " << name << std::endl;
if(name == "units_description"){
state_ = NONE;
} else if(state_ == CONDITION && name == "condition"){
wccl_operator_ = finish_get_text();
std::cout << wccl_operator_ << std::endl;
state_ = MWEGROUP;
} else if(state_ == MWEGROUP && name == "mwegroup"){
state_ = UNITSDESC;
} else if(state_ == INSTANCES && name == "instances"){
state_ = MWEGROUP;
} else if(state_ == MWE && name == "MWE"){
state_ = INSTANCES;
// TODO: tworzenie jednostki
std::cout << "Tworzenie jednostki: " << mwe_base_ << " dla ";
foreach(str_map::value_type &i, variables_)
std::cout << i.first << ": " << i.second << ", ";
std::cout << "\nhead: " << head_cond_ << "\nop: "
<< wccl_operator_ << std::endl;
std::cout << "MWE Group name: " << group_name_ << std::endl;
} else if(state_ == VAR && name == "var"){
state_ = MWE;
variables_[var_name_] = finish_get_text();
} else if(state_ == HEAD && name == "head"){
state_ = MWE;
head_cond_ = finish_get_text();
} else{
std::cerr << "Wrong state_:" << state_ << " for name: "
<< name << std::endl;
}
}
std::string MWEParser::finish_get_text()
{
std::string str = get_buf();
boost::algorithm::trim(str);
grab_characters_ = false;
return str;
}
} // ns Corpus2
#ifndef LIBMWEREADER_MWEPARSER_H
#define LIBMWEREADER_MWEPARSER_H
#include <libcorpus2/io/reader.h>
#include <libcorpus2/io/sax.h>
namespace Corpus2 {
typedef std::map<std::string, std::string> str_map;
class MWEParser : public BasicSaxParser
{
public:
MWEParser();
protected:
void on_start_element(const Glib::ustring &name,
const AttributeList& attributes);
void on_end_element(const Glib::ustring &name);
std::string finish_get_text();
/// retrives tagset= attribute
std::string get_attribute(const AttributeList& attributes,
const std::string &name) const;
void parse_mwegroup_attributes(const AttributeList& attributes);
/// tagset name used in wccl operators
std::string tagset_;
enum States{NONE, // not started
UNITSDESC, // in <units_description
MWEGROUP, // in <mwegroup>
CONDITION, // in <condition>
INSTANCES, // <instances>
MWE, // start of MWE, <MWE>
VAR, // <var> of <MWE>
HEAD, // <head> condition of MWE
};
States state_;
str_map variables_; // name -> val
std::string wccl_operator_;
std::string mwe_base_;
std::string var_name_;
std::string group_name_;
std::string group_type_;
std::string group_class_;
std::string head_cond_;
};
} // ns Corpus2
#endif // LIBMWEREADER_MWEPARSER_H
#include "mwereader.h"
#include <boost/algorithm/string.hpp>
namespace Corpus2{
bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
"mwereader","token,chunk,sentence"); // TODO wiecej helpa
"mwereader","inner,mwepath"); // TODO more help?
MWEReader::MWEReader(const Tagset &tagset, const std::string &filename)
: TokenReader(tagset)
: TokenReader(tagset), inner_filename_(filename)
{
// TODO implementataion
std::cerr << "Jestem sobie MWE Readerkiem" << std::endl;
}
MWEReader::~MWEReader()
......@@ -19,37 +19,51 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Token* MWEReader::get_next_token()
{
// TODO implementation
return 0;
// TODO MWE stuff
// get whole sentence -> process it -> return token by token
return inner_reader_->get_next_token();
}
Sentence::Ptr MWEReader::get_next_sentence()
{
// TODO implementataion
return Sentence::Ptr();
// TODO MWE stuff
return inner_reader_->get_next_sentence();
}
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
{
// TODO implementataion
return boost::shared_ptr<Chunk>();
// TODO MWE stuff
// get whole chunk -> process sentences -> return processed chunk
return inner_reader_->get_next_chunk();
}
void MWEReader::set_option(const std::string& option)
{
// TODO implementataion
if(boost::algorithm::starts_with(option, "inner:")) {
std::string inner = option.substr(6);
inner_reader_ = create_path_reader(inner, this->tagset(),
inner_filename_);
}
// TODO MWE stuff
}
void MWEReader::validate()
{
// TODO implementataion
if(inner_reader_ == NULL)
throw Corpus2Error("Inner reader not initialised.");
// TODO MWE stuff
}
std::string MWEReader::get_option(const std::string& option) const
{
// TODO implementataion
std::string s;
return s;
if(boost::algorithm::starts_with(option, "inner:")
&& inner_reader_ != NULL)
return option;
// TODO options for MWE
return inner_reader_->get_option(option);
}
......
......@@ -9,14 +9,23 @@ namespace Corpus2 {
class MWEReader: public TokenReader
{
public:
/**
* \param filename corpus filename (MWE file is given in options)
*/
MWEReader(const Tagset& tagset, const std::string& filename);
~MWEReader();
/// retrieves whole sentence, finds MWEs, and return tokens
Token* get_next_token();
/// the prefered mode for this reader
Sentence::Ptr get_next_sentence();
/**
* retrieves chunk with inner reader and then searches for MWEs within
* sentences.
*/
boost::shared_ptr<Chunk> get_next_chunk();
void set_option(const std::string& option);
......@@ -34,6 +43,12 @@ public:
virtual void validate();
static bool registered;
private:
/// ptr to inner reader doing the real work of reading a corpus
TokenReaderPtr inner_reader_;
/// path for inner reader
std::string inner_filename_;
/// inner reader option
};
} // ns Corpus2
......
#include <iostream>
#include "mweparser.h"
int main(int ac, char**av)
{
std::cout << "TEST" << std::endl;
using namespace Corpus2;
std::cout << "Starting tests" << std::endl;
MWEParser parser;
parser.parse_file(av[1]);
}
<?xml version='1.0' encoding='utf-8'?>
<units_description tagset='kipi'>
<mwegroup name="SubstSubstFix" type="fix" class="subst">
<condition>
and(
inter(base[0],$s:Subst1),
inter(class[0],{subst,ger,depr}),
inter(base[1],$s:Subst2),
inter(class[1],{subst,ger,depr}),
inter(cas[0], cas[1])
)
</condition>
<instances>
<MWE base="Jan Paweł">
<var name="Subst1">jan</var>
<var name="Subst2">paweł</var>
<head>inter(cas[0], {nom})</head>
</MWE>
</instances>
</mwegroup>
</units_description>
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment