Skip to content
Snippets Groups Projects
Commit 634aea36 authored by Radosław Warzocha's avatar Radosław Warzocha
Browse files

Merge branch 'new_morfeusz'

parents 8b9433d9 c0d84a91
No related branches found
No related tags found
No related merge requests found
FIND_PATH(MORFEUSZ2_INCLUDE_DIR morfeusz2.h /usr/include /usr/local/include)
MARK_AS_ADVANCED(MORFEUSZ2_INCLUDE_DIR)
FIND_LIBRARY(MORFEUSZ2_LIBRARY NAMES morfeusz2 PATH /usr/lib /usr/local/lib)
MARK_AS_ADVANCED(MORFEUSZ2_LIBRARY)
IF (MORFEUSZ2_INCLUDE_DIR AND MORFEUSZ2_LIBRARY)
SET(MORFEUSZ2_FOUND TRUE)
ENDIF (MORFEUSZ2_INCLUDE_DIR AND MORFEUSZ2_LIBRARY)
IF (MORFEUSZ2_FOUND)
IF (NOT Morfeusz2_FIND_QUIETLY)
MESSAGE(STATUS "Found Morfeusz2: ${MORFEUSZ2_LIBRARY}")
ENDIF (NOT Morfeusz2_FIND_QUIETLY)
ELSE (MORFEUSZ2_FOUND)
IF (Morfeusz2_FIND_REQUIRED)
MESSAGE(FATAL_ERROR "Could not find Morfeusz2")
ELSE (Morfeusz2_FIND_REQUIRED)
IF (NOT Morfeusz2_FIND_QUIETLY)
MESSAGE("Morfeusz2 NOT found")
ENDIF (NOT Morfeusz2_FIND_QUIETLY)
ENDIF (Morfeusz2_FIND_REQUIRED)
ENDIF (MORFEUSZ2_FOUND)
; Data from Morfeusz2 (SGJP), converted to the simplified NKJP tagset (possible genders: n, m1, m2, m3, f)
[general]
tagset=nkjp
; dot and hyphen sequences as separate tokens
toki-config=nkjp
[ma:unknown]
class=const
tagset=nkjp
tag=ign
[ma:interp]
class=const
tagset=nkjp
tag=interp
[ma:url]
class=const
tagset=nkjp
tag=subst:sg:nom:m3
[ma:morfeusz]
class=morfeusz2
tagset=nkjp
converter=sgjp2nkjp.conv
[rule]
toki_type=p
ma=interp
[rule]
toki_type=tu
ma=url
[default]
ma=morfeusz
ma=unknown
......@@ -76,6 +76,7 @@ SET(libmaca_STAT_SRC
morph/dispatchanalyser.cpp
morph/mapanalyser.cpp
morph/morphanalyser.cpp
morph/morfeusz2analyser.cpp
util/confignode.cpp
util/debug.cpp
util/sentenceanalyser.cpp
......@@ -141,6 +142,11 @@ if (MORFEUSZ_FOUND)
endif(MORFEUSZ_PLUGIN)
endif (MORFEUSZ_FOUND)
find_package(Morfeusz2)
set(LIBS ${LIBS} ${MORFEUSZ2_LIBRARY})
include_directories(${MORFEUSZ2_INCLUDE_DIR})
link_directories(${MORFEUSZ2_LIBRARY_DIRS})
find_package(Corpus)
if(BUILD_GPL_PLUGINS)
if (CORPUS_FOUND)
......
/*
Copyright (C) 2014 Radosław Warzocha, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#include <fstream>
#include <libpwrutils/util.h>
#include <libmaca/util/settings.h>
#include "morfeusz2analyser.h"
namespace Maca {
// statics
const char* Morfeusz2Analyser::identifier = "morfeusz2";
bool Morfeusz2Analyser::registered =
MorphAnalyser::register_analyser<Morfeusz2Analyser>();
const morfeusz::Charset Morfeusz2Analyser::charset = morfeusz::UTF8;
// construct, copy, destruct
Morfeusz2Analyser::Morfeusz2Analyser(const Corpus2::Tagset* tagset,
Conversion::TagsetConverter* conv)
: MorphAnalyser(tagset), conv_(conv), warn_on_fold_failure_(false)
{
require_matching_tagsets(conv_->tagset_to(), *tagset,
"Morfeusz analyser creation");
}
Morfeusz2Analyser::Morfeusz2Analyser(const Config::Node& cfg)
: MorphAnalyser(cfg), conv_(NULL), ign_tag_(), warn_on_ign_(false)
{
std::string fn = cfg.get("converter", "");
std::ifstream ifs;
Path::Instance().open_stream_or_throw(fn, ifs, "converter");
Config::Node conv_cfg = Config::from_stream(ifs);
std::auto_ptr<Conversion::TagsetConverter> c(
new Conversion::TagsetConverter(conv_cfg));
require_matching_tagsets(c->tagset_to(), *this,
"Morfeusz analyser creation");
conv_ = c.release();
std::string ign_tag_string = cfg.get("ign_tag", "ign");
ign_tag_ = conv_->tagset_from().parse_simple_tag(ign_tag_string);
warn_on_ign_ = cfg.get("warn_on_ign", false);
warn_on_fold_failure_ = cfg.get("warn_on_fold_failure", false);
}
Morfeusz2Analyser* Morfeusz2Analyser::clone() const
{
Morfeusz2Analyser* copy = new Morfeusz2Analyser(&tagset(), conv_->clone());
copy->ign_tag_ = ign_tag_;
copy->warn_on_ign_ = warn_on_ign_;
copy->warn_on_fold_failure_ = warn_on_fold_failure_;
return copy;
}
Morfeusz2Analyser::~Morfeusz2Analyser()
{
delete conv_;
}
// public methods
bool Morfeusz2Analyser::process_functional(const Toki::Token &t,
boost::function<void(Corpus2::Token *)> sink)
{
using namespace morfeusz;
std::string s = PwrNlp::to_utf8(t.orth());
std::vector<details::Morfeusz2Edge> pmorf;
Morfeusz *morf = Morfeusz::createInstance(ANALYSE_ONLY);
morf->setCharset(charset);
ResultsIterator *res_iter = morf->analyse(s);
while(res_iter->hasNext())
pmorf.push_back(details::Morfeusz2Edge(res_iter->next(), morf));
if(pmorf.size() == 1 && pmorf[0].lemma.length() > 0) { // only one analysis
Corpus2::Token *tok = make_token(t, pmorf[0]);
std::vector<Corpus2::Token*> vec(1, tok);
flush_convert(vec, sink);
return true;
} else if(pmorf.size() > 1)
return process_complex_analysis(t, pmorf, sink);
else
return false;
}
// private methods
bool Morfeusz2Analyser::process_complex_analysis(const Toki::Token &t,
std::vector<details::Morfeusz2Edge>& pmorf,
boost::function<void(Corpus2::Token *)>sink)
{
adjacency_lists alists = build_adjacency_lists(t, pmorf);
adj_list &succ = alists.first, &prec = alists.second;
std::vector<Corpus2::Token*> unambiguous;
int current_node = 0, node_count = succ.size();
while (current_node < node_count) {
if (succ[current_node].size() > 1) { // complex case, segmentation ambiguity
if (!unambiguous.empty()) {
flush_convert(unambiguous, sink);
unambiguous.clear();
}
int merge_node = -1;
std::vector< std::vector< Corpus2::Token* > > paths;
// follow all paths to the merge point
BOOST_FOREACH(int tse, succ[current_node]) {
paths.push_back(std::vector<Corpus2::Token*>());
paths.back().push_back(pmorf[tse].token);
int v = pmorf[tse].node_to;
while (prec[v].size() == 1) {
if (succ[v].size() != 1) {
throw Morfeusz2Error("path splits twice",
t.orth_utf8(), pmorf);
}
tse = *succ[v].begin();
paths.back().push_back(pmorf[tse].token);
v = pmorf[tse].node_to;
}
//assume this is the merge node, check for consistency
if (merge_node != -1 && merge_node != v) {
throw Morfeusz2Error("path merge node ambiguity",
t.orth_utf8(), pmorf);
}
merge_node = v;
}
flush_convert(paths, sink);
current_node = merge_node;
} else if (!succ[current_node].empty()) { //simple case, only one interp
int edge = *succ[current_node].begin();
unambiguous.push_back(pmorf[edge].token);
if (pmorf[edge].node_to != current_node + 1)
throw Morfeusz2Error("simple path has non-consecutive nodes",
t.orth_utf8(), pmorf);
++current_node;
} else { //only the last node should have no successors
if (current_node != node_count - 1)
throw Morfeusz2Error("node without successors is not the last node",
t.orth_utf8(), pmorf);
++current_node;
}
}
if (!unambiguous.empty()) {
flush_convert(unambiguous, sink);
}
return true;
}
Morfeusz2Analyser::adjacency_lists
Morfeusz2Analyser::build_adjacency_lists(const Toki::Token &t,
std::vector<details::Morfeusz2Edge>& pmorf)
{
int node_count = 0;
BOOST_FOREACH(const details::Morfeusz2Edge& mri, pmorf) {
node_count = std::max(node_count, mri.node_to);
}
++node_count; // the numbering starts at 0 and we got the last valid node number
std::vector< std::vector< int > > succ(node_count), prec(node_count);
for (unsigned int i = 0; i < pmorf.size(); ++i) {
details::Morfeusz2Edge& edge = pmorf[i];
int actual_edge_i = -1;
BOOST_FOREACH(int out_edge, succ[edge.node_from]) {
if (pmorf[out_edge].node_to == edge.node_to)
actual_edge_i = out_edge;
}
if (actual_edge_i >= 0) // duplicate edge -- simple lemma ambiguity
morfeusz_into_token(pmorf[actual_edge_i].token, edge);
else {
edge.token = make_token(t, edge);
succ[edge.node_from].push_back(i);
prec[edge.node_to].push_back(i);
}
}
return make_pair(succ, prec);
}
void Morfeusz2Analyser::flush_convert(std::vector<Corpus2::Token*>& vec,
boost::function<void(Corpus2::Token *)> sink)
{
conv_->convert_simple(vec, sink);
}
void Morfeusz2Analyser::flush_convert(std::vector< std::vector<Corpus2::Token*> >& vec,
boost::function<void(Corpus2::Token *)> sink)
{
conv_->convert_ambiguous(vec, sink, warn_on_fold_failure_);
}
Corpus2::Token* Morfeusz2Analyser::make_token(const Toki::Token& t,
const details::Morfeusz2Edge& m) const
{
Corpus2::Token* tt = new Corpus2::Token();
if (m.node_from == 0) {
tt->set_wa(t.preceeding_whitespace());
} else {
tt->set_wa(PwrNlp::Whitespace::None);
}
morfeusz_into_token(tt, m);
return tt;
}
void Morfeusz2Analyser::morfeusz_into_token(Corpus2::Token* tt, const details::Morfeusz2Edge& m) const
{
tt->set_orth(m.orth);
if (!m.tag_string.empty()) {
conv_->tagset_from().lexemes_into_token(*tt, m.lemma,
m.tag_string);
} else {
Corpus2::Lexeme ign_lex(m.orth, ign_tag_);
tt->add_lexeme(ign_lex);
if (warn_on_ign_) {
std::cerr << "Morfeusz: tagging as ign: "
<< ign_lex.lemma_utf8() << "\n";
}
}
}
Morfeusz2Error::Morfeusz2Error(const std::string& error,
const std::string input,
const std::vector<details::Morfeusz2Edge>& interp)
: MacaError("Morfeusz2 error: " + error), error(error), input(input)
, interp(interp)
{
}
Morfeusz2Error::~Morfeusz2Error() throw()
{
}
std::string Morfeusz2Error::info() const
{
std::stringstream ss;
ss << what();
if (!input.empty()) {
ss << " for input '" << input << "'";
}
return ss.str();
}
namespace details {
Morfeusz2Edge::Morfeusz2Edge(const morfeusz::MorphInterpretation interp,
const morfeusz::Morfeusz * morf)
: node_from(interp.startNode), node_to(interp.endNode)
, orth(UnicodeString::fromUTF8(interp.orth))
, lemma(UnicodeString::fromUTF8(interp.lemma))
, tag_string(morf->getIdResolver().getTag(interp.tagId)), token(NULL)
{
}
}
}
/*
Copyright (C) 2014 Radosław Warzocha, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#ifndef LIBMACA_MORFEUSZ2ANALYSER_H
#define LIBMACA_MORFEUSZ2ANALYSER_H
#include <utility>
#include <libcorpus2/tagset.h>
#include <libcorpus2/token.h>
#include <morfeusz2.h>
#include <libmaca/conv/tagsetconverter.h>
#include "morphanalyser.h"
namespace Maca {
namespace details {
struct Morfeusz2Edge;
}
class Morfeusz2Analyser : public MorphAnalyser
{
public:
/**
* Constructor for a Morfeusz analyser with a given tagset and converter.
* The tagset should be the output tagset of the converter.
*/
Morfeusz2Analyser(const Corpus2::Tagset* tagset,
Conversion::TagsetConverter* conv);
/**
* Config node constructor. Recognized keys are:
* - converter - the converter to load (from standard paths)
* - ign_tag - the tag to use when Morfeusz returns no analysis,
* defaults to "ign"
* - warn_on_ign - warn when using the ign tag, false by default
* - warn_on_fold_failure - issue a warning when folding ambiguous paths
* is unsuccesful after conversion (off by def.)
*/
Morfeusz2Analyser(const Config::Node& cfg);
/// Cloning
Morfeusz2Analyser* clone() const;
/// Destructor
~Morfeusz2Analyser();
/// MorphAnalyser override
bool process_functional(const Toki::Token &t,
boost::function<void(Corpus2::Token *)> sink);
/// Class identifier
static const char* identifier;
/// Registered flag
static bool registered;
private:
static const morfeusz::Charset charset;
bool process_complex_analysis(const Toki::Token &t,
std::vector<details::Morfeusz2Edge>& pmorf,
boost::function<void(Corpus2::Token *)>sink);
typedef std::vector< std::vector<int> > adj_list;
typedef std::pair<adj_list, adj_list> adjacency_lists;
adjacency_lists build_adjacency_lists(const Toki::Token &t,
std::vector<details::Morfeusz2Edge>& pmorf);
/// convert gathered tokens and pass them to the sink
void flush_convert(std::vector<Corpus2::Token*>& vec,
boost::function<void(Corpus2::Token *)> sink);
/// convert gethered tokens (ambiguously segmented), try folding and
/// pass the resulting tokens to the sink
void flush_convert(std::vector< std::vector<Corpus2::Token*> >& vec,
boost::function<void(Corpus2::Token *)> sink);
/// helper to create a token from a Morfeusz interpretation struct
Corpus2::Token* make_token(const Toki::Token& t,
const details::Morfeusz2Edge& m) const;
/// helper to add lexemes from a Morfeusz interp struct into a token
void morfeusz_into_token(Corpus2::Token* tt,
const details::Morfeusz2Edge& m) const;
/// the tagset converter
Conversion::TagsetConverter* conv_;
Corpus2::Tag ign_tag_;
bool warn_on_ign_;
bool warn_on_fold_failure_;
};
/**
* Exception class for signalling Morfeusz-related analysis errors
*/
class Morfeusz2Error : public MacaError
{
public:
/// Constructor
Morfeusz2Error(const std::string& error, const std::string input,
const std::vector<details::Morfeusz2Edge>& interp);
/// Destructor
~Morfeusz2Error() throw();
/// Info accessor
std::string info() const;
/// The error info and Morfeusz input during the error, if available
std::string error, input;
/// The structure returned by Morfeusz during the error, if available
std::vector<details::Morfeusz2Edge> interp;
};
namespace details {
/// Helper struct for holding preprocessed Morfeusz results
struct Morfeusz2Edge
{
explicit Morfeusz2Edge(const morfeusz::MorphInterpretation interp,
const morfeusz::Morfeusz * morf);
int node_from, node_to;
UnicodeString orth;
UnicodeString lemma;
std::string tag_string;
Corpus2::Token* token;
};
} /* end ns details */
} /* end ns Maca */
#endif // LIBMACA_MORFEUSZ2ANALYSER_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment