Skip to content
Snippets Groups Projects
Commit e9277e51 authored by ilor's avatar ilor
Browse files

add mark and unmark operators, tweak wcclrules to work with annotated data

parent 7df6865c
Branches
No related merge requests found
......@@ -67,9 +67,11 @@ SET(libwccl_STAT_SRC
ops/rulesequence.cpp
ops/tagaction.cpp
ops/tagactions/delete.cpp
ops/tagactions/mark.cpp
ops/tagactions/relabel.cpp
ops/tagactions/select.cpp
ops/tagactions/unify.cpp
ops/tagactions/unmark.cpp
ops/tagrule.cpp
parser/grammar.g
parser/Parser.cpp
......
#include <libwccl/ops/tagactions/mark.h>
#include <libpwrutils/foreach.h>
#include <libcorpus2/ann/annotatedsentence.h>
#include <sstream>
namespace Wccl {
Bool Mark::execute(const ActionExecContext& context) const
{
SentenceContext& sc = context.sentence_context();
const boost::shared_ptr<const Position>& range_left = pos_begin_->apply(context);
if (range_left->get_value() == Position::Nowhere) {
return Bool(false);
}
const boost::shared_ptr<const Position>& range_right = pos_end_->apply(context);
if (range_right->get_value() == Position::Nowhere) {
return Bool(false);
}
int abs_left, abs_right;
if (!sc.validate_range(*range_left, *range_right, abs_left, abs_right)) {
return Bool(false);
}
int abs_head;
if (pos_head_) {
const boost::shared_ptr<const Position>& head = pos_head_->apply(context);
if (head->get_value() == Position::Nowhere) {
return Bool(false);
}
abs_head = sc.get_abs_position(*head);
if (abs_head < abs_left || abs_head > abs_right) {
return Bool(false);
}
} else {
abs_head = abs_left;
}
boost::shared_ptr<Corpus2::AnnotatedSentence> as;
as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sc.get_sentence_ptr());
if (!as) {
throw WcclError("Operator needs an annotated sentence");
}
if (!as->has_channel(chan_name_)) {
as->create_channel(chan_name_);
}
Corpus2::AnnotationChannel& channel = as->get_channel(chan_name_);
int segment_idx = channel.get_new_segment_index();
//std::cerr << "Marking " << chan_name_ << " from " << abs_left << " to "
// << abs_right << " as " << segment_idx << "\n";
for (int i = abs_left; i <= abs_right; ++i) {
if (channel.get_segment_at(i) > 0) {
//std::cerr << "There already is an annotation at " << i
// << " (" << channel.get_segment_at(i) << ")\n";
//throw WcclError("Mark action would overwrite existing annotation");
return Bool(false);
}
}
for (int i = abs_left; i <= abs_right; ++i) {
channel.set_segment_at(i, segment_idx);
channel.set_head_at(i, false);
}
channel.set_head_at(abs_head, true);
return Bool(true);
}
std::string Mark::to_string(const Corpus2::Tagset& tagset) const
{
std::ostringstream os;
os << name() << "(" << pos_begin_->to_string(tagset) << ", "
<< pos_end_->to_string(tagset);
if (pos_head_) {
os << ", " << pos_head_->to_string(tagset);
}
os << ", \"" << chan_name_ << "\")";
return os.str();
}
std::ostream& Mark::write_to(std::ostream& os) const
{
os << name() << "(" << *pos_begin_ << ", " << *pos_end_;
if (pos_head_) {
os << ", " << *pos_head_ ;
}
os << ", \"" << chan_name_ << "\")";
return os;
}
} /* end ns Wccl */
#ifndef LIBWCCL_OPS_TAGACTIONS_MARK_H
#define LIBWCCL_OPS_TAGACTIONS_MARK_H
#include <libwccl/ops/tagaction.h>
#include <libwccl/values/position.h>
#include <libwccl/values/bool.h>
#include <libwccl/ops/function.h>
namespace Wccl {
/**
* Action to mark an annotation fragment on a channel.
*/
class Mark : public TagAction
{
public:
typedef boost::shared_ptr<Function<Position> > PosFunctionPtr;
Mark(
const PosFunctionPtr& pos_begin,
const PosFunctionPtr& pos_end,
const PosFunctionPtr& pos_head,
const std::string& chan_name)
: pos_begin_(pos_begin),
pos_end_(pos_end),
pos_head_(pos_head),
chan_name_(chan_name)
{
BOOST_ASSERT(pos_begin_);
BOOST_ASSERT(pos_end_);
BOOST_ASSERT(!chan_name.empty());
}
Mark(
const PosFunctionPtr& pos_begin,
const PosFunctionPtr& pos_end,
const std::string& chan_name)
: pos_begin_(pos_begin),
pos_end_(pos_end),
pos_head_(),
chan_name_(chan_name)
{
BOOST_ASSERT(pos_begin_);
BOOST_ASSERT(pos_end_);
BOOST_ASSERT(!chan_name.empty());
}
/**
* @returns Name of the function.
*/
std::string name() const {
return "mark";
}
/**
* @returns String representation of the Action
*/
std::string to_string(const Corpus2::Tagset& tagset) const;
/**
* Executes the Action on given context: Marks an annotation within
* the given channel on the range supplied. It is an error if there
* already is an annotation anywhere in the range in the channel.
* The Range is trimmed to sentence boundaries.
* No action is made in case of an invalid/empty range.
* @returns True if there were any changes made; False otherwise
*/
Bool execute(const ActionExecContext &context) const;
protected:
/**
* Writes string representation of the Action to
* an output stream.
* @returns Stream written to.
* @note May be incomplete and/or containt internal info.
*/
std::ostream& write_to(std::ostream& ostream) const;
private:
const PosFunctionPtr pos_begin_;
const PosFunctionPtr pos_end_;
const PosFunctionPtr pos_head_;
const std::string chan_name_;
};
} /* end ns Wccl */
#endif // LIBWCCL_OPS_TAGACTIONS_MARK_H
#include <libwccl/ops/tagactions/unmark.h>
#include <libpwrutils/foreach.h>
#include <libcorpus2/ann/annotatedsentence.h>
#include <sstream>
namespace Wccl {
Bool Unmark::execute(const ActionExecContext& context) const
{
SentenceContext& sc = context.sentence_context();
const boost::shared_ptr<const Position>& position = pos_->apply(context);
if (position->get_value() == Position::Nowhere) {
return Bool(false);
}
int abs_pos = sc.get_abs_position(*position);
if (!sc.is_inside(abs_pos)) {
return Bool(false);
}
boost::shared_ptr<Corpus2::AnnotatedSentence> as;
as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sc.get_sentence_ptr());
if (!as) {
throw WcclError("Operator needs an annotated sentence");
}
if (!as->has_channel(chan_name_)) {
return Bool(false);
}
Corpus2::AnnotationChannel& channel = as->get_channel(chan_name_);
int segment_idx = channel.get_segment_at(abs_pos);
if (segment_idx == 0) {
return Bool(false);
}
for (int i = 0; i < channel.size(); ++i) {
if (channel.segments()[i] == segment_idx) {
channel.set_segment_at(i, 0);
}
}
return Bool(true);
}
std::string Unmark::to_string(const Corpus2::Tagset& tagset) const
{
std::ostringstream os;
os << name() << "(" << pos_->to_string(tagset) << ", \""
<< chan_name_ << "\")";
return os.str();
}
std::ostream& Unmark::write_to(std::ostream& os) const
{
os << name() << "(" << *pos_ << ", \"" << chan_name_ << "\")";
return os;
}
} /* end ns Wccl */
#ifndef LIBWCCL_OPS_TAGACTIONS_UNMARK_H
#define LIBWCCL_OPS_TAGACTIONS_UNMARK_H
#include <libwccl/ops/tagaction.h>
#include <libwccl/values/position.h>
#include <libwccl/values/bool.h>
#include <libwccl/ops/function.h>
namespace Wccl {
/**
* Action to unmark (delete) an annotation passing through a token.
*/
class Unmark : public TagAction
{
public:
typedef boost::shared_ptr<Function<Position> > PosFunctionPtr;
Unmark(
const PosFunctionPtr& pos,
const std::string& chan_name)
: pos_(pos),
chan_name_(chan_name)
{
BOOST_ASSERT(pos_);
BOOST_ASSERT(!chan_name.empty());
}
/**
* @returns Name of the function.
*/
std::string name() const {
return "unmark";
}
/**
* @returns String representation of the Action
*/
std::string to_string(const Corpus2::Tagset& tagset) const;
protected:
/**
* Writes string representation of the Action to
* an output stream.
* @returns Stream written to.
* @note May be incomplete and/or containt internal info.
*/
std::ostream& write_to(std::ostream& ostream) const;
/**
* Executes the Action on given context: Marks an annotation within
* the given channel on the range supplied. It is an error if there
* already is an annotation anywhere in the range in the channel.
* The Range is trimmed to sentence boundaries.
* No action is made in case of an invalid/empty range.
* @returns True if there were any changes made; False otherwise
*/
Bool execute(const ActionExecContext &context) const;
private:
const PosFunctionPtr pos_;
const std::string chan_name_;
};
} /* end ns Wccl */
#endif // LIBWCCL_OPS_TAGACTIONS_UNMARK_H
......@@ -66,6 +66,8 @@ header {
#include <libwccl/ops/tagactions/delete.h>
#include <libwccl/ops/tagactions/select.h>
#include <libwccl/ops/tagactions/relabel.h>
#include <libwccl/ops/tagactions/mark.h>
#include <libwccl/ops/tagactions/unmark.h>
// Match operators
#include <libwccl/values/tokenmatch.h>
......@@ -1485,6 +1487,8 @@ action
//
| act = action_unify [tagset, vars]
//
| act = action_mark [tagset, vars]
| act = action_unmark [tagset, vars]
;
// Action sequence - the actions are separated with commas:
......@@ -1697,6 +1701,40 @@ action_unify
}
;
// ----------------------------------------------------------------------------
// Mark action
action_mark
[const Corpus2::Tagset& tagset, Variables& vars]
returns [boost::shared_ptr<Mark> action]
{
boost::shared_ptr<Function<Position> > pos_begin, pos_end, pos_head;
}
: "mark" LPAREN
pos_begin = position_operator [tagset, vars] COMMA
pos_end = position_operator [tagset, vars] COMMA
(pos_head = position_operator [tagset, vars] COMMA)?
chan_name: STRING
RPAREN {
action.reset(new Mark(pos_begin, pos_end, pos_head, ((antlr::Token*)chan_name)->getText()));
}
;
// ----------------------------------------------------------------------------
// Unmark action
action_unmark
[const Corpus2::Tagset& tagset, Variables& vars]
returns [boost::shared_ptr<Unmark> action]
{
boost::shared_ptr<Function<Position> > pos;
}
: "unmark" LPAREN
pos = position_operator [tagset, vars] COMMA
chan_name: STRING
RPAREN {
action.reset(new Unmark(pos, ((antlr::Token*)chan_name)->getText()));
}
;
// ----------------------------------------------------------------------------
// ----------------------------------------------------------------------------
// Match rules
......
......@@ -27,6 +27,7 @@ add_executable(tests
getorth.cpp
logicalpredicates.cpp
main.cpp
mark.cpp
match.cpp
position.cpp
positionpredicates.cpp
......
#include <boost/test/unit_test.hpp>
#include <boost/bind.hpp>
#include <libcorpus2/ann/annotatedsentence.h>
#include <libwccl/ops/functions/constant.h>
#include <libwccl/ops/tagactions/mark.h>
using namespace Wccl;
BOOST_AUTO_TEST_SUITE(mark)
struct MarkFix
{
MarkFix()
: as(boost::make_shared<Corpus2::AnnotatedSentence>()),
sc(as),
cx(sc, boost::make_shared<Variables>()),
pos_zero(0),
pos_one(1),
pos_minus_one(-1),
nowhere(Position::Nowhere),
begin(Position::Begin),
end(Position::End),
pos_zero_constant(new Constant<Position>(pos_zero)),
pos_one_constant(new Constant<Position>(pos_one)),
pos_minus_one_constant(new Constant<Position>(pos_minus_one)),
nowhere_constant(new Constant<Position>(nowhere)),
begin_constant(new Constant<Position>(begin)),
end_constant(new Constant<Position>(end))
{
as->append(new Corpus2::Token(UnicodeString::fromUTF8("t1"), PwrNlp::Whitespace::Newline));
as->append(new Corpus2::Token(UnicodeString::fromUTF8("t2"), PwrNlp::Whitespace::Newline));
as->append(new Corpus2::Token(UnicodeString::fromUTF8("t3"), PwrNlp::Whitespace::Newline));
as->append(new Corpus2::Token(UnicodeString::fromUTF8("t4"), PwrNlp::Whitespace::Newline));
as->append(new Corpus2::Token(UnicodeString::fromUTF8("t5"), PwrNlp::Whitespace::Newline));
as->create_channel("ch1");
as->get_channel("ch1").set_segment_at(2, 1);
as->get_channel("ch1").set_segment_at(3, 1);
as->get_channel("ch1").set_head_at(3, true);
}
boost::shared_ptr<Corpus2::AnnotatedSentence> as;
SentenceContext sc;
Corpus2::Tagset tagset;
ActionExecContext cx;
Position pos_zero;
Position pos_one;
Position pos_minus_one;
Position nowhere;
Position begin;
Position end;
boost::shared_ptr<Function<Position> > pos_zero_constant;
boost::shared_ptr<Function<Position> > pos_one_constant;
boost::shared_ptr<Function<Position> > pos_minus_one_constant;
boost::shared_ptr<Function<Position> > nowhere_constant;
boost::shared_ptr<Function<Position> > begin_constant;
boost::shared_ptr<Function<Position> > end_constant;
};
BOOST_FIXTURE_TEST_CASE(mark_empty, MarkFix)
{
boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
Mark mark(pos_minus_one_constant, pos_minus_one_constant, "ch1");
BOOST_CHECK(!mark.execute(cx));
//BOOST_CHECK((*as) == (*as_clone));
BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "__aA_");
}
BOOST_FIXTURE_TEST_CASE(mark_already_there, MarkFix)
{
boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
sc.set_position(1);
Mark mark(begin_constant, pos_one_constant, "ch1");
BOOST_CHECK(!mark.execute(cx));
BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "__aA_");
}
BOOST_FIXTURE_TEST_CASE(mark_begin, MarkFix)
{
boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
sc.set_position(1);
Mark mark(begin_constant, pos_minus_one_constant, "ch1");
BOOST_CHECK(mark.execute(cx));
BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "B_aA_");
}
BOOST_FIXTURE_TEST_CASE(mark_begin_head0, MarkFix)
{
boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
sc.set_position(0);
Mark mark(begin_constant, pos_one_constant, pos_zero_constant, "ch1");
BOOST_CHECK(mark.execute(cx));
BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "BbaA_");
}
BOOST_FIXTURE_TEST_CASE(mark_begin_head1, MarkFix)
{
boost::shared_ptr<Corpus2::AnnotatedSentence> as_clone = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(as->clone_shared());
sc.set_position(0);
Mark mark(begin_constant, pos_one_constant, pos_one_constant, "ch1");
BOOST_CHECK(mark.execute(cx));
BOOST_CHECK_EQUAL(as->get_channel("ch1").dump_alpha(), "bBaA_");
}
BOOST_FIXTURE_TEST_CASE(mark_other, MarkFix)
{
sc.set_position(1);
Mark mark(begin_constant, pos_one_constant, begin_constant, "ch2");
BOOST_CHECK(mark.execute(cx));
BOOST_CHECK_EQUAL(as->get_channel("ch2").dump_alpha(), "Aaa__");
}
//------ to_string test cases -------
BOOST_FIXTURE_TEST_CASE(mark_to_string, MarkFix)
{
Mark mark(begin_constant, end_constant, "ch");
BOOST_CHECK_EQUAL("mark(begin, end, \"ch\")", mark.to_string(tagset));
Mark mark2(begin_constant, pos_one_constant, "ch2");
BOOST_CHECK_EQUAL("mark(begin, 1, \"ch2\")", mark2.to_string(tagset));
Mark mark3(pos_minus_one_constant, end_constant, "ch3");
BOOST_CHECK_EQUAL("mark(-1, end, \"ch3\")", mark3.to_string(tagset));
}
BOOST_FIXTURE_TEST_CASE(mark_to_string_head, MarkFix)
{
Mark mark(begin_constant, end_constant, pos_one_constant, "ch");
BOOST_CHECK_EQUAL("mark(begin, end, 1, \"ch\")", mark.to_string(tagset));
Mark mark2(begin_constant, pos_one_constant, pos_zero_constant, "ch2");
BOOST_CHECK_EQUAL("mark(begin, 1, 0, \"ch2\")", mark2.to_string(tagset));
Mark mark3(pos_minus_one_constant, end_constant, end_constant, "ch3");
BOOST_CHECK_EQUAL("mark(-1, end, end, \"ch3\")", mark3.to_string(tagset));
}
BOOST_AUTO_TEST_SUITE_END()
......@@ -72,25 +72,28 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru
return false;
}
void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules,
std::istream& is, const options& opts)
void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
boost::shared_ptr<Corpus2::TokenWriter> writer, Wccl::RuleSequence& rules,
const options& opts)
{
Corpus2::XcesReader reader(tagset, is);
Corpus2::TokenTimer& timer = Corpus2::global_timer();
while (boost::shared_ptr<Corpus2::Chunk> c = reader.get_next_chunk()) {
while (boost::shared_ptr<Corpus2::Chunk> c = reader->get_next_chunk()) {
foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) {
boost::shared_ptr<Corpus2::AnnotatedSentence> as;
as = Corpus2::AnnotatedSentence::wrap_sentence(s);
if (opts.until_done) {
rules.execute_until_done(s, opts.until_done_iterations);
rules.execute_until_done(as, opts.until_done_iterations);
} else {
rules.execute_once(s);
rules.execute_once(as);
}
timer.count_sentence(*s);
timer.count_sentence(*as);
if (progress) {
timer.check_slice();
}
if (opts.first) break;
writer->write_sentence(*as);
}
writer->write_chunk(*c);
//writer->write_chunk(*c);
if (opts.first) break;
}
if (progress) {
......@@ -102,6 +105,7 @@ void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Ta
int main(int argc, char** argv)
{
std::string tagset_load = "kipi";
std::string input_format;
std::string output_format;
options opts;
opts.first = false;
......@@ -111,8 +115,10 @@ int main(int argc, char** argv)
bool corpus_stdin = true;
using boost::program_options::value;
std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
std::string readers_help = "Input format, any of: " + readers + "\n";
std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " ");
std::string writers_help = "Output format, any of: " + writers + "\n";
std::string writers_help = "Output format, any of: " + writers + "\n";;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
......@@ -124,6 +130,8 @@ int main(int argc, char** argv)
"CCL rule files\n")
("files,f", value(&files),
"Files to load, looking at the extension to determine type\n")
("input-format,i", value(&input_format)->default_value("xces"),
readers_help.c_str())
("output-format,o", value(&output_format)->default_value("xces"),
writers_help.c_str())
("progress,p", value(&progress)->zero_tokens(),
......@@ -189,16 +197,14 @@ int main(int argc, char** argv)
timer.register_signal_handler();
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset));
boost::shared_ptr<Corpus2::TokenReader> reader;
foreach (const std::string& f, corpora_files) {
std::ifstream ifs(f.c_str());
if (ifs.good()) {
do_stream(writer, tagset, rules, ifs, opts);
} else {
std::cerr << "Error reading corpus from " << f << "\n";
}
reader = Corpus2::TokenReader::create_path_reader(input_format, tagset, f);
apply_rules(reader, writer, rules, opts);
}
if (corpus_stdin) {
do_stream(writer, tagset, rules, std::cin, opts);
reader = Corpus2::TokenReader::create_stream_reader(input_format, tagset, std::cin);
apply_rules(reader, writer, rules, opts);
}
}
} catch (PwrNlp::PwrNlpError& e) {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment