Skip to content
Snippets Groups Projects
Commit 0fad2421 authored by Adam Wardynski's avatar Adam Wardynski
Browse files

GetSymbolsInRange - range(symbol, p1, p2) operator.

parent 7e2814aa
No related merge requests found
......@@ -41,6 +41,7 @@ SET(libwccl_STAT_SRC
ops/functions/strset/tolower.cpp
ops/functions/strset/toupper.cpp
ops/functions/tset/getsymbols.cpp
ops/functions/tset/getsymbolsinrange.cpp
parser/grammar.g
parser/Parser.cpp
parser/ParserException.cpp
......
#include <libwccl/ops/functions/tset/getsymbolsinrange.h>
#include <libwccl/ops/functions/constant.h>
#include <sstream>
namespace Wccl {
std::string GetSymbolsInRange::to_string(const Corpus2::Tagset& tagset) const
{
std::stringstream ss;
ss << name(tagset) << "("
<< tagset.get_attribute_name(mask_.get_values()) << ", "
<< rbegin_expr_->to_string(tagset) << ", "
<< rend_expr_->to_string(tagset) << ")";
return ss.str();
}
std::string GetSymbolsInRange::to_raw_string() const {
std::stringstream ss;
ss << raw_name() << "("
<< mask_.raw_dump() << ", "
<< rbegin_expr_->to_raw_string() << ", "
<< rend_expr_->to_raw_string() << ")";
return ss.str();
}
GetSymbolsInRange::BaseRetValPtr GetSymbolsInRange::apply_internal(const FunExecContext& context) const
{
const boost::shared_ptr<const Position>& range_begin = rbegin_expr_->apply(context);
const boost::shared_ptr<const Position>& range_end = rend_expr_->apply(context);
const SentenceContext& sc = context.sentence_context();
int abs_begin = sc.get_abs_position(*range_begin);
int abs_end = sc.get_abs_position(*range_end);
// Trim range to sentence boundaries
if ((abs_begin != Position::Nowhere) && (abs_begin < 0)) {
abs_begin = 0;
}
if ((abs_end != Position::Nowhere) && (abs_end >= sc.size())) {
abs_end = sc.size() - 1;
}
// If range is empty, return an empty set - note the below also
// covers ranges without overlap with actual sentence range
// (including an empty sentence).
if((abs_begin == Position::Nowhere) || (abs_end == Position::Nowhere) || (abs_begin > abs_end)) {
return detail::DefaultFunction<TSet>()->apply(context);
}
boost::shared_ptr<TSet> tset = boost::make_shared<TSet>();
for(int abs_pos = abs_begin; abs_pos <= abs_end; abs_pos++) {
const Corpus2::Token* token = sc.at(abs_pos);
foreach (const Corpus2::Lexeme& lexeme, token->lexemes()) {
tset->combine_with(lexeme.tag());
}
}
tset->tag_ref().mask_with(mask_);
return tset;
}
} /* end ns Wccl */
#ifndef LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLSINRANGE_H
#define LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLSINRANGE_H
#include <libwccl/values/tset.h>
#include <libwccl/values/position.h>
#include <libwccl/ops/function.h>
namespace Wccl {
/**
* Operator that gets tagset symbols from tokens in given range.
*/
class GetSymbolsInRange : public Function<TSet> {
public:
typedef boost::shared_ptr<Function<Position> > PosFunctionPtr;
GetSymbolsInRange(
const Corpus2::Tag& mask,
const PosFunctionPtr& range_begin_expr,
const PosFunctionPtr& range_end_expr)
: mask_(mask),
rbegin_expr_(range_begin_expr),
rend_expr_(range_end_expr)
{
BOOST_ASSERT(rbegin_expr_);
BOOST_ASSERT(rend_expr_);
}
/**
* @returns String representation of the function in the form of:
* "range(tagset_symbol, range_begin_expr, range_end_expr)
*/
std::string to_string(const Corpus2::Tagset& tagset) const;
/**
* @returns String representation of the function in the form of:
* "range(raw_tagset_symbol, range_begin_expr_raw, range_end_expr_raw)
* @note This version does not require tagset, but will be inclomplete
* and/or contain internal info.
*/
std::string to_raw_string() const;
/**
* @returns The operator name, "range"
*/
std::string raw_name() const {
return "range";
}
protected:
Corpus2::Tag mask_;
const PosFunctionPtr rbegin_expr_;
const PosFunctionPtr rend_expr_;
/**
* Gets positions for beginning and end of range we are
* interested in (from corresponding argument expressions).
* The range is trimmed to boundaries of the sentence we are working on.
* An empty set is returned if any of the positions points
* to nowhere, or when the range doesn't overlap with the sentence,
* or when supplied begin is actually after the supplied end.
* Otherwise we have a valid range, and a symbol set is returned,
* which is the sum of tagset symbols for words within the range
* (inclusive). The selection of symbols is based on the mask.
* The main intention is to supply masks that correspond
* directly to a single selected attribute, but the code accepts any
* valid mask (i.e. any combination of attributes and their values,
* and even includes the part-of-speech part).
* @returns A tagset symbol set for the words within range, given
* the mask, if the range is valid. An empty Tset otherwise.
*/
BaseRetValPtr apply_internal(const FunExecContext& context) const;
};
} /* end ns Wccl */
#endif // LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLSINRANGE_H
......@@ -10,6 +10,7 @@ add_executable(tests
context.cpp
getlemmas.cpp
getsymbols.cpp
getsymbolsinrange.cpp
getorth.cpp
logicalpredicates.cpp
main.cpp
......
#include <boost/test/unit_test.hpp>
#include <boost/bind.hpp>
#include <boost/shared_ptr.hpp>
#include <libcorpus2/sentence.h>
#include <libcorpus2/tagsetmanager.h>
#include <libwccl/ops/functions/constant.h>
#include <libwccl/ops/functions/tset/getsymbols.h>
#include <libwccl/ops/functions/tset/getsymbolsinrange.h>
using namespace Wccl;
BOOST_AUTO_TEST_SUITE(get_symbols_in_range_op)
struct SymbolsInRangeFix
{
SymbolsInRangeFix()
: s(boost::make_shared<Corpus2::Sentence>()),
sc(s),
tagset(Corpus2::get_named_tagset("kipi")),
cx(sc, boost::make_shared<Variables>()),
pos_zero(0),
pos_one(1),
pos_minus_one(-1),
nowhere(Position::Nowhere),
pos_zero_constant(new Constant<Position>(pos_zero)),
pos_one_constant(new Constant<Position>(pos_one)),
pos_minus_one_constant(new Constant<Position>(pos_minus_one)),
nowhere_constant(new Constant<Position>(nowhere)),
pos_plus_2_constant(new Constant<Position>(Position(2))),
pos_minus_2_constant(new Constant<Position>(Position(-2)))
{
Corpus2::Token* the_token = new Corpus2::Token(
"One",
PwrNlp::Whitespace::ManySpaces);
Corpus2::Lexeme l1("aaa", tagset.parse_simple_tag("subst:sg:nom:m1", false));
Corpus2::Lexeme l2("aaa", tagset.parse_simple_tag("subst:sg:nom:m2", false));
the_token->add_lexeme(l1);
the_token->add_lexeme(l2);
s->append(the_token);
Corpus2::Token* another_token = new Corpus2::Token(
"Two",
PwrNlp::Whitespace::ManySpaces);
Corpus2::Lexeme l3("aaa", tagset.parse_simple_tag("subst:pl:dat:f", false));
Corpus2::Lexeme l4("aaa", tagset.parse_simple_tag("prep:nom:wok", false));
Corpus2::Lexeme l5("aaa", tagset.parse_simple_tag("adja", false));
another_token->add_lexeme(l3);
another_token->add_lexeme(l4);
another_token->add_lexeme(l5);
s->append(another_token);
gnd = tagset.parse_symbol("gnd");
nmb = tagset.parse_symbol("nmb");
vcl = tagset.parse_symbol("vcl");
}
boost::shared_ptr<Corpus2::Sentence> s;
SentenceContext sc;
const Corpus2::Tagset& tagset;
FunExecContext cx;
Position pos_zero;
Position pos_one;
Position pos_minus_one;
Position nowhere;
boost::shared_ptr<Function<Position> > pos_zero_constant;
boost::shared_ptr<Function<Position> > pos_one_constant;
boost::shared_ptr<Function<Position> > pos_minus_one_constant;
boost::shared_ptr<Function<Position> > nowhere_constant;
boost::shared_ptr<Function<Position> > pos_plus_2_constant;
boost::shared_ptr<Function<Position> > pos_minus_2_constant;
TSet empty;
Corpus2::Tag gnd;
Corpus2::Tag nmb;
Corpus2::Tag vcl;
Corpus2::Tag pos;
};
BOOST_FIXTURE_TEST_CASE(range_nowhere, SymbolsInRangeFix)
{
for(int i = 0; i < 3; i++)
{
GetSymbolsInRange range(gnd, nowhere_constant, pos_zero_constant);
BOOST_CHECK(range.apply(cx)->equals(empty));
GetSymbolsInRange r2(gnd, pos_zero_constant, nowhere_constant);
BOOST_CHECK(r2.apply(cx)->equals(empty));
GetSymbolsInRange r3(gnd, nowhere_constant, nowhere_constant);
BOOST_CHECK(r3.apply(cx)->equals(empty));
sc.advance();
}
}
BOOST_FIXTURE_TEST_CASE(range_outside, SymbolsInRangeFix)
{
GetSymbolsInRange range(gnd, pos_minus_2_constant, pos_minus_one_constant);
BOOST_CHECK(range.apply(cx)->equals(empty));
sc.advance();
GetSymbolsInRange r2(gnd, pos_one_constant, pos_plus_2_constant);
BOOST_CHECK(r2.apply(cx)->equals(empty));
}
BOOST_FIXTURE_TEST_CASE(range_valid_including_trimmed, SymbolsInRangeFix)
{
GetSymbolsInRange range(gnd, pos_minus_2_constant, pos_zero_constant);
BOOST_CHECK_EQUAL("{m1,m2}", range.apply(cx)->to_string(tagset));
GetSymbolsInRange r2(gnd, pos_minus_one_constant, pos_zero_constant);
BOOST_CHECK_EQUAL("{m1,m2}", r2.apply(cx)->to_string(tagset));
GetSymbolsInRange r3(gnd, pos_one_constant, pos_plus_2_constant);
BOOST_CHECK_EQUAL("{f}", r3.apply(cx)->to_string(tagset));
GetSymbolsInRange r4(gnd, pos_zero_constant, pos_plus_2_constant);
BOOST_CHECK_EQUAL("{m1,m2,f}", r4.apply(cx)->to_string(tagset));
sc.advance();
BOOST_CHECK_EQUAL("{m1,m2,f}", range.apply(cx)->to_string(tagset));
BOOST_CHECK_EQUAL("{m1,m2,f}", r2.apply(cx)->to_string(tagset));
BOOST_CHECK_EQUAL("{}", r3.apply(cx)->to_string(tagset));
BOOST_CHECK_EQUAL("{f}", r4.apply(cx)->to_string(tagset));
sc.advance();
BOOST_CHECK_EQUAL("{m1,m2,f}", range.apply(cx)->to_string(tagset));
BOOST_CHECK_EQUAL("{f}", r2.apply(cx)->to_string(tagset));
BOOST_CHECK_EQUAL("{}", r3.apply(cx)->to_string(tagset));
BOOST_CHECK_EQUAL("{}", r4.apply(cx)->to_string(tagset));
}
BOOST_FIXTURE_TEST_CASE(range_same_token, SymbolsInRangeFix)
{
GetSymbolsInRange range(gnd, pos_zero_constant, pos_zero_constant);
BOOST_CHECK_EQUAL("{m1,m2}", range.apply(cx)->to_string(tagset));
sc.advance();
BOOST_CHECK_EQUAL("{f}", range.apply(cx)->to_string(tagset));
sc.advance();
BOOST_CHECK_EQUAL("{}", range.apply(cx)->to_string(tagset));
}
BOOST_FIXTURE_TEST_CASE(range_to_string, SymbolsInRangeFix)
{
GetSymbolsInRange range(gnd, pos_zero_constant, pos_plus_2_constant);
BOOST_CHECK_EQUAL("range(gnd, 0, 2)", range.to_string(tagset));
}
BOOST_FIXTURE_TEST_CASE(range_to_raw_string, SymbolsInRangeFix)
{
GetSymbolsInRange range(gnd, pos_zero_constant, pos_plus_2_constant);
std::string expected = "range(" + gnd.raw_dump() + ", 0, 2)";
BOOST_CHECK_EQUAL(expected, range.to_raw_string());
}
BOOST_AUTO_TEST_SUITE_END()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment