diff --git a/doc/wccl-run.py b/doc/wccl-run.py new file mode 100755 index 0000000000000000000000000000000000000000..e82398df40dd4d039e8cf1cccee335f0b258fb25 --- /dev/null +++ b/doc/wccl-run.py @@ -0,0 +1,70 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import sys +from optparse import OptionParser +import ctypes +sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL) +import corpus2, wccl + +descr = """%prog [options] CORPUSFILE +Mimics (simplified) functionality of wccl-run. +This script is a demo of the Python API.""" + +def chunks(rdr): + """Yields subsequent sentences from a reader.""" + while True: + chunk = rdr.get_next_chunk() + if not chunk: + break + yield chunk + +def iter_sent(sent): + """Iterates over a sentence, yielding the context with current_pos set + to the subsequent tokens. NOTE: the same context object is returned each + time, so tweaking with its state will affect iteration.""" + con = wccl.SentenceContext(sent) + con.goto_start() + while con.is_current_inside(): + yield con + con.advance() + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='kipi', + help='set the tagset used in input; default: kipi') + (options, args) = parser.parse_args() + + ts = corpus2.get_named_tagset(options.tagset) + p = wccl.Parser(ts) + + ops = [] # (name, op) pairs + infiles = [] + for arg in args: + if arg.endswith('.xml'): + infiles.append(arg) + elif arg.endswith('.ccl'): + f = p.parseWcclFileFromPath(arg) + ops.extend(f.gen_all_op_pairs()) + else: + # parse arg as single op string + op = p.parseAnyOperator(arg) + ops.append((arg, arg)) + if ops and infiles: + for fname in infiles: + rdr = corpus2.TokenReader.create_path_reader(options.input_format, ts, fname) + for chunk in chunks(rdr): + # dump op names + print '\t'.join(name for (name, _) in ops) + # iterate and dump values + for sent in chunk.sentences(): + for con in iter_sent(sent): + print '\t'.join(op.base_apply(con).to_string(ts) for (_, op) in ops) + + + +if __name__ == '__main__': + go() diff --git a/libmwereader/mweparser.cpp b/libmwereader/mweparser.cpp index 5d7534f8d17356360044bf5d7a33621945bf8373..c4453f140b011f2131fcb02dbff16a106ad1e46d 100644 --- a/libmwereader/mweparser.cpp +++ b/libmwereader/mweparser.cpp @@ -56,8 +56,6 @@ namespace Corpus2 { void MWEParser::create_mwe() { - print_current_mwe(true); - //std::cout << " kupa cond" << std::endl; MWEBuilder::BoolOpPtr main = mwe_builder_->get_mwe_condition( wccl_operator_); //std::cout << " kupa head" << std::endl; @@ -147,7 +145,7 @@ namespace Corpus2 { void MWEParser::on_end_element(const Glib::ustring &name) { - std::cout << "/" << state_ << ": " << name << std::endl; + //std::cout << "/" << state_ << ": " << name << std::endl; if(name == "units_description"){ state_ = NONE; diff --git a/libmwereader/mwereader.cpp b/libmwereader/mwereader.cpp index 90b3bcdcf25525b224c7bfa394a1f49e0f58b59a..e2399fa452c6c27d3974f6d1ce93705ff9ce8a7f 100644 --- a/libmwereader/mwereader.cpp +++ b/libmwereader/mwereader.cpp @@ -30,6 +30,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( { // TODO MWE stuff Sentence::Ptr pSentence = inner_reader_->get_next_sentence(); + if(pSentence == NULL) + return Sentence::Ptr(); Wccl::SentenceContext sc(pSentence); return process_sentence(sc); diff --git a/libmwereader/tests/CMakeLists.txt b/libmwereader/tests/CMakeLists.txt index d428203bbfeb1c3da093d2dae8013069779b0f7c..f7c36adc46e8a9dc1531b28d6d04286f8d2c175c 100644 --- a/libmwereader/tests/CMakeLists.txt +++ b/libmwereader/tests/CMakeLists.txt @@ -3,7 +3,7 @@ PROJECT( mwtest ) include_directories( ${CMAKE_SOURCE_DIR} ) add_definitions(-DMWE_READER_TEST_DATA_DIR="${CMAKE_CURRENT_SOURCE_DIR}") -MESSAGE(status ${CMAKE_CURRENT_SOURCE_DIR}) +#MESSAGE(status ${CMAKE_CURRENT_SOURCE_DIR}) add_executable( mwtests main.cpp diff --git a/libwccl/ops/functions/bool/iteration.cpp b/libwccl/ops/functions/bool/iteration.cpp index c91fc17ebf78efe9c2cf2ff1f387d438876cc6cb..9a6982a177dda80cdcd6c2d70f52702c6a4617ef 100644 --- a/libwccl/ops/functions/bool/iteration.cpp +++ b/libwccl/ops/functions/bool/iteration.cpp @@ -35,7 +35,8 @@ Iteration::BaseRetValPtr Iteration::apply_internal(const FunExecContext& context const boost::shared_ptr<const Position>& range_right = right_pos_expr_->apply(context); int left, right; - if (sc.validate_range(*range_left, *range_right, left, right)) { + if (sc.validate_range(*range_left, *range_right, + left, right, may_cross)) { // Change range from absolute to relative and iterate left -= sc.get_position(); right -= sc.get_position(); @@ -44,7 +45,7 @@ Iteration::BaseRetValPtr Iteration::apply_internal(const FunExecContext& context } } } - // In case of failure, set iteration variable to Nowhere and return False + // Failure. Set iteration variable to Nowhere and return False iter_var->set_value(Position::Nowhere); return Predicate::False(context); } diff --git a/libwccl/ops/functions/bool/iteration.h b/libwccl/ops/functions/bool/iteration.h index dee6c88676ed6e0ad8272f6094021922f435137c..cb96a2e59a8c8841a7670362fc2bc636a8967221 100644 --- a/libwccl/ops/functions/bool/iteration.h +++ b/libwccl/ops/functions/bool/iteration.h @@ -28,16 +28,19 @@ protected: const PosFunctionPtr right_pos_expr_; const VariableAccessor<Position> iter_var_acc_; const BoolFunctionPtr evaluating_expr_; + const bool may_cross; Iteration( const PosFunctionPtr& left_pos_expr, const PosFunctionPtr& right_pos_expr, const VariableAccessor<Position>& iter_var_acc, - const BoolFunctionPtr& evaluating_expr) + const BoolFunctionPtr& evaluating_expr, + bool may_cross) : left_pos_expr_(left_pos_expr), right_pos_expr_(right_pos_expr), iter_var_acc_(iter_var_acc), - evaluating_expr_(evaluating_expr) + evaluating_expr_(evaluating_expr), + may_cross(may_cross) { BOOST_ASSERT(left_pos_expr_); BOOST_ASSERT(right_pos_expr_); @@ -50,9 +53,10 @@ protected: * iteration variable, evaluating positions within the * range using supplied evaluation function. * Range is trimmed to sentence boundaries. - * In case of an invalid range (begin and end cross over or - * either of them points Nowhere), False is returned and iteration - * variable set to Nowhere. + * A range is invalid when begin or end point Nowhere. Depending on the + * value of may_cross, a range with begin following end is treated either + * as invalid (may_cross == False) or valid. In case of an invalid range, + * False is returned and iteration variable set to Nowhere. * If range is correct, return value depends on stopping condition * that describes how many positions within the range have to evaluate * to true. Exact details depend on type of iteration, and are diff --git a/libwccl/ops/functions/bool/iterations/atleast.h b/libwccl/ops/functions/bool/iterations/atleast.h index 21d55360cb88f6686897ad9e553cf49bfc311569..a2b756eee5034dcc7fea5666762293fc7b856a1b 100644 --- a/libwccl/ops/functions/bool/iterations/atleast.h +++ b/libwccl/ops/functions/bool/iterations/atleast.h @@ -19,7 +19,8 @@ public: const VariableAccessor<Position>& iter_var_acc, const BoolFunctionPtr& evaluating_expr, int min_matches) - : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, evaluating_expr), + : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, + evaluating_expr, false), // false==no crossing ranges min_matches_(min_matches) { BOOST_ASSERT(min_matches_ > 0); diff --git a/libwccl/ops/functions/bool/iterations/leftlook.h b/libwccl/ops/functions/bool/iterations/leftlook.h index 618aa3d11e63eefcfc9d3ed74abba08e5e9432a1..e0776c68c47a8f83e887795ab293f20f95c035ae 100644 --- a/libwccl/ops/functions/bool/iterations/leftlook.h +++ b/libwccl/ops/functions/bool/iterations/leftlook.h @@ -18,7 +18,8 @@ public: const PosFunctionPtr& right_pos_expr, const VariableAccessor<Position>& iter_var_acc, const BoolFunctionPtr& evaluating_expr) - : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, evaluating_expr) + : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, + evaluating_expr, false) // false==no crossing ranges { } diff --git a/libwccl/ops/functions/bool/iterations/only.h b/libwccl/ops/functions/bool/iterations/only.h index dce58cba7dcf366487b80057fa120c22bfc98651..f11d6f536836597834cf9c062fa124826e7e4d73 100644 --- a/libwccl/ops/functions/bool/iterations/only.h +++ b/libwccl/ops/functions/bool/iterations/only.h @@ -8,7 +8,8 @@ namespace Wccl { /** * Iterative operator "only", which mandates that * evaluating expression should evaluate to true - * on all positions in range. + * on all positions in range OR the range should + * be empty. */ class Only : public Iteration { @@ -18,7 +19,8 @@ public: const PosFunctionPtr& right_pos_expr, const VariableAccessor<Position>& iter_var_acc, const BoolFunctionPtr& evaluating_expr) - : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, evaluating_expr) + : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, + evaluating_expr, true) { } diff --git a/libwccl/ops/functions/bool/iterations/rightlook.h b/libwccl/ops/functions/bool/iterations/rightlook.h index dcc5e5fa722ef007c0a14e3d583d165f40f32a85..579aea3bd594034668c7038c17530cedc314fae8 100644 --- a/libwccl/ops/functions/bool/iterations/rightlook.h +++ b/libwccl/ops/functions/bool/iterations/rightlook.h @@ -18,7 +18,8 @@ public: const PosFunctionPtr& right_pos_expr, const VariableAccessor<Position>& iter_var_acc, const BoolFunctionPtr& evaluating_expr) - : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, evaluating_expr) + : Iteration(left_pos_expr, right_pos_expr, iter_var_acc, + evaluating_expr, false) // false==no crossing ranges { } diff --git a/libwccl/sentencecontext.h b/libwccl/sentencecontext.h index a857a3fc43b5558bad1f2cf7115a89a451072f75..6775ef8d6b3a7a5e72879a0976af4148ecc4c257 100644 --- a/libwccl/sentencecontext.h +++ b/libwccl/sentencecontext.h @@ -88,6 +88,7 @@ public: * value for left position * @param abs_right reference to int value that will hold absolute * value for right position + * @param may_cross set to true if crossing begin and end is admissible * @returns true if range is valid; in this case abs_left and abs_right * are set to absolute positions values for left and right Position. * False is returned otherwise; in this case abs_left and abs_right @@ -97,7 +98,8 @@ public: const Position& left, const Position& right, int& abs_left, - int& abs_right) const; + int& abs_right, + bool may_cross = false) const; /// Position setter void set_position(int new_position) { @@ -196,7 +198,8 @@ bool SentenceContext::validate_range( const Position& left, const Position& right, int& abs_left, - int& abs_right) const + int& abs_right, + bool may_cross) const { abs_left = get_abs_position(left); @@ -216,9 +219,10 @@ bool SentenceContext::validate_range( if (abs_right >= size()) { abs_right = size() - 1; } + // is range valid? this covers "crossed" range, an empty sentence, // and range outside boundaries of sentence - if (abs_left > abs_right) { + if (!may_cross && (abs_left > abs_right)) { abs_left = Position::Nowhere; abs_right = Position::Nowhere; return false; diff --git a/libwccl/values/strset.cpp b/libwccl/values/strset.cpp index 9c869ef90978e00a0ee6c3c6455473f3e7fe3510..5eebd4042e5185f466b091e03739f998f5504b86 100644 --- a/libwccl/values/strset.cpp +++ b/libwccl/values/strset.cpp @@ -48,6 +48,58 @@ UnicodeString StrSet::to_raw_string_u() const return u; } +std::string StrSet::to_compact_string(const Corpus2::Tagset& /* tagset */) + const +{ + if (set_.empty()) { + return "-"; + } + + std::stringstream ss; + value_type::const_iterator it = set_.begin(); + while(it != set_.end()) { + ss << '\"'; + std::string item = PwrNlp::to_utf8(*it); + boost::algorithm::replace_all(item, "-", "\\u002d"); + boost::algorithm::replace_all(item, ".", "\\u002e"); + boost::algorithm::replace_all(item, " ", "\\u0020"); + boost::algorithm::replace_all(item, "\t", "\\u0009"); + ss << item; + ss << '\"'; + if(++it != set_.end()) { + ss << "-"; + } + } + return ss.str(); +} + +UnicodeString StrSet::to_compact_string_u(const Corpus2::Tagset& /* tagset */) + const +{ + UnicodeString u; + + if (set_.empty()) { + u.append(UNICODE_STRING("-", 1)); + return u; + } + + value_type::const_iterator it = set_.begin(); + while(it != set_.end()) { + u.append(UNICODE_STRING("\"", 1)); + UnicodeString item = *it; + item.findAndReplace(UNICODE_STRING("-", 1), UNICODE_STRING("\\u002d", 6)); + item.findAndReplace(UNICODE_STRING(".", 1), UNICODE_STRING("\\u002e", 6)); + item.findAndReplace(UNICODE_STRING(" ", 1), UNICODE_STRING("\\u0020", 6)); + item.findAndReplace(UNICODE_STRING("\t", 1), UNICODE_STRING("\\u0009", 6)); + u.append(item); + u.append(UNICODE_STRING("\"", 1)); + if(++it != set_.end()) { + u.append(UNICODE_STRING("-", 1)); + } + } + return u; +} + bool StrSet::intersects(const StrSet &other) const { if (empty() || other.empty()) { return false; diff --git a/libwccl/values/strset.h b/libwccl/values/strset.h index e1abb89004ea866bc38efb93956d8788f370ca63..0de7a5de17adc7aa12cb33b9f429cca6499e02ee 100644 --- a/libwccl/values/strset.h +++ b/libwccl/values/strset.h @@ -99,6 +99,12 @@ public: /// Value override UnicodeString to_raw_string_u() const; + /// Value override + std::string to_compact_string(const Corpus2::Tagset& tagset) const; + + /// Value override + UnicodeString to_compact_string_u(const Corpus2::Tagset& tagset) const; + private: value_type set_; }; diff --git a/libwccl/values/tset.cpp b/libwccl/values/tset.cpp index 499fb5ae9148f2b248045bcc0549f15b291271b5..27760fbf1dae7daa0d0d91beb65fcb1d855d2bec 100644 --- a/libwccl/values/tset.cpp +++ b/libwccl/values/tset.cpp @@ -1,6 +1,8 @@ #include <libwccl/values/tset.h> #include <libpwrutils/foreach.h> #include <libpwrutils/bitset.h> + +#include <boost/algorithm/string.hpp> #include <sstream> namespace Wccl { @@ -46,4 +48,16 @@ void TSet::insert_symbol(const Corpus2::Tagset& tagset, const std::string& s) tag_.combine_with(tagset.parse_symbol(s)); } +std::string TSet::to_compact_string(const Corpus2::Tagset& tagset) + const +{ + if (tag_.is_null()) { + return "-"; + } + + std::string body = tagset.tag_to_symbol_string(tag_); + boost::algorithm::replace_all(body, ",", "-"); + return body; +} + } /* end ns Wccl */ diff --git a/libwccl/values/tset.h b/libwccl/values/tset.h index 73ed1f3326d73633982f2a446b6709d70dd461f1..41ff34f88568a50fb370235cf408061bd79d2c8c 100644 --- a/libwccl/values/tset.h +++ b/libwccl/values/tset.h @@ -115,6 +115,9 @@ public: std::string to_raw_string() const; + /// Value override + std::string to_compact_string(const Corpus2::Tagset& tagset) const; + private: Corpus2::Tag tag_; }; diff --git a/libwccl/values/value.h b/libwccl/values/value.h index abf068d4144218bf0f131cd25222beda4da8d018..702175c757e675f0cbcb9530d0e5e563b6080f15 100644 --- a/libwccl/values/value.h +++ b/libwccl/values/value.h @@ -71,6 +71,22 @@ public: return UnicodeString::fromUTF8(to_raw_string()); } + /** + * Compact string representation: sets are represented as hyphen-separated + * strings (sorted) with no brackets. The representation is suitable for + * generating compact output where some degree of ambiguity is allowed + * (note that type can't be unambiguously inferred from such strings). + */ + virtual std::string to_compact_string(const Corpus2::Tagset& /* tagset */) + const { + return to_raw_string(); + } + + virtual UnicodeString to_compact_string_u(const Corpus2::Tagset& tagset) + const { + return UnicodeString::fromUTF8(to_compact_string(tagset)); + } + protected: Value() {} }; diff --git a/swig/value.i b/swig/value.i index 30f8eefb203b114d7905733953ae326f53a33887..671c3c981ff1da336aa3b19064e6ab0cadb67f11 100644 --- a/swig/value.i +++ b/swig/value.i @@ -29,6 +29,7 @@ namespace Wccl { virtual std::string to_string(const Corpus2::Tagset& /*tagset*/) const; virtual std::string to_raw_string() const = 0; + virtual std::string to_compact_string(const Corpus2::Tagset& /*tagset*/) const; }; } diff --git a/tests/data/crossing.ccl b/tests/data/crossing.ccl new file mode 100644 index 0000000000000000000000000000000000000000..23dd82df465d8b57d48bc1fa9cdf377796f6cf72 --- /dev/null +++ b/tests/data/crossing.ccl @@ -0,0 +1,93 @@ +tagset=kipi +sentence=t01.xml +--- +rlook(0, 1, $It, True) + +True +It=0 +--- +llook(1, 0, $It, True) + +True +It=1 +--- +rlook(1, 0, $It, True) + +False +It=nowhere +--- +llook(0, 1, $It, True) + +False +It=nowhere +--- +only(0, 1, $It, True) + +True +It=1 +--- +only(1, 0, $It, True) + +True +It=nowhere +--- +only(1, 0, $It, False) + +True +It=nowhere +--- +and(setvar($It,2), rlook(0, 1, $It, True)) + +True +It=0 +--- +and(setvar($It,2), llook(1, 0, $It, True)) + +True +It=1 +--- +and(setvar($It,2), rlook(1, 0, $It, True)) + +False +It=nowhere +--- +and(setvar($It,2), llook(0, 1, $It, True)) + +False +It=nowhere +--- +and(setvar($It,2), only(0, 1, $It, True)) + +True +It=1 +--- +and(setvar($It,2), only(1, 0, $It, True)) + +True +It=2 +--- +and(setvar($It,2), only(1, 0, $It, False)) + +True +It=2 +--- +and(setvar($It,2), only(0, 1, $It, False)) + +False +It=nowhere +--- +and(setvar($It,6), atleast(0, 3, $It, inside($It), 2)) + +True +It=1 +--- +and(setvar($It,6), atleast(3, 0, $It, inside($It), 2)) + +False +It=nowhere +--- +and(setvar($It,6), atleast(0, 3, $It, inside($It), 7)) + +False +It=nowhere +---