From b7d34d6187985ea72d9acae38674b6cac8e53e1e Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Tue, 30 Nov 2010 14:38:53 +0100 Subject: [PATCH] Add the tagset-symbol-getter operator (GetSymbols), bump required Corpus2 version to 1.0.2 since new features are needed. --- libwccl/CMakeLists.txt | 3 +- libwccl/ops/functions/tset/getsymbols.cpp | 42 +++++++++++++++++ libwccl/ops/functions/tset/getsymbols.h | 56 +++++++++++++++++++++++ libwccl/values/tset.h | 9 ++++ 4 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 libwccl/ops/functions/tset/getsymbols.cpp create mode 100644 libwccl/ops/functions/tset/getsymbols.h diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index 70729be..c8159dd 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -4,7 +4,7 @@ PROJECT(wccl) include_directories( ${CMAKE_CURRENT_BINARY_DIR}/include/ ) -find_package(Corpus2 0.1.2 REQUIRED) +find_package(Corpus2 1.0.2 REQUIRED) set(LIBS ${LIBS} ${Corpus2_LIBRARY}) find_package(PwrUtils 0.0.3 REQUIRED) @@ -39,6 +39,7 @@ SET(libwccl_STAT_SRC ops/functions/strset/getorth.cpp ops/functions/strset/tolower.cpp ops/functions/strset/toupper.cpp + ops/functions/tset/getsymbols.cpp parser/grammar.g parser/Parser.cpp parser/ParserException.cpp diff --git a/libwccl/ops/functions/tset/getsymbols.cpp b/libwccl/ops/functions/tset/getsymbols.cpp new file mode 100644 index 0000000..fe5d1fa --- /dev/null +++ b/libwccl/ops/functions/tset/getsymbols.cpp @@ -0,0 +1,42 @@ +#include <libwccl/ops/functions/tset/getsymbols.h> +#include <libwccl/ops/formatters.h> +#include <libwccl/ops/functions/constant.h> + +namespace Wccl { + +std::string GetSymbols::to_string(const Corpus2::Tagset& tagset) const +{ + return UnaryFunctionFormatter::to_string(tagset, *this, *pos_expr_, "[", "]"); +} + +std::string GetSymbols::to_raw_string() const { + return UnaryFunctionFormatter::to_raw_string(*this, *pos_expr_, "[", "]"); +} + +std::string GetSymbols::name(const Corpus2::Tagset &tagset) const +{ + return tagset.get_attribute_name(mask_.get_values()); +} + +std::string GetSymbols::raw_name() const +{ + return mask_.raw_dump(); +} + +GetSymbols::BaseRetValPtr GetSymbols::apply_internal(const FunExecContext& context) const +{ + const boost::shared_ptr<const Position>& pos = pos_expr_->apply(context); + const SentenceContext& sc = context.sentence_context(); + if(pos->is_outside(sc) || !sc.is_current_inside()) { + return detail::DefaultFunction<TSet>()->apply(context); + } + + boost::shared_ptr<TSet> tset = boost::make_shared<TSet>(); + const Corpus2::Token* token = sc.at(*pos); + foreach (const Corpus2::Lexeme& lexeme, token->lexemes()) { + tset->combine_with(lexeme.tag()); + } + return tset; +} + +} /* end ns Wccl */ diff --git a/libwccl/ops/functions/tset/getsymbols.h b/libwccl/ops/functions/tset/getsymbols.h new file mode 100644 index 0000000..2c49201 --- /dev/null +++ b/libwccl/ops/functions/tset/getsymbols.h @@ -0,0 +1,56 @@ +#ifndef LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLS_H +#define LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLS_H + +#include <libwccl/values/tset.h> +#include <libwccl/values/position.h> +#include <libwccl/ops/function.h> + +namespace Wccl { + +class GetSymbols : public Function<TSet> { +public: + typedef boost::shared_ptr<Function<Position> > PosFunctionPtr; + + GetSymbols(const PosFunctionPtr& pos_expr, const Corpus2::Tag& mask) + : pos_expr_(pos_expr), mask_(mask) + { + BOOST_ASSERT(pos_expr_); + } + + /** + * @returns String representation of the function in the form of: + * "attribute_name[pos_expr_string]" + */ + std::string to_string(const Corpus2::Tagset& tagset) const; + + /** + * @returns String representation of the function in the form of: + * "attribute_name[pos_expr_string]" + * @note This version does not require tagset, but will be inclomplete + * and/or contain internal info. + */ + std::string to_raw_string() const; + + std::string raw_name() const; + + std::string name(const Corpus2::Tagset& tagset) const; + +protected: + const PosFunctionPtr pos_expr_; + + Corpus2::Tag mask_; + + /** + * Gets a position from the argument expression, then gets the + * word at that position from the Sentence in the SentenceContext, + * then gets the tagset symbols matching the mask of the word. + * + * @returns A tagset symbol set of the word if position pointed to + * lies within boundaries of the Sentence. Empty Tset otherwise. + */ + BaseRetValPtr apply_internal(const FunExecContext& context) const; +}; + +} /* end ns Wccl */ + +#endif // LIBWCCL_OPS_FUNCTIONS_TSET_GETSYMBOLS_H diff --git a/libwccl/values/tset.h b/libwccl/values/tset.h index 0378628..47533f1 100644 --- a/libwccl/values/tset.h +++ b/libwccl/values/tset.h @@ -61,6 +61,15 @@ public: return tag_ == other.tag_; } + void combine_with(const Corpus2::Tag& other) { + tag_.combine_with(other); + } + + void combine_with(const TSet& other) { + tag_.combine_with(other.get_value()); + } + + std::string to_string(const Corpus2::Tagset &) const; std::string to_raw_string() const; -- GitLab