From 954a2063b6562e2e085dd8bedd6cb4f174cad95d Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 24 Mar 2011 17:11:44 +0100 Subject: [PATCH] Add parsing of '@pos' sumbol in Tagset::parse_symbol, parses to a mask of all valid POSes. Add Tagset::parse_symbol_string for convenience parsing of comma-separated symbol strings --- libcorpus2/tagset.cpp | 17 ++++++++++++++++- libcorpus2/tagset.h | 11 +++++++++++ libcorpus2/tagsetparser.cpp | 1 + 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index 531ee28..53f0009 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -85,7 +85,7 @@ std::string TagsetMismatch::info() const tagset_idx_t Tagset::next_id_ = static_cast<tagset_idx_t>(0); Tagset::Tagset() - : id_(++next_id_) + : id_(++next_id_), valid_pos_mask_(0) { } @@ -125,9 +125,24 @@ Tag Tagset::parse_symbol(const std::string& s) const if (m.any()) { return Tag(0, m); } + if (s == "@pos") { + return Tag(valid_pos_mask_); + } throw TagParseError("Not a tagset symbol", s, "", id_string()); } +Tag Tagset::parse_symbol_string(const std::string &s) const +{ + Tag t; + std::vector<std::string> parts; + boost::algorithm::split(parts, s, boost::is_any_of(",")); + foreach (const std::string& ss, parts) { + t.combine_with(parse_symbol(ss)); + } + return t; + +} + void Tagset::parse_tag(const string_range &s, boost::function<void(const Tag &)> sink, ParseMode mode /* = ParseDefault*/) const diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 4c628c5..75c98dd 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -147,6 +147,14 @@ public: */ Tag parse_symbol(const std::string& s) const; + /** + * Parse a comma-separated list of tagset symbols, calling parse_symbol + * repeatedly. No validation is performed other than the validity of + * each individual symbol. + */ + Tag parse_symbol_string(const std::string& s) const; + + /** * Tag parsing -- functional version, whole tag string. * @@ -632,6 +640,9 @@ private: /// Flags for attribute indices which are required for a given POS std::vector< std::vector<bool> > pos_required_attributes_; + + /// Valid POS mask + mask_t valid_pos_mask_; }; /* implementation */ diff --git a/libcorpus2/tagsetparser.cpp b/libcorpus2/tagsetparser.cpp index 93da97c..945cc00 100644 --- a/libcorpus2/tagsetparser.cpp +++ b/libcorpus2/tagsetparser.cpp @@ -187,6 +187,7 @@ Tagset TagsetParser::load_ini(std::istream &is) for (size_t i = 0; i < poses_plain.size(); ++i) { idx_t p = tagset.pos_dictionary().get_id(poses_plain[i]); tagset.original_pos_indices_.insert(std::make_pair(p,i)); + tagset.valid_pos_mask_ |= (mask_t(1) << i); } return tagset; -- GitLab