diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index 531ee28101a6fe8d1d2e31f90c80d23290ae26ba..53f0009226ead59ca531d4cd4443061ad340a6e4 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -85,7 +85,7 @@ std::string TagsetMismatch::info() const tagset_idx_t Tagset::next_id_ = static_cast<tagset_idx_t>(0); Tagset::Tagset() - : id_(++next_id_) + : id_(++next_id_), valid_pos_mask_(0) { } @@ -125,9 +125,24 @@ Tag Tagset::parse_symbol(const std::string& s) const if (m.any()) { return Tag(0, m); } + if (s == "@pos") { + return Tag(valid_pos_mask_); + } throw TagParseError("Not a tagset symbol", s, "", id_string()); } +Tag Tagset::parse_symbol_string(const std::string &s) const +{ + Tag t; + std::vector<std::string> parts; + boost::algorithm::split(parts, s, boost::is_any_of(",")); + foreach (const std::string& ss, parts) { + t.combine_with(parse_symbol(ss)); + } + return t; + +} + void Tagset::parse_tag(const string_range &s, boost::function<void(const Tag &)> sink, ParseMode mode /* = ParseDefault*/) const diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 4c628c5fafbfd5a0508225c2f626cf740c644f0e..75c98dddff53d972042aee0e312a1093d1f0a666 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -147,6 +147,14 @@ public: */ Tag parse_symbol(const std::string& s) const; + /** + * Parse a comma-separated list of tagset symbols, calling parse_symbol + * repeatedly. No validation is performed other than the validity of + * each individual symbol. + */ + Tag parse_symbol_string(const std::string& s) const; + + /** * Tag parsing -- functional version, whole tag string. * @@ -632,6 +640,9 @@ private: /// Flags for attribute indices which are required for a given POS std::vector< std::vector<bool> > pos_required_attributes_; + + /// Valid POS mask + mask_t valid_pos_mask_; }; /* implementation */ diff --git a/libcorpus2/tagsetparser.cpp b/libcorpus2/tagsetparser.cpp index 93da97c7ca32e940ac41bebf6a53423aeb2eecdf..945cc00db9b0be1150a39cb363f7bb7358cabc49 100644 --- a/libcorpus2/tagsetparser.cpp +++ b/libcorpus2/tagsetparser.cpp @@ -187,6 +187,7 @@ Tagset TagsetParser::load_ini(std::istream &is) for (size_t i = 0; i < poses_plain.size(); ++i) { idx_t p = tagset.pos_dictionary().get_id(poses_plain[i]); tagset.original_pos_indices_.insert(std::make_pair(p,i)); + tagset.valid_pos_mask_ |= (mask_t(1) << i); } return tagset;