diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index 1d82498197216db3f1b29aa77d4708d4f6b59a79..424fd0cecaafda2a340a8cf69c2e96eab3493ab0 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -89,7 +89,7 @@ Sentence::Ptr RftReader::actual_next_sentence() std::string orth = line.substr(0, tab); std::string tag_string = line.substr(tab + 1); boost::algorithm::replace_all(tag_string, ".", ":"); - Tag tag = tagset().parse_simple_tag(tag_string, false); + Tag tag = tagset().parse_simple_tag(tag_string); Token* t = new Token(); t->set_orth(UnicodeString::fromUTF8(orth)); t->set_wa(PwrNlp::Whitespace::Space); diff --git a/libcorpus2/io/xcesvalidate.cpp b/libcorpus2/io/xcesvalidate.cpp index 8cebeaa8caeabd758276f43654f580d944762c90..8c47bfde8359d4543cc9b6f09c38bed7959f2ec0 100644 --- a/libcorpus2/io/xcesvalidate.cpp +++ b/libcorpus2/io/xcesvalidate.cpp @@ -119,9 +119,9 @@ void XcesValidatorImpl::on_end_element(const Glib::ustring &name) state_ = XS_TOK; } else if (state_ == XS_TAG && name == "ctag") { try { - Tag tag = tagset_.parse_simple_tag(sbuf_, true); + Tag tag = tagset_.parse_simple_tag(sbuf_); std::stringstream ss; - if (!tagset_.validate_tag(tag, false, &ss)) { + if (!tagset_.validate_tag(tag, Tagset::ParseStrict, &ss)) { error_preamble(os_, last_orth_, sbuf_, token_idx_, tag_idx_); os_ << ss.str() << "\n"; } diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index b732690e713d9138195fc069e13da361605377ca..531ee28101a6fe8d1d2e31f90c80d23290ae26ba 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -128,12 +128,13 @@ Tag Tagset::parse_symbol(const std::string& s) const throw TagParseError("Not a tagset symbol", s, "", id_string()); } -void Tagset::parse_tag(const string_range &s, bool allow_extra, - boost::function<void(const Tag &)> sink) const +void Tagset::parse_tag(const string_range &s, + boost::function<void(const Tag &)> sink, + ParseMode mode /* = ParseDefault*/) const { string_range_vector fields; boost::algorithm::split(fields, s, boost::is_any_of(":")); - parse_tag(fields, allow_extra, sink); + parse_tag(fields, sink, mode); } namespace { @@ -155,8 +156,9 @@ namespace { } } -void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, - boost::function<void(const Tag &)>sink) const +void Tagset::parse_tag(const string_range_vector &fields, + boost::function<void(const Tag &)>sink, + ParseMode mode /* = ParseDefault*/) const { if (fields.empty()) { throw TagParseError("No POS", "", "", id_string()); @@ -209,37 +211,38 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra, } // else empty, do nothing } foreach (mask_t variant, all_variants) { - sink(make_tag(pos_idx, variant, allow_extra)); + sink(make_tag(pos_idx, variant, mode)); } } std::vector<Tag> Tagset::parse_tag(const string_range& sr, - bool allow_extra) const + ParseMode mode /* = ParseDefault*/) const { string_range_vector fields; boost::algorithm::split(fields, sr, boost::is_any_of(":")); - return parse_tag(fields, allow_extra); + return parse_tag(fields, mode); } std::vector<Tag> Tagset::parse_tag(const string_range_vector &fields, - bool allow_extra) const + ParseMode mode /* = ParseDefault*/) const { std::vector<Tag> tags; - parse_tag(fields, allow_extra, - boost::bind(&std::vector<Tag>::push_back, boost::ref(tags), - _1)); + parse_tag(fields, + boost::bind(&std::vector<Tag>::push_back, boost::ref(tags),_1), + mode); return tags; } -Tag Tagset::parse_simple_tag(const string_range &s, bool allow_extra) const +Tag Tagset::parse_simple_tag(const string_range &s, + ParseMode mode /* = ParseDefault*/) const { string_range_vector fields; boost::algorithm::split(fields, s, boost::is_any_of(std::string(":"))); - return parse_simple_tag(fields, allow_extra); + return parse_simple_tag(fields, mode); } Tag Tagset::parse_simple_tag(const string_range_vector &ts, - bool allow_extra) const + ParseMode mode /* = ParseDefault*/) const { if (ts.empty()) { throw TagParseError("Empty POS+attribute list", "", "", @@ -269,11 +272,11 @@ Tag Tagset::parse_simple_tag(const string_range_vector &ts, } } } - - return make_tag(pos_idx, values, allow_extra); + return make_tag(pos_idx, values, mode); } -Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const +Tag Tagset::make_tag(idx_t pos_idx, mask_t values, + ParseMode mode /* = ParseDefault*/) const { mask_t required_values = get_pos_required_mask(pos_idx); //std::cerr << values << "\n"; @@ -281,32 +284,43 @@ Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const //std::cerr << (required_values & values) << "\n"; //std::cerr << PwrNlp::count_bits_set(required_values & values) // << " of " << pos_required_attributes_idx_[pos_idx].size() << "\n"; - size_t has_req = PwrNlp::count_bits_set(required_values & values); - if (has_req != pos_required_attributes_idx_[pos_idx].size()) { - foreach (idx_t a, get_pos_attributes(pos_idx)) { - if (pos_requires_attribute(pos_idx, a)) { - mask_t amask = get_attribute_mask(a); - if ((values & amask).none()) { - throw TagParseError("Required attribute missing", - tag_to_string(Tag(get_pos_mask(pos_idx), values)), - get_attribute_name(a), id_string()); + if (mode & ParseCheckRequired) { + size_t has_req = PwrNlp::count_bits_set(required_values & values); + if (has_req != pos_required_attributes_idx_[pos_idx].size()) { + foreach (idx_t a, get_pos_attributes(pos_idx)) { + if (pos_requires_attribute(pos_idx, a)) { + mask_t amask = get_attribute_mask(a); + if ((values & amask).none()) { + throw TagParseError("Required attribute missing", + tag_to_string(Tag(get_pos_mask(pos_idx), values)), + get_attribute_name(a), id_string()); + } } } + throw TagParseError("Required attribute missing", + tag_to_string(Tag(get_pos_mask(pos_idx), values)), + get_pos_name(pos_idx), id_string()); } - throw TagParseError("Required attribute missing", - tag_to_string(Tag(get_pos_mask(pos_idx), values)), - get_pos_name(pos_idx), id_string()); } - mask_t valid_values = get_pos_value_mask(pos_idx); - mask_t invalid = values & ~valid_values; - if (invalid.any() && !allow_extra) { - mask_t first_invalid = PwrNlp::lowest_bit(invalid); - throw TagParseError("Attribute not valid for this POS", - get_value_name(first_invalid), - get_pos_name(pos_idx), id_string()); + if (!(mode & ParseAllowExtra)) { + mask_t valid_values = get_pos_value_mask(pos_idx); + mask_t invalid = values & ~valid_values; + if (invalid.any()) { + mask_t first_invalid = PwrNlp::lowest_bit(invalid); + throw TagParseError("Attribute not valid for this POS", + get_value_name(first_invalid), + get_pos_name(pos_idx), id_string()); + } } - // check singularity? - return Tag(get_pos_mask(pos_idx), values); + Tag tag(get_pos_mask(pos_idx), values); + if (mode & ParseCheckSingular) { + if (!tag_is_singular(tag)) { + throw TagParseError("Parsed tag not singular", + tag_to_symbol_string(tag, false), + get_pos_name(pos_idx), id_string()); + } + } + return tag; } Tag Tagset::make_ign_tag() const @@ -318,23 +332,24 @@ Tag Tagset::make_ign_tag() const return Tag(ign_pos_mask); } -bool Tagset::validate_tag(const Tag &t, bool allow_extra, - std::ostream* os) const +bool Tagset::validate_tag(const Tag &t, ParseMode mode /* = ParseDefault*/, + std::ostream* os /* = NULL */) const { - if (t.pos_count() != 1) { - if (os) { - (*os) << " POS not singular : " << t.pos_count(); + if (mode & ParseCheckSingular) { + if (t.pos_count() != 1) { + if (os) { + (*os) << " POS not singular : " << t.pos_count(); + } + return false; } - return false; - } - size_t ts = tag_size(t); - if (ts != 1) { - if (os) { - (*os) << " Tag not singular : " << ts; + size_t ts = tag_size(t); + if (ts != 1) { + if (os) { + (*os) << " Tag not singular : " << ts; + } + return false; } - return false; } - idx_t pos_idx = t.get_pos_index(); if (!pos_dict_.is_id_valid(pos_idx)) { if (os) { @@ -342,13 +357,13 @@ bool Tagset::validate_tag(const Tag &t, bool allow_extra, } return false; } - std::vector<bool> valid = get_pos_attributes_flag(pos_idx); - std::vector<bool> required = get_pos_required_attributes(pos_idx); + const std::vector<bool>& valid = get_pos_attributes_flag(pos_idx); + const std::vector<bool>& required = get_pos_required_attributes(pos_idx); for (idx_t i = 0; i < attribute_count(); ++i) { mask_t value = t.get_values_for(get_attribute_mask(i)); if (value.none()) { - if (required[i]) { + if ((mode & ParseCheckRequired) && required[i]) { if (os) { (*os) << " red attribuite " << get_attribute_name(i) @@ -357,7 +372,7 @@ bool Tagset::validate_tag(const Tag &t, bool allow_extra, return false; } } else { - if (!valid[i] && !allow_extra) { + if (!valid[i] && !(mode & ParseAllowExtra)) { if (os) { (*os) << " Extra attribute value: " << get_value_name(value) @@ -417,7 +432,7 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const } std::vector<std::string> Tagset::tag_to_symbol_string_vector(const Tag& tag, - bool compress_attributes) const + bool compress_attributes /* = true */) const { std::vector<std::string> ret; foreach (mask_t p, PwrNlp::set_bits(tag.get_pos())) { @@ -440,7 +455,7 @@ std::vector<std::string> Tagset::tag_to_symbol_string_vector(const Tag& tag, } std::string Tagset::tag_to_symbol_string(const Tag& tag, - bool compress_attributes) const + bool compress_attributes /* = true */) const { return boost::algorithm::join( tag_to_symbol_string_vector(tag, compress_attributes), ","); @@ -729,7 +744,7 @@ void Tagset::lexemes_into_token(Token& tok, const UnicodeString& lemma, boost::bind(lex, _1)); foreach (const string_range& o, options) { - parse_tag(o, true, func); + parse_tag(o, func); } } diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 4b199b6251f9b56f4a7c4670dfc89729082d1133..4c628c5fafbfd5a0508225c2f626cf740c644f0e 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -120,6 +120,20 @@ public: */ static Tagset from_data(const char*); + /** + * Mode enum for tag parsing + */ + enum ParseMode { + ParseCheckRequired = 1, /// Check for presence of required attributes + ParseAllowExtra = 2, /// Allow extra attributes + ParseCheckSingular = 4, /// Check tag singularity + + ParseDefault = ParseCheckRequired, /// Default mode + ParseRequiredWithExtra = ParseCheckRequired | ParseAllowExtra, + ParseStrict = ParseCheckRequired | ParseCheckSingular, + ParseLoose = ParseAllowExtra + }; + /** * Parse a single tagset symbol and return the correspondig (partial) tag. * @@ -139,8 +153,9 @@ public: * A simple wrapper for string split and a call to the split string * version. */ - void parse_tag(const string_range& s, bool allow_extra, - boost::function<void (const Tag&)> sink) const; + void parse_tag(const string_range& s, + boost::function<void (const Tag&)> sink, + ParseMode mode = ParseDefault) const; /** * Tag parsing -- functional version, whole tag string, char* overload. @@ -149,9 +164,10 @@ public: * A simple wrapper for string split and a call to the split string * version. */ - void parse_tag(const char* c, bool allow_extra, - boost::function<void (const Tag&)> sink) const { - parse_tag(std::string(c), allow_extra, sink); + void parse_tag(const char* c, + boost::function<void (const Tag&)> sink, + ParseMode mode = ParseDefault) const { + parse_tag(std::string(c), sink, mode); } /** @@ -171,8 +187,9 @@ public: * - an underscore (_) indicates that all values for the attribute at * the underscore's position should be taken. */ - void parse_tag(const string_range_vector& ts, bool allow_extra, - boost::function<void (const Tag&)> sink) const; + void parse_tag(const string_range_vector& ts, + boost::function<void (const Tag&)> sink, + ParseMode mode = ParseDefault) const; /** * Tag parsing -- plain version, whole string. @@ -181,7 +198,7 @@ public: * version. */ std::vector<Tag> parse_tag(const string_range& s, - bool allow_extra) const; + ParseMode mode = ParseDefault) const; /** * Tag parsing -- plain version, whole string, char* overload. @@ -189,8 +206,9 @@ public: * A simple wrapper for string split and a call to the split string * version. */ - std::vector<Tag> parse_tag(const char* c, bool allow_extra) const { - return parse_tag(std::string(c), allow_extra); + std::vector<Tag> parse_tag(const char* c, + ParseMode mode = ParseDefault) const { + return parse_tag(std::string(c), mode); } /** @@ -200,7 +218,7 @@ public: * the tags end up in a vector, which is then returned. */ std::vector<Tag> parse_tag(const string_range_vector& ts, - bool allow_extra) const; + ParseMode mode = ParseDefault) const; /** * Simple tag parsing -- whole string version. @@ -208,7 +226,8 @@ public: * A simple wrapper for string split and a call to the split string * version. */ - Tag parse_simple_tag(const string_range& s, bool allow_extra) const; + Tag parse_simple_tag(const string_range& s, + ParseMode mode = ParseDefault) const; /** * Simple tag parsing -- whole string version, char* overload. @@ -216,8 +235,9 @@ public: * A simple wrapper for string split and a call to the split string * version. */ - Tag parse_simple_tag(const char* c, bool allow_extra) const { - return parse_simple_tag(std::string(c), allow_extra); + Tag parse_simple_tag(const char* c, + ParseMode mode = ParseDefault) const { + return parse_simple_tag(std::string(c), mode); } /** @@ -228,17 +248,17 @@ public: * underscores or plus / pipe characters). */ Tag parse_simple_tag(const string_range_vector& ts, - bool allow_extra) const; - + ParseMode mode = ParseDefault) const; /** * Create a tag from the given POS and a (unsorted) vector of values. * * The POS is assumed to be valid in this tagset. * The values are assumed to be valid in this tagset, but are checked - * for correctness with regards to the POS. + * for correctness with regards to the POS according to the mode. */ - Tag make_tag(idx_t pos, mask_t values, bool allow_extra) const; + Tag make_tag(idx_t pos, mask_t values, + ParseMode mode = ParseDefault) const; /** * Convenience function for creating a 'ign' (ignored) tag within this @@ -255,7 +275,7 @@ public: * * no extra attrbutes are set, unless allow_extra is true * @return true if the tag is valid, false otherwise */ - bool validate_tag(const Tag& t, bool allow_extra, + bool validate_tag(const Tag& t, ParseMode mode = ParseDefault, std::ostream* os = NULL) const; /** diff --git a/tagset-tool/main.cpp b/tagset-tool/main.cpp index c51a991152e9dfe8358a8a9b1a26f4be1d5d3f9b..f3fa15f49f34ce0c376f0f69238c77468385edfb 100644 --- a/tagset-tool/main.cpp +++ b/tagset-tool/main.cpp @@ -182,7 +182,7 @@ void tag_parse_cb(const Corpus2::Tagset& tagset, bool validate, bool sort, std::stringstream ss; ss << tagset.tag_to_string(lex.tag()); if (validate) { - tagset.validate_tag(lex.tag(), false, &ss); + tagset.validate_tag(lex.tag(), Corpus2::Tagset::ParseStrict, &ss); } if (internals) { ss << "\n" << lex.tag().raw_dump() << ""; diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp index 4258b7ba032961c4e38405d89a9e8194d58a70c7..ddac048238cd0e55f1351973079541508b5929c7 100644 --- a/tests/tag_split.cpp +++ b/tests/tag_split.cpp @@ -48,7 +48,7 @@ struct F { foreach (const Corpus2::Lexeme& lex, t.lexemes()) { const Corpus2::Tag& tag = lex.tag(); - BOOST_WARN(tagset->validate_tag(tag, false, &std::cerr)); + BOOST_WARN(tagset->validate_tag(tag, Corpus2::Tagset::ParseStrict, &std::cerr)); actual.insert(tagset->tag_to_string(tag)); tags.push_back(tag); } @@ -185,9 +185,9 @@ BOOST_FIXTURE_TEST_CASE( underscore_dots, F ) BOOST_FIXTURE_TEST_CASE( tag_size, F ) { - Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false); - Corpus2::Tag t2 = tagset->parse_simple_tag("some:tog:data", false); - Corpus2::Tag t3 = tagset->parse_simple_tag("same:tag:data", false); + Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data"); + Corpus2::Tag t2 = tagset->parse_simple_tag("some:tog:data"); + Corpus2::Tag t3 = tagset->parse_simple_tag("same:tag:data"); BOOST_CHECK(tagset->tag_is_singular(t)); BOOST_CHECK_EQUAL(tagset->tag_size(t), 1); BOOST_CHECK(tagset->tag_is_singular(t2)); @@ -200,7 +200,7 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F ) t.add_pos(t3.get_pos()); BOOST_CHECK(!tagset->tag_is_singular(t)); BOOST_CHECK_EQUAL(tagset->tag_size(t), 4); - Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", true); + Corpus2::Tag t4 = tagset->parse_simple_tag("same:other:thang", Corpus2::Tagset::ParseLoose); t.add_values(t4.get_values() & tagset->get_attribute_mask(std::string("A"))); BOOST_CHECK_EQUAL(tagset->tag_size(t), 6); std::vector<Corpus2::Tag> tags = tagset->split_tag(t); @@ -214,8 +214,8 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F ) BOOST_FIXTURE_TEST_CASE( s, F ) { - Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false); - Corpus2::Tag t2 = tagset->parse_simple_tag("same:tog:data", false); + Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data"); + Corpus2::Tag t2 = tagset->parse_simple_tag("same:tog:data"); BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t), "some,tag,data"); BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t2), "same,tog,data"); Corpus2::Tag t3 = t.get_combined(t2);