From c9a46af06061e9da82fb203ec02336b549da05c6 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 4 Nov 2010 17:32:01 +0100 Subject: [PATCH] Add Tagset::parse_symbol and tag_to_symbol_string and related functions. Bumps verison of corpus2. --- libcorpus2/CMakeLists.txt | 2 +- libcorpus2/tagset.cpp | 44 +++++++++++++++++++++++++++++++++++++++ libcorpus2/tagset.h | 35 +++++++++++++++++++++++++++++++ tests/tag_split.cpp | 20 ++++++++++++++++++ 4 files changed, 100 insertions(+), 1 deletion(-) diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index e6873f9..ae39d3b 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -3,7 +3,7 @@ PROJECT(corpus2) set(corpus2_ver_major "0") set(corpus2_ver_minor "1") -set(corpus2_ver_patch "0") +set(corpus2_ver_patch "1") if(NOT LIBCORPUS2_SRC_DATA_DIR) diff --git a/libcorpus2/tagset.cpp b/libcorpus2/tagset.cpp index 538d2c2..8fbb296 100644 --- a/libcorpus2/tagset.cpp +++ b/libcorpus2/tagset.cpp @@ -95,6 +95,20 @@ std::string Tagset::id_string(const Tag& tag) const return ss.str(); } +Tag Tagset::parse_symbol(const std::string& s) const +{ + mask_t m = get_pos_mask(s); + if (m.none()) { + return Tag(m); + } + m = get_attribute_mask(s); + if (m.any()) { + return Tag(0, m); + } + m = get_value_mask(s); + return Tag(0, m); +} + void Tagset::parse_tag(const string_range &s, bool allow_extra, boost::function<void(const Tag &)> sink) const { @@ -369,6 +383,36 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const return ss.str(); } +std::vector<std::string> Tagset::tag_to_symbol_string_vector(const Tag& tag, + bool compress_attributes) const +{ + std::vector<std::string> ret; + foreach (mask_t p, PwrNlp::set_bits(tag.get_pos())) { + ret.push_back(get_pos_name(p)); + } + mask_t vals = tag.get_values(); + if (compress_attributes) { + for (idx_t ai = 0; ai < attribute_count(); ++ai) { + mask_t amask = get_attribute_mask(ai); + if ((vals & amask) == amask) { + vals ^= amask; + ret.push_back(get_attribute_name(ai)); + } + } + } + foreach (mask_t p, PwrNlp::set_bits(vals)) { + ret.push_back(get_value_name(p)); + } + return ret; +} + +std::string Tagset::tag_to_symbol_string(const Tag& tag, + bool compress_attributes) const +{ + return boost::algorithm::join( + tag_to_symbol_string_vector(tag, compress_attributes), ","); +} + size_t Tagset::tag_size(const Tag& tag) const { size_t s = PwrNlp::count_bits_set(tag.get_pos()); diff --git a/libcorpus2/tagset.h b/libcorpus2/tagset.h index 44cfbf2..572e5b4 100644 --- a/libcorpus2/tagset.h +++ b/libcorpus2/tagset.h @@ -104,6 +104,19 @@ public: */ static Tagset from_data(const char*); + /** + * Parse a single tagset symbol and return the correspondig (partial) tag. + * + * Pos and value names result in a single-bit-set tag, attribite names + * result in a tag with all values from that attribute set. + * + * The resulting tags will usually be invalid as standalone tags, so + * there is no validation performed. + * + * An invalid string will result in a null tag being returned. + */ + Tag parse_symbol(const std::string& s) const; + /** * Tag parsing -- functional version, whole tag string. * @@ -250,6 +263,28 @@ public: */ std::string tag_to_no_opt_string(const Tag &tag) const; + /** + * Create and return a string representation of the symbols contained + * within a tag when treated as separate tagset symbols. + * + * There will be one string for each POS set in the tag, and enough symbols + * to cover all the values. If compress_tags is false, there will be one + * value name per value set. If compress_tags is true, in case there are + * attributes with all values setin the tag, the name of the attribiute + * will be used instead of separate names of the attribute's values. + */ + std::vector<std::string> tag_to_symbol_string_vector(const Tag& tag, + bool compress_attribites = true) const; + + /** + * Return a comma-separated string representation of all symbols contained + * within a tag. + * + * @see tag_to_symbol_string_vector. + */ + std::string tag_to_symbol_string(const Tag& tag, + bool compress_attribites = true) const; + /** * Compute the number of singular tags that can be represented by the given * tag, with the following restrictions: diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp index 7197653..b666049 100644 --- a/tests/tag_split.cpp +++ b/tests/tag_split.cpp @@ -196,6 +196,26 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F ) BOOST_CHECK(tt == t); } +BOOST_FIXTURE_TEST_CASE( s, F ) +{ + Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false); + Corpus2::Tag t2 = tagset->parse_simple_tag("same:tog:data", false); + BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t), "some,tag,data"); + BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t2), "same,tog,data"); + Corpus2::Tag t3 = t.get_combined(t2); + std::vector<std::string> v = tagset->tag_to_symbol_string_vector(t3); + std::sort(v.begin(), v.end()); + std::vector<std::string> v2; + v2.push_back("some"); + v2.push_back("same"); + v2.push_back("tog"); + v2.push_back("tag"); + v2.push_back("data"); + std::sort(v2.begin(), v2.end()); + BOOST_CHECK_EQUAL_COLLECTIONS(v.begin(), v.end(), v2.begin(), v2.end()); + +} + BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_CASE(bs_split) -- GitLab