Skip to content
Snippets Groups Projects
Commit c9a46af0 authored by ilor's avatar ilor
Browse files

Add Tagset::parse_symbol and tag_to_symbol_string and related functions. Bumps verison of corpus2.

parent 011d10e6
Branches
No related tags found
No related merge requests found
......@@ -3,7 +3,7 @@ PROJECT(corpus2)
set(corpus2_ver_major "0")
set(corpus2_ver_minor "1")
set(corpus2_ver_patch "0")
set(corpus2_ver_patch "1")
if(NOT LIBCORPUS2_SRC_DATA_DIR)
......
......@@ -95,6 +95,20 @@ std::string Tagset::id_string(const Tag& tag) const
return ss.str();
}
Tag Tagset::parse_symbol(const std::string& s) const
{
mask_t m = get_pos_mask(s);
if (m.none()) {
return Tag(m);
}
m = get_attribute_mask(s);
if (m.any()) {
return Tag(0, m);
}
m = get_value_mask(s);
return Tag(0, m);
}
void Tagset::parse_tag(const string_range &s, bool allow_extra,
boost::function<void(const Tag &)> sink) const
{
......@@ -369,6 +383,36 @@ std::string Tagset::tag_to_no_opt_string(const Tag &tag) const
return ss.str();
}
std::vector<std::string> Tagset::tag_to_symbol_string_vector(const Tag& tag,
bool compress_attributes) const
{
std::vector<std::string> ret;
foreach (mask_t p, PwrNlp::set_bits(tag.get_pos())) {
ret.push_back(get_pos_name(p));
}
mask_t vals = tag.get_values();
if (compress_attributes) {
for (idx_t ai = 0; ai < attribute_count(); ++ai) {
mask_t amask = get_attribute_mask(ai);
if ((vals & amask) == amask) {
vals ^= amask;
ret.push_back(get_attribute_name(ai));
}
}
}
foreach (mask_t p, PwrNlp::set_bits(vals)) {
ret.push_back(get_value_name(p));
}
return ret;
}
std::string Tagset::tag_to_symbol_string(const Tag& tag,
bool compress_attributes) const
{
return boost::algorithm::join(
tag_to_symbol_string_vector(tag, compress_attributes), ",");
}
size_t Tagset::tag_size(const Tag& tag) const
{
size_t s = PwrNlp::count_bits_set(tag.get_pos());
......
......@@ -104,6 +104,19 @@ public:
*/
static Tagset from_data(const char*);
/**
* Parse a single tagset symbol and return the correspondig (partial) tag.
*
* Pos and value names result in a single-bit-set tag, attribite names
* result in a tag with all values from that attribute set.
*
* The resulting tags will usually be invalid as standalone tags, so
* there is no validation performed.
*
* An invalid string will result in a null tag being returned.
*/
Tag parse_symbol(const std::string& s) const;
/**
* Tag parsing -- functional version, whole tag string.
*
......@@ -250,6 +263,28 @@ public:
*/
std::string tag_to_no_opt_string(const Tag &tag) const;
/**
* Create and return a string representation of the symbols contained
* within a tag when treated as separate tagset symbols.
*
* There will be one string for each POS set in the tag, and enough symbols
* to cover all the values. If compress_tags is false, there will be one
* value name per value set. If compress_tags is true, in case there are
* attributes with all values setin the tag, the name of the attribiute
* will be used instead of separate names of the attribute's values.
*/
std::vector<std::string> tag_to_symbol_string_vector(const Tag& tag,
bool compress_attribites = true) const;
/**
* Return a comma-separated string representation of all symbols contained
* within a tag.
*
* @see tag_to_symbol_string_vector.
*/
std::string tag_to_symbol_string(const Tag& tag,
bool compress_attribites = true) const;
/**
* Compute the number of singular tags that can be represented by the given
* tag, with the following restrictions:
......
......@@ -196,6 +196,26 @@ BOOST_FIXTURE_TEST_CASE( tag_size, F )
BOOST_CHECK(tt == t);
}
BOOST_FIXTURE_TEST_CASE( s, F )
{
Corpus2::Tag t = tagset->parse_simple_tag("some:tag:data", false);
Corpus2::Tag t2 = tagset->parse_simple_tag("same:tog:data", false);
BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t), "some,tag,data");
BOOST_CHECK_EQUAL(tagset->tag_to_symbol_string(t2), "same,tog,data");
Corpus2::Tag t3 = t.get_combined(t2);
std::vector<std::string> v = tagset->tag_to_symbol_string_vector(t3);
std::sort(v.begin(), v.end());
std::vector<std::string> v2;
v2.push_back("some");
v2.push_back("same");
v2.push_back("tog");
v2.push_back("tag");
v2.push_back("data");
std::sort(v2.begin(), v2.end());
BOOST_CHECK_EQUAL_COLLECTIONS(v.begin(), v.end(), v2.begin(), v2.end());
}
BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_CASE(bs_split)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment