From 2702b4e1dc855ab2cce727caabf599245dd9919c Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Mon, 15 Nov 2010 10:43:31 +0100 Subject: [PATCH] Implement TSet:: is_subset_of, equals and intersects Change StrSet::is_subset_of to correctly handle empty sets, add some comments Add Tset::insert_symbol Add tests for new functions Bump required corpus2 version to 0.1.2 due to a fixed bug there --- libwccl/CMakeLists.txt | 2 +- libwccl/values/strset.cpp | 19 +++++++------ libwccl/values/strset.h | 8 ++++++ libwccl/values/tset.cpp | 5 ++++ libwccl/values/tset.h | 25 ++++++++++------ tests/values.cpp | 60 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 101 insertions(+), 18 deletions(-) diff --git a/libwccl/CMakeLists.txt b/libwccl/CMakeLists.txt index b69bbb4..8c68a29 100644 --- a/libwccl/CMakeLists.txt +++ b/libwccl/CMakeLists.txt @@ -4,7 +4,7 @@ PROJECT(wccl) include_directories( ${CMAKE_CURRENT_BINARY_DIR}/include/ ) -find_package(Corpus2 0.1.1 REQUIRED) +find_package(Corpus2 0.1.2 REQUIRED) set(LIBS ${LIBS} ${Corpus2_LIBRARY}) find_package(PwrUtils 0.0.3 REQUIRED) diff --git a/libwccl/values/strset.cpp b/libwccl/values/strset.cpp index ed6b4bc..02489f4 100644 --- a/libwccl/values/strset.cpp +++ b/libwccl/values/strset.cpp @@ -24,29 +24,32 @@ std::string StrSet::to_raw_string() const } bool StrSet::intersects(const StrSet &other) const { - if(empty() || other.empty()) { + if (empty() || other.empty()) { return false; } - //we just want to check if there is an intersection, no + //We just want to check if there is an intersection, no //need to actually compute it to check if it's empty. - //doing it like below sounds faster than, say, sorting - //the sets and using set_intersection + //Doing it like below sounds faster than, say, sorting + //the sets and using set_intersection. + //it's faster to iterate through the smaller set and check in + //the larger than it is to do the opposite, hence the &?: below const set_t& smaller = size() < other.size() ? set_ : other.set_; const set_t& bigger = size() < other.size() ? other.set_ : set_; foreach (const UnicodeString& u, smaller) { - if(bigger.find(u) != bigger.end()) { + if (bigger.find(u) != bigger.end()) { return true; } } return false; } -bool StrSet::is_subset_of(const StrSet &other) const { - if(empty() || size() > other.size()) { +bool StrSet::is_subset_of(const StrSet &other) const +{ + if (size() > other.size()) { return false; } foreach (const UnicodeString& u, set_) { - if(other.set_.find(u) == other.set_.end()) { + if (other.set_.find(u) == other.set_.end()) { return false; } } diff --git a/libwccl/values/strset.h b/libwccl/values/strset.h index e984bd0..3545e80 100644 --- a/libwccl/values/strset.h +++ b/libwccl/values/strset.h @@ -56,8 +56,16 @@ public: return set_.empty(); } + /** + * @return true if each string from this set exists in the other set + * (note that an empty set is a subset of anything) + */ bool is_subset_of(const StrSet& other) const; + /** + * @return true if there is at least one common string between this set and + * the other set (an empty set intersects with nothing) + */ bool intersects(const StrSet& other) const; bool equals(const StrSet& other) const { diff --git a/libwccl/values/tset.cpp b/libwccl/values/tset.cpp index 9573bcb..8b1090e 100644 --- a/libwccl/values/tset.cpp +++ b/libwccl/values/tset.cpp @@ -15,4 +15,9 @@ std::string TSet::to_string(const Corpus2::Tagset& tagset) const return tagset.tag_to_symbol_string(tag_); } +void TSet::insert_symbol(const Corpus2::Tagset& tagset, const std::string& s) +{ + tag_.combine_with(tagset.parse_symbol(s)); +} + } /* end ns Wccl */ diff --git a/libwccl/values/tset.h b/libwccl/values/tset.h index 3c7d125..6d6813b 100644 --- a/libwccl/values/tset.h +++ b/libwccl/values/tset.h @@ -33,23 +33,30 @@ public: return tag_; } + void insert_symbol(const Corpus2::Tagset& tagset, const std::string& s); + bool empty() const { return tag_.is_null(); } - bool is_subset_of(const TSet& /*tset*/) const { - //TODO: implement this - return false; + /** + * @return true if each tagset symbol from this set exists in the other set + * (note that an empty set is a subset of anything) + */ + bool is_subset_of(const TSet& other) const { + return tag_.get_masked(other.tag_) == tag_; } - bool intersects(const TSet& /*tset*/) const { - //TODO: implement this - return false; + /** + * @return true if there is at least one common symbol between this set and + * the other set (an empty set intersects with nothing) + */ + bool intersects(const TSet& other) const { + return !tag_.get_masked(other.tag_).is_null(); } - bool equals(const TSet& /*tset*/) const { - //TODO: implement this - return false; + bool equals(const TSet& other) const { + return tag_ == other.tag_; } std::string to_string(const Corpus2::Tagset &) const; diff --git a/tests/values.cpp b/tests/values.cpp index 6d2ac2d..78df7ec 100644 --- a/tests/values.cpp +++ b/tests/values.cpp @@ -1,5 +1,6 @@ #include <boost/test/unit_test.hpp> #include <boost/bind.hpp> +#include <libcorpus2/tagsetmanager.h> #include <libwccl/variables.h> @@ -25,4 +26,63 @@ BOOST_AUTO_TEST_CASE(tsetz) BOOST_CHECK_EQUAL(v.get_type_name(), TSet::type_name); } +BOOST_AUTO_TEST_CASE(strset_ops) +{ + StrSet s1, s2; + BOOST_CHECK(s1.equals(s2)); + BOOST_CHECK(s1.is_subset_of(s2)); + BOOST_CHECK(s2.is_subset_of(s1)); + BOOST_CHECK(!s1.intersects(s2)); + s1.insert_utf8("aaa"); + BOOST_CHECK(!s1.equals(s2)); + BOOST_CHECK(!s1.is_subset_of(s2)); + BOOST_CHECK(s2.is_subset_of(s1)); + BOOST_CHECK(!s1.intersects(s2)); + s2.insert_utf8("bbb"); + BOOST_CHECK(!s1.equals(s2)); + BOOST_CHECK(!s1.is_subset_of(s2)); + BOOST_CHECK(!s2.is_subset_of(s1)); + BOOST_CHECK(!s1.intersects(s2)); + s2.insert_utf8("aaa"); + BOOST_CHECK(!s1.equals(s2)); + BOOST_CHECK(s1.is_subset_of(s2)); + BOOST_CHECK(!s2.is_subset_of(s1)); + BOOST_CHECK(s1.intersects(s2)); + s1.insert_utf8("bbb"); + BOOST_CHECK(s1.equals(s2)); + BOOST_CHECK(s1.is_subset_of(s2)); + BOOST_CHECK(s2.is_subset_of(s1)); + BOOST_CHECK(s1.intersects(s2)); +} + +BOOST_AUTO_TEST_CASE(tset_ops) +{ + TSet s1, s2; + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi"); + BOOST_CHECK(s1.equals(s2)); + BOOST_CHECK(s1.is_subset_of(s2)); + BOOST_CHECK(s2.is_subset_of(s1)); + BOOST_CHECK(!s1.intersects(s2)); + s1.insert_symbol(tagset, "subst"); + BOOST_CHECK(!s1.equals(s2)); + BOOST_CHECK(!s1.is_subset_of(s2)); + BOOST_CHECK(s2.is_subset_of(s1)); + BOOST_CHECK(!s1.intersects(s2)); + s2.insert_symbol(tagset, "pl"); + BOOST_CHECK(!s1.equals(s2)); + BOOST_CHECK(!s1.is_subset_of(s2)); + BOOST_CHECK(!s2.is_subset_of(s1)); + BOOST_CHECK(!s1.intersects(s2)); + s2.insert_symbol(tagset, "subst"); + BOOST_CHECK(!s1.equals(s2)); + BOOST_CHECK(s1.is_subset_of(s2)); + BOOST_CHECK(!s2.is_subset_of(s1)); + BOOST_CHECK(s1.intersects(s2)); + s1.insert_symbol(tagset, "pl"); + BOOST_CHECK(s1.equals(s2)); + BOOST_CHECK(s1.is_subset_of(s2)); + BOOST_CHECK(s2.is_subset_of(s1)); + BOOST_CHECK(s1.intersects(s2)); +} + BOOST_AUTO_TEST_SUITE_END() -- GitLab