diff --git a/libcorpus2/lexeme.cpp b/libcorpus2/lexeme.cpp index 9714860c18b80e4458905a1fc434790407f078db..cd5361328ed333c2f5b30770c86cc8004325203f 100644 --- a/libcorpus2/lexeme.cpp +++ b/libcorpus2/lexeme.cpp @@ -28,7 +28,7 @@ bool Lexeme::operator<(const Lexeme& other) const || (lemma_ == other.lemma_ && (tag_ < other.tag_ || (tag_ == other.tag_ - && disamb_ < other.disamb_))); + && disamb_ && !other.disamb_))); } bool Lexeme::operator==(const Lexeme& other) const @@ -37,4 +37,9 @@ bool Lexeme::operator==(const Lexeme& other) const disamb_ == other.disamb_; } +bool Lexeme::DisamblessComparator::operator()(const Lexeme& l, const Lexeme& other) const +{ + return l.lemma_ == other.lemma_ && l.tag_ == other.tag_; +} + } /* end ns Corpus2 */ diff --git a/libcorpus2/lexeme.h b/libcorpus2/lexeme.h index 1e70d17278cab81c7d5cf848ce7ac27d48a72e76..0a508b1b0e4f61f5f08ef312ec26173304232f93 100644 --- a/libcorpus2/lexeme.h +++ b/libcorpus2/lexeme.h @@ -88,6 +88,14 @@ public: */ bool operator==(const Lexeme& other) const; + /** + * Disamb-ignoring lexeme comparison + */ + struct DisamblessComparator + { + bool operator()(const Lexeme& l1, const Lexeme& l2) const; + }; + private: /// The lemma -- basic form //boost::flyweight<UnicodeString> lemma_; diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp index f4317c51f404dece3566dc593c78cae2d7f34b09..ac86d5c6886992ca0f5fd80bc4bb3430e356babb 100644 --- a/libcorpus2/token.cpp +++ b/libcorpus2/token.cpp @@ -84,7 +84,7 @@ bool Token::remove_duplicate_lexemes() { size_t old_size = lexemes_.size(); std::sort(lexemes_.begin(), lexemes_.end()); - lexemes_.erase(std::unique(lexemes_.begin(), lexemes_.end()), + lexemes_.erase(std::unique(lexemes_.begin(), lexemes_.end(), Lexeme::DisamblessComparator()), lexemes_.end()); return old_size != lexemes_.size(); }