diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cbaa9dab3608c4b012b473b40beac084647aadd6..3b937f6d286640452a10a69774a2443155591412 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ PROJECT(corpus2) set(corpus2_ver_major "1") -set(corpus2_ver_minor "8") +set(corpus2_ver_minor "9") set(corpus2_ver_patch "0") set(CORPUS2_VERSION "${corpus2_ver_major}.${corpus2_ver_minor}.${corpus2_ver_patch}") diff --git a/src/libcorpus2/lexeme.cpp b/src/libcorpus2/lexeme.cpp index b6b6650956e74bdf439995cd6975f0982d4c41b3..2fcfea3334b75780faf6370ff9ee0660df7125be 100644 --- a/src/libcorpus2/lexeme.cpp +++ b/src/libcorpus2/lexeme.cpp @@ -64,4 +64,13 @@ bool Lexeme::DisamblessComparator::operator()(const Lexeme& l, const Lexeme& oth return l.lemma_ == other.lemma_ && l.tag_ == other.tag_; } +size_t hash_value(const Lexeme &lexeme) +{ + std::size_t seed = 0; + boost::hash_combine(seed, lexeme.lemma_utf8()); + boost::hash_combine(seed, lexeme.tag()); + boost::hash_combine(seed, lexeme.is_disamb()); + return seed; +} + } /* end ns Corpus2 */ diff --git a/src/libcorpus2/lexeme.h b/src/libcorpus2/lexeme.h index 8b9084c3aafbd97e38adbe5d2a8d63e97bfc327d..48189685606b02cdbc0627d73b3c9280b411385c 100644 --- a/src/libcorpus2/lexeme.h +++ b/src/libcorpus2/lexeme.h @@ -138,6 +138,8 @@ private: bool disamb_; }; +size_t hash_value(const Lexeme &lexeme); + } /* end ns Corpus2 */ #endif // LIBCORPUS2_LEXEME_H diff --git a/src/libcorpus2/token.cpp b/src/libcorpus2/token.cpp index 857980e9500aa66b0c8ff8bfb1077f74a87f482d..2afaf81a3c601825ab0120c1b16642132f62c72a 100644 --- a/src/libcorpus2/token.cpp +++ b/src/libcorpus2/token.cpp @@ -157,4 +157,13 @@ void Token::create_metadata() metadata_ = boost::make_shared<TokenMetaData>(); } +size_t hash_value(const Token &token) +{ + std::size_t seed = 0; + boost::hash_combine(seed, token.orth_utf8()); + boost::hash_combine(seed, token.wa()); + boost::hash_combine(seed, token.lexemes()); + return seed; +} + } /* end ns Corpus2 */ diff --git a/src/libcorpus2/token.h b/src/libcorpus2/token.h index ea2ed5bf99c4e9e289ef3d52283c325ce3c31f94..6be8ec2f0902d735755c70ea58de4cc6afac4a5f 100644 --- a/src/libcorpus2/token.h +++ b/src/libcorpus2/token.h @@ -233,6 +233,8 @@ private: boost::shared_ptr<TokenMetaData> metadata_; }; +size_t hash_value(const Token &token); + } /* end ns Corpus2 */ #endif // LIBCORPUS2_TOKEN_H diff --git a/src/swig/token.i b/src/swig/token.i index 2cde15e0e74ee7f7fac370b6b591409491da51f9..0aa11fd9d96652fff105175ab4952157630904eb 100644 --- a/src/swig/token.i +++ b/src/swig/token.i @@ -74,7 +74,13 @@ namespace Corpus2 { void create_metadata(); }; + size_t hash_value(const Token &token); + %extend Token { + long __hash__() { + return (long) hash_value(*self); + } + // otherwise x != y wont trigger operator== %pythoncode %{ def __ne__(self, other): diff --git a/src/tests/python/data/ccl01.xml b/src/tests/python/data/ccl01.xml new file mode 100644 index 0000000000000000000000000000000000000000..fd05dc0c5ae395673fec2b3e707ce6ff89dbce4f --- /dev/null +++ b/src/tests/python/data/ccl01.xml @@ -0,0 +1,60 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE chunkList SYSTEM "ccl.dtd"> +<chunkList> + <chunk id="ch1"> + <sentence id="sent1"> + <tok> + <orth>historii</orth> + <lex disamb="1"> + <base>historia</base> + <ctag>subst:sg:gen:f</ctag> + </lex> + </tok> + <tok> + <orth>historii</orth> + <lex disamb="1"> + <base>historia</base> + <ctag>subst:sg:gen:f</ctag> + </lex> + </tok> + <tok> + <orth>historiami</orth> + <lex disamb="1"> + <base>historia</base> + <ctag>subst:sg:gen:f</ctag> + </lex> + </tok> + <tok> + <orth>historiami</orth> + <lex disamb="1"> + <base>historia</base> + <ctag>subst:pl:inst:f</ctag> + </lex> + </tok> + </sentence> + <sentence id="sent2"> + <tok> + <orth>historii</orth> + <lex disamb="1"> + <base>historia</base> + <ctag>subst:sg:gen:f</ctag> + </lex> + </tok> + <tok> + <orth>historii</orth> + <lex disamb="1"> + <base>Historia</base> + <ctag>subst:sg:gen:f</ctag> + </lex> + </tok> + <tok> + <orth>historiami</orth> + <lex disamb="1"> + <base>historia</base> + <ctag>subst:sg:gen:f</ctag> + </lex> + </tok> + <ns/> + </sentence> + </chunk> +</chunkList> diff --git a/src/tests/python/requirements-test.txt b/src/tests/python/requirements-test.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e823d75e1e01f5a36cd4ffbc10bd71e80e71280 --- /dev/null +++ b/src/tests/python/requirements-test.txt @@ -0,0 +1 @@ +cclutils diff --git a/src/tests/python/test_token_hash.py b/src/tests/python/test_token_hash.py new file mode 100644 index 0000000000000000000000000000000000000000..6ed8ad0eb8446058dceb2265cd4572e93217545b --- /dev/null +++ b/src/tests/python/test_token_hash.py @@ -0,0 +1,49 @@ +from collections import defaultdict + +import cclutils as ccl + + +test_doc = "data/ccl01.xml" + +doc = ccl.read(test_doc) + +toks = [ + t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens() +] + +all_toks = set(toks) +assert len(all_toks) == 5 + +# check if results of hash function and `==` operator are consistent +for t1 in toks: + for t2 in toks: + are_equal = t1 == t2 + have_equal_hashes = hash(t1) == hash(t2) + assert ( + not any((are_equal, have_equal_hashes)) + or all((are_equal, have_equal_hashes)) + ) + +toks2 = [ + t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens() +] +for t1, t2 in zip(toks, toks2): + assert hash(t1) == hash(t2) + +assert hash(toks[1]) == hash(toks[4]) # same toks in defferent sents +assert hash(toks[2]) == hash(toks[6]) # same toks in defferent sents +assert hash(toks[0]) != hash(toks[1]) # same tokens except whitespaces +assert hash(toks[1]) != hash(toks[2]) # different orth +assert hash(toks[2]) != hash(toks[3]) # different ctag +assert hash(toks[4]) != hash(toks[5]) # different base + +tok_pos_idx = defaultdict(list) +for i, t in enumerate(toks): + tok_pos_idx[t].append(i) +assert tok_pos_idx == { + toks[0]: [0], + toks[1]: [1, 4], + toks[2]: [2, 6], + toks[3]: [3], + toks[5]: [5], +}