Skip to content
Snippets Groups Projects
Commit ce32d2fb authored by Arkadiusz Janz's avatar Arkadiusz Janz
Browse files

Merge branch 'token-hash' into 'master'

Make token hashable

See merge request !18
parents 92ecaac7 e0036bd4
Branches
1 merge request!18Make token hashable
Pipeline #4424 passed with stages
in 10 minutes and 8 seconds
PROJECT(corpus2)
set(corpus2_ver_major "1")
set(corpus2_ver_minor "8")
set(corpus2_ver_minor "9")
set(corpus2_ver_patch "0")
set(CORPUS2_VERSION "${corpus2_ver_major}.${corpus2_ver_minor}.${corpus2_ver_patch}")
......
......@@ -64,4 +64,13 @@ bool Lexeme::DisamblessComparator::operator()(const Lexeme& l, const Lexeme& oth
return l.lemma_ == other.lemma_ && l.tag_ == other.tag_;
}
size_t hash_value(const Lexeme &lexeme)
{
std::size_t seed = 0;
boost::hash_combine(seed, lexeme.lemma_utf8());
boost::hash_combine(seed, lexeme.tag());
boost::hash_combine(seed, lexeme.is_disamb());
return seed;
}
} /* end ns Corpus2 */
......@@ -138,6 +138,8 @@ private:
bool disamb_;
};
size_t hash_value(const Lexeme &lexeme);
} /* end ns Corpus2 */
#endif // LIBCORPUS2_LEXEME_H
......@@ -157,4 +157,13 @@ void Token::create_metadata()
metadata_ = boost::make_shared<TokenMetaData>();
}
size_t hash_value(const Token &token)
{
std::size_t seed = 0;
boost::hash_combine(seed, token.orth_utf8());
boost::hash_combine(seed, token.wa());
boost::hash_combine(seed, token.lexemes());
return seed;
}
} /* end ns Corpus2 */
......@@ -233,6 +233,8 @@ private:
boost::shared_ptr<TokenMetaData> metadata_;
};
size_t hash_value(const Token &token);
} /* end ns Corpus2 */
#endif // LIBCORPUS2_TOKEN_H
......@@ -74,7 +74,13 @@ namespace Corpus2 {
void create_metadata();
};
size_t hash_value(const Token &token);
%extend Token {
long __hash__() {
return (long) hash_value(*self);
}
// otherwise x != y wont trigger operator==
%pythoncode %{
def __ne__(self, other):
......
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE chunkList SYSTEM "ccl.dtd">
<chunkList>
<chunk id="ch1">
<sentence id="sent1">
<tok>
<orth>historii</orth>
<lex disamb="1">
<base>historia</base>
<ctag>subst:sg:gen:f</ctag>
</lex>
</tok>
<tok>
<orth>historii</orth>
<lex disamb="1">
<base>historia</base>
<ctag>subst:sg:gen:f</ctag>
</lex>
</tok>
<tok>
<orth>historiami</orth>
<lex disamb="1">
<base>historia</base>
<ctag>subst:sg:gen:f</ctag>
</lex>
</tok>
<tok>
<orth>historiami</orth>
<lex disamb="1">
<base>historia</base>
<ctag>subst:pl:inst:f</ctag>
</lex>
</tok>
</sentence>
<sentence id="sent2">
<tok>
<orth>historii</orth>
<lex disamb="1">
<base>historia</base>
<ctag>subst:sg:gen:f</ctag>
</lex>
</tok>
<tok>
<orth>historii</orth>
<lex disamb="1">
<base>Historia</base>
<ctag>subst:sg:gen:f</ctag>
</lex>
</tok>
<tok>
<orth>historiami</orth>
<lex disamb="1">
<base>historia</base>
<ctag>subst:sg:gen:f</ctag>
</lex>
</tok>
<ns/>
</sentence>
</chunk>
</chunkList>
cclutils
from collections import defaultdict
import cclutils as ccl
test_doc = "data/ccl01.xml"
doc = ccl.read(test_doc)
toks = [
t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
]
all_toks = set(toks)
assert len(all_toks) == 5
# check if results of hash function and `==` operator are consistent
for t1 in toks:
for t2 in toks:
are_equal = t1 == t2
have_equal_hashes = hash(t1) == hash(t2)
assert (
not any((are_equal, have_equal_hashes))
or all((are_equal, have_equal_hashes))
)
toks2 = [
t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
]
for t1, t2 in zip(toks, toks2):
assert hash(t1) == hash(t2)
assert hash(toks[1]) == hash(toks[4]) # same toks in defferent sents
assert hash(toks[2]) == hash(toks[6]) # same toks in defferent sents
assert hash(toks[0]) != hash(toks[1]) # same tokens except whitespaces
assert hash(toks[1]) != hash(toks[2]) # different orth
assert hash(toks[2]) != hash(toks[3]) # different ctag
assert hash(toks[4]) != hash(toks[5]) # different base
tok_pos_idx = defaultdict(list)
for i, t in enumerate(toks):
tok_pos_idx[t].append(i)
assert tok_pos_idx == {
toks[0]: [0],
toks[1]: [1, 4],
toks[2]: [2, 6],
toks[3]: [3],
toks[5]: [5],
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment