Select Git revision
test_token_hash.py 1.28 KiB
from collections import defaultdict
import cclutils as ccl
test_doc = "data/ccl01.xml"
doc = ccl.read(test_doc)
toks = [
t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
]
all_toks = set(toks)
assert len(all_toks) == 5
# check if results of hash function and `==` operator are consistent
for t1 in toks:
for t2 in toks:
are_equal = t1 == t2
have_equal_hashes = hash(t1) == hash(t2)
assert (
not any((are_equal, have_equal_hashes))
or all((are_equal, have_equal_hashes))
)
toks2 = [
t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
]
for t1, t2 in zip(toks, toks2):
assert hash(t1) == hash(t2)
assert hash(toks[1]) == hash(toks[4]) # same toks in defferent sents
assert hash(toks[2]) == hash(toks[6]) # same toks in defferent sents
assert hash(toks[0]) != hash(toks[1]) # same tokens except whitespaces
assert hash(toks[1]) != hash(toks[2]) # different orth
assert hash(toks[2]) != hash(toks[3]) # different ctag
assert hash(toks[4]) != hash(toks[5]) # different base
tok_pos_idx = defaultdict(list)
for i, t in enumerate(toks):
tok_pos_idx[t].append(i)
assert tok_pos_idx == {
toks[0]: [0],
toks[1]: [1, 4],
toks[2]: [2, 6],
toks[3]: [3],
toks[5]: [5],
}