test_token_hash.py

from collections import defaultdict

import cclutils as ccl


test_doc = "data/ccl01.xml"

doc = ccl.read(test_doc)

toks = [
    t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
]

all_toks = set(toks)
assert len(all_toks) == 5

# check if results of hash function and `==` operator are consistent
for t1 in toks:
    for t2 in toks:
        are_equal = t1 == t2
        have_equal_hashes = hash(t1) == hash(t2)
        assert (
            not any((are_equal, have_equal_hashes))
            or all((are_equal, have_equal_hashes))
        )

toks2 = [
    t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
]
for t1, t2 in zip(toks, toks2):
    assert hash(t1) == hash(t2)

assert hash(toks[1]) == hash(toks[4])  # same toks in defferent sents
assert hash(toks[2]) == hash(toks[6])  # same toks in defferent sents
assert hash(toks[0]) != hash(toks[1])  # same tokens except whitespaces
assert hash(toks[1]) != hash(toks[2])  # different orth
assert hash(toks[2]) != hash(toks[3])  # different ctag
assert hash(toks[4]) != hash(toks[5])  # different base

tok_pos_idx = defaultdict(list)
for i, t in enumerate(toks):
    tok_pos_idx[t].append(i)
assert tok_pos_idx == {
    toks[0]: [0],
    toks[1]: [1, 4],
    toks[2]: [2, 6],
    toks[3]: [3],
    toks[5]: [5],
}