Skip to content
Snippets Groups Projects
Select Git revision
  • aa315d516f3379c67000466c2b9c84ad50e25e23
  • master default protected
  • develop protected
  • feat_remove_attr
  • python2.7
  • python3.8
6 results

token.cpp

Blame
  • test_token_hash.py 1.28 KiB
    from collections import defaultdict
    
    import cclutils as ccl
    
    
    test_doc = "data/ccl01.xml"
    
    doc = ccl.read(test_doc)
    
    toks = [
        t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
    ]
    
    all_toks = set(toks)
    assert len(all_toks) == 5
    
    # check if results of hash function and `==` operator are consistent
    for t1 in toks:
        for t2 in toks:
            are_equal = t1 == t2
            have_equal_hashes = hash(t1) == hash(t2)
            assert (
                not any((are_equal, have_equal_hashes))
                or all((are_equal, have_equal_hashes))
            )
    
    toks2 = [
        t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
    ]
    for t1, t2 in zip(toks, toks2):
        assert hash(t1) == hash(t2)
    
    assert hash(toks[1]) == hash(toks[4])  # same toks in defferent sents
    assert hash(toks[2]) == hash(toks[6])  # same toks in defferent sents
    assert hash(toks[0]) != hash(toks[1])  # same tokens except whitespaces
    assert hash(toks[1]) != hash(toks[2])  # different orth
    assert hash(toks[2]) != hash(toks[3])  # different ctag
    assert hash(toks[4]) != hash(toks[5])  # different base
    
    tok_pos_idx = defaultdict(list)
    for i, t in enumerate(toks):
        tok_pos_idx[t].append(i)
    assert tok_pos_idx == {
        toks[0]: [0],
        toks[1]: [1, 4],
        toks[2]: [2, 6],
        toks[3]: [3],
        toks[5]: [5],
    }