Skip to content
Snippets Groups Projects
Select Git revision
  • add7a3159699b84864fdefe418c7851066000228
  • master default protected
  • vertical_relations
  • lu_without_semantic_frames
  • hierarchy
  • additional-unification-filters
  • v0.1.1
  • v0.1.0
  • v0.0.9
  • v0.0.8
  • v0.0.7
  • v0.0.6
  • v0.0.5
  • v0.0.4
  • v0.0.3
  • v0.0.2
  • v0.0.1
17 results

tests.py

Blame
  • test_token_hash.py 1.28 KiB
    from collections import defaultdict
    
    import cclutils as ccl
    
    
    test_doc = "data/ccl01.xml"
    
    doc = ccl.read(test_doc)
    
    toks = [
        t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
    ]
    
    all_toks = set(toks)
    assert len(all_toks) == 5
    
    # check if results of hash function and `==` operator are consistent
    for t1 in toks:
        for t2 in toks:
            are_equal = t1 == t2
            have_equal_hashes = hash(t1) == hash(t2)
            assert (
                not any((are_equal, have_equal_hashes))
                or all((are_equal, have_equal_hashes))
            )
    
    toks2 = [
        t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()
    ]
    for t1, t2 in zip(toks, toks2):
        assert hash(t1) == hash(t2)
    
    assert hash(toks[1]) == hash(toks[4])  # same toks in defferent sents
    assert hash(toks[2]) == hash(toks[6])  # same toks in defferent sents
    assert hash(toks[0]) != hash(toks[1])  # same tokens except whitespaces
    assert hash(toks[1]) != hash(toks[2])  # different orth
    assert hash(toks[2]) != hash(toks[3])  # different ctag
    assert hash(toks[4]) != hash(toks[5])  # different base
    
    tok_pos_idx = defaultdict(list)
    for i, t in enumerate(toks):
        tok_pos_idx[t].append(i)
    assert tok_pos_idx == {
        toks[0]: [0],
        toks[1]: [1, 4],
        toks[2]: [2, 6],
        toks[3]: [3],
        toks[5]: [5],
    }