Newer
Older
# from src.annotation_types_old import AnnotationTypes
from src.input_parsers.ccl import CCLInputParser
from tempfile import NamedTemporaryFile
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
example_ccl = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE chunkList SYSTEM "ccl.dtd">
<chunkList>
<chunk type="p" id="ch1">
<sentence id="s1">
<tok>
<orth>Marek</orth>
<lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex>
<lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex>
<ann chan="nam_liv" head="1">1</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>Kowalski</orth>
<lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex>
<ann chan="nam_liv">1</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>pojechał</orth>
<lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>do</orth>
<lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>Wrocławia</orth>
<lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc" head="1">1</ann>
</tok>
<ns/>
<tok>
<orth>.</orth>
<lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc">0</ann>
</tok>
</sentence>
</chunk>
</chunkList>
"""
def test_ccl_input_parser():
parser = CCLInputParser()
with NamedTemporaryFile() as f:
f.write(example_ccl.encode("utf-8"))
f.flush()
text, annotations = parser.parse(f.name)
assert text == "Marek Kowalski pojechał do Wrocławia."
# assert set(annotations.keys()) == set(["nam_liv", "nam_loc", AnnotationTypes.MORPHOSYNTACTIC_TAG])
assert annotations["nam_liv"] == [(0, 14, "Marek Kowalski")]
assert annotations["nam_loc"] == [(27, 36, "Wrocławia")]
# assert annotations[AnnotationTypes.MORPHOSYNTACTIC_TAG] == [
# (0, 5, "subst:sg:nom:m1"),
# (6, 14, "subst:sg:nom:m1"),
# (15, 23, "praet:sg:m1:perf"),
# (24, 26, "prep:gen"),
# (27, 36, "subst:sg:gen:m3"),
# (36, 37, "interp"),
# ]