"""Module for parsing CCL files.""" from typing import List, Tuple from lxml import etree from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation from src.input_parsers.interface import InputParser class CCLInputParser(InputParser): """Parser for CCL files. Example CCL file: <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE chunkList SYSTEM "ccl.dtd"> <chunkList> <chunk type="p" id="ch1"> <sentence id="s1"> <tok> <orth>Tom</orth> <lex disamb="1"><base>Tom</base><ctag>subst:sg:nom:m1</ctag></lex> <ann chan="person_first_nam" head="1">1</ann> </tok> <tok> <orth>is</orth> <lex disamb="1"><base>be</base><ctag>fin:sg:ter:pres</ctag></lex> <ann chan="person_first_nam">0</ann> </tok> <tok> <orth>nice</orth> <lex disamb="1"><base>nice</base><ctag>adj:sg:nom:f:pos</ctag></lex> <ann chan="person_first_nam">0</ann> </tok> <tok> <orth>!</orth> <lex disamb="1"><base>!</base><ctag>interp</ctag></lex> <ann chan="person_first_nam">0</ann> </tok> </sentence> </chunk> </chunkList> """ def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse CCL string into text and annotations. Annotations are returned as a dictionary with channel name as a key and list of tuples. Args: content (str): Content of ccl file. Returns: Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and annotations. """ ccl_tree = etree.fromstring(content.strip().encode("utf-8")) results = [] text = "" ner_annotations = [] morphosyntactic_annotations = [] # First token is assumed to not have space before it last_was_ns = True tokens = ccl_tree.xpath("//ns | //tok") for token in tokens: if token.tag == "tok": if not last_was_ns: text += " " word = token.xpath("./orth")[0].text start = len(text) end = start + len(word) for lex in token.xpath("./lex"): if lex.attrib["disamb"] == "1": ctag = lex.xpath("./ctag")[0] morphosyntactic_annotations.append( (start, end, MorphosyntacticAnnotation(ctag.text)) ) break for ann in token.xpath("./ann"): is_present = int(ann.text) == 1 if not is_present: continue channel = ann.attrib["chan"] is_head = "head" in ann.attrib and ann.attrib["head"] == "1" if is_head: ner_annotations.append((start, end, NerAnnotation(channel))) else: old_start = ner_annotations[-1][0] ner_annotations[-1] = (old_start, end, ner_annotations[-1][2]) last_was_ns = False text += word elif token.tag == "ns": last_was_ns = True results = ner_annotations + morphosyntactic_annotations return text, results