from typing import Dict, List, Tuple from lxml import etree from collections import defaultdict # from src.annotation_types_old import from src.input_parsers.interface import InputParser from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation class CCLInputParser(InputParser): def __init__(self) -> None: super().__init__() def parse(self, path_to_input: str) -> List[Tuple[int, int, Annotation]]: """Parse CCL string into text and annotations. Annotations are returned as a dictionary with channel name as a key and list of tuples. Args: path_to_input (str): Path to file containing CCL. Returns: Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and annotations. """ with open(path_to_input, 'r') as f: ccl = f.read() ccl_tree = etree.fromstring(ccl.strip().encode('utf-8')) results = [] text = "" ner_annotations = [] morphosyntactic_annotations = [] # First token is assumed to not have space before it last_was_ns = True tokens = ccl_tree.xpath("//ns | //tok") for token in tokens: if token.tag == 'tok': if not last_was_ns: text += " " word = token.xpath('./orth')[0].text start = len(text) end = start + len(word) for lex in token.xpath('./lex'): if lex.attrib['disamb'] == "1": ctag = lex.xpath('./ctag')[0] morphosyntactic_annotations.append((start, end, MorphosyntacticAnnotation(ctag.text))) break for ann in token.xpath('./ann'): is_present = int(ann.text) == 1 if not is_present: continue channel = ann.attrib['chan'] is_head = "head" in ann.attrib and ann.attrib['head'] == "1" if is_head: ner_annotations.append((start, end, NerAnnotation(channel))) else: old_start = ner_annotations[-1][0] ner_annotations[-1] = (old_start, end, ner_annotations[-1][2]) last_was_ns = False text += word elif token.tag == 'ns': last_was_ns = True results = ner_annotations + morphosyntactic_annotations return text, results