from typing import Dict, List, Tuple from lxml import etree import json from collections import defaultdict # from src.annotation_types_old import from src.input_parsers.interface import InputParser from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation class WiktorNERInputParser(InputParser): def __init__(self) -> None: super().__init__() def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse wiktorner file into text and annotations. Annotations are returned as a dictionary with channel name as a key and list of tuples. Args: co z (str): Path to file containing CCL. Returns: Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations. """ content_parsed = json.loads(content) if "text" in content_parsed: text = content_parsed["text"] else: text = "" annotations = [] # Morphosyntactic annotations if "tokens" in content_parsed: for token in content_parsed["tokens"]: if "position" in token: token_start, token_end = token["position"] if "lexemes" in token: for lexeme in token["lexemes"]: if "disamb" in lexeme and lexeme["disamb"] == True: if "mstag" in lexeme: annotations.append( ( token_start, token_end, MorphosyntacticAnnotation(lexeme["mstag"]), ) ) # NER annotations if "entities" in content_parsed: for entity in content_parsed["entities"]: if "positions" in entity: entity_start, entity_end = entity["positions"] if "type" in entity: annotations.append( (entity_start, entity_end, NerAnnotation(entity["type"])) ) return text, annotations