from typing import Dict, List, Tuple from lxml import etree import json from collections import defaultdict # from src.annotation_types_old import from src.input_parsers.interface import InputParser from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation class WiktorNERInputParser(InputParser): def __init__(self) -> None: super().__init__() def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse wiktorner file into text and annotations. Annotations are returned as a dictionary with channel name as a key and list of tuples. Args: co z (str): Path to file containing CCL. Returns: Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations. """ content_parsed = json.loads(content) if "text" in content_parsed: text = content_parsed['text'] else: text = "" annotations = [] # Morphosyntactic annotations if "tokens" in content_parsed: for token in content_parsed['tokens']: if "position" in token: token_start, token_end = token['position'] if "lexemes" in token: for lexeme in token['lexemes']: if "disamb" in lexeme and lexeme['disamb'] == True: if "mstag" in lexeme: annotations.append((token_start, token_end, MorphosyntacticAnnotation(lexeme['mstag']))) # NER annotations if "entities" in content_parsed: for entity in content_parsed['entities']: if "positions" in entity: entity_start, entity_end = entity['positions'] if "type" in entity: annotations.append((entity_start, entity_end, NerAnnotation(entity['type']))) return text, annotations