"""Module for parsing WiktorNER files.""" import json from typing import List, Tuple from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation from src.input_parsers.interface import InputParser class WiktorNERInputParser(InputParser): """Parser for WiktorNER files. Example WiktorNER file: { "filename": "greeting-5b1401", "text": "Hello Tom!", "tokens": [ { "index": 1, "position": [0,5], "orth": "Hello", "lex": [ { "lemma": "hello", "mstag": "interj" } ] }, { "index": 2, "position": [6,9], "orth": "Tom", "lex": [ { "lemma": "Tom", "mstag": "noun" } ] }, { "index": 3, "position": [9,10], "orth": "!", "lex": [ { "lemma": "!", "mstag": "interp" } ] } ], "entities": [ { "text": "Tom", "type": "nam_prs_human", "tokens": [2], "positions": [6,9] } ] } """ def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse wiktorner file into text and annotations. Annotations are returned as a dictionary with channel name as a key and list of tuples. Args: co z (str): Path to file containing CCL. Returns: Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations. """ content_parsed = json.loads(content) if "text" in content_parsed: text = content_parsed["text"] else: text = "" annotations = [] # Morphosyntactic annotations if "tokens" in content_parsed: for token in content_parsed["tokens"]: if "position" in token: token_start, token_end = token["position"] if "lexemes" in token: for lexeme in token["lexemes"]: if "disamb" in lexeme and lexeme["disamb"] is True: if "mstag" in lexeme: lemma = lexeme.get("lemma", None) annotations.append( ( token_start, token_end, MorphosyntacticAnnotation( lexeme["mstag"], lemma ), ) ) # NER annotations if "entities" in content_parsed: for entity in content_parsed["entities"]: if "positions" in entity: entity_start, entity_end = entity["positions"] if "type" in entity: annotations.append( (entity_start, entity_end, NerAnnotation(entity["type"])) ) return text, annotations