"""Module for parsing WiktorNER files.""" from typing import List, Tuple from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation from src.input_parsers.interface import InputParser class WiktorNERInputParser(InputParser): """Parser for WiktorNER files. Example WiktorNER file: { "filename": "greeting-5b1401", "text": "Hello Tom!", "tokens": [ { "index": 1, "position": [0,5], "orth": "Hello", "lex": [ { "lemma": "hello", "mstag": "interj" } ] }, { "index": 2, "position": [6,9], "orth": "Tom", "lex": [ { "lemma": "Tom", "mstag": "noun" } ] }, { "index": 3, "position": [9,10], "orth": "!", "lex": [ { "lemma": "!", "mstag": "interp" } ] } ], "entities": [ { "text": "Tom", "type": "nam_prs_human", "tokens": [2], "positions": [6,9] } ] } """ def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse wiktorner file into text and annotations. Annotations are returned as a dictionary with channel name as a key and list of tuples. Args: co z (str): Path to file containing CCL. Returns: Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations. """ if content.text: text = content.text else: text = "" annotations = [] # Morphosyntactic annotations if content.tokens(): for token in content.tokens(): if token.lexemes: for lexeme in token.lexemes: if lexeme.disamb and lexeme.disamb is True: if lexeme.pos: if lexeme.lemma: lemma = lexeme.lemma else: lemma = None annotations.append( ( token.start, token.stop, MorphosyntacticAnnotation( lexeme.pos, lemma ), ) ) # NER annotations if 'ner' in content.get_span_types(): for entity in content.spans('ner'): if entity.type: annotations.append( (entity.start, entity.stop, NerAnnotation(entity.type)) ) return text, annotations