diff --git a/requirements.txt b/requirements.txt index fd294b06655c0e74737c266b35c0cdc6f12602b6..343ba800bada9b24dc341a34c3875170b65fe506 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ --index-url https://pypi.clarin-pl.eu/simple/ +clarin-json nlp-ws regex==2020.10.28 Babel==2.8.0 @@ -6,4 +7,4 @@ bitarray==2.6.1 random-username==1.0.2 randominfo==2.0.2 hydra-core==1.3.1 -lxml==4.9.2 \ No newline at end of file +lxml==4.9.2 diff --git a/src/input_parsers/wiktor_ner.py b/src/input_parsers/wiktor_ner.py index 19a074b81c99e013291e6f6f350e5c6ec726cb66..051ffc5b53636c1aaadcff8464e92722e6e2a043 100644 --- a/src/input_parsers/wiktor_ner.py +++ b/src/input_parsers/wiktor_ner.py @@ -73,43 +73,33 @@ class WiktorNERInputParser(InputParser): Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations. """ - content_parsed = json.loads(content) - if "text" in content_parsed: - text = content_parsed["text"] + if content.text: + text = content.text else: text = "" annotations = [] - # Morphosyntactic annotations - if "tokens" in content_parsed: - for token in content_parsed["tokens"]: - if "position" in token: - token_start, token_end = token["position"] - if "lexemes" in token: - for lexeme in token["lexemes"]: - if "disamb" in lexeme and lexeme["disamb"] is True: - if "mstag" in lexeme: - lemma = lexeme.get("lemma", None) + if content.tokens: + for token in content.tokens(): + if token.start and token.stop: + if token.lexemes: + for lexeme in token.lexemes: + if lexeme.disamb and lexeme.disamb is True: + if lexeme.pos: + if lexeme.lemma: + lemma = lexeme.lemma + else: + lemma = None annotations.append( ( - token_start, - token_end, + token.start, + token.stop, MorphosyntacticAnnotation( - lexeme["mstag"], lemma + lexeme.pos, lemma ), ) ) - # NER annotations - if "entities" in content_parsed: - for entity in content_parsed["entities"]: - if "positions" in entity: - entity_start, entity_end = entity["positions"] - if "type" in entity: - annotations.append( - (entity_start, entity_end, NerAnnotation(entity["type"])) - ) - return text, annotations