From 77f1a90ed2c4ea44a5e854d7b3595b943d8fd573 Mon Sep 17 00:00:00 2001
From: Bartlomiej <bartlomiej.piotr.bojanowski@gmail.com>
Date: Wed, 18 Oct 2023 10:06:32 +0200
Subject: [PATCH] Change open file to clarin json

---
 requirements.txt                |  3 ++-
 src/input_parsers/wiktor_ner.py | 42 +++++++++++++--------------------
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index fd294b0..343ba80 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 --index-url https://pypi.clarin-pl.eu/simple/
+clarin-json
 nlp-ws
 regex==2020.10.28
 Babel==2.8.0
@@ -6,4 +7,4 @@ bitarray==2.6.1
 random-username==1.0.2
 randominfo==2.0.2
 hydra-core==1.3.1
-lxml==4.9.2
\ No newline at end of file
+lxml==4.9.2
diff --git a/src/input_parsers/wiktor_ner.py b/src/input_parsers/wiktor_ner.py
index 19a074b..051ffc5 100644
--- a/src/input_parsers/wiktor_ner.py
+++ b/src/input_parsers/wiktor_ner.py
@@ -73,43 +73,33 @@ class WiktorNERInputParser(InputParser):
             Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations.
 
         """
-        content_parsed = json.loads(content)
 
-        if "text" in content_parsed:
-            text = content_parsed["text"]
+        if content.text:
+            text = content.text
         else:
             text = ""
 
         annotations = []
-
         # Morphosyntactic annotations
-        if "tokens" in content_parsed:
-            for token in content_parsed["tokens"]:
-                if "position" in token:
-                    token_start, token_end = token["position"]
-                    if "lexemes" in token:
-                        for lexeme in token["lexemes"]:
-                            if "disamb" in lexeme and lexeme["disamb"] is True:
-                                if "mstag" in lexeme:
-                                    lemma = lexeme.get("lemma", None)
+        if content.tokens:
+            for token in content.tokens():
+                if token.start and token.stop:
+                    if token.lexemes:
+                        for lexeme in token.lexemes:
+                            if lexeme.disamb and lexeme.disamb is True:
+                                if lexeme.pos:
+                                    if lexeme.lemma:
+                                        lemma = lexeme.lemma
+                                    else:
+                                        lemma = None
                                     annotations.append(
                                         (
-                                            token_start,
-                                            token_end,
+                                            token.start,
+                                            token.stop,
                                             MorphosyntacticAnnotation(
-                                                lexeme["mstag"], lemma
+                                                lexeme.pos, lemma
                                             ),
                                         )
                                     )
 
-        # NER annotations
-        if "entities" in content_parsed:
-            for entity in content_parsed["entities"]:
-                if "positions" in entity:
-                    entity_start, entity_end = entity["positions"]
-                    if "type" in entity:
-                        annotations.append(
-                            (entity_start, entity_end, NerAnnotation(entity["type"]))
-                        )
-
         return text, annotations
-- 
GitLab