Change open file to clarin json

77f1a90e · Bartlomiej · 0456a3f2 · 77f1a90e · 77f1a90e
Commit 77f1a90e authored 1 year ago by Bartlomiej
--- a/requirements.txt
+++ b/requirements.txt
 --index-url https://pypi.clarin-pl.eu/simple/
+clarin-json
 nlp-ws
 regex==2020.10.28
 Babel==2.8.0
@@ -6,4 +7,4 @@ bitarray==2.6.1
 random-username==1.0.2
 randominfo==2.0.2
 hydra-core==1.3.1
-lxml==4.9.2
\ No newline at end of file
+lxml==4.9.2
--- a/src/input_parsers/wiktor_ner.py
+++ b/src/input_parsers/wiktor_ner.py
@@ -73,43 +73,33 @@ class WiktorNERInputParser(InputParser):
            Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations.

        """
-        content_parsed = json.loads(content)

-        if "text" in content_parsed:
-            text = content_parsed["text"]
+        if content.text:
+            text = content.text
        else:
            text = ""

        annotations = []
-
        # Morphosyntactic annotations
-        if "tokens" in content_parsed:
-            for token in content_parsed["tokens"]:
-                if "position" in token:
-                    token_start, token_end = token["position"]
-                    if "lexemes" in token:
-                        for lexeme in token["lexemes"]:
-                            if "disamb" in lexeme and lexeme["disamb"] is True:
-                                if "mstag" in lexeme:
-                                    lemma = lexeme.get("lemma", None)
+        if content.tokens:
+            for token in content.tokens():
+                if token.start and token.stop:
+                    if token.lexemes:
+                        for lexeme in token.lexemes:
+                            if lexeme.disamb and lexeme.disamb is True:
+                                if lexeme.pos:
+                                    if lexeme.lemma:
+                                        lemma = lexeme.lemma
+                                    else:
+                                        lemma = None
                                    annotations.append(
                                        (
-                                            token_start,
-                                            token_end,
+                                            token.start,
+                                            token.stop,
                                            MorphosyntacticAnnotation(
-                                                lexeme["mstag"], lemma
+                                                lexeme.pos, lemma
                                            ),
                                        )
                                    )

-        # NER annotations
-        if "entities" in content_parsed:
-            for entity in content_parsed["entities"]:
-                if "positions" in entity:
-                    entity_start, entity_end = entity["positions"]
-                    if "type" in entity:
-                        annotations.append(
-                            (entity_start, entity_end, NerAnnotation(entity["type"]))
-                        )
-
        return text, annotations