Merge branch 'update-to-0.7' into 'master'

Update to 0.7 See merge request !8

Merge branch 'update-to-0.7' into 'master'
Update to 0.7 See merge request !8
16b95b53 · Paweł Walkowiak · 9f17e626 · 17aef862 · 16b95b53 · 16b95b53
Commit 16b95b53 authored 1 year ago by Paweł Walkowiak
--- a/requirements.txt
+++ b/requirements.txt
 --index-url https://pypi.clarin-pl.eu/simple/ 
 nlp_ws
-winer==0.6.2post7
+winer==0.7.4post1
 awscli==1.22.57
 PyYAML==5.3.1
--- a/src/winer_worker.py
+++ b/src/winer_worker.py
 """Winer worker implementation."""
-from winer import Winer, allign_into_entities
-from winer.io import (create_document_from_clarin_json,
-                      create_entities_from_hf_outputs,
-                      read_clarin_json, extend_clarin_json)
+from winer import Winer, allign_into_entities, get_sentences_from_document
+import clarin_json
+from clarin_json.containers import Span
+
 import nlp_ws

 import logging
+_log = logging.getLogger(__name__)


 class WinerWorker(nlp_ws.NLPWorker):
@@ -24,7 +25,7 @@ class WinerWorker(nlp_ws.NLPWorker):

    def __init__(self):
        """Constructor."""
-        logging.info("Loading models...")
+        _log.info("Loading models...")
        self._model = Winer(self.models_location,
                            batch_size=self.batch_size)
        self._model._tokenizer.model_max_length = 512
@@ -46,25 +47,30 @@ class WinerWorker(nlp_ws.NLPWorker):
            worker will store result.
        :type output_path: str
        """
-        # Read inputs
-        documents = [read_clarin_json(input_path)]
-        dto_documents = \
-            [create_document_from_clarin_json(document)
-             for document in documents]
-        tok_inputs = \
-            [document.get_pretokenized_text() for document in dto_documents]
-        plain_inputs = [document.get_text() for document in dto_documents]
+        # Read inputs and open output
+        F_ASCII = task_options.get("ensure_ascii", False)
+        with clarin_json.open(output_path, "w", ensure_ascii=F_ASCII) as fout:
+            with clarin_json.open(input_path, "r") as fin:
+                for document in fin:
+                    # Pre-process document
+                    plain_inputs, tokenized_inputs, sent_starts = \
+                        get_sentences_from_document(document)

-        # Process data
-        aggregations = self._model.process(tok_inputs)
+                    # Process data
+                    aggregations = self._model.process(tokenized_inputs)

-        # Aggregate results to entities.
-        entities = allign_into_entities(tok_inputs, aggregations,
-                                        inputs_sentences=plain_inputs)
+                    # Aggregate results to entities
+                    entities = allign_into_entities(
+                        tokenized_inputs,
+                        aggregations,
+                        inputs_sentences=plain_inputs,
+                        sentences_range=sent_starts
+                    )
+                    document.set_spans(
+                        [Span(**entity)
+                         for sent_entities in entities
+                         for entity in sent_entities],
+                        "ner")

-        for document, doc_entities in zip(documents, entities):
-            extend_clarin_json(
-                output_path.replace('.json', '.clarin.json'),
-                document,
-                create_entities_from_hf_outputs(doc_entities)
-            )
+                    # Write processed document
+                    fout.write(document)