Skip to content
Snippets Groups Projects
Commit 16b95b53 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Merge branch 'update-to-0.7' into 'master'

Update to 0.7

See merge request !8
parents 9f17e626 17aef862
Branches
1 merge request!8Update to 0.7
Pipeline #14571 passed with stages
in 14 minutes and 34 seconds
--index-url https://pypi.clarin-pl.eu/simple/
nlp_ws
winer==0.6.2post7
winer==0.7.4post1
awscli==1.22.57
PyYAML==5.3.1
"""Winer worker implementation."""
from winer import Winer, allign_into_entities
from winer.io import (create_document_from_clarin_json,
create_entities_from_hf_outputs,
read_clarin_json, extend_clarin_json)
from winer import Winer, allign_into_entities, get_sentences_from_document
import clarin_json
from clarin_json.containers import Span
import nlp_ws
import logging
_log = logging.getLogger(__name__)
class WinerWorker(nlp_ws.NLPWorker):
......@@ -24,7 +25,7 @@ class WinerWorker(nlp_ws.NLPWorker):
def __init__(self):
"""Constructor."""
logging.info("Loading models...")
_log.info("Loading models...")
self._model = Winer(self.models_location,
batch_size=self.batch_size)
self._model._tokenizer.model_max_length = 512
......@@ -46,25 +47,30 @@ class WinerWorker(nlp_ws.NLPWorker):
worker will store result.
:type output_path: str
"""
# Read inputs
documents = [read_clarin_json(input_path)]
dto_documents = \
[create_document_from_clarin_json(document)
for document in documents]
tok_inputs = \
[document.get_pretokenized_text() for document in dto_documents]
plain_inputs = [document.get_text() for document in dto_documents]
# Read inputs and open output
F_ASCII = task_options.get("ensure_ascii", False)
with clarin_json.open(output_path, "w", ensure_ascii=F_ASCII) as fout:
with clarin_json.open(input_path, "r") as fin:
for document in fin:
# Pre-process document
plain_inputs, tokenized_inputs, sent_starts = \
get_sentences_from_document(document)
# Process data
aggregations = self._model.process(tok_inputs)
# Process data
aggregations = self._model.process(tokenized_inputs)
# Aggregate results to entities.
entities = allign_into_entities(tok_inputs, aggregations,
inputs_sentences=plain_inputs)
# Aggregate results to entities
entities = allign_into_entities(
tokenized_inputs,
aggregations,
inputs_sentences=plain_inputs,
sentences_range=sent_starts
)
document.set_spans(
[Span(**entity)
for sent_entities in entities
for entity in sent_entities],
"ner")
for document, doc_entities in zip(documents, entities):
extend_clarin_json(
output_path.replace('.json', '.clarin.json'),
document,
create_entities_from_hf_outputs(doc_entities)
)
# Write processed document
fout.write(document)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment