Skip to content
Snippets Groups Projects
Commit c6b2a2ee authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Add clarin-json.process

parent a8bb87d5
1 merge request!8Update to 0.7
Pipeline #14466 failed with stages
in 1 minute and 1 second
......@@ -3,3 +3,4 @@ nlp_ws
winer==0.7.4post1
awscli==1.22.57
PyYAML==5.3.1
clarin_json>=0.6
......@@ -30,6 +30,31 @@ class WinerWorker(nlp_ws.NLPWorker):
batch_size=self.batch_size)
self._model._tokenizer.model_max_length = 512
def _add_entities_to_document(self, documents):
result_documents = []
for document in documents:
# Pre-process document
plain_inputs, tokenized_inputs, sent_starts = \
get_sentences_from_document(document)
# Process data
aggregations = self._model.process(tokenized_inputs)
# Aggregate results to entities
entities = allign_into_entities(
tokenized_inputs,
aggregations,
inputs_sentences=plain_inputs,
sentences_range=sent_starts
)
document.set_spans(
[Span(**entity)
for sent_entities in entities
for entity in sent_entities],
"ner")
result_documents.append(document)
return result_documents
def process(
self,
input_path: str,
......@@ -49,28 +74,9 @@ class WinerWorker(nlp_ws.NLPWorker):
"""
# Read inputs and open output
F_ASCII = task_options.get("ensure_ascii", False)
with clarin_json.open(output_path, "w", ensure_ascii=F_ASCII) as fout:
with clarin_json.open(input_path, "r") as fin:
for document in fin:
# Pre-process document
plain_inputs, tokenized_inputs, sent_starts = \
get_sentences_from_document(document)
# Process data
aggregations = self._model.process(tokenized_inputs)
# Aggregate results to entities
entities = allign_into_entities(
tokenized_inputs,
aggregations,
inputs_sentences=plain_inputs,
sentences_range=sent_starts
)
document.set_spans(
[Span(**entity)
for sent_entities in entities
for entity in sent_entities],
"ner")
# Write processed document
fout.write(document)
clarin_json.process(
input_path,
self._add_entities_to_document,
output_path,
ensure_ascii=F_ASCII
)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment