Skip to content
Snippets Groups Projects
Commit 508ecbb0 authored by Konrad Wojtasik's avatar Konrad Wojtasik
Browse files

Fix style

parent 0abbdb13
No related branches found
No related tags found
No related merge requests found
Pipeline #14441 failed
...@@ -2,17 +2,14 @@ ...@@ -2,17 +2,14 @@
import json import json
import logging import logging
import os
from io import UnsupportedOperation from io import UnsupportedOperation
from pathlib import Path from pathlib import Path
from tempfile import NamedTemporaryFile
import nlp_ws import nlp_ws
import clarin_json import clarin_json
from clarin_json import Span from clarin_json import Span
from easymatcher.models import MatrixMatcher from easymatcher.models import MatrixMatcher
from easymatcher.utils import load_data
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
...@@ -24,40 +21,6 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ...@@ -24,40 +21,6 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher
""" """
# @staticmethod
# def is_jsonl(
# document_path: str | Path
# ) -> bool:
# """Validates whether text file has json/jsonl structure."""
# try:
# with open(document_path, 'r', encoding="utf-8") as file:
# for line in file:
# json_obj = json.loads(line)
# if "text" not in json_obj:
# return False
# return True
# except (json.JSONDecodeError, FileNotFoundError):
# return False
# @staticmethod
# def prepare_and_append_document(
# file_path: str | Path, document_path: str | Path
# ) -> None:
# """Formats and appends plain texts into jsonl file."""
# document = {}
# if EasymatcherWorker.is_jsonl(document_path):
# with open(file_path, "a", encoding="utf-8") as _f:
# with open(document_path, "r", encoding="utf-8") as _df:
# for line in _df:
# line_data = json.loads(line)
# _f.write(json.dumps(line_data) + "\n")
# else:
# with open(document_path, "r", encoding="utf-8") as _df:
# document["text"] = _df.read()
# with open(file_path, "a", encoding="utf-8") as _f:
# _f.write(json.dumps(document) + "\n")
def process( def process(
self, self,
input_path: str, input_path: str,
...@@ -96,12 +59,13 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ...@@ -96,12 +59,13 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
# Get labels filename # Get labels filename
labels_dict_name = Path(labels_path).name.split(".")[0] labels_dict_name = Path(labels_path).name.split(".")[0]
matcher = MatrixMatcher(
matcher = MatrixMatcher(labels["labels"], labels["labels"],
multiple_labels=True, multiple_labels=True,
sim_threshold=0.8, sim_threshold=0.8,
tfidf_ngram_range=(2, 3), tfidf_ngram_range=(2, 3),
**task_options) **task_options,
)
with clarin_json.open(output_path, "w", ensure_ascii=False) as fout: with clarin_json.open(output_path, "w", ensure_ascii=False) as fout:
with clarin_json.open(input_path, "r") as fin: with clarin_json.open(input_path, "r") as fin:
...@@ -111,10 +75,20 @@ class EasymatcherWorker(nlp_ws.NLPWorker): ...@@ -111,10 +75,20 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
for doc_in, doc_out in zip(documents_in, documents_out): for doc_in, doc_out in zip(documents_in, documents_out):
doc_in.set_spans( doc_in.set_spans(
[Span.from_dict({"id": idx, "start": start_idx, "stop": end_idx, "class": class_name}) [
for idx, (start_idx, end_idx, class_name) Span.from_dict(
in enumerate(doc_out['label'])], {
"easymatcher-{}".format(labels_dict_name) "id": idx,
"start": start_idx,
"stop": end_idx,
"class": class_name,
}
)
for idx, (start_idx, end_idx, class_name) in enumerate(
doc_out["label"]
)
],
"easymatcher-{}".format(labels_dict_name),
) )
fout.write(doc_in) fout.write(doc_in)
...@@ -40,7 +40,7 @@ ignore = W504 ...@@ -40,7 +40,7 @@ ignore = W504
show-source = True show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv,tests,.vscode exclude = .git,.venv,.tox,dist,doc,*egg,build,venv,tests,.vscode
import-order-style = pep8 import-order-style = pep8
max-line-length = 80 max-line-length = 120
[pydocstyle] [pydocstyle]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment