Skip to content
Snippets Groups Projects

Resolve "Read not only .txt files"

Files

+ 36
11
@@ -21,23 +21,45 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
https://gitlab.clarin-pl.eu/knowledge-extraction/tools/easymatcher
"""
@staticmethod
def is_jsonl(
document_path: str | Path
) -> bool:
"""Validates whether text file has json/jsonl structure."""
try:
with open(document_path, 'r', encoding="utf-8") as file:
for line in file:
json_obj = json.loads(line)
if "text" not in json_obj:
return False
return True
except (json.JSONDecodeError, FileNotFoundError):
return False
@staticmethod
def prepare_and_append_document(
file_path: str | Path, document_path: str | Path
file_path: str | Path, document_path: str | Path
) -> None:
"""Formats and appends plain texts into jsonl file."""
document = {}
with open(document_path, "r", encoding="utf-8") as _df:
document["text"] = _df.read()
if EasymatcherWorker.is_jsonl(document_path):
with open(file_path, "a", encoding="utf-8") as _f:
with open(document_path, "r", encoding="utf-8") as _df:
for line in _df:
line_data = json.loads(line)
_f.write(json.dumps(line_data) + "\n")
else:
with open(document_path, "r", encoding="utf-8") as _df:
document["text"] = _df.read()
with open(file_path, "a", encoding="utf-8") as _f:
_f.write(json.dumps(document) + "\n")
with open(file_path, "a", encoding="utf-8") as _f:
_f.write(json.dumps(document) + "\n")
def process(
self,
input_path: str,
task_options: dict[str, str | int | float],
output_path: str,
self,
input_path: str,
task_options: dict[str, str | int | float],
output_path: str,
) -> None:
"""Called for each request made to the worker.
@@ -83,5 +105,8 @@ class EasymatcherWorker(nlp_ws.NLPWorker):
os.unlink(tmpf.name)
with open(output_path, "w", encoding="utf-8") as _f:
for out_document in out_documents:
_f.write(json.dumps(out_document) + "\n")
for out_document, document in zip(out_documents, documents):
# We want to keep content of the original labeled documents
document['label'] = out_document['label']
document['text'] = out_document['text']
_f.write(json.dumps(document) + "\n")
Loading