Skip to content
Snippets Groups Projects
Commit ce77fdf8 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Add tag ds

parent d9116b98
Branches
No related tags found
No related merge requests found
......@@ -97,7 +97,7 @@ def data_producer(queue_out, dataset_df, queue_recurse, queue_log, log_file):
except Exception as e:
queue_log.put(f"Error in data producer: {e}")
with open(log_file, "a") as f:
f.write("Producer failed with {e}\n")
f.write(f"Producer failed with {e}\n")
queue_out.put(None)
......
"""Script for running tagger on datasets."""
import click
import pandas as pd
from lpmn_client_biz import Connection, IOType, Task, download, upload
from lpmn_client_biz import Connection, run_json_lines
import json
import os
from tqdm import tqdm
......@@ -21,72 +21,84 @@ ORTHS = "orths"
NER = "ner"
def tag_sentences(sentences, lang: str):
results = {}
connection = Connection(config_file="experiments/configs/config.yml")
lpmn = [[{"postagger": {"lang": lang}}], 'makezip']
input_dir = str(uuid.uuid4())
os.makedirs(input_dir)
for idx, sentence in enumerate(sentences):
with open(f'{input_dir}/file_{idx}',
'w', encoding='utf8') as fout:
fout.write(sentence)
uploaded = upload(connection, input_dir)
task = Task(lpmn, connection)
result = task.run(uploaded, IOType.FILE, verbose=True)
archive_path = download(
connection,
result,
IOType.FILE,
filename=f'{uuid.uuid4()}.zip'
def tag_sentences(input_path, output_path, lang: str):
run_json_lines(
Connection(config_file='experiments/configs/config.yml'),
[{'postagger': {'method': 'ner', 'lang': lang}}],
input_path,
output_path,
TEXT,
["tagset", "tokens", "spans"],
verbose=True
)
output_path = archive_path.replace('.zip', '')
shutil.unpack_archive(archive_path, output_path)
files = sorted(os.listdir(output_path), key=lambda x: int(x.split('_')[1]))
for j, filename in enumerate(files):
with open(f'{output_path}/{filename}', 'r') as file:
lines = [json.loads(line) for line in file.readlines()]
lemmas, tags, orths = [], [], []
if len(lines) > 0:
for idx, line in enumerate(lines):
tokens = line[TOKENS]
for token in tokens:
lexeme = token[LEXEMES][0]
lemmas.append(lexeme[LEMMA])
tags.append(lexeme[MSTAG])
orths.append(token[ORTH])
else:
tokens = lines[0][TOKENS]
for token in tokens:
lexeme = token[LEXEMES][0]
lemmas.append(lexeme[LEMMA])
tags.append(lexeme[MSTAG])
orths.append(token[ORTH])
results[int(filename.split('_')[1])] = {
LEMMAS: lemmas,
TAGS: tags,
ORTHS: orths
}
shutil.rmtree(input_dir)
os.remove(archive_path)
shutil.rmtree(output_path)
return results
def process_file(dataset_df, lang):
test_with_tags = pd.DataFrame(dataset_df)
lemmas_col, tags_col, orth_col = [], [], []
tagged_sentences = tag_sentences(dataset_df[TEXT].tolist(), lang)
for idx, tokens in tagged_sentences.items():
lemmas_col.append(tokens[LEMMAS])
tags_col.append(tokens[TAGS])
orth_col.append(tokens[ORTHS])
test_with_tags[LEMMAS] = lemmas_col
test_with_tags[TAGS] = tags_col
test_with_tags[ORTHS] = orth_col
return test_with_tags
# def tag_sentences(sentences, lang: str):
# results = {}
# connection = Connection(config_file="experiments/configs/config.yml")
# lpmn = [[{"postagger": {"lang": lang}}], 'makezip']
# input_dir = str(uuid.uuid4())
# os.makedirs(input_dir)
# for idx, sentence in enumerate(sentences):
# with open(f'{input_dir}/file_{idx}',
# 'w', encoding='utf8') as fout:
# fout.write(sentence)
#
# uploaded = upload(connection, input_dir)
# task = Task(lpmn, connection)
# result = task.run(uploaded, IOType.FILE, verbose=True)
# archive_path = download(
# connection,
# result,
# IOType.FILE,
# filename=f'{uuid.uuid4()}.zip'
# )
# output_path = archive_path.replace('.zip', '')
# shutil.unpack_archive(archive_path, output_path)
# files = sorted(os.listdir(output_path), key=lambda x: int(x.split('_')[1]))
# for j, filename in enumerate(files):
# with open(f'{output_path}/{filename}', 'r') as file:
# lines = [json.loads(line) for line in file.readlines()]
# lemmas, tags, orths = [], [], []
# if len(lines) > 0:
# for idx, line in enumerate(lines):
# tokens = line[TOKENS]
# for token in tokens:
# lexeme = token[LEXEMES][0]
# lemmas.append(lexeme[LEMMA])
# tags.append(lexeme[MSTAG])
# orths.append(token[ORTH])
# else:
# tokens = lines[0][TOKENS]
# for token in tokens:
# lexeme = token[LEXEMES][0]
# lemmas.append(lexeme[LEMMA])
# tags.append(lexeme[MSTAG])
# orths.append(token[ORTH])
# results[int(filename.split('_')[1])] = {
# LEMMAS: lemmas,
# TAGS: tags,
# ORTHS: orths
# }
# shutil.rmtree(input_dir)
# os.remove(archive_path)
# shutil.rmtree(output_path)
# return results
# def process_file(dataset_df, lang):
# test_with_tags = pd.DataFrame(dataset_df)
# lemmas_col, tags_col, orth_col = [], [], []
#
# tagged_sentences = tag_sentences(dataset_df[TEXT].tolist(), lang)
# for idx, tokens in tagged_sentences.items():
# lemmas_col.append(tokens[LEMMAS])
# tags_col.append(tokens[TAGS])
# orth_col.append(tokens[ORTHS])
# test_with_tags[LEMMAS] = lemmas_col
# test_with_tags[TAGS] = tags_col
# test_with_tags[ORTHS] = orth_col
# return test_with_tags
def add_ner(dataset_df, language):
......
......@@ -20,6 +20,7 @@ torch==1.12.0+cu116
--index-url https://pypi.clarin-pl.eu/simple/
plwn-api==0.24
lpmn_client_biz>=2.1.1
lpmn_client_biz==2.1.5
clarin_json
datetime
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment