Skip to content
Snippets Groups Projects
Commit 786bc9f2 authored by pwalkow's avatar pwalkow
Browse files

Add preprocessing

parent d3950c57
No related branches found
No related tags found
No related merge requests found
......@@ -153,3 +153,6 @@ dmypy.json
cython_debug/
.idea/
# Lpmn config
experiments/configs/config.yml
/enron_spam
/poleval
/poleval
/enron_spam
......@@ -68,3 +68,45 @@ stages:
md5: 376bd1619c08b4989564788e74de8e06.dir
size: 7870394
nfiles: 1
download_dataset@poleval:
cmd: PYTHONPATH=. python experiments/scripts/download_dataset.py --dataset_name
poleval --output_dir data/datasets/poleval
deps:
- path: experiments/scripts/download_dataset.py
md5: 9eb915fd5b9216965db519f686408a51
size: 887
outs:
- path: data/datasets/poleval/
md5: 826f974f794e24efcb5aedb054d1fd55.dir
size: 1688836
nfiles: 3
preprocess_dataset@poleval:
cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name poleval
deps:
- path: data/datasets/poleval/
md5: 826f974f794e24efcb5aedb054d1fd55.dir
size: 1688836
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 1d911edcd336cacaec482e6b7570eb1a
size: 2716
outs:
- path: data/preprocessed/poleval/
md5: 8daba6ad0597214499ac9b96e8e47c9f.dir
size: 501920
nfiles: 1
preprocess_dataset@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam
deps:
- path: data/datasets/enron_spam/
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 1d911edcd336cacaec482e6b7570eb1a
size: 2716
outs:
- path: data/preprocessed/enron_spam/
md5: 80c8dd3aa3bacf3afe8cf3138ab01d00.dir
size: 10639521
nfiles: 1
......@@ -13,6 +13,20 @@ stages:
- experiments/scripts/download_dataset.py
outs:
- data/datasets/${item}/
preprocess_dataset:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/tag_dataset.py
--dataset_name ${item}
deps:
- experiments/scripts/tag_dataset.py
- data/datasets/${item}/
outs:
- data/preprocessed/${item}/
get_model:
foreach:
- enron_spam
......
"""Script for running tagger on datasets."""
import click
import pandas as pd
from lpmn_client_biz import Connection, IOType, Task, download
import json
import os
from tqdm import tqdm
from multiprocessing import cpu_count, Pool
TOKENS = 'tokens'
ORTH = 'orth'
LEXEMES = 'lexemes'
LEMMA = 'lemma'
MSTAG = 'mstag'
TEXT = 'text'
LEMMAS = 'lemmas'
TAGS = 'tags'
def tag_sentence(connection: Connection, sentence: str, lang: str):
task = Task([{'postagger': {'output_type': 'json', 'lang': lang}}],
connection=connection)
output_file_id = task.run(sentence, IOType.TEXT)
tokens = []
try:
clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8"))
tokens = clarin_json[TOKENS]
except json.decoder.JSONDecodeError:
downloaded = download(connection, output_file_id, IOType.FILE)
with open(downloaded, 'r') as file:
lines = [json.loads(line) for line in file.readlines()]
for line in lines:
tokens.extend(line[TOKENS])
os.remove(downloaded)
lemmas, tags = [], []
for token in tokens:
lexeme = token['lexemes'][0]
lemmas.append(lexeme['lemma'])
tags.append(lexeme['mstag'])
return lemmas, tags
@click.command()
@click.option(
"--dataset_name",
help="Dataset name",
type=str,
)
def main(dataset_name: str):
"""Downloads the dataset to the output directory."""
lang = 'en' if dataset_name == 'enron_spam' else 'pl'
test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
test_with_tags = pd.DataFrame(test)
conn = Connection(config_file="experiments/configs/config.yml")
lemmas_col, tags_col = [], []
cpus = cpu_count()
with Pool(processes=cpus) as pool:
results = []
for idx in tqdm(range(0, len(test), cpus)):
end = min(idx+cpus, len(test) + 1)
for sentence in test[TEXT][idx:end]:
results.append(pool.apply_async(tag_sentence, args=[conn,
sentence,
lang]))
for res in results:
lemmas, tags = res.get()
lemmas_col.append(lemmas)
tags_col.append(tags)
results = []
test_with_tags[LEMMAS] = lemmas_col
test_with_tags[TAGS] = tags_col
output_dir = f"data/preprocessed/{dataset_name}"
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/test.jsonl", mode="wt") as fd:
fd.write(test_with_tags.to_json(orient='records', lines=True))
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -4,6 +4,7 @@ click
scikit-learn
dvc[s3]
shap
lpmn_client_biz
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==1.12.0+cu116
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment