From 786bc9f212daedbca563d89c5a9b8121d87007ed Mon Sep 17 00:00:00 2001 From: pwalkow <pwalkow@gpu-server.ws.clarin> Date: Thu, 9 Mar 2023 14:43:51 +0100 Subject: [PATCH] Add preprocessing --- .gitignore | 3 ++ data/datasets/.gitignore | 1 + data/preprocessed/.gitignore | 2 + dvc.lock | 42 ++++++++++++++++ dvc.yaml | 14 ++++++ experiments/scripts/tag_dataset.py | 80 ++++++++++++++++++++++++++++++ requirements.txt | 1 + 7 files changed, 143 insertions(+) create mode 100644 data/preprocessed/.gitignore create mode 100644 experiments/scripts/tag_dataset.py diff --git a/.gitignore b/.gitignore index 976a184..853a525 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,6 @@ dmypy.json cython_debug/ .idea/ + +# Lpmn config +experiments/configs/config.yml diff --git a/data/datasets/.gitignore b/data/datasets/.gitignore index 60ba700..af871df 100644 --- a/data/datasets/.gitignore +++ b/data/datasets/.gitignore @@ -1 +1,2 @@ /enron_spam +/poleval diff --git a/data/preprocessed/.gitignore b/data/preprocessed/.gitignore new file mode 100644 index 0000000..8cfcddc --- /dev/null +++ b/data/preprocessed/.gitignore @@ -0,0 +1,2 @@ +/poleval +/enron_spam diff --git a/dvc.lock b/dvc.lock index e0d6202..d20b0ea 100644 --- a/dvc.lock +++ b/dvc.lock @@ -68,3 +68,45 @@ stages: md5: 376bd1619c08b4989564788e74de8e06.dir size: 7870394 nfiles: 1 + download_dataset@poleval: + cmd: PYTHONPATH=. python experiments/scripts/download_dataset.py --dataset_name + poleval --output_dir data/datasets/poleval + deps: + - path: experiments/scripts/download_dataset.py + md5: 9eb915fd5b9216965db519f686408a51 + size: 887 + outs: + - path: data/datasets/poleval/ + md5: 826f974f794e24efcb5aedb054d1fd55.dir + size: 1688836 + nfiles: 3 + preprocess_dataset@poleval: + cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name poleval + deps: + - path: data/datasets/poleval/ + md5: 826f974f794e24efcb5aedb054d1fd55.dir + size: 1688836 + nfiles: 3 + - path: experiments/scripts/tag_dataset.py + md5: 1d911edcd336cacaec482e6b7570eb1a + size: 2716 + outs: + - path: data/preprocessed/poleval/ + md5: 8daba6ad0597214499ac9b96e8e47c9f.dir + size: 501920 + nfiles: 1 + preprocess_dataset@enron_spam: + cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam + deps: + - path: data/datasets/enron_spam/ + md5: 66d44efedf37990b1989c81bbee085e0.dir + size: 53096069 + nfiles: 3 + - path: experiments/scripts/tag_dataset.py + md5: 1d911edcd336cacaec482e6b7570eb1a + size: 2716 + outs: + - path: data/preprocessed/enron_spam/ + md5: 80c8dd3aa3bacf3afe8cf3138ab01d00.dir + size: 10639521 + nfiles: 1 diff --git a/dvc.yaml b/dvc.yaml index c035ccb..05c7ca8 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -13,6 +13,20 @@ stages: - experiments/scripts/download_dataset.py outs: - data/datasets/${item}/ + preprocess_dataset: + foreach: + - enron_spam + - poleval + do: + wdir: . + cmd: >- + PYTHONPATH=. python experiments/scripts/tag_dataset.py + --dataset_name ${item} + deps: + - experiments/scripts/tag_dataset.py + - data/datasets/${item}/ + outs: + - data/preprocessed/${item}/ get_model: foreach: - enron_spam diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py new file mode 100644 index 0000000..494bfe6 --- /dev/null +++ b/experiments/scripts/tag_dataset.py @@ -0,0 +1,80 @@ +"""Script for running tagger on datasets.""" +import click +import pandas as pd +from lpmn_client_biz import Connection, IOType, Task, download +import json +import os +from tqdm import tqdm +from multiprocessing import cpu_count, Pool + +TOKENS = 'tokens' +ORTH = 'orth' +LEXEMES = 'lexemes' +LEMMA = 'lemma' +MSTAG = 'mstag' +TEXT = 'text' +LEMMAS = 'lemmas' +TAGS = 'tags' + + +def tag_sentence(connection: Connection, sentence: str, lang: str): + task = Task([{'postagger': {'output_type': 'json', 'lang': lang}}], + connection=connection) + output_file_id = task.run(sentence, IOType.TEXT) + tokens = [] + try: + clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8")) + tokens = clarin_json[TOKENS] + except json.decoder.JSONDecodeError: + downloaded = download(connection, output_file_id, IOType.FILE) + with open(downloaded, 'r') as file: + lines = [json.loads(line) for line in file.readlines()] + for line in lines: + tokens.extend(line[TOKENS]) + os.remove(downloaded) + lemmas, tags = [], [] + for token in tokens: + lexeme = token['lexemes'][0] + lemmas.append(lexeme['lemma']) + tags.append(lexeme['mstag']) + return lemmas, tags + + +@click.command() +@click.option( + "--dataset_name", + help="Dataset name", + type=str, +) +def main(dataset_name: str): + """Downloads the dataset to the output directory.""" + lang = 'en' if dataset_name == 'enron_spam' else 'pl' + test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True) + test_with_tags = pd.DataFrame(test) + conn = Connection(config_file="experiments/configs/config.yml") + lemmas_col, tags_col = [], [] + cpus = cpu_count() + with Pool(processes=cpus) as pool: + results = [] + for idx in tqdm(range(0, len(test), cpus)): + end = min(idx+cpus, len(test) + 1) + for sentence in test[TEXT][idx:end]: + results.append(pool.apply_async(tag_sentence, args=[conn, + sentence, + lang])) + for res in results: + lemmas, tags = res.get() + lemmas_col.append(lemmas) + tags_col.append(tags) + results = [] + test_with_tags[LEMMAS] = lemmas_col + test_with_tags[TAGS] = tags_col + + output_dir = f"data/preprocessed/{dataset_name}" + os.makedirs(output_dir, exist_ok=True) + with open(f"{output_dir}/test.jsonl", mode="wt") as fd: + fd.write(test_with_tags.to_json(orient='records', lines=True)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 83b9c69..62256fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ click scikit-learn dvc[s3] shap +lpmn_client_biz --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.0+cu116 -- GitLab