From 786bc9f212daedbca563d89c5a9b8121d87007ed Mon Sep 17 00:00:00 2001
From: pwalkow <pwalkow@gpu-server.ws.clarin>
Date: Thu, 9 Mar 2023 14:43:51 +0100
Subject: [PATCH] Add preprocessing

---
 .gitignore                         |  3 ++
 data/datasets/.gitignore           |  1 +
 data/preprocessed/.gitignore       |  2 +
 dvc.lock                           | 42 ++++++++++++++++
 dvc.yaml                           | 14 ++++++
 experiments/scripts/tag_dataset.py | 80 ++++++++++++++++++++++++++++++
 requirements.txt                   |  1 +
 7 files changed, 143 insertions(+)
 create mode 100644 data/preprocessed/.gitignore
 create mode 100644 experiments/scripts/tag_dataset.py

diff --git a/.gitignore b/.gitignore
index 976a184..853a525 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,3 +153,6 @@ dmypy.json
 cython_debug/
 
 .idea/
+
+# Lpmn config
+experiments/configs/config.yml
diff --git a/data/datasets/.gitignore b/data/datasets/.gitignore
index 60ba700..af871df 100644
--- a/data/datasets/.gitignore
+++ b/data/datasets/.gitignore
@@ -1 +1,2 @@
 /enron_spam
+/poleval
diff --git a/data/preprocessed/.gitignore b/data/preprocessed/.gitignore
new file mode 100644
index 0000000..8cfcddc
--- /dev/null
+++ b/data/preprocessed/.gitignore
@@ -0,0 +1,2 @@
+/poleval
+/enron_spam
diff --git a/dvc.lock b/dvc.lock
index e0d6202..d20b0ea 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -68,3 +68,45 @@ stages:
       md5: 376bd1619c08b4989564788e74de8e06.dir
       size: 7870394
       nfiles: 1
+  download_dataset@poleval:
+    cmd: PYTHONPATH=. python experiments/scripts/download_dataset.py --dataset_name
+      poleval --output_dir data/datasets/poleval
+    deps:
+    - path: experiments/scripts/download_dataset.py
+      md5: 9eb915fd5b9216965db519f686408a51
+      size: 887
+    outs:
+    - path: data/datasets/poleval/
+      md5: 826f974f794e24efcb5aedb054d1fd55.dir
+      size: 1688836
+      nfiles: 3
+  preprocess_dataset@poleval:
+    cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name poleval
+    deps:
+    - path: data/datasets/poleval/
+      md5: 826f974f794e24efcb5aedb054d1fd55.dir
+      size: 1688836
+      nfiles: 3
+    - path: experiments/scripts/tag_dataset.py
+      md5: 1d911edcd336cacaec482e6b7570eb1a
+      size: 2716
+    outs:
+    - path: data/preprocessed/poleval/
+      md5: 8daba6ad0597214499ac9b96e8e47c9f.dir
+      size: 501920
+      nfiles: 1
+  preprocess_dataset@enron_spam:
+    cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam
+    deps:
+    - path: data/datasets/enron_spam/
+      md5: 66d44efedf37990b1989c81bbee085e0.dir
+      size: 53096069
+      nfiles: 3
+    - path: experiments/scripts/tag_dataset.py
+      md5: 1d911edcd336cacaec482e6b7570eb1a
+      size: 2716
+    outs:
+    - path: data/preprocessed/enron_spam/
+      md5: 80c8dd3aa3bacf3afe8cf3138ab01d00.dir
+      size: 10639521
+      nfiles: 1
diff --git a/dvc.yaml b/dvc.yaml
index c035ccb..05c7ca8 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -13,6 +13,20 @@ stages:
         - experiments/scripts/download_dataset.py
       outs:
         - data/datasets/${item}/
+  preprocess_dataset:
+    foreach:
+      - enron_spam
+      - poleval
+    do:
+       wdir: .
+       cmd: >-
+         PYTHONPATH=. python experiments/scripts/tag_dataset.py
+         --dataset_name ${item}
+       deps:
+         - experiments/scripts/tag_dataset.py
+         - data/datasets/${item}/
+       outs:
+         - data/preprocessed/${item}/
   get_model:
     foreach:
       - enron_spam
diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py
new file mode 100644
index 0000000..494bfe6
--- /dev/null
+++ b/experiments/scripts/tag_dataset.py
@@ -0,0 +1,80 @@
+"""Script for running tagger on datasets."""
+import click
+import pandas as pd
+from lpmn_client_biz import Connection, IOType, Task, download
+import json
+import os
+from tqdm import tqdm
+from multiprocessing import cpu_count, Pool
+
+TOKENS = 'tokens'
+ORTH = 'orth'
+LEXEMES = 'lexemes'
+LEMMA = 'lemma'
+MSTAG = 'mstag'
+TEXT = 'text'
+LEMMAS = 'lemmas'
+TAGS = 'tags'
+
+
+def tag_sentence(connection: Connection, sentence: str, lang: str):
+    task = Task([{'postagger': {'output_type': 'json', 'lang': lang}}],
+                connection=connection)
+    output_file_id = task.run(sentence, IOType.TEXT)
+    tokens = []
+    try:
+        clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8"))
+        tokens = clarin_json[TOKENS]
+    except json.decoder.JSONDecodeError:
+        downloaded = download(connection, output_file_id, IOType.FILE)
+        with open(downloaded, 'r') as file:
+            lines = [json.loads(line) for line in file.readlines()]
+            for line in lines:
+                tokens.extend(line[TOKENS])
+        os.remove(downloaded)
+    lemmas, tags = [], []
+    for token in tokens:
+        lexeme = token['lexemes'][0]
+        lemmas.append(lexeme['lemma'])
+        tags.append(lexeme['mstag'])
+    return lemmas, tags
+
+
+@click.command()
+@click.option(
+    "--dataset_name",
+    help="Dataset name",
+    type=str,
+)
+def main(dataset_name: str):
+    """Downloads the dataset to the output directory."""
+    lang = 'en' if dataset_name == 'enron_spam' else 'pl'
+    test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
+    test_with_tags = pd.DataFrame(test)
+    conn = Connection(config_file="experiments/configs/config.yml")
+    lemmas_col, tags_col = [], []
+    cpus = cpu_count()
+    with Pool(processes=cpus) as pool:
+        results = []
+        for idx in tqdm(range(0, len(test), cpus)):
+            end = min(idx+cpus, len(test) + 1)
+            for sentence in test[TEXT][idx:end]:
+                results.append(pool.apply_async(tag_sentence, args=[conn,
+                                                                    sentence,
+                                                                    lang]))
+            for res in results:
+                lemmas, tags = res.get()
+                lemmas_col.append(lemmas)
+                tags_col.append(tags)
+            results = []
+    test_with_tags[LEMMAS] = lemmas_col
+    test_with_tags[TAGS] = tags_col
+
+    output_dir = f"data/preprocessed/{dataset_name}"
+    os.makedirs(output_dir, exist_ok=True)
+    with open(f"{output_dir}/test.jsonl", mode="wt") as fd:
+        fd.write(test_with_tags.to_json(orient='records', lines=True))
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 83b9c69..62256fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ click
 scikit-learn
 dvc[s3]
 shap
+lpmn_client_biz
 
 --find-links https://download.pytorch.org/whl/torch_stable.html
 torch==1.12.0+cu116
-- 
GitLab