From 60c45d3e8f4dd67c7c1dae290745ff3653f7e329 Mon Sep 17 00:00:00 2001
From: pwalkow <pwalkow@gpu-server.ws.clarin>
Date: Thu, 9 Mar 2023 18:39:52 +0100
Subject: [PATCH] Add tags

---
 dvc.lock                           | 42 +++++++++++++++---------------
 dvc.yaml                           |  9 ++++---
 experiments/scripts/classify.py    |  2 +-
 experiments/scripts/explain.py     |  2 +-
 experiments/scripts/tag_dataset.py | 30 +++++++++++++--------
 text_attacks/models/poleval.py     | 13 +++++++++
 6 files changed, 61 insertions(+), 37 deletions(-)
 create mode 100644 text_attacks/models/poleval.py

diff --git a/dvc.lock b/dvc.lock
index d20b0ea..b3d2e13 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -16,9 +16,9 @@ stages:
     cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name enron_spam
       --output_dir data/models/enron_spam
     deps:
-    - path: data/datasets/enron_spam
-      md5: 66d44efedf37990b1989c81bbee085e0.dir
-      size: 53096069
+    - path: data/preprocessed/enron_spam
+      md5: b75efba1a62182dc8ac32acd1faf92ed.dir
+      size: 61709260
       nfiles: 3
     - path: experiments/scripts/get_model.py
       md5: 5050f51b4019bba97af47971f6c7cab4
@@ -32,21 +32,21 @@ stages:
     cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name enron_spam
       --output_dir data/classification/enron_spam
     deps:
-    - path: data/datasets/enron_spam/
-      md5: 66d44efedf37990b1989c81bbee085e0.dir
-      size: 53096069
-      nfiles: 3
     - path: data/models/enron_spam/
       md5: 3e16b22f59532c66beeadea958e0579a.dir
       size: 18505614
       nfiles: 6
+    - path: data/preprocessed/enron_spam/
+      md5: b75efba1a62182dc8ac32acd1faf92ed.dir
+      size: 61709260
+      nfiles: 3
     - path: experiments/scripts/classify.py
-      md5: 50f55b90eb47cbf448d83f8392dd37b6
-      size: 1102
+      md5: ba9284c90847fbbd0f2a6cca414d9636
+      size: 1106
     outs:
     - path: data/classification/enron_spam
-      md5: c7d42825b98b289f6a5ed3be1af14413.dir
-      size: 2763843
+      md5: 0450c0b672bc4a5db3cc7be2dac786bd.dir
+      size: 10674882
       nfiles: 2
   explain@enron_spam:
     cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
@@ -88,13 +88,13 @@ stages:
       size: 1688836
       nfiles: 3
     - path: experiments/scripts/tag_dataset.py
-      md5: 1d911edcd336cacaec482e6b7570eb1a
-      size: 2716
+      md5: 2c4e097b3a278c12d19858f988232b44
+      size: 3435
     outs:
     - path: data/preprocessed/poleval/
-      md5: 8daba6ad0597214499ac9b96e8e47c9f.dir
-      size: 501920
-      nfiles: 1
+      md5: 854387459b193c5eba6db1273ca5ad23.dir
+      size: 2277282
+      nfiles: 3
   preprocess_dataset@enron_spam:
     cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam
     deps:
@@ -103,10 +103,10 @@ stages:
       size: 53096069
       nfiles: 3
     - path: experiments/scripts/tag_dataset.py
-      md5: 1d911edcd336cacaec482e6b7570eb1a
-      size: 2716
+      md5: 2c4e097b3a278c12d19858f988232b44
+      size: 3435
     outs:
     - path: data/preprocessed/enron_spam/
-      md5: 80c8dd3aa3bacf3afe8cf3138ab01d00.dir
-      size: 10639521
-      nfiles: 1
+      md5: b75efba1a62182dc8ac32acd1faf92ed.dir
+      size: 61709260
+      nfiles: 3
diff --git a/dvc.yaml b/dvc.yaml
index 05c7ca8..a110e67 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -30,6 +30,7 @@ stages:
   get_model:
     foreach:
       - enron_spam
+      - poleval
     do:
       wdir: .
       cmd: >-
@@ -38,12 +39,13 @@ stages:
         --output_dir data/models/${item}
       deps:
         - experiments/scripts/get_model.py
-        - data/datasets/${item}
+        - data/preprocessed/${item}
       outs:
         - data/models/${item}/
   classify:
     foreach:
       - enron_spam
+      - poleval
     do:
       wdir: .
       cmd: >-
@@ -53,12 +55,13 @@ stages:
       deps:
         - experiments/scripts/classify.py
         - data/models/${item}/
-        - data/datasets/${item}/
+        - data/preprocessed/${item}/
       outs:
         - data/classification/${item}
   explain:
     foreach:
       - enron_spam
+      - poleval
     do:
       wdir: .
       cmd: >-
@@ -68,6 +71,6 @@ stages:
       deps:
         - experiments/scripts/explain.py
         - data/models/${item}
-        - data/datasets/${item}
+        - data/proprocessed/${item}
       outs:
         - data/explanations/${item}/
diff --git a/experiments/scripts/classify.py b/experiments/scripts/classify.py
index b642d9b..9639d29 100644
--- a/experiments/scripts/classify.py
+++ b/experiments/scripts/classify.py
@@ -28,7 +28,7 @@ def main(
     classify = get_classify_function(
         dataset_name=dataset_name,
     )
-    test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
+    test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
     test_x = test["text"].tolist()
     test_y = test["label"]
     pred_y = classify(test_x)
diff --git a/experiments/scripts/explain.py b/experiments/scripts/explain.py
index a9576fc..6cdb216 100644
--- a/experiments/scripts/explain.py
+++ b/experiments/scripts/explain.py
@@ -43,7 +43,7 @@ def main(
     model, tokenizer = get_model_and_tokenizer(
         dataset_name=dataset_name,
     )
-    test = pd.read_json(f"data/datasets/{dataset_name}/adversarial.jsonl", lines=True)
+    test = pd.read_json(f"data/preprocessed/{dataset_name}/adversarial.jsonl", lines=True)
     test_x = test["text"].tolist()
 
     predict = build_predict_fun(model, tokenizer)
diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py
index 0c266d3..e1b0671 100644
--- a/experiments/scripts/tag_dataset.py
+++ b/experiments/scripts/tag_dataset.py
@@ -17,10 +17,14 @@ LEMMAS = 'lemmas'
 TAGS = 'tags'
 
 
-def tag_sentence(connection: Connection, sentence: str, lang: str):
-    task = Task([{'postagger': {'output_type': 'json', 'lang': lang}}],
-                connection=connection)
-    output_file_id = task.run(sentence, IOType.TEXT)
+def tag_sentence(sentence: str, lang: str):
+    connection = Connection(config_file="experiments/configs/config.yml")
+    lpmn = ["morphodita",
+            {"posconverter":
+                 {"input_format": "ccl", "output_format": "json"}}] \
+        if lang == 'pl' else [{"spacy": {"lang": "en"}}]
+    task = Task(lpmn, connection=connection)
+    output_file_id = task.run(str(sentence), IOType.TEXT)
     tokens = []
     try:
         clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8"))
@@ -40,7 +44,7 @@ def tag_sentence(connection: Connection, sentence: str, lang: str):
     return lemmas, tags
 
 
-def process_file(dataset_df, connection, lang, output_path):
+def process_file(dataset_df, lang, output_path):
     test_with_tags = pd.DataFrame(dataset_df)
     lemmas_col, tags_col = [], []
     cpus = cpu_count()
@@ -49,9 +53,7 @@ def process_file(dataset_df, connection, lang, output_path):
         for idx in tqdm(range(0, len(dataset_df), cpus)):
             end = min(idx+cpus, len(dataset_df) + 1)
             for sentence in dataset_df[TEXT][idx:end]:
-                results.append(pool.apply_async(tag_sentence, args=(connection,
-                                                                    sentence,
-                                                                    lang,)))
+                results.append(pool.apply_async(tag_sentence, args=[sentence, lang]))
             for res in results:
                 lemmas, tags = res.get()
                 lemmas_col.append(lemmas)
@@ -73,15 +75,21 @@ def process_file(dataset_df, connection, lang, output_path):
 def main(dataset_name: str):
     """Downloads the dataset to the output directory."""
     lang = 'en' if dataset_name == 'enron_spam' else 'pl'
-    conn = Connection(config_file="experiments/configs/config.yml")
     output_dir = f"data/preprocessed/{dataset_name}"
     os.makedirs(output_dir, exist_ok=True)
 
     input_dir = f"data/datasets/{dataset_name}"
     for file in os.listdir(input_dir):
         if os.path.isfile(os.path.join(input_dir, file)):
-            process_file(pd.read_json(os.path.join(input_dir, file), lines=True),
-                         conn, lang, os.path.join(output_dir, file))
+            if file == "test.jsonl":
+                process_file(pd.read_json(os.path.join(input_dir, file), lines=True),
+                             lang, os.path.join(output_dir, file))
+            else:
+                test_with_tags = pd.DataFrame(pd.read_json(os.path.join(input_dir, file), lines=True))
+                test_with_tags[LEMMAS] = ['' for _ in range(len(test_with_tags))]
+                test_with_tags[TAGS] = ['' for _ in range(len(test_with_tags))]
+                with open(os.path.join(output_dir, file), mode="wt") as fd:
+                    fd.write(test_with_tags.to_json(orient='records', lines=True))
 
 
 if __name__ == "__main__":
diff --git a/text_attacks/models/poleval.py b/text_attacks/models/poleval.py
new file mode 100644
index 0000000..a037f8d
--- /dev/null
+++ b/text_attacks/models/poleval.py
@@ -0,0 +1,13 @@
+"""Classification model for enron_spam"""
+
+
+def get_model_and_tokenizer():
+    return None, None
+
+
+def get_classify_function():
+
+    def fun(texts):
+        return "dummy"
+
+    return fun
-- 
GitLab