From 0c324297ebb678eda77d16b8d609fa995d99f2af Mon Sep 17 00:00:00 2001
From: pwalkow <pwalkow@gpu-server.ws.clarin>
Date: Thu, 9 Mar 2023 16:11:32 +0100
Subject: [PATCH] Change script

---
 experiments/scripts/tag_dataset.py | 46 ++++++++++++++++++------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py
index 494bfe6..0c266d3 100644
--- a/experiments/scripts/tag_dataset.py
+++ b/experiments/scripts/tag_dataset.py
@@ -40,28 +40,18 @@ def tag_sentence(connection: Connection, sentence: str, lang: str):
     return lemmas, tags
 
 
-@click.command()
-@click.option(
-    "--dataset_name",
-    help="Dataset name",
-    type=str,
-)
-def main(dataset_name: str):
-    """Downloads the dataset to the output directory."""
-    lang = 'en' if dataset_name == 'enron_spam' else 'pl'
-    test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
-    test_with_tags = pd.DataFrame(test)
-    conn = Connection(config_file="experiments/configs/config.yml")
+def process_file(dataset_df, connection, lang, output_path):
+    test_with_tags = pd.DataFrame(dataset_df)
     lemmas_col, tags_col = [], []
     cpus = cpu_count()
     with Pool(processes=cpus) as pool:
         results = []
-        for idx in tqdm(range(0, len(test), cpus)):
-            end = min(idx+cpus, len(test) + 1)
-            for sentence in test[TEXT][idx:end]:
-                results.append(pool.apply_async(tag_sentence, args=[conn,
+        for idx in tqdm(range(0, len(dataset_df), cpus)):
+            end = min(idx+cpus, len(dataset_df) + 1)
+            for sentence in dataset_df[TEXT][idx:end]:
+                results.append(pool.apply_async(tag_sentence, args=(connection,
                                                                     sentence,
-                                                                    lang]))
+                                                                    lang,)))
             for res in results:
                 lemmas, tags = res.get()
                 lemmas_col.append(lemmas)
@@ -70,10 +60,28 @@ def main(dataset_name: str):
     test_with_tags[LEMMAS] = lemmas_col
     test_with_tags[TAGS] = tags_col
 
+    with open(output_path, mode="wt") as fd:
+        fd.write(test_with_tags.to_json(orient='records', lines=True))
+
+
+@click.command()
+@click.option(
+    "--dataset_name",
+    help="Dataset name",
+    type=str,
+)
+def main(dataset_name: str):
+    """Downloads the dataset to the output directory."""
+    lang = 'en' if dataset_name == 'enron_spam' else 'pl'
+    conn = Connection(config_file="experiments/configs/config.yml")
     output_dir = f"data/preprocessed/{dataset_name}"
     os.makedirs(output_dir, exist_ok=True)
-    with open(f"{output_dir}/test.jsonl", mode="wt") as fd:
-        fd.write(test_with_tags.to_json(orient='records', lines=True))
+
+    input_dir = f"data/datasets/{dataset_name}"
+    for file in os.listdir(input_dir):
+        if os.path.isfile(os.path.join(input_dir, file)):
+            process_file(pd.read_json(os.path.join(input_dir, file), lines=True),
+                         conn, lang, os.path.join(output_dir, file))
 
 
 if __name__ == "__main__":
-- 
GitLab