Add tag ds

ce77fdf8 · Paweł Walkowiak · d9116b98 · ce77fdf8 · ce77fdf8 · ce77fdf8
Commit ce77fdf8 authored Dec 13, 2023 by Paweł Walkowiak
--- a/experiments/scripts/attack.py
+++ b/experiments/scripts/attack.py
@@ -97,7 +97,7 @@ def data_producer(queue_out, dataset_df, queue_recurse, queue_log, log_file):
    except Exception as e:
        queue_log.put(f"Error in data producer: {e}")
        with open(log_file, "a") as f:
-            f.write("Producer failed with {e}\n")
+            f.write(f"Producer failed with {e}\n")
        queue_out.put(None)



--- a/experiments/scripts/tag_dataset.py
+++ b/experiments/scripts/tag_dataset.py
 """Script for running tagger on datasets."""
 import click
 import pandas as pd
-from lpmn_client_biz import Connection, IOType, Task, download, upload
+from lpmn_client_biz import Connection, run_json_lines
 import json
 import os
 from tqdm import tqdm
@@ -21,72 +21,84 @@ ORTHS = "orths"
 NER = "ner"


-def tag_sentences(sentences, lang: str):
-    results = {}
-    connection = Connection(config_file="experiments/configs/config.yml")
-    lpmn = [[{"postagger": {"lang": lang}}], 'makezip']
-    input_dir = str(uuid.uuid4())
-    os.makedirs(input_dir)
-    for idx, sentence in enumerate(sentences):
-        with open(f'{input_dir}/file_{idx}',
-                  'w', encoding='utf8') as fout:
-            fout.write(sentence)
-
-    uploaded = upload(connection, input_dir)
-    task = Task(lpmn, connection)
-    result = task.run(uploaded, IOType.FILE, verbose=True)
-    archive_path = download(
-        connection,
-        result,
-        IOType.FILE,
-        filename=f'{uuid.uuid4()}.zip'
+def tag_sentences(input_path, output_path, lang: str):
+    run_json_lines(
+        Connection(config_file='experiments/configs/config.yml'),
+        [{'postagger': {'method': 'ner', 'lang': lang}}],
+        input_path,
+        output_path,
+        TEXT,
+        ["tagset", "tokens", "spans"],
+        verbose=True
    )
-    output_path = archive_path.replace('.zip', '')
-    shutil.unpack_archive(archive_path, output_path)
-    files = sorted(os.listdir(output_path), key=lambda x: int(x.split('_')[1]))
-    for j, filename in enumerate(files):
-        with open(f'{output_path}/{filename}', 'r') as file:
-            lines = [json.loads(line) for line in file.readlines()]
-            lemmas, tags, orths = [], [], []
-            if len(lines) > 0:
-                for idx, line in enumerate(lines):
-                    tokens = line[TOKENS]
-                    for token in tokens:
-                        lexeme = token[LEXEMES][0]
-                        lemmas.append(lexeme[LEMMA])
-                        tags.append(lexeme[MSTAG])
-                        orths.append(token[ORTH])
-            else:
-                tokens = lines[0][TOKENS]
-                for token in tokens:
-                    lexeme = token[LEXEMES][0]
-                    lemmas.append(lexeme[LEMMA])
-                    tags.append(lexeme[MSTAG])
-                    orths.append(token[ORTH])
-            results[int(filename.split('_')[1])] = {
-                LEMMAS: lemmas,
-                TAGS: tags,
-                ORTHS: orths
-            }
-    shutil.rmtree(input_dir)
-    os.remove(archive_path)
-    shutil.rmtree(output_path)
-    return results
-
-
-def process_file(dataset_df, lang):
-    test_with_tags = pd.DataFrame(dataset_df)
-    lemmas_col, tags_col, orth_col = [], [], []
-
-    tagged_sentences = tag_sentences(dataset_df[TEXT].tolist(), lang)
-    for idx, tokens in tagged_sentences.items():
-        lemmas_col.append(tokens[LEMMAS])
-        tags_col.append(tokens[TAGS])
-        orth_col.append(tokens[ORTHS])
-    test_with_tags[LEMMAS] = lemmas_col
-    test_with_tags[TAGS] = tags_col
-    test_with_tags[ORTHS] = orth_col
-    return test_with_tags
+
+
+# def tag_sentences(sentences, lang: str):
+#     results = {}
+#     connection = Connection(config_file="experiments/configs/config.yml")
+#     lpmn = [[{"postagger": {"lang": lang}}], 'makezip']
+#     input_dir = str(uuid.uuid4())
+#     os.makedirs(input_dir)
+#     for idx, sentence in enumerate(sentences):
+#         with open(f'{input_dir}/file_{idx}',
+#                   'w', encoding='utf8') as fout:
+#             fout.write(sentence)
+#
+#     uploaded = upload(connection, input_dir)
+#     task = Task(lpmn, connection)
+#     result = task.run(uploaded, IOType.FILE, verbose=True)
+#     archive_path = download(
+#         connection,
+#         result,
+#         IOType.FILE,
+#         filename=f'{uuid.uuid4()}.zip'
+#     )
+#     output_path = archive_path.replace('.zip', '')
+#     shutil.unpack_archive(archive_path, output_path)
+#     files = sorted(os.listdir(output_path), key=lambda x: int(x.split('_')[1]))
+#     for j, filename in enumerate(files):
+#         with open(f'{output_path}/{filename}', 'r') as file:
+#             lines = [json.loads(line) for line in file.readlines()]
+#             lemmas, tags, orths = [], [], []
+#             if len(lines) > 0:
+#                 for idx, line in enumerate(lines):
+#                     tokens = line[TOKENS]
+#                     for token in tokens:
+#                         lexeme = token[LEXEMES][0]
+#                         lemmas.append(lexeme[LEMMA])
+#                         tags.append(lexeme[MSTAG])
+#                         orths.append(token[ORTH])
+#             else:
+#                 tokens = lines[0][TOKENS]
+#                 for token in tokens:
+#                     lexeme = token[LEXEMES][0]
+#                     lemmas.append(lexeme[LEMMA])
+#                     tags.append(lexeme[MSTAG])
+#                     orths.append(token[ORTH])
+#             results[int(filename.split('_')[1])] = {
+#                 LEMMAS: lemmas,
+#                 TAGS: tags,
+#                 ORTHS: orths
+#             }
+#     shutil.rmtree(input_dir)
+#     os.remove(archive_path)
+#     shutil.rmtree(output_path)
+#     return results
+
+
+# def process_file(dataset_df, lang):
+#     test_with_tags = pd.DataFrame(dataset_df)
+#     lemmas_col, tags_col, orth_col = [], [], []
+#
+#     tagged_sentences = tag_sentences(dataset_df[TEXT].tolist(), lang)
+#     for idx, tokens in tagged_sentences.items():
+#         lemmas_col.append(tokens[LEMMAS])
+#         tags_col.append(tokens[TAGS])
+#         orth_col.append(tokens[ORTHS])
+#     test_with_tags[LEMMAS] = lemmas_col
+#     test_with_tags[TAGS] = tags_col
+#     test_with_tags[ORTHS] = orth_col
+#     return test_with_tags


 def add_ner(dataset_df, language):

--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,7 @@ torch==1.12.0+cu116

 --index-url https://pypi.clarin-pl.eu/simple/
 plwn-api==0.24
-lpmn_client_biz>=2.1.1
+lpmn_client_biz==2.1.5
+clarin_json
 datetime