From 60c45d3e8f4dd67c7c1dae290745ff3653f7e329 Mon Sep 17 00:00:00 2001 From: pwalkow <pwalkow@gpu-server.ws.clarin> Date: Thu, 9 Mar 2023 18:39:52 +0100 Subject: [PATCH] Add tags --- dvc.lock | 42 +++++++++++++++--------------- dvc.yaml | 9 ++++--- experiments/scripts/classify.py | 2 +- experiments/scripts/explain.py | 2 +- experiments/scripts/tag_dataset.py | 30 +++++++++++++-------- text_attacks/models/poleval.py | 13 +++++++++ 6 files changed, 61 insertions(+), 37 deletions(-) create mode 100644 text_attacks/models/poleval.py diff --git a/dvc.lock b/dvc.lock index d20b0ea..b3d2e13 100644 --- a/dvc.lock +++ b/dvc.lock @@ -16,9 +16,9 @@ stages: cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name enron_spam --output_dir data/models/enron_spam deps: - - path: data/datasets/enron_spam - md5: 66d44efedf37990b1989c81bbee085e0.dir - size: 53096069 + - path: data/preprocessed/enron_spam + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 nfiles: 3 - path: experiments/scripts/get_model.py md5: 5050f51b4019bba97af47971f6c7cab4 @@ -32,21 +32,21 @@ stages: cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name enron_spam --output_dir data/classification/enron_spam deps: - - path: data/datasets/enron_spam/ - md5: 66d44efedf37990b1989c81bbee085e0.dir - size: 53096069 - nfiles: 3 - path: data/models/enron_spam/ md5: 3e16b22f59532c66beeadea958e0579a.dir size: 18505614 nfiles: 6 + - path: data/preprocessed/enron_spam/ + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 + nfiles: 3 - path: experiments/scripts/classify.py - md5: 50f55b90eb47cbf448d83f8392dd37b6 - size: 1102 + md5: ba9284c90847fbbd0f2a6cca414d9636 + size: 1106 outs: - path: data/classification/enron_spam - md5: c7d42825b98b289f6a5ed3be1af14413.dir - size: 2763843 + md5: 0450c0b672bc4a5db3cc7be2dac786bd.dir + size: 10674882 nfiles: 2 explain@enron_spam: cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam @@ -88,13 +88,13 @@ stages: size: 1688836 nfiles: 3 - path: experiments/scripts/tag_dataset.py - md5: 1d911edcd336cacaec482e6b7570eb1a - size: 2716 + md5: 2c4e097b3a278c12d19858f988232b44 + size: 3435 outs: - path: data/preprocessed/poleval/ - md5: 8daba6ad0597214499ac9b96e8e47c9f.dir - size: 501920 - nfiles: 1 + md5: 854387459b193c5eba6db1273ca5ad23.dir + size: 2277282 + nfiles: 3 preprocess_dataset@enron_spam: cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam deps: @@ -103,10 +103,10 @@ stages: size: 53096069 nfiles: 3 - path: experiments/scripts/tag_dataset.py - md5: 1d911edcd336cacaec482e6b7570eb1a - size: 2716 + md5: 2c4e097b3a278c12d19858f988232b44 + size: 3435 outs: - path: data/preprocessed/enron_spam/ - md5: 80c8dd3aa3bacf3afe8cf3138ab01d00.dir - size: 10639521 - nfiles: 1 + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 + nfiles: 3 diff --git a/dvc.yaml b/dvc.yaml index 05c7ca8..a110e67 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -30,6 +30,7 @@ stages: get_model: foreach: - enron_spam + - poleval do: wdir: . cmd: >- @@ -38,12 +39,13 @@ stages: --output_dir data/models/${item} deps: - experiments/scripts/get_model.py - - data/datasets/${item} + - data/preprocessed/${item} outs: - data/models/${item}/ classify: foreach: - enron_spam + - poleval do: wdir: . cmd: >- @@ -53,12 +55,13 @@ stages: deps: - experiments/scripts/classify.py - data/models/${item}/ - - data/datasets/${item}/ + - data/preprocessed/${item}/ outs: - data/classification/${item} explain: foreach: - enron_spam + - poleval do: wdir: . cmd: >- @@ -68,6 +71,6 @@ stages: deps: - experiments/scripts/explain.py - data/models/${item} - - data/datasets/${item} + - data/proprocessed/${item} outs: - data/explanations/${item}/ diff --git a/experiments/scripts/classify.py b/experiments/scripts/classify.py index b642d9b..9639d29 100644 --- a/experiments/scripts/classify.py +++ b/experiments/scripts/classify.py @@ -28,7 +28,7 @@ def main( classify = get_classify_function( dataset_name=dataset_name, ) - test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True) + test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True) test_x = test["text"].tolist() test_y = test["label"] pred_y = classify(test_x) diff --git a/experiments/scripts/explain.py b/experiments/scripts/explain.py index a9576fc..6cdb216 100644 --- a/experiments/scripts/explain.py +++ b/experiments/scripts/explain.py @@ -43,7 +43,7 @@ def main( model, tokenizer = get_model_and_tokenizer( dataset_name=dataset_name, ) - test = pd.read_json(f"data/datasets/{dataset_name}/adversarial.jsonl", lines=True) + test = pd.read_json(f"data/preprocessed/{dataset_name}/adversarial.jsonl", lines=True) test_x = test["text"].tolist() predict = build_predict_fun(model, tokenizer) diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py index 0c266d3..e1b0671 100644 --- a/experiments/scripts/tag_dataset.py +++ b/experiments/scripts/tag_dataset.py @@ -17,10 +17,14 @@ LEMMAS = 'lemmas' TAGS = 'tags' -def tag_sentence(connection: Connection, sentence: str, lang: str): - task = Task([{'postagger': {'output_type': 'json', 'lang': lang}}], - connection=connection) - output_file_id = task.run(sentence, IOType.TEXT) +def tag_sentence(sentence: str, lang: str): + connection = Connection(config_file="experiments/configs/config.yml") + lpmn = ["morphodita", + {"posconverter": + {"input_format": "ccl", "output_format": "json"}}] \ + if lang == 'pl' else [{"spacy": {"lang": "en"}}] + task = Task(lpmn, connection=connection) + output_file_id = task.run(str(sentence), IOType.TEXT) tokens = [] try: clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8")) @@ -40,7 +44,7 @@ def tag_sentence(connection: Connection, sentence: str, lang: str): return lemmas, tags -def process_file(dataset_df, connection, lang, output_path): +def process_file(dataset_df, lang, output_path): test_with_tags = pd.DataFrame(dataset_df) lemmas_col, tags_col = [], [] cpus = cpu_count() @@ -49,9 +53,7 @@ def process_file(dataset_df, connection, lang, output_path): for idx in tqdm(range(0, len(dataset_df), cpus)): end = min(idx+cpus, len(dataset_df) + 1) for sentence in dataset_df[TEXT][idx:end]: - results.append(pool.apply_async(tag_sentence, args=(connection, - sentence, - lang,))) + results.append(pool.apply_async(tag_sentence, args=[sentence, lang])) for res in results: lemmas, tags = res.get() lemmas_col.append(lemmas) @@ -73,15 +75,21 @@ def process_file(dataset_df, connection, lang, output_path): def main(dataset_name: str): """Downloads the dataset to the output directory.""" lang = 'en' if dataset_name == 'enron_spam' else 'pl' - conn = Connection(config_file="experiments/configs/config.yml") output_dir = f"data/preprocessed/{dataset_name}" os.makedirs(output_dir, exist_ok=True) input_dir = f"data/datasets/{dataset_name}" for file in os.listdir(input_dir): if os.path.isfile(os.path.join(input_dir, file)): - process_file(pd.read_json(os.path.join(input_dir, file), lines=True), - conn, lang, os.path.join(output_dir, file)) + if file == "test.jsonl": + process_file(pd.read_json(os.path.join(input_dir, file), lines=True), + lang, os.path.join(output_dir, file)) + else: + test_with_tags = pd.DataFrame(pd.read_json(os.path.join(input_dir, file), lines=True)) + test_with_tags[LEMMAS] = ['' for _ in range(len(test_with_tags))] + test_with_tags[TAGS] = ['' for _ in range(len(test_with_tags))] + with open(os.path.join(output_dir, file), mode="wt") as fd: + fd.write(test_with_tags.to_json(orient='records', lines=True)) if __name__ == "__main__": diff --git a/text_attacks/models/poleval.py b/text_attacks/models/poleval.py new file mode 100644 index 0000000..a037f8d --- /dev/null +++ b/text_attacks/models/poleval.py @@ -0,0 +1,13 @@ +"""Classification model for enron_spam""" + + +def get_model_and_tokenizer(): + return None, None + + +def get_classify_function(): + + def fun(texts): + return "dummy" + + return fun -- GitLab