diff --git a/.gitignore b/.gitignore index 976a184864ad03354ff818d147fc6bf059e6dcf7..853a525fe88b9836e04cb5a30b396f1627b8d327 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,6 @@ dmypy.json cython_debug/ .idea/ + +# Lpmn config +experiments/configs/config.yml diff --git a/data/datasets/.gitignore b/data/datasets/.gitignore index 60ba70084bebf52a8521c364d3a4b019028fe084..af871dfd6ce1c3aab7e8d1a405df6390acab6f65 100644 --- a/data/datasets/.gitignore +++ b/data/datasets/.gitignore @@ -1 +1,2 @@ /enron_spam +/poleval diff --git a/data/preprocessed/.gitignore b/data/preprocessed/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..8cfcddc78ec875a51b9da2648f6a001136ec87fe --- /dev/null +++ b/data/preprocessed/.gitignore @@ -0,0 +1,2 @@ +/poleval +/enron_spam diff --git a/dvc.lock b/dvc.lock index e0d6202837a1f42b7033074e8abb65adfbc92239..346ede5ec88b823d474a3c0dc3adddaad6c1f578 100644 --- a/dvc.lock +++ b/dvc.lock @@ -16,9 +16,9 @@ stages: cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name enron_spam --output_dir data/models/enron_spam deps: - - path: data/datasets/enron_spam - md5: 66d44efedf37990b1989c81bbee085e0.dir - size: 53096069 + - path: data/preprocessed/enron_spam + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 nfiles: 3 - path: experiments/scripts/get_model.py md5: 5050f51b4019bba97af47971f6c7cab4 @@ -32,39 +32,81 @@ stages: cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name enron_spam --output_dir data/classification/enron_spam deps: - - path: data/datasets/enron_spam/ - md5: 66d44efedf37990b1989c81bbee085e0.dir - size: 53096069 - nfiles: 3 - path: data/models/enron_spam/ md5: 3e16b22f59532c66beeadea958e0579a.dir size: 18505614 nfiles: 6 + - path: data/preprocessed/enron_spam/ + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 + nfiles: 3 - path: experiments/scripts/classify.py - md5: 50f55b90eb47cbf448d83f8392dd37b6 - size: 1102 + md5: ba9284c90847fbbd0f2a6cca414d9636 + size: 1106 outs: - path: data/classification/enron_spam - md5: c7d42825b98b289f6a5ed3be1af14413.dir - size: 2763843 + md5: 0450c0b672bc4a5db3cc7be2dac786bd.dir + size: 10674882 nfiles: 2 explain@enron_spam: cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam --output_dir data/explanations/enron_spam deps: - - path: data/datasets/enron_spam - md5: 66d44efedf37990b1989c81bbee085e0.dir - size: 53096069 - nfiles: 3 - path: data/models/enron_spam md5: 3e16b22f59532c66beeadea958e0579a.dir size: 18505614 nfiles: 6 + - path: data/preprocessed/enron_spam + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 + nfiles: 3 - path: experiments/scripts/explain.py - md5: c85cbb774f2682ee39948e701fa0b0ca - size: 1445 + md5: 4e40a6415038ec6eb4140b54ff65c9c0 + size: 1449 outs: - path: data/explanations/enron_spam/ - md5: 376bd1619c08b4989564788e74de8e06.dir + md5: 345282e7c4e774d55aba55ed56ec464f.dir size: 7870394 nfiles: 1 + download_dataset@poleval: + cmd: PYTHONPATH=. python experiments/scripts/download_dataset.py --dataset_name + poleval --output_dir data/datasets/poleval + deps: + - path: experiments/scripts/download_dataset.py + md5: 9eb915fd5b9216965db519f686408a51 + size: 887 + outs: + - path: data/datasets/poleval/ + md5: 826f974f794e24efcb5aedb054d1fd55.dir + size: 1688836 + nfiles: 3 + preprocess_dataset@poleval: + cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name poleval + deps: + - path: data/datasets/poleval/ + md5: 826f974f794e24efcb5aedb054d1fd55.dir + size: 1688836 + nfiles: 3 + - path: experiments/scripts/tag_dataset.py + md5: 2c4e097b3a278c12d19858f988232b44 + size: 3435 + outs: + - path: data/preprocessed/poleval/ + md5: 854387459b193c5eba6db1273ca5ad23.dir + size: 2277282 + nfiles: 3 + preprocess_dataset@enron_spam: + cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam + deps: + - path: data/datasets/enron_spam/ + md5: 66d44efedf37990b1989c81bbee085e0.dir + size: 53096069 + nfiles: 3 + - path: experiments/scripts/tag_dataset.py + md5: 2c4e097b3a278c12d19858f988232b44 + size: 3435 + outs: + - path: data/preprocessed/enron_spam/ + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 + nfiles: 3 diff --git a/dvc.yaml b/dvc.yaml index c035ccba805ed740cf2cdba55f6217d2a7f97712..533298e55d4632d5a8413d34b917d531bf334396 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -13,9 +13,24 @@ stages: - experiments/scripts/download_dataset.py outs: - data/datasets/${item}/ + preprocess_dataset: + foreach: + - enron_spam + - poleval + do: + wdir: . + cmd: >- + PYTHONPATH=. python experiments/scripts/tag_dataset.py + --dataset_name ${item} + deps: + - experiments/scripts/tag_dataset.py + - data/datasets/${item}/ + outs: + - data/preprocessed/${item}/ get_model: foreach: - enron_spam + # - poleval do: wdir: . cmd: >- @@ -24,12 +39,13 @@ stages: --output_dir data/models/${item} deps: - experiments/scripts/get_model.py - - data/datasets/${item} + - data/preprocessed/${item} outs: - data/models/${item}/ classify: foreach: - enron_spam + #- poleval do: wdir: . cmd: >- @@ -39,12 +55,13 @@ stages: deps: - experiments/scripts/classify.py - data/models/${item}/ - - data/datasets/${item}/ + - data/preprocessed/${item}/ outs: - data/classification/${item} explain: foreach: - enron_spam + #- poleval do: wdir: . cmd: >- @@ -54,6 +71,6 @@ stages: deps: - experiments/scripts/explain.py - data/models/${item} - - data/datasets/${item} + - data/preprocessed/${item} outs: - data/explanations/${item}/ diff --git a/experiments/scripts/attack.py b/experiments/scripts/attack.py new file mode 100644 index 0000000000000000000000000000000000000000..0980b6037a6d9b69dc683db8ca44dc2ecd468d70 --- /dev/null +++ b/experiments/scripts/attack.py @@ -0,0 +1,61 @@ +"""Script for running attacks on datasets.""" +import click +import pandas as pd +import json +import os +from tqdm import tqdm +from multiprocessing import cpu_count, Pool +from text_attacks.utils import get_classify_function +from textfooler import Attack, TextFooler + + +TEXT = 'text' +LEMMAS = 'lemmas' +TAGS = 'tags' + + +def spoil_sentence(sentence, lemmas, tags, lang, classify_fun, similarity): + attack = TextFooler(lang) + return attack.process(sentence, lemmas, tags, classify_fun, similarity) + + +@click.command() +@click.option( + "--dataset_name", + help="Dataset name", + type=str, +) +def main(dataset_name: str): + """Downloads the dataset to the output directory.""" + lang = 'en' if dataset_name == 'enron_spam' else 'pl' + output_dir = f"data/results/{dataset_name}" + input_file = f"data/preprocessed/{dataset_name}/test.jsonl" + os.makedirs(output_dir, exist_ok=True) + output_path = os.path.join(output_dir, 'test.jsonl') + classify = get_classify_function( + dataset_name=dataset_name, + ) + dataset_df = pd.read_json(input_file, lines=True) + spoiled = [] + similarity = 0.95 + cpus = cpu_count() + with Pool(processes=cpus) as pool: + results = [] + for idx in tqdm(range(0, len(dataset_df), cpus)): + end = min(idx+cpus, len(dataset_df) + 1) + for sentence, lemmas, tags in dataset_df[[TEXT, LEMMAS, TAGS], idx:end]: + results.append(pool.apply_async(spoil_sentence, args=[sentence, lemmas, + tags, lang, classify, similarity])) + for res in results: + spoiled_sent = res.get() + spoiled.append(spoiled_sent) + results = [] + + with open(output_path, mode="wt") as fd: + fd.write(pd.DataFrame( + {"spoiled": spoiled}).to_json( + orient='records', lines=True)) + + +if __name__ == "__main__": + main() diff --git a/experiments/scripts/classify.py b/experiments/scripts/classify.py index b642d9b11cc2a1d7e90130113c7c438e5d22ac85..9639d298904c1a2815f0b34f8cbb6894df6c8527 100644 --- a/experiments/scripts/classify.py +++ b/experiments/scripts/classify.py @@ -28,7 +28,7 @@ def main( classify = get_classify_function( dataset_name=dataset_name, ) - test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True) + test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True) test_x = test["text"].tolist() test_y = test["label"] pred_y = classify(test_x) diff --git a/experiments/scripts/explain.py b/experiments/scripts/explain.py index a9576fcd18ab2d33b39270cd105820708e4e2c60..6cdb2165e86e38d3c5b5a1b5089ac48c48e0ccb1 100644 --- a/experiments/scripts/explain.py +++ b/experiments/scripts/explain.py @@ -43,7 +43,7 @@ def main( model, tokenizer = get_model_and_tokenizer( dataset_name=dataset_name, ) - test = pd.read_json(f"data/datasets/{dataset_name}/adversarial.jsonl", lines=True) + test = pd.read_json(f"data/preprocessed/{dataset_name}/adversarial.jsonl", lines=True) test_x = test["text"].tolist() predict = build_predict_fun(model, tokenizer) diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e1b0671fc6ba9111847dabd3e1dea7a1fb7d71fb --- /dev/null +++ b/experiments/scripts/tag_dataset.py @@ -0,0 +1,96 @@ +"""Script for running tagger on datasets.""" +import click +import pandas as pd +from lpmn_client_biz import Connection, IOType, Task, download +import json +import os +from tqdm import tqdm +from multiprocessing import cpu_count, Pool + +TOKENS = 'tokens' +ORTH = 'orth' +LEXEMES = 'lexemes' +LEMMA = 'lemma' +MSTAG = 'mstag' +TEXT = 'text' +LEMMAS = 'lemmas' +TAGS = 'tags' + + +def tag_sentence(sentence: str, lang: str): + connection = Connection(config_file="experiments/configs/config.yml") + lpmn = ["morphodita", + {"posconverter": + {"input_format": "ccl", "output_format": "json"}}] \ + if lang == 'pl' else [{"spacy": {"lang": "en"}}] + task = Task(lpmn, connection=connection) + output_file_id = task.run(str(sentence), IOType.TEXT) + tokens = [] + try: + clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8")) + tokens = clarin_json[TOKENS] + except json.decoder.JSONDecodeError: + downloaded = download(connection, output_file_id, IOType.FILE) + with open(downloaded, 'r') as file: + lines = [json.loads(line) for line in file.readlines()] + for line in lines: + tokens.extend(line[TOKENS]) + os.remove(downloaded) + lemmas, tags = [], [] + for token in tokens: + lexeme = token['lexemes'][0] + lemmas.append(lexeme['lemma']) + tags.append(lexeme['mstag']) + return lemmas, tags + + +def process_file(dataset_df, lang, output_path): + test_with_tags = pd.DataFrame(dataset_df) + lemmas_col, tags_col = [], [] + cpus = cpu_count() + with Pool(processes=cpus) as pool: + results = [] + for idx in tqdm(range(0, len(dataset_df), cpus)): + end = min(idx+cpus, len(dataset_df) + 1) + for sentence in dataset_df[TEXT][idx:end]: + results.append(pool.apply_async(tag_sentence, args=[sentence, lang])) + for res in results: + lemmas, tags = res.get() + lemmas_col.append(lemmas) + tags_col.append(tags) + results = [] + test_with_tags[LEMMAS] = lemmas_col + test_with_tags[TAGS] = tags_col + + with open(output_path, mode="wt") as fd: + fd.write(test_with_tags.to_json(orient='records', lines=True)) + + +@click.command() +@click.option( + "--dataset_name", + help="Dataset name", + type=str, +) +def main(dataset_name: str): + """Downloads the dataset to the output directory.""" + lang = 'en' if dataset_name == 'enron_spam' else 'pl' + output_dir = f"data/preprocessed/{dataset_name}" + os.makedirs(output_dir, exist_ok=True) + + input_dir = f"data/datasets/{dataset_name}" + for file in os.listdir(input_dir): + if os.path.isfile(os.path.join(input_dir, file)): + if file == "test.jsonl": + process_file(pd.read_json(os.path.join(input_dir, file), lines=True), + lang, os.path.join(output_dir, file)) + else: + test_with_tags = pd.DataFrame(pd.read_json(os.path.join(input_dir, file), lines=True)) + test_with_tags[LEMMAS] = ['' for _ in range(len(test_with_tags))] + test_with_tags[TAGS] = ['' for _ in range(len(test_with_tags))] + with open(os.path.join(output_dir, file), mode="wt") as fd: + fd.write(test_with_tags.to_json(orient='records', lines=True)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 83b9c69cc24fc13e377f4b09937beba3b44c18dc..fec55bda346006486c107831bf7b436564332a9e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,11 @@ click scikit-learn dvc[s3] shap +lpmn_client_biz --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.0+cu116 + +--index-url https://pypi.clarin-pl.eu/simple/ +plwn-api +git+ssh://git@gitlab.clarin-pl.eu/adversarial-attacks/textfooling.git@develop diff --git a/text_attacks/models/poleval.py b/text_attacks/models/poleval.py new file mode 100644 index 0000000000000000000000000000000000000000..a037f8d0ff30367e20ffabf015b211aa992cf7ce --- /dev/null +++ b/text_attacks/models/poleval.py @@ -0,0 +1,13 @@ +"""Classification model for enron_spam""" + + +def get_model_and_tokenizer(): + return None, None + + +def get_classify_function(): + + def fun(texts): + return "dummy" + + return fun