diff --git a/data/models/plwn/.gitignore b/data/models/plwn/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d7c5fc6e0e51f2643632ad8ad24c45a9123fb05d --- /dev/null +++ b/data/models/plwn/.gitignore @@ -0,0 +1 @@ +/plwn-15012022.db diff --git a/data/models/plwn/plwn-15012022.db.dvc b/data/models/plwn/plwn-15012022.db.dvc new file mode 100644 index 0000000000000000000000000000000000000000..dea2296c7d873ef67cf09bfa92496a84580cec07 --- /dev/null +++ b/data/models/plwn/plwn-15012022.db.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 143e38207b4a64d10fccab8e145ce38b + size: 207540224 + path: plwn-15012022.db diff --git a/dvc.lock b/dvc.lock index 52672d93fe9bcbe1ae3e0ed339c6e3b6eb7a3bcd..d773e5820ba1783e5ac99460dbebcf3c4184feb4 100644 --- a/dvc.lock +++ b/dvc.lock @@ -180,3 +180,22 @@ stages: md5: 6831f104f7c20541548fe72250c45706.dir size: 31286120 nfiles: 2 + attack@enron_spam: + cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name enron_spam + deps: + - path: data/models/enron_spam + md5: 3e16b22f59532c66beeadea958e0579a.dir + size: 18505614 + nfiles: 6 + - path: data/preprocessed/enron_spam + md5: b75efba1a62182dc8ac32acd1faf92ed.dir + size: 61709260 + nfiles: 3 + - path: experiments/scripts/attack.py + md5: 80f4725e81f070494d926c4916bd9cf4 + size: 3352 + outs: + - path: data/results/enron_spam/ + md5: fc35de09ce5e2aa7043325b2819c0aa0.dir + size: 4869 + nfiles: 1 diff --git a/dvc.yaml b/dvc.yaml index 92afbaa266962d0bbc8d0f316ec7b6e38c6633d9..88eae99b12db4ba48eccbe23d64ef60b83529082 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -75,3 +75,18 @@ stages: - data/preprocessed/${item} outs: - data/explanations/${item}/ + attack: + foreach: + - enron_spam + do: + wdir: . + cmd: >- + PYTHONPATH=. python experiments/scripts/attack.py + --dataset_name ${item} + deps: + - experiments/scripts/attack.py + - data/models/${item} + - data/preprocessed/${item} + outs: + - data/results/${item}/ + diff --git a/experiments/scripts/attack.py b/experiments/scripts/attack.py index 0980b6037a6d9b69dc683db8ca44dc2ecd468d70..2874037108b6d44d4539db6c4d0891692a92550b 100644 --- a/experiments/scripts/attack.py +++ b/experiments/scripts/attack.py @@ -1,22 +1,40 @@ """Script for running attacks on datasets.""" import click import pandas as pd -import json import os from tqdm import tqdm from multiprocessing import cpu_count, Pool from text_attacks.utils import get_classify_function -from textfooler import Attack, TextFooler +from textfooler import Attack, TextFooler, BaseLine, process TEXT = 'text' LEMMAS = 'lemmas' TAGS = 'tags' +ATTACK_SUMMARY = "attacks_summary" +ATTACK_SUCCEEDED = "attacks_succeeded" +SIMILARITY = "similarity" +CHANGED = "changed" +CHANGED_WORDS = "changed_words" +SUCCEEDED = "succeeded" +ALL = "all" +DIFF = "diff" +EXPECTED = "expected" +ACTUAL = "actual" +COSINE_SCORE = "cosine_score" +CLASS = "class" -def spoil_sentence(sentence, lemmas, tags, lang, classify_fun, similarity): +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +DEFAULT_RES = {"spoiled": {"attacks_summary": {"succeeded": 0, "all": 1}, "attacks_succeeded": []}} + + + +def spoil_sentence(sentence, lemmas, tags, lang, similarity, max_sub): attack = TextFooler(lang) - return attack.process(sentence, lemmas, tags, classify_fun, similarity) + # attack = BaseLine(lang, 0.5, 0.4, 0.3) + return attack.spoil(sentence, [], lemmas, tags, similarity, max_sub) @click.command() @@ -33,23 +51,62 @@ def main(dataset_name: str): os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, 'test.jsonl') classify = get_classify_function( - dataset_name=dataset_name, + dataset_name=dataset_name ) dataset_df = pd.read_json(input_file, lines=True) - spoiled = [] - similarity = 0.95 + # dataset_df = dataset_df[:10] + + spoiled, results = [], [] + similarity, max_sub = 0.95, 1 cpus = cpu_count() - with Pool(processes=cpus) as pool: - results = [] - for idx in tqdm(range(0, len(dataset_df), cpus)): - end = min(idx+cpus, len(dataset_df) + 1) - for sentence, lemmas, tags in dataset_df[[TEXT, LEMMAS, TAGS], idx:end]: - results.append(pool.apply_async(spoil_sentence, args=[sentence, lemmas, - tags, lang, classify, similarity])) - for res in results: - spoiled_sent = res.get() - spoiled.append(spoiled_sent) - results = [] + classes = classify(dataset_df[TEXT].tolist()) + # used_id = 0 + # sent_nbr = len(dataset_df[TEXT]) + # with Pool(processes=cpus) as pool: + # for idx in range(0, min(cpus, sent_nbr)): + # sentence, lemmas, tags = dataset_df[TEXT][idx], \ + # dataset_df[LEMMAS][idx], \ + # dataset_df[TAGS][idx] + # + # results.append(pool.apply_async(spoil_sentence, args=[sentence, + # lemmas, + # tags, + # lang, + # similarity, + # max_sub])) + # used_id = idx + # count = len(results) + # while count and used_id < sent_nbr: + # ready = 0 + # to_rm = [] + # for r in results: + # if r.ready(): + # ready += 1 + # changed_sent = r.get() + # if changed_sent: + # spoiled.append(process(changed_sent, classes[i], classify)) + # to_rm.append(r) + # count = len(results) - ready + # results = [res for res in results if res not in to_rm] + # h_bound = min(used_id + cpus - len(results), sent_nbr) + # for i in range(used_id + 1, h_bound): + # used_id += 1 + # sentence, lemmas, tags = dataset_df[TEXT][idx], \ + # dataset_df[LEMMAS][idx], \ + # dataset_df[TAGS][idx] + # + # results.append(pool.apply_async(spoil_sentence, args=[sentence, + # lemmas, + # tags, + # lang, + # similarity, + # max_sub])) + + for i, cols in tqdm(dataset_df[[TEXT, LEMMAS, TAGS]].iterrows(), total=len(dataset_df)): + sentence, lemmas, tags = cols[0], cols[1], cols[2] + changed_sent = spoil_sentence(sentence, lemmas, tags, lang, similarity, max_sub) + if changed_sent: + spoiled.append(process(changed_sent, classes[i], classify)) with open(output_path, mode="wt") as fd: fd.write(pd.DataFrame(