Skip to content
Snippets Groups Projects
Commit 4c04019c authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Experiments

parent 3545a63b
No related merge requests found
/poleval
/enron_spam
/20_news
/wiki_pl
......@@ -709,12 +709,12 @@ stages:
size: 501609312
nfiles: 7
- path: experiments/scripts/attack.py
md5: 9518ec9af275d6a12fede47dff6767e1
size: 11530
md5: c464fe658004e1d0b2f45bf0dbdbfb42
size: 11947
outs:
- path: data/results/attack_xai_local/poleval/
md5: 7597e90d1ddfa82615e79f6821d90e1b.dir
size: 188754
md5: 3f47355fe91d8df7cb5b598da22bccdc.dir
size: 275308
nfiles: 2
attack_xai_discard_local@poleval:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name poleval
......@@ -749,12 +749,12 @@ stages:
size: 501609312
nfiles: 7
- path: experiments/scripts/attack.py
md5: 9518ec9af275d6a12fede47dff6767e1
size: 11530
md5: 5c37737865b0e3524be76396330e683f
size: 9556
outs:
- path: data/results/attack_xai/poleval/
md5: d368af0f7069a5f43b9cf6f3a0422522.dir
size: 189001
md5: 1ed13c64a5ae2ed24598be64b36ad26e.dir
size: 84472229
nfiles: 2
attack_xai_discard@poleval:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name poleval
......@@ -769,12 +769,12 @@ stages:
size: 501609312
nfiles: 7
- path: experiments/scripts/attack.py
md5: 9518ec9af275d6a12fede47dff6767e1
size: 11530
md5: e46bca05ac076c87522c7318257026ba
size: 10247
outs:
- path: data/results/attack_xai_discard/poleval/
md5: 83bbb41d4e1303329330c981cf50ece6.dir
size: 188316
md5: 70dbf6dacdfb2be5e41e462bd9b6ad8d.dir
size: 8606443
nfiles: 2
attack_basic@poleval:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name poleval
......@@ -789,12 +789,12 @@ stages:
size: 501609312
nfiles: 7
- path: experiments/scripts/attack.py
md5: 9518ec9af275d6a12fede47dff6767e1
size: 11530
md5: d2a15a7f4c3d065c67db7caf4aaa0dae
size: 9556
outs:
- path: data/results/attack_basic/poleval/
md5: 2ba20316e75c6401e764a42c8c9ba02d.dir
size: 220962
md5: 70849358008ba01eebce555a4a1e1482.dir
size: 238609
nfiles: 2
attack_textfooler_discard@poleval:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name poleval
......@@ -809,12 +809,12 @@ stages:
size: 501609312
nfiles: 7
- path: experiments/scripts/attack.py
md5: 9518ec9af275d6a12fede47dff6767e1
size: 11530
md5: d2a15a7f4c3d065c67db7caf4aaa0dae
size: 9556
outs:
- path: data/results/attack_textfooler_discard/poleval/
md5: 71e521e256665812795d75e545ee4e9a.dir
size: 205890
md5: 884f622d1b9a8e531583ff6dcabe3f95.dir
size: 1536088
nfiles: 2
attack_textfooler@poleval:
cmd: 'PYTHONPATH=. python experiments/scripts/attack.py --dataset_name poleval
......@@ -829,10 +829,30 @@ stages:
size: 501609312
nfiles: 7
- path: experiments/scripts/attack.py
md5: 9518ec9af275d6a12fede47dff6767e1
size: 11530
md5: 5c37737865b0e3524be76396330e683f
size: 9556
outs:
- path: data/results/attack_textfooler/poleval/
md5: 1cc24839e9c653182cb312f5c66a6a88.dir
size: 275318
md5: 37e5f7b554c1959fbf1fabb84bf32ed8.dir
size: 89141787
nfiles: 2
attack_xai_char_discard@poleval:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name poleval
--attack_type attack_xai_char_discard
deps:
- path: data/classification/poleval
md5: f207458f9365a74672c31b5ffb2a83af.dir
size: 787456
nfiles: 2
- path: data/models/poleval
md5: 8f806cb1b2eb0dd097811d42e4bf9c2d.dir
size: 501609312
nfiles: 7
- path: experiments/scripts/attack.py
md5: aa42fd50ddee64a0002e210270376b88
size: 10247
outs:
- path: data/results/attack_xai_char_discard/poleval/
md5: 134ee8022b841597f6a14796bdbbcf30.dir
size: 142837300
nfiles: 2
......@@ -199,4 +199,37 @@ stages:
- data/classification/${item}
outs:
- data/results/attack_basic/${item}/
attack_xai_char_discard:
foreach:
- enron_spam
- poleval
- 20_news
- wiki_pl
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/attack.py
--dataset_name ${item} --attack_type attack_xai_char_discard
deps:
- experiments/scripts/attack.py
- data/models/${item}
- data/classification/${item}
outs:
- data/results/attack_xai_char_discard/${item}/
attack_xai_char_discard_local:
foreach:
- enron_spam
- poleval
- 20_news
- wiki_pl
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/attack.py
--dataset_name ${item} --attack_type attack_xai_char_discard_local
deps:
- experiments/scripts/attack.py
- data/models/${item}
- data/classification/${item}
outs:
- data/results/attack_xai_char_discard_local/${item}/
......@@ -15,7 +15,7 @@ from multiprocessing import Process
from multiprocessing import Queue, Manager
from threading import Thread
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
TEXT = "text"
LEMMAS = "lemmas"
......@@ -43,6 +43,7 @@ SYNONYM = "synonym"
DISCARD = "discard"
GLOBAL = "global"
LOCAL = "local"
CHAR_DISCARD = "char_discard"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
......@@ -69,18 +70,28 @@ def data_saver(queue_in, queue_log, output_file, output_dir, cases_nbr, queues_k
item = 1
test_y, pred_y = [], []
spoiled_sents = []
ch_suc, ch_all = 0, 0
ch_suc, ch_all, synonyms_nbr = 0, 0, 0
samples, samples_succ = 0, 0
count_tokens, sum_tokens = 0, 0
end_time = time()
while item is not None:
item = queue_in.get()
if item is not None:
processed_nbr += 1
spoiled, class_test, class_pred = item
spoiled, class_test, class_pred, synonym_nbr = item
test_y.append(class_test)
pred_y.append(class_pred)
queue_log.put(f"Processed and saved {processed_nbr} in {time() - start} s")
samples_succ = samples_succ + 1 if spoiled[ATTACK_SUMMARY][SUCCEEDED] > 0 else samples_succ
samples += 1
for success in spoiled[ATTACK_SUCCEEDED]:
if CHANGED_WORDS in success:
count_tokens += len(success[CHANGED_WORDS])
sum_tokens += 1
ch_suc += spoiled[ATTACK_SUMMARY][SUCCEEDED]
ch_all += spoiled[ATTACK_SUMMARY][ALL]
synonyms_nbr += synonym_nbr
spoiled_sents.append(spoiled)
if processed_nbr == cases_nbr:
for que_kill in queues_kill:
......@@ -94,12 +105,27 @@ def data_saver(queue_in, queue_log, output_file, output_dir, cases_nbr, queues_k
with open(output_file, 'wt') as fd:
fd.write(pd.DataFrame(spoiled_sents).to_json(
orient="records", lines=True))
np.savetxt(f"{output_dir}/metrics.txt", confusion_matrix(test_y, pred_y))
with open(f"{output_dir}/metrics.txt", mode="at") as fd:
fd.write('\n')
fd.write(classification_report(test_y, pred_y))
fd.write('\n')
fd.write(f"succeeded {ch_suc} all {ch_all}")
metrics = {
"confusion_matrix": confusion_matrix(test_y, pred_y).tolist(),
"classification_report": classification_report(test_y, pred_y, output_dict=True),
"attacks_succeeded": ch_suc,
"attacks_all": ch_all,
"synonyms_nbr": synonyms_nbr,
"success_rate": ch_suc / ch_all,
"success_rate_per_synonym": ch_suc / synonyms_nbr,
"time": time() - start,
"samples": samples,
"samples_succ": samples_succ,
"count_tokens": count_tokens,
"sum_tokens": sum_tokens,
"%F": (samples - samples_succ) / samples if samples > 0 else 0,
"%C": count_tokens / sum_tokens if sum_tokens > 0 else 0,
"BLEU": 0,
"P": 0
}
with open(f"{output_dir}/metrics.json", mode="w") as fd:
json.dump(metrics, fd)
def classify_queue(queue_in, queue_out, queue_log, dataset_name, cuda_device):
......@@ -115,11 +141,11 @@ def classify_queue(queue_in, queue_out, queue_log, dataset_name, cuda_device):
item = queue_in.get()
queue_log.put("Classify got from queue")
if item is not None:
sent_id, org_sentence, y_pred, changed_sents = item
sent_id, org_sentence, y_pred, changed_sents, synonyms_nbr = item
sentences = [sent[TEXT] for sent in changed_sents]
queue_log.put(f"Classifying sentences {len(sentences)}, id {sent_id}")
classified = classify_fun(sentences) if sentences else []
queue_out.put((sent_id, org_sentence, changed_sents, y_pred, classified))
queue_out.put((sent_id, org_sentence, changed_sents, y_pred, classified, synonyms_nbr))
queue_log.put(f"Classified sentences {sent_id}")
......@@ -134,7 +160,9 @@ def log_info_queue(queue):
print("Logging queue")
while True:
item = queue.get()
print(item)
if item is not None:
print(item)
print("Logging queue finished")
def load_dir_files(dir_path):
......@@ -187,6 +215,10 @@ def main(dataset_name: str, attack_type: str):
importance = load_xai_importance(f"data/explanations/{dataset_name}")
xai_global, xai_local = importance[0], importance[1]
xai_sub = 5
max_sub = 8
char_delete_size = 0.4
similarity_bound = 0.7
params = {
"attack_textfooler": [lang, SYNONYM],
"attack_textfooler_discard": [lang, DISCARD],
......@@ -194,61 +226,18 @@ def main(dataset_name: str, attack_type: str):
"attack_xai": [lang, xai_global, xai_local, GLOBAL, SYNONYM, xai_sub],
"attack_xai_discard": [lang, xai_global, xai_local, GLOBAL, DISCARD, xai_sub],
"attack_xai_local": [lang, xai_global, xai_local, LOCAL, SYNONYM, xai_sub],
"attack_xai_discard_local": [lang, xai_global, xai_local, LOCAL, DISCARD, xai_sub]
"attack_xai_discard_local": [lang, xai_global, xai_local, LOCAL, DISCARD, xai_sub],
"attack_xai_char_discard": [lang, xai_global, xai_local, GLOBAL, CHAR_DISCARD, xai_sub, char_delete_size],
"attack_xai_char_discard_local": [lang, xai_global, xai_local, LOCAL, CHAR_DISCARD, xai_sub, char_delete_size]
}[attack_type]
output_dir = f"data/results/{attack_type}/{dataset_name}/"
input_file = f"data/classification/{dataset_name}/test.jsonl"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "test.jsonl")
dataset_df = pd.read_json(input_file, lines=True)
max_sub = 1
m = Manager()
queues = [m.Queue(maxsize=QUEUE_SIZE) for _ in range(6)]
sim = Similarity(queues[5], 0.95, "distiluse-base-multilingual-cased-v1")
processes = [
Process(target=data_producer, args=(queues[0], dataset_df,)), # loading data file_in -> 0
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)),
# spoiling 0 -> 1
Process(target=filter_similarity_queue, args=(queues[1], queues[2], queues[5], sim)),
Process(target=filter_similarity_queue, args=(queues[1], queues[2], queues[5], sim)), # cosim 1 -> 2
Process(target=classify_queue, args=(queues[2], queues[3], queues[5], dataset_name, "6")),
Process(target=classify_queue, args=(queues[2], queues[3], queues[5], dataset_name, "4")),
# classify changed 2 -> 3
Process(target=run_queue, args=(queues[3], queues[4], queues[5], process,)), # process 3 -> 4
Process(target=data_saver, args=(queues[4], queues[5], output_path, output_dir, len(dataset_df), queues, 30))
# saving 4 -> file_out
]
[p.start() for p in processes]
log_que = Thread(target=log_queues, args=(queues[:5],))
log_que.daemon = True
......@@ -256,6 +245,25 @@ def main(dataset_name: str, attack_type: str):
info_que = Thread(target=log_info_queue, args=(queues[5],))
info_que.daemon = True
info_que.start()
processes_nbr = 30
sim = Similarity(queues[5], similarity_bound, "distiluse-base-multilingual-cased-v1")
processes = [Process(target=data_producer, args=(queues[0], dataset_df,))] # loading data file_in -> 0
processes.extend([Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params))
for _ in range(processes_nbr)]) # spoiling 0 -> 1
processes.extend([Process(target=filter_similarity_queue, args=(queues[1], queues[2], queues[5], sim)),
Process(target=filter_similarity_queue, args=(queues[1], queues[2], queues[5], sim)), # cosim 1 -> 2
Process(target=classify_queue, args=(queues[2], queues[3], queues[5], dataset_name, "3")),
Process(target=classify_queue, args=(queues[2], queues[3], queues[5], dataset_name, "3")),
# classify changed 2 -> 3
Process(target=run_queue, args=(queues[3], queues[4], queues[5], process,)), # process 3 -> 4
Process(target=data_saver, args=(queues[4], queues[5], output_path, output_dir, len(dataset_df), queues, processes_nbr+6)) # saving 4 -> file_out
])
[p.start() for p in processes]
# wait for all processes to finish
[p.join() for p in processes]
log_que.join(timeout=0.5)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment