From 77ee2c903ed2b0442dcc7ada578d9a57f3e43df7 Mon Sep 17 00:00:00 2001 From: pwalkow <pwalkow@gpu-server.ws.clarin> Date: Fri, 17 Mar 2023 17:24:36 +0100 Subject: [PATCH] 10 spoilers --- experiments/scripts/attack.py | 80 ++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 20 deletions(-) diff --git a/experiments/scripts/attack.py b/experiments/scripts/attack.py index d9a0db8..721d46f 100644 --- a/experiments/scripts/attack.py +++ b/experiments/scripts/attack.py @@ -1,4 +1,5 @@ """Script for running attacks on datasets.""" +import importlib import json import click @@ -7,14 +8,14 @@ import os import torch from tqdm import tqdm -from text_attacks.utils import get_classify_function from textfooler import Attack, TextFooler, Similarity, BaseLine, \ process, run_queue, filter_similarity_queue, spoil_queue -from time import sleep +from time import sleep, time from multiprocessing import Process from multiprocessing import Queue, Manager -import multiprocess from threading import Thread +from sklearn.metrics import classification_report, confusion_matrix +import numpy as np TEXT = "text" LEMMAS = "lemmas" @@ -46,27 +47,55 @@ DEFAULT_RES = { } -def data_producer(queue_out, input_file): - dataset_df = pd.read_json(input_file, lines=True) +def data_producer(queue_out, dataset_df): for i, cols in tqdm( dataset_df[[TEXT, ID, LEMMAS, TAGS, ORTHS]].iterrows(), total=len(dataset_df) ): sentence, sent_id, lemmas, tags, orths = cols[0], cols[1], \ cols[2], cols[3], cols[4] queue_out.put([sentence, orths, [], lemmas, tags, sent_id]) - queue_out.put(None) -def data_saver(queue_in, output_file): +def data_saver(queue_in, queue_log, output_file, output_dir, cases_nbr, que_kill, to_kill_nbr): + processed_nbr, start = 0, time() item = 1 + test_y, pred_y = [], [] + spoiled_sents = [] + ch_suc, ch_all = 0, 0 while item is not None: item = queue_in.get() if item is not None: - with open(output_file, 'a') as file_out: - json.dump(item, file_out, indent=2) - - -def classify_queue(queue_in, queue_out, queue_log, classify_fun): + processed_nbr += 1 + spoiled, class_test, class_pred = item + test_y.append(class_test) + pred_y.append(class_pred) + queue_log.put(f"Processed and saved {processed_nbr} in {time() - start} s") + ch_suc += spoiled[ATTACK_SUMMARY][SUCCEEDED] + ch_all += spoiled[ATTACK_SUMMARY][ALL] + spoiled_sents.append(spoiled) + if processed_nbr == cases_nbr: + [que_kill.put(None) for _ in range(to_kill_nbr)] + with open(output_file, 'a') as fd: + fd.write( + pd.DataFrame(spoiled_sents).to_json( + orient="records", lines=True + ) + ) + np.savetxt(f"{output_dir}/metrics.txt", confusion_matrix(test_y, pred_y)) + with open(f"{output_dir}/metrics.txt", mode="at") as fd: + fd.write('\n') + fd.write(classification_report(test_y, pred_y)) + fd.write('\n') + fd.write(f"succeeded {ch_suc} all {ch_all}") + + +def classify_queue(queue_in, queue_out, queue_log, dataset_name): + fun = getattr( + importlib.import_module(f"text_attacks.models.{dataset_name}"), + "get_classify_function", + ) + classify_fun = fun(device="cuda" if torch.cuda.is_available() else "cpu") + queue_log.put(f"Classify device {'cuda' if torch.cuda.is_available() else 'cpu'}") item = True while item is not None: item = queue_in.get() @@ -75,9 +104,10 @@ def classify_queue(queue_in, queue_out, queue_log, classify_fun): sent_id, org_sentence, changed_sents = item sentences = [org_sentence] sentences.extend([sent[TEXT] for sent in changed_sents]) - queue_log.put(f"Classifying sentences {sentences[:100]}") + queue_log.put(f"Classifying sentences {len(sentences)}, id {sent_id}") classified = classify_fun(sentences) queue_out.put((sent_id, org_sentence, changed_sents, classified)) + queue_log.put(f"Classified sentences {sent_id}") queue_out.put(None) @@ -123,21 +153,31 @@ def main(dataset_name: str, attack_type: str): input_file = f"data/preprocessed/{dataset_name}/test.jsonl" os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "test.jsonl") - classify = get_classify_function(dataset_name=dataset_name, device="cpu") + dataset_df = pd.read_json(input_file, lines=True) max_sub = 1 m = Manager() queues = [m.Queue(maxsize=QUEUE_SIZE) for _ in range(6)] sim = Similarity(queues[5], 0.95, "distiluse-base-multilingual-cased-v1") - processes = [Process(target=data_producer, args=(queues[0], input_file,)), # loading data file_in -> 0 - Process(target=spoil_queue, args=(queues[0], queues[1], max_sub, attack_type, params)), # spoiling 0 -> 1 + processes = [Process(target=data_producer, args=(queues[0], dataset_df,)), # loading data file_in -> 0 + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), + Process(target=spoil_queue, args=(queues[0], queues[1], queues[5], max_sub, attack_type, params)), # spoiling 0 -> 1 Process(target=filter_similarity_queue, args=(queues[1], queues[2], queues[5], sim)), # cosim 1 -> 2 - multiprocess.Process(target=classify_queue, args=(queues[2], queues[3], queues[5], classify, )), # classify changed 2 -> 3 - Process(target=run_queue, args=(queues[3], queues[4], process,)), # process 3 -> 4 - Process(target=data_saver, args=(queues[4], output_path,))] # saving 4 -> file_out + Process(target=classify_queue, args=(queues[2], queues[3], queues[5], dataset_name, )), # classify changed 2 -> 3 + Process(target=run_queue, args=(queues[3], queues[4], queues[5], process,)), # process 3 -> 4 + Process(target=data_saver, args=(queues[4], queues[5], output_path, output_dir, len(dataset_df), + queues[0], 11))] # saving 4 -> file_out [p.start() for p in processes] - log_que = Thread(target=log_queues, args=(queues, )) + log_que = Thread(target=log_queues, args=(queues[:5], )) log_que.daemon = True log_que.start() info_que = Thread(target=log_info_queue, args=(queues[5], )) -- GitLab