Skip to content
Snippets Groups Projects
Commit 9de50e73 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski
Browse files

Merge branch 'textfooler' into 'master'

Add attacks

See merge request adversarial-attacks/text-attacks!2
parents 2dc58c39 13b97a2d
Branches
No related merge requests found
/plwn-15012022.db
outs:
- md5: 143e38207b4a64d10fccab8e145ce38b
size: 207540224
path: plwn-15012022.db
...@@ -180,3 +180,22 @@ stages: ...@@ -180,3 +180,22 @@ stages:
md5: 6831f104f7c20541548fe72250c45706.dir md5: 6831f104f7c20541548fe72250c45706.dir
size: 31286120 size: 31286120
nfiles: 2 nfiles: 2
attack@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name enron_spam
deps:
- path: data/models/enron_spam
md5: 3e16b22f59532c66beeadea958e0579a.dir
size: 18505614
nfiles: 6
- path: data/preprocessed/enron_spam
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
- path: experiments/scripts/attack.py
md5: 80f4725e81f070494d926c4916bd9cf4
size: 3352
outs:
- path: data/results/enron_spam/
md5: fc35de09ce5e2aa7043325b2819c0aa0.dir
size: 4869
nfiles: 1
...@@ -75,3 +75,18 @@ stages: ...@@ -75,3 +75,18 @@ stages:
- data/preprocessed/${item} - data/preprocessed/${item}
outs: outs:
- data/explanations/${item}/ - data/explanations/${item}/
attack:
foreach:
- enron_spam
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/attack.py
--dataset_name ${item}
deps:
- experiments/scripts/attack.py
- data/models/${item}
- data/preprocessed/${item}
outs:
- data/results/${item}/
"""Script for running attacks on datasets.""" """Script for running attacks on datasets."""
import click import click
import pandas as pd import pandas as pd
import json
import os import os
from tqdm import tqdm from tqdm import tqdm
from multiprocessing import cpu_count, Pool from multiprocessing import cpu_count, Pool
from text_attacks.utils import get_classify_function from text_attacks.utils import get_classify_function
from textfooler import Attack, TextFooler from textfooler import Attack, TextFooler, BaseLine, process
TEXT = 'text' TEXT = 'text'
LEMMAS = 'lemmas' LEMMAS = 'lemmas'
TAGS = 'tags' TAGS = 'tags'
ATTACK_SUMMARY = "attacks_summary"
ATTACK_SUCCEEDED = "attacks_succeeded"
SIMILARITY = "similarity"
CHANGED = "changed"
CHANGED_WORDS = "changed_words"
SUCCEEDED = "succeeded"
ALL = "all"
DIFF = "diff"
EXPECTED = "expected"
ACTUAL = "actual"
COSINE_SCORE = "cosine_score"
CLASS = "class"
def spoil_sentence(sentence, lemmas, tags, lang, classify_fun, similarity): os.environ["TOKENIZERS_PARALLELISM"] = "false"
DEFAULT_RES = {"spoiled": {"attacks_summary": {"succeeded": 0, "all": 1}, "attacks_succeeded": []}}
def spoil_sentence(sentence, lemmas, tags, lang, similarity, max_sub):
attack = TextFooler(lang) attack = TextFooler(lang)
return attack.process(sentence, lemmas, tags, classify_fun, similarity) # attack = BaseLine(lang, 0.5, 0.4, 0.3)
return attack.spoil(sentence, [], lemmas, tags, similarity, max_sub)
@click.command() @click.command()
...@@ -33,23 +51,62 @@ def main(dataset_name: str): ...@@ -33,23 +51,62 @@ def main(dataset_name: str):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'test.jsonl') output_path = os.path.join(output_dir, 'test.jsonl')
classify = get_classify_function( classify = get_classify_function(
dataset_name=dataset_name, dataset_name=dataset_name
) )
dataset_df = pd.read_json(input_file, lines=True) dataset_df = pd.read_json(input_file, lines=True)
spoiled = [] # dataset_df = dataset_df[:10]
similarity = 0.95
spoiled, results = [], []
similarity, max_sub = 0.95, 1
cpus = cpu_count() cpus = cpu_count()
with Pool(processes=cpus) as pool: classes = classify(dataset_df[TEXT].tolist())
results = [] # used_id = 0
for idx in tqdm(range(0, len(dataset_df), cpus)): # sent_nbr = len(dataset_df[TEXT])
end = min(idx+cpus, len(dataset_df) + 1) # with Pool(processes=cpus) as pool:
for sentence, lemmas, tags in dataset_df[[TEXT, LEMMAS, TAGS], idx:end]: # for idx in range(0, min(cpus, sent_nbr)):
results.append(pool.apply_async(spoil_sentence, args=[sentence, lemmas, # sentence, lemmas, tags = dataset_df[TEXT][idx], \
tags, lang, classify, similarity])) # dataset_df[LEMMAS][idx], \
for res in results: # dataset_df[TAGS][idx]
spoiled_sent = res.get() #
spoiled.append(spoiled_sent) # results.append(pool.apply_async(spoil_sentence, args=[sentence,
results = [] # lemmas,
# tags,
# lang,
# similarity,
# max_sub]))
# used_id = idx
# count = len(results)
# while count and used_id < sent_nbr:
# ready = 0
# to_rm = []
# for r in results:
# if r.ready():
# ready += 1
# changed_sent = r.get()
# if changed_sent:
# spoiled.append(process(changed_sent, classes[i], classify))
# to_rm.append(r)
# count = len(results) - ready
# results = [res for res in results if res not in to_rm]
# h_bound = min(used_id + cpus - len(results), sent_nbr)
# for i in range(used_id + 1, h_bound):
# used_id += 1
# sentence, lemmas, tags = dataset_df[TEXT][idx], \
# dataset_df[LEMMAS][idx], \
# dataset_df[TAGS][idx]
#
# results.append(pool.apply_async(spoil_sentence, args=[sentence,
# lemmas,
# tags,
# lang,
# similarity,
# max_sub]))
for i, cols in tqdm(dataset_df[[TEXT, LEMMAS, TAGS]].iterrows(), total=len(dataset_df)):
sentence, lemmas, tags = cols[0], cols[1], cols[2]
changed_sent = spoil_sentence(sentence, lemmas, tags, lang, similarity, max_sub)
if changed_sent:
spoiled.append(process(changed_sent, classes[i], classify))
with open(output_path, mode="wt") as fd: with open(output_path, mode="wt") as fd:
fd.write(pd.DataFrame( fd.write(pd.DataFrame(
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment