Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Text Attacks
Manage
Activity
Members
Plan
Wiki
Redmine
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Package Registry
Operate
Terraform modules
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Adversarial Attacks
Text Attacks
Commits
2c2ffad8
Commit
2c2ffad8
authored
1 year ago
by
Paweł Walkowiak
Browse files
Options
Downloads
Patches
Plain Diff
Save experiments
parent
4c04019c
Branches
Branches containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
dvc.lock
+24
-4
24 additions, 4 deletions
dvc.lock
experiments/scripts/attack.py
+47
-26
47 additions, 26 deletions
experiments/scripts/attack.py
with
71 additions
and
30 deletions
dvc.lock
+
24
−
4
View file @
2c2ffad8
...
...
@@ -433,12 +433,12 @@ stages:
size: 501711136
nfiles: 7
- path: experiments/scripts/attack.py
md5:
87f54ee4e2a08f1259d9d8b2d01fe1b9
size: 1
2
06
1
md5:
fa754531f756242413103dd4a039ecbb
size: 106
50
outs:
- path: data/results/attack_xai/wiki_pl/
md5:
e24c456f63d8e13b92fcab51e0726141
.dir
size:
82
8733
4
md5:
ff52c5a1f070d3b935437f149ba0ef1f
.dir
size:
3
873
7628
3
nfiles: 2
attack_xai_local@wiki_pl:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name wiki_pl
...
...
@@ -856,3 +856,23 @@ stages:
md5: 134ee8022b841597f6a14796bdbbcf30.dir
size: 142837300
nfiles: 2
attack_xai_char_discard@wiki_pl:
cmd: PYTHONPATH=. python experiments/scripts/attack.py --dataset_name wiki_pl
--attack_type attack_xai_char_discard
deps:
- path: data/classification/wiki_pl
md5: 88c3cea96b2cb3ddda1a82037bf6130a.dir
size: 44196727
nfiles: 2
- path: data/models/wiki_pl
md5: fd453042628fb09c080ef05d34a32cce.dir
size: 501711136
nfiles: 7
- path: experiments/scripts/attack.py
md5: 6a16ddc830a8ba50d01412600a19a4ea
size: 11037
outs:
- path: data/results/attack_xai_char_discard/wiki_pl/
md5: db1b512415d278115f76a74112f31c53.dir
size: 57649801
nfiles: 2
This diff is collapsed.
Click to expand it.
experiments/scripts/attack.py
+
47
−
26
View file @
2c2ffad8
...
...
@@ -15,6 +15,7 @@ from multiprocessing import Process
from
multiprocessing
import
Queue
,
Manager
from
threading
import
Thread
from
sklearn.metrics
import
classification_report
,
confusion_matrix
from
string
import
punctuation
TEXT
=
"
text
"
...
...
@@ -36,7 +37,7 @@ EXPECTED = "expected"
ACTUAL
=
"
actual
"
COSINE_SCORE
=
"
cosine_score
"
CLASS
=
"
class
"
QUEUE_SIZE
=
100
0
QUEUE_SIZE
=
6
0
FEATURES
=
"
features
"
IMPORTANCE
=
"
importance
"
SYNONYM
=
"
synonym
"
...
...
@@ -56,6 +57,11 @@ DEFAULT_RES = {
}
def
join_punct
(
words
):
punc
=
set
(
punctuation
)
return
""
.
join
(
w
if
set
(
w
)
<=
punc
else
"
"
+
w
for
w
in
words
).
lstrip
()
def
data_producer
(
queue_out
,
dataset_df
):
for
i
,
cols
in
tqdm
(
dataset_df
[[
TEXT
,
ID
,
LEMMAS
,
TAGS
,
ORTHS
,
PRED
]].
iterrows
(),
total
=
len
(
dataset_df
)
...
...
@@ -69,7 +75,6 @@ def data_saver(queue_in, queue_log, output_file, output_dir, cases_nbr, queues_k
processed_nbr
,
start
=
0
,
time
()
item
=
1
test_y
,
pred_y
=
[],
[]
spoiled_sents
=
[]
ch_suc
,
ch_all
,
synonyms_nbr
=
0
,
0
,
0
samples
,
samples_succ
=
0
,
0
count_tokens
,
sum_tokens
=
0
,
0
...
...
@@ -79,7 +84,7 @@ def data_saver(queue_in, queue_log, output_file, output_dir, cases_nbr, queues_k
item
=
queue_in
.
get
()
if
item
is
not
None
:
processed_nbr
+=
1
spoiled
,
class_test
,
class_pred
,
synonym_nbr
=
item
spoiled
,
class_test
,
class_pred
,
synonym_nbr
=
process
(
*
item
)
test_y
.
append
(
class_test
)
pred_y
.
append
(
class_pred
)
queue_log
.
put
(
f
"
Processed and saved
{
processed_nbr
}
in
{
time
()
-
start
}
s
"
)
...
...
@@ -92,7 +97,9 @@ def data_saver(queue_in, queue_log, output_file, output_dir, cases_nbr, queues_k
ch_suc
+=
spoiled
[
ATTACK_SUMMARY
][
SUCCEEDED
]
ch_all
+=
spoiled
[
ATTACK_SUMMARY
][
ALL
]
synonyms_nbr
+=
synonym_nbr
spoiled_sents
.
append
(
spoiled
)
with
open
(
output_file
,
'
at
'
)
as
fd
:
fd
.
write
(
pd
.
DataFrame
([
spoiled
]).
to_json
(
orient
=
"
records
"
,
lines
=
True
))
spoiled
=
None
if
processed_nbr
==
cases_nbr
:
for
que_kill
in
queues_kill
:
[
que_kill
.
put
(
None
)
for
_
in
range
(
to_kill_nbr
)]
...
...
@@ -102,9 +109,6 @@ def data_saver(queue_in, queue_log, output_file, output_dir, cases_nbr, queues_k
if
sum
([
q
.
qsize
()
for
q
in
queues_kill
])
==
0
and
(
time
()
-
end_time
)
>
3600
:
for
que_kill
in
queues_kill
:
[
que_kill
.
put
(
None
)
for
_
in
range
(
to_kill_nbr
)]
with
open
(
output_file
,
'
wt
'
)
as
fd
:
fd
.
write
(
pd
.
DataFrame
(
spoiled_sents
).
to_json
(
orient
=
"
records
"
,
lines
=
True
))
metrics
=
{
"
confusion_matrix
"
:
confusion_matrix
(
test_y
,
pred_y
).
tolist
(),
...
...
@@ -141,11 +145,17 @@ def classify_queue(queue_in, queue_out, queue_log, dataset_name, cuda_device):
item
=
queue_in
.
get
()
queue_log
.
put
(
"
Classify got from queue
"
)
if
item
is
not
None
:
sent_id
,
org_sentence
,
y_pred
,
changed_sents
,
synonyms_nbr
=
item
sentences
=
[
sent
[
TEXT
]
for
sent
in
changed_sents
]
queue_log
.
put
(
f
"
Classifying sentences
{
len
(
sentences
)
}
, id
{
sent_id
}
"
)
sent_id
,
org_sentence
,
y_pred
,
changed
,
synonyms_nbr
,
sent_words
=
item
sentences
=
[]
for
subst
,
_
in
changed
:
sent_words_copy
=
[
*
sent_words
]
for
idx
,
word_change
in
subst
.
items
():
sent_words_copy
[
idx
]
=
word_change
[
'
word
'
]
sentences
.
append
(
join_punct
(
sent_words_copy
))
queue_log
.
put
(
f
"
Classifying sentences
{
synonyms_nbr
}
, id
{
sent_id
}
"
)
classified
=
classify_fun
(
sentences
)
if
sentences
else
[]
queue_out
.
put
((
sent_id
,
org_sentence
,
changed
_sents
,
y_pred
,
classified
,
synonyms_nbr
))
queue_out
.
put
((
sent_id
,
org_sentence
,
changed
,
y_pred
,
classified
,
synonyms_nbr
,
sent_words
))
queue_log
.
put
(
f
"
Classified sentences
{
sent_id
}
"
)
...
...
@@ -214,10 +224,10 @@ def main(dataset_name: str, attack_type: str):
if
"
attack_xai
"
in
attack_type
:
importance
=
load_xai_importance
(
f
"
data/explanations/
{
dataset_name
}
"
)
xai_global
,
xai_local
=
importance
[
0
],
importance
[
1
]
xai_sub
=
5
max_sub
=
8
xai_sub
=
0.1
5
max_sub
=
3
char_delete_size
=
0.4
similarity_bound
=
0.
7
similarity_bound
=
0.
3
params
=
{
"
attack_textfooler
"
:
[
lang
,
SYNONYM
],
...
...
@@ -232,34 +242,45 @@ def main(dataset_name: str, attack_type: str):
}[
attack_type
]
output_dir
=
f
"
data/results/
{
attack_type
}
/
{
dataset_name
}
/
"
input_file
=
f
"
data/classification/
{
dataset_name
}
/test.jsonl
"
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
output_path
=
os
.
path
.
join
(
output_dir
,
"
test.jsonl
"
)
dataset_df
=
pd
.
read_json
(
input_file
,
lines
=
True
)
test_sent_ids
=
[
"
Komputery_199721.txt
"
,
"
Zydzi_976178.txt
"
,
"
Kotowate_2015873.txt
"
,
"
Zydzi_1602490.txt
"
,
"
Pilka-nozna_2899267.txt
"
,
"
Optyka_1926807.txt
"
,
"
Zydzi_929483.txt
"
,
"
Niemieccy-wojskowi_2410107.txt
"
]
# dataset_df = dataset_df[dataset_df['id'].isin(test_sent_ids)]
# dataset_df = dataset_df.reset_index(drop=True)
dataset_df
=
dataset_df
[:
20
]
m
=
Manager
()
queues
=
[
m
.
Queue
(
maxsize
=
QUEUE_SIZE
)
for
_
in
range
(
6
)]
queues
=
[
m
.
Queue
(
maxsize
=
QUEUE_SIZE
)
for
_
in
range
(
5
)]
log_que
=
Thread
(
target
=
log_queues
,
args
=
(
queues
[:
5
],))
log_que
=
Thread
(
target
=
log_queues
,
args
=
(
queues
[:
4
],))
log_que
.
daemon
=
True
log_que
.
start
()
info_que
=
Thread
(
target
=
log_info_queue
,
args
=
(
queues
[
5
],))
info_que
=
Thread
(
target
=
log_info_queue
,
args
=
(
queues
[
4
],))
info_que
.
daemon
=
True
info_que
.
start
()
processes_nbr
=
30
sim
=
Similarity
(
queues
[
5
],
similarity_bound
,
"
distiluse-base-multilingual-cased-v1
"
)
processes_nbr
=
15
sim
=
Similarity
(
queues
[
4
],
similarity_bound
,
"
distiluse-base-multilingual-cased-v1
"
)
processes
=
[
Process
(
target
=
data_producer
,
args
=
(
queues
[
0
],
dataset_df
,))]
# loading data file_in -> 0
processes
.
extend
([
Process
(
target
=
spoil_queue
,
args
=
(
queues
[
0
],
queues
[
1
],
queues
[
5
],
max_sub
,
attack_type
,
params
))
processes
.
extend
([
Process
(
target
=
spoil_queue
,
args
=
(
queues
[
0
],
queues
[
1
],
queues
[
4
],
max_sub
,
attack_type
,
params
))
for
_
in
range
(
processes_nbr
)])
# spoiling 0 -> 1
processes
.
extend
([
Process
(
target
=
filter_similarity_queue
,
args
=
(
queues
[
1
],
queues
[
2
],
queues
[
5
],
sim
)),
Process
(
target
=
filter_similarity_queue
,
args
=
(
queues
[
1
],
queues
[
2
],
queues
[
5
],
sim
)),
# cosim 1 -> 2
Process
(
target
=
classify_queue
,
args
=
(
queues
[
2
],
queues
[
3
],
queues
[
5
],
dataset_name
,
"
3
"
)),
Process
(
target
=
classify_queue
,
args
=
(
queues
[
2
],
queues
[
3
],
queues
[
5
],
dataset_name
,
"
3
"
)),
processes
.
extend
([
Process
(
target
=
filter_similarity_queue
,
args
=
(
queues
[
1
],
queues
[
2
],
queues
[
4
],
sim
)),
Process
(
target
=
filter_similarity_queue
,
args
=
(
queues
[
1
],
queues
[
2
],
queues
[
4
],
sim
)),
# cosim 1 -> 2
Process
(
target
=
classify_queue
,
args
=
(
queues
[
2
],
queues
[
3
],
queues
[
4
],
dataset_name
,
"
3
"
)),
Process
(
target
=
classify_queue
,
args
=
(
queues
[
2
],
queues
[
3
],
queues
[
4
],
dataset_name
,
"
3
"
)),
# classify changed 2 -> 3
Process
(
target
=
run_queue
,
args
=
(
queues
[
3
],
queues
[
4
],
queues
[
5
],
process
,)),
# process 3 -> 4
Process
(
target
=
data_saver
,
args
=
(
queues
[
4
],
queues
[
5
],
output_path
,
output_dir
,
len
(
dataset_df
),
queues
,
processes_nbr
+
6
))
# saving
4
-> file_out
#
Process(target=run_queue, args=(queues[3], queues[4], queues[5], process,)), # process 3 -> 4
Process
(
target
=
data_saver
,
args
=
(
queues
[
3
],
queues
[
4
],
output_path
,
output_dir
,
len
(
dataset_df
),
queues
,
processes_nbr
+
6
))
# saving
3
-> file_out
])
[
p
.
start
()
for
p
in
processes
]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment