Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
Text Attacks
Manage
Activity
Members
Plan
Wiki
Redmine
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Package Registry
Operate
Terraform modules
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Adversarial Attacks
Text Attacks
Commits
0c324297
There was an error fetching the commit references. Please try again later.
Commit
0c324297
authored
2 years ago
by
pwalkow
Browse files
Options
Downloads
Patches
Plain Diff
Change script
parent
786bc9f2
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
experiments/scripts/tag_dataset.py
+27
-19
27 additions, 19 deletions
experiments/scripts/tag_dataset.py
with
27 additions
and
19 deletions
experiments/scripts/tag_dataset.py
+
27
−
19
View file @
0c324297
...
...
@@ -40,28 +40,18 @@ def tag_sentence(connection: Connection, sentence: str, lang: str):
return
lemmas
,
tags
@click.command
()
@click.option
(
"
--dataset_name
"
,
help
=
"
Dataset name
"
,
type
=
str
,
)
def
main
(
dataset_name
:
str
):
"""
Downloads the dataset to the output directory.
"""
lang
=
'
en
'
if
dataset_name
==
'
enron_spam
'
else
'
pl
'
test
=
pd
.
read_json
(
f
"
data/datasets/
{
dataset_name
}
/test.jsonl
"
,
lines
=
True
)
test_with_tags
=
pd
.
DataFrame
(
test
)
conn
=
Connection
(
config_file
=
"
experiments/configs/config.yml
"
)
def
process_file
(
dataset_df
,
connection
,
lang
,
output_path
):
test_with_tags
=
pd
.
DataFrame
(
dataset_df
)
lemmas_col
,
tags_col
=
[],
[]
cpus
=
cpu_count
()
with
Pool
(
processes
=
cpus
)
as
pool
:
results
=
[]
for
idx
in
tqdm
(
range
(
0
,
len
(
test
),
cpus
)):
end
=
min
(
idx
+
cpus
,
len
(
test
)
+
1
)
for
sentence
in
test
[
TEXT
][
idx
:
end
]:
results
.
append
(
pool
.
apply_async
(
tag_sentence
,
args
=
[
conn
,
for
idx
in
tqdm
(
range
(
0
,
len
(
dataset_df
),
cpus
)):
end
=
min
(
idx
+
cpus
,
len
(
dataset_df
)
+
1
)
for
sentence
in
dataset_df
[
TEXT
][
idx
:
end
]:
results
.
append
(
pool
.
apply_async
(
tag_sentence
,
args
=
(
conn
ection
,
sentence
,
lang
]
))
lang
,)
))
for
res
in
results
:
lemmas
,
tags
=
res
.
get
()
lemmas_col
.
append
(
lemmas
)
...
...
@@ -70,10 +60,28 @@ def main(dataset_name: str):
test_with_tags
[
LEMMAS
]
=
lemmas_col
test_with_tags
[
TAGS
]
=
tags_col
with
open
(
output_path
,
mode
=
"
wt
"
)
as
fd
:
fd
.
write
(
test_with_tags
.
to_json
(
orient
=
'
records
'
,
lines
=
True
))
@click.command
()
@click.option
(
"
--dataset_name
"
,
help
=
"
Dataset name
"
,
type
=
str
,
)
def
main
(
dataset_name
:
str
):
"""
Downloads the dataset to the output directory.
"""
lang
=
'
en
'
if
dataset_name
==
'
enron_spam
'
else
'
pl
'
conn
=
Connection
(
config_file
=
"
experiments/configs/config.yml
"
)
output_dir
=
f
"
data/preprocessed/
{
dataset_name
}
"
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
with
open
(
f
"
{
output_dir
}
/test.jsonl
"
,
mode
=
"
wt
"
)
as
fd
:
fd
.
write
(
test_with_tags
.
to_json
(
orient
=
'
records
'
,
lines
=
True
))
input_dir
=
f
"
data/datasets/
{
dataset_name
}
"
for
file
in
os
.
listdir
(
input_dir
):
if
os
.
path
.
isfile
(
os
.
path
.
join
(
input_dir
,
file
)):
process_file
(
pd
.
read_json
(
os
.
path
.
join
(
input_dir
,
file
),
lines
=
True
),
conn
,
lang
,
os
.
path
.
join
(
output_dir
,
file
))
if
__name__
==
"
__main__
"
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment