Skip to content
Snippets Groups Projects
Commit 2dc58c39 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski
Browse files

Merge branch '20_news_wiki_pl' into 'master'

20 news wiki pl

See merge request adversarial-attacks/text-attacks!3
parents 68879d57 9534bee8
Branches
No related merge requests found
Showing with 276 additions and 52 deletions
```
pip install morfeusz2
pip install -r requirements.txt
pip install --no-deps git+ssh://git@gitlab.clarin-pl.eu/adversarial-attacks/textfooling.git@develop
```
/enron_spam
/wiki_pl
/20_news
/enron_spam
/20_news
/poleval
/wiki_pl
outs:
- md5: 999207f1c2c123c9943397b47f2c3b3a.dir
size: 23460358
nfiles: 3
path: 20_news
outs:
- md5: abcbccb3e352ed623cace1b95078bd63.dir
size: 29115538
nfiles: 3
path: wiki_pl
/enron_spam
/20_news
/wiki_pl
outs:
- md5: 43d68a67ecb8149bd6bf50db9767cb64.dir
size: 439008808
nfiles: 6
path: 20_news
outs:
- md5: fd453042628fb09c080ef05d34a32cce.dir
size: 501711136
nfiles: 7
path: wiki_pl
/poleval
/enron_spam
/wiki_pl
/20_news
......@@ -41,8 +41,8 @@ stages:
size: 61709260
nfiles: 3
- path: experiments/scripts/classify.py
md5: ba9284c90847fbbd0f2a6cca414d9636
size: 1106
md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
size: 1181
outs:
- path: data/classification/enron_spam
md5: 0450c0b672bc4a5db3cc7be2dac786bd.dir
......@@ -88,8 +88,8 @@ stages:
size: 1688836
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 2c4e097b3a278c12d19858f988232b44
size: 3435
md5: 8e8039b73b8ea3ce39287ed0a304af9b
size: 3750
outs:
- path: data/preprocessed/poleval/
md5: 854387459b193c5eba6db1273ca5ad23.dir
......@@ -103,10 +103,80 @@ stages:
size: 53096069
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 2c4e097b3a278c12d19858f988232b44
size: 3435
md5: 8e8039b73b8ea3ce39287ed0a304af9b
size: 3750
outs:
- path: data/preprocessed/enron_spam/
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
preprocess_dataset@wiki_pl:
cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name wiki_pl
deps:
- path: data/datasets/wiki_pl/
md5: abcbccb3e352ed623cace1b95078bd63.dir
size: 29115538
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 8e8039b73b8ea3ce39287ed0a304af9b
size: 3750
outs:
- path: data/preprocessed/wiki_pl/
md5: 3e9b2e1e0542777e0a751d9d7f7f4241.dir
size: 55380570
nfiles: 3
classify@wiki_pl:
cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name wiki_pl
--output_dir data/classification/wiki_pl
deps:
- path: data/models/wiki_pl/
md5: fd453042628fb09c080ef05d34a32cce.dir
size: 501711136
nfiles: 7
- path: data/preprocessed/wiki_pl/
md5: 3e9b2e1e0542777e0a751d9d7f7f4241.dir
size: 55380570
nfiles: 3
- path: experiments/scripts/classify.py
md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
size: 1181
outs:
- path: data/classification/wiki_pl
md5: 515330772505f489b55686545bcf23a0.dir
size: 34103198
nfiles: 2
preprocess_dataset@20_news:
cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name 20_news
deps:
- path: data/datasets/20_news/
md5: 999207f1c2c123c9943397b47f2c3b3a.dir
size: 23460358
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 8e8039b73b8ea3ce39287ed0a304af9b
size: 3750
outs:
- path: data/preprocessed/20_news/
md5: 1ed5ef2dabe4bc05f7377175ed11137b.dir
size: 46845669
nfiles: 3
classify@20_news:
cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name 20_news
--output_dir data/classification/20_news
deps:
- path: data/models/20_news/
md5: 43d68a67ecb8149bd6bf50db9767cb64.dir
size: 439008808
nfiles: 6
- path: data/preprocessed/20_news/
md5: 1ed5ef2dabe4bc05f7377175ed11137b.dir
size: 46845669
nfiles: 3
- path: experiments/scripts/classify.py
md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
size: 1181
outs:
- path: data/classification/20_news
md5: 6831f104f7c20541548fe72250c45706.dir
size: 31286120
nfiles: 2
......@@ -17,6 +17,8 @@ stages:
foreach:
- enron_spam
- poleval
- 20_news
- wiki_pl
do:
wdir: .
cmd: >-
......@@ -30,7 +32,6 @@ stages:
get_model:
foreach:
- enron_spam
# - poleval
do:
wdir: .
cmd: >-
......@@ -45,7 +46,8 @@ stages:
classify:
foreach:
- enron_spam
#- poleval
- 20_news
- wiki_pl
do:
wdir: .
cmd: >-
......@@ -61,7 +63,6 @@ stages:
explain:
foreach:
- enron_spam
#- poleval
do:
wdir: .
cmd: >-
......
......@@ -3,6 +3,7 @@ from pathlib import Path
import click
import pandas as pd
import torch
from sklearn.metrics import classification_report
from text_attacks.utils import get_classify_function
......@@ -27,6 +28,7 @@ def main(
output_dir.mkdir(parents=True, exist_ok=True)
classify = get_classify_function(
dataset_name=dataset_name,
device="cuda" if torch.cuda.is_available() else "cpu"
)
test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
......
......@@ -7,53 +7,60 @@ import os
from tqdm import tqdm
from multiprocessing import cpu_count, Pool
TOKENS = 'tokens'
ORTH = 'orth'
LEXEMES = 'lexemes'
LEMMA = 'lemma'
MSTAG = 'mstag'
TEXT = 'text'
LEMMAS = 'lemmas'
TAGS = 'tags'
TOKENS = "tokens"
ORTH = "orth"
LEXEMES = "lexemes"
LEMMA = "lemma"
MSTAG = "mstag"
TEXT = "text"
LEMMAS = "lemmas"
TAGS = "tags"
def tag_sentence(sentence: str, lang: str):
connection = Connection(config_file="experiments/configs/config.yml")
lpmn = ["morphodita",
{"posconverter":
{"input_format": "ccl", "output_format": "json"}}] \
if lang == 'pl' else [{"spacy": {"lang": "en"}}]
lpmn = [{"spacy": {"lang": "en"}}]
if lang == "pl":
lpmn = [
"morphodita",
{"posconverter": {"input_format": "ccl", "output_format": "json"}},
]
task = Task(lpmn, connection=connection)
output_file_id = task.run(str(sentence), IOType.TEXT)
tokens = []
try:
clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8"))
clarin_json = json.loads(
download(connection, output_file_id, IOType.TEXT).decode("utf-8")
)
tokens = clarin_json[TOKENS]
except json.decoder.JSONDecodeError:
downloaded = download(connection, output_file_id, IOType.FILE)
with open(downloaded, 'r') as file:
with open(downloaded, "r") as file:
lines = [json.loads(line) for line in file.readlines()]
for line in lines:
tokens.extend(line[TOKENS])
os.remove(downloaded)
lemmas, tags = [], []
for token in tokens:
lexeme = token['lexemes'][0]
lemmas.append(lexeme['lemma'])
tags.append(lexeme['mstag'])
lexeme = token["lexemes"][0]
lemmas.append(lexeme["lemma"])
tags.append(lexeme["mstag"])
return lemmas, tags
def process_file(dataset_df, lang, output_path):
test_with_tags = pd.DataFrame(dataset_df)
lemmas_col, tags_col = [], []
cpus = cpu_count()
cpus = 8
with Pool(processes=cpus) as pool:
results = []
for idx in tqdm(range(0, len(dataset_df), cpus)):
end = min(idx+cpus, len(dataset_df) + 1)
end = min(idx + cpus, len(dataset_df) + 1)
for sentence in dataset_df[TEXT][idx:end]:
results.append(pool.apply_async(tag_sentence, args=[sentence, lang]))
results.append(
pool.apply_async(tag_sentence, args=[sentence, lang])
)
for res in results:
lemmas, tags = res.get()
lemmas_col.append(lemmas)
......@@ -63,7 +70,7 @@ def process_file(dataset_df, lang, output_path):
test_with_tags[TAGS] = tags_col
with open(output_path, mode="wt") as fd:
fd.write(test_with_tags.to_json(orient='records', lines=True))
fd.write(test_with_tags.to_json(orient="records", lines=True))
@click.command()
......@@ -74,7 +81,12 @@ def process_file(dataset_df, lang, output_path):
)
def main(dataset_name: str):
"""Downloads the dataset to the output directory."""
lang = 'en' if dataset_name == 'enron_spam' else 'pl'
lang = {
"enron_spam": "en",
"poleval": "pl",
"20_news": "en",
"wiki_pl": "pl",
}[dataset_name]
output_dir = f"data/preprocessed/{dataset_name}"
os.makedirs(output_dir, exist_ok=True)
......@@ -82,15 +94,24 @@ def main(dataset_name: str):
for file in os.listdir(input_dir):
if os.path.isfile(os.path.join(input_dir, file)):
if file == "test.jsonl":
process_file(pd.read_json(os.path.join(input_dir, file), lines=True),
lang, os.path.join(output_dir, file))
process_file(
pd.read_json(os.path.join(input_dir, file), lines=True),
lang,
os.path.join(output_dir, file),
)
else:
test_with_tags = pd.DataFrame(pd.read_json(os.path.join(input_dir, file), lines=True))
test_with_tags[LEMMAS] = ['' for _ in range(len(test_with_tags))]
test_with_tags[TAGS] = ['' for _ in range(len(test_with_tags))]
test_with_tags = pd.DataFrame(
pd.read_json(os.path.join(input_dir, file), lines=True)
)
test_with_tags[LEMMAS] = [
"" for _ in range(len(test_with_tags))
]
test_with_tags[TAGS] = ["" for _ in range(len(test_with_tags))]
with open(os.path.join(output_dir, file), mode="wt") as fd:
fd.write(test_with_tags.to_json(orient='records', lines=True))
fd.write(
test_with_tags.to_json(orient="records", lines=True)
)
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -4,11 +4,14 @@ click
scikit-learn
dvc[s3]
shap
lpmn_client_biz
tqdm
transformers
tokenizers
sentence-transformers
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==1.12.0+cu116
--index-url https://pypi.clarin-pl.eu/simple/
plwn-api
git+ssh://git@gitlab.clarin-pl.eu/adversarial-attacks/textfooling.git@develop
lpmn_client_biz
"""Classification model for enron_spam"""
import os
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
model_path = "./data/models/20_news"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
return model, tokenizer
def get_classify_function(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
def fun(texts):
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 256] for pos in range(0, len(texts), 256)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
......@@ -2,12 +2,13 @@
import os
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
model_path = "data/models/endron_spam"
model_path = "./data/models/endron_spam"
if not os.path.exists(model_path):
model_path = "mrm8488/bert-tiny-finetuned-enron-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(model_path)
......@@ -16,18 +17,27 @@ def get_model_and_tokenizer():
return model, tokenizer
def get_classify_function():
def get_classify_function(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
def fun(texts):
encoded_inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 256] for pos in range(0, len(texts), 256)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
......
"""Classification model for enron_spam"""
import os
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
model_path = "./data/models/wiki_pl"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
return model, tokenizer
def get_classify_function(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
def fun(texts):
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 256] for pos in range(0, len(texts), 256)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
......@@ -11,13 +11,13 @@ def get_model_and_tokenizer(dataset_name):
return fun()
def get_classify_function(dataset_name):
def get_classify_function(dataset_name, device="cpu"):
"""Return get_model_and_tokenizer for a specific dataset."""
fun = getattr(
importlib.import_module(f"text_attacks.models.{dataset_name}"),
"get_classify_function",
)
return fun()
return fun(device=device)
def download_dataset(dataset_name):
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment