Skip to content
Snippets Groups Projects
Commit 08e1c133 authored by MGniew's avatar MGniew
Browse files

Added 2 new datasets

parent 68879d57
Branches
No related merge requests found
/enron_spam
/wiki_pl
/20_news
/enron_spam
/20_news
/poleval
/wiki_pl
outs:
- md5: 999207f1c2c123c9943397b47f2c3b3a.dir
size: 23460358
nfiles: 3
path: 20_news
outs:
- md5: abcbccb3e352ed623cace1b95078bd63.dir
size: 29115538
nfiles: 3
path: wiki_pl
/enron_spam
/20_news
/wiki_pl
outs:
- md5: 43d68a67ecb8149bd6bf50db9767cb64.dir
size: 439008808
nfiles: 6
path: 20_news
outs:
- md5: fd453042628fb09c080ef05d34a32cce.dir
size: 501711136
nfiles: 7
path: wiki_pl
......@@ -3,6 +3,7 @@ from pathlib import Path
import click
import pandas as pd
import torch
from sklearn.metrics import classification_report
from text_attacks.utils import get_classify_function
......@@ -27,6 +28,7 @@ def main(
output_dir.mkdir(parents=True, exist_ok=True)
classify = get_classify_function(
dataset_name=dataset_name,
device="cuda" if torch.cuda.is_available() else "cpu"
)
test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
......
......@@ -2,8 +2,8 @@ datasets
transformers
click
scikit-learn
dvc[s3]
shap
dvc[s3]==2.46.0
shap==0.41.0
lpmn_client_biz
--find-links https://download.pytorch.org/whl/torch_stable.html
......
"""Classification model for enron_spam"""
import os
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
model_path = "./data/models/20_news"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
return model, tokenizer
def get_classify_function(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
def fun(texts):
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 256] for pos in range(0, len(texts), 256)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
......@@ -2,12 +2,13 @@
import os
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
model_path = "data/models/endron_spam"
model_path = "./data/models/endron_spam"
if not os.path.exists(model_path):
model_path = "mrm8488/bert-tiny-finetuned-enron-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(model_path)
......@@ -16,18 +17,27 @@ def get_model_and_tokenizer():
return model, tokenizer
def get_classify_function():
def get_classify_function(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
def fun(texts):
encoded_inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 256] for pos in range(0, len(texts), 256)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
......
"""Classification model for enron_spam"""
import os
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
model_path = "./data/models/wiki_pl"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
return model, tokenizer
def get_classify_function(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
def fun(texts):
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 256] for pos in range(0, len(texts), 256)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
......@@ -11,13 +11,13 @@ def get_model_and_tokenizer(dataset_name):
return fun()
def get_classify_function(dataset_name):
def get_classify_function(dataset_name, device="cpu"):
"""Return get_model_and_tokenizer for a specific dataset."""
fun = getattr(
importlib.import_module(f"text_attacks.models.{dataset_name}"),
"get_classify_function",
)
return fun()
return fun(device=device)
def download_dataset(dataset_name):
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment