Skip to content
Snippets Groups Projects
Commit d3950c57 authored by MGniew's avatar MGniew
Browse files

Added poleval dataset

parent 9cb6d84a
No related branches found
No related tags found
No related merge requests found
......@@ -5,11 +5,11 @@ stages:
enron_spam --output_dir data/datasets/enron_spam
deps:
- path: experiments/scripts/download_dataset.py
md5: dfcc61ca00234b3dbe0e9c04697ae40a
size: 1686
md5: 9eb915fd5b9216965db519f686408a51
size: 887
outs:
- path: data/datasets/enron_spam/
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
get_model@enron_spam:
......@@ -17,7 +17,7 @@ stages:
--output_dir data/models/enron_spam
deps:
- path: data/datasets/enron_spam
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: experiments/scripts/get_model.py
......@@ -33,7 +33,7 @@ stages:
--output_dir data/classification/enron_spam
deps:
- path: data/datasets/enron_spam/
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: data/models/enron_spam/
......@@ -41,19 +41,19 @@ stages:
size: 18505614
nfiles: 6
- path: experiments/scripts/classify.py
md5: 5bd1363bd8cb2742e5d8391a0287cddb
size: 1281
md5: 50f55b90eb47cbf448d83f8392dd37b6
size: 1102
outs:
- path: data/classification/enron_spam
md5: a83267cc1b9d8e210412b725f93902c0.dir
size: 326
nfiles: 1
md5: c7d42825b98b289f6a5ed3be1af14413.dir
size: 2763843
nfiles: 2
explain@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
--output_dir data/explanations/enron_spam
deps:
- path: data/datasets/enron_spam
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: data/models/enron_spam
......@@ -65,6 +65,6 @@ stages:
size: 1445
outs:
- path: data/explanations/enron_spam/
md5: b05c04769355a99964e67e4b8a15f082.dir
size: 6269580
md5: 376bd1619c08b4989564788e74de8e06.dir
size: 7870394
nfiles: 1
......@@ -2,6 +2,7 @@ stages:
download_dataset:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......
......@@ -3,10 +3,9 @@ from pathlib import Path
import click
import pandas as pd
import torch
from sklearn.metrics import classification_report
from text_attacks.utils import get_model_and_tokenizer
from text_attacks.utils import get_classify_function
@click.command()
......@@ -24,29 +23,22 @@ def main(
dataset_name: str,
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
"""Classifies the test data and saves results to the output directory."""
output_dir.mkdir(parents=True, exist_ok=True)
model, tokenizer = get_model_and_tokenizer(
classify = get_classify_function(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
test_y = test["label"]
encoded_inputs = tokenizer(
test_x,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
pred_y = classify(test_x)
with open(output_dir / "metrics.txt", mode="wt") as fd:
fd.write(classification_report(test_y, pred_y))
test["pred_label"] = pred_y
test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
if __name__ == "__main__":
main()
......@@ -2,29 +2,8 @@
from pathlib import Path
import click
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def convert(dataset):
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["label"] = train["label_text"]
train = train.rename(columns={"message_id": "id"})
train = train.drop(columns=["label_text", "subject", "message", "date"])
test["label"] = test["label_text"]
test = test.rename(columns={"message_id": "id"})
test = test.drop(columns=["label_text", "subject", "message", "date"])
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
from text_attacks.utils import download_dataset
@click.command()
......@@ -43,12 +22,9 @@ def main(
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
dataset_mappings = {
"enron_spam": "SetFit/enron_spam",
}
output_dir.mkdir(parents=True, exist_ok=True)
dataset = load_dataset(dataset_mappings[dataset_name])
train, test, adversarial = convert(dataset)
train, test, adversarial = download_dataset(dataset_name)
train.to_json(output_dir / "train.jsonl", orient="records", lines=True)
test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
adversarial.to_json(
......
"""Classification model for enron_spam"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def download_dataset():
dataset = load_dataset("SetFit/enron_spam")
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["label"] = train["label_text"]
train = train.rename(columns={"message_id": "id"})
train = train.drop(columns=["label_text", "subject", "message", "date"])
test["label"] = test["label_text"]
test = test.rename(columns={"message_id": "id"})
test = test.drop(columns=["label_text", "subject", "message", "date"])
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
"""Download and preprecess poleval"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def download_dataset():
dataset = load_dataset("poleval2019_cyberbullying", "task01")
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["id"] = list(range(len(train)))
train["label"] = [
"hate" if lab == 1 else "normal" for lab in train["label"]
]
test["id"] = list(range(len(test)))
test["label"] = [
"hate" if lab == 1 else "normal" for lab in test["label"]
]
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
"""Classification model for enron_spam"""
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(
"mrm8488/bert-tiny-finetuned-enron-spam-detection"
)
model = AutoModelForSequenceClassification.from_pretrained(
"mrm8488/bert-tiny-finetuned-enron-spam-detection"
)
model_path = "data/models/endron_spam"
if not os.path.exists(model_path):
model_path = "mrm8488/bert-tiny-finetuned-enron-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.config.id2label = {0: "ham", 1: "spam"}
return model, tokenizer
def get_classify_function():
model, tokenizer = get_model_and_tokenizer()
def fun(texts):
encoded_inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
......@@ -11,3 +11,18 @@ def get_model_and_tokenizer(dataset_name):
return fun()
def get_classify_function(dataset_name):
"""Return get_model_and_tokenizer for a specific dataset."""
fun = getattr(
importlib.import_module(f"text_attacks.models.{dataset_name}"),
"get_classify_function",
)
return fun()
def download_dataset(dataset_name):
fun = getattr(
importlib.import_module(f"text_attacks.datasets.{dataset_name}"),
"download_dataset",
)
return fun()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment