Skip to content
Snippets Groups Projects
Commit d3950c57 authored by MGniew's avatar MGniew
Browse files

Added poleval dataset

parent 9cb6d84a
No related merge requests found
......@@ -5,11 +5,11 @@ stages:
enron_spam --output_dir data/datasets/enron_spam
deps:
- path: experiments/scripts/download_dataset.py
md5: dfcc61ca00234b3dbe0e9c04697ae40a
size: 1686
md5: 9eb915fd5b9216965db519f686408a51
size: 887
outs:
- path: data/datasets/enron_spam/
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
get_model@enron_spam:
......@@ -17,7 +17,7 @@ stages:
--output_dir data/models/enron_spam
deps:
- path: data/datasets/enron_spam
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: experiments/scripts/get_model.py
......@@ -33,7 +33,7 @@ stages:
--output_dir data/classification/enron_spam
deps:
- path: data/datasets/enron_spam/
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: data/models/enron_spam/
......@@ -41,19 +41,19 @@ stages:
size: 18505614
nfiles: 6
- path: experiments/scripts/classify.py
md5: 5bd1363bd8cb2742e5d8391a0287cddb
size: 1281
md5: 50f55b90eb47cbf448d83f8392dd37b6
size: 1102
outs:
- path: data/classification/enron_spam
md5: a83267cc1b9d8e210412b725f93902c0.dir
size: 326
nfiles: 1
md5: c7d42825b98b289f6a5ed3be1af14413.dir
size: 2763843
nfiles: 2
explain@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
--output_dir data/explanations/enron_spam
deps:
- path: data/datasets/enron_spam
md5: b2115d2a6901cd29727f9ed294196544.dir
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: data/models/enron_spam
......@@ -65,6 +65,6 @@ stages:
size: 1445
outs:
- path: data/explanations/enron_spam/
md5: b05c04769355a99964e67e4b8a15f082.dir
size: 6269580
md5: 376bd1619c08b4989564788e74de8e06.dir
size: 7870394
nfiles: 1
......@@ -2,6 +2,7 @@ stages:
download_dataset:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......
......@@ -3,10 +3,9 @@ from pathlib import Path
import click
import pandas as pd
import torch
from sklearn.metrics import classification_report
from text_attacks.utils import get_model_and_tokenizer
from text_attacks.utils import get_classify_function
@click.command()
......@@ -21,32 +20,25 @@ from text_attacks.utils import get_model_and_tokenizer
type=click.Path(path_type=Path),
)
def main(
dataset_name: str,
output_dir: Path,
dataset_name: str,
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
"""Classifies the test data and saves results to the output directory."""
output_dir.mkdir(parents=True, exist_ok=True)
model, tokenizer = get_model_and_tokenizer(
classify = get_classify_function(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
test_y = test["label"]
encoded_inputs = tokenizer(
test_x,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
pred_y = classify(test_x)
with open(output_dir / "metrics.txt", mode="wt") as fd:
fd.write(classification_report(test_y, pred_y))
test["pred_label"] = pred_y
test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
if __name__ == "__main__":
main()
......@@ -2,29 +2,8 @@
from pathlib import Path
import click
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def convert(dataset):
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["label"] = train["label_text"]
train = train.rename(columns={"message_id": "id"})
train = train.drop(columns=["label_text", "subject", "message", "date"])
test["label"] = test["label_text"]
test = test.rename(columns={"message_id": "id"})
test = test.drop(columns=["label_text", "subject", "message", "date"])
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
from text_attacks.utils import download_dataset
@click.command()
......@@ -43,12 +22,9 @@ def main(
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
dataset_mappings = {
"enron_spam": "SetFit/enron_spam",
}
output_dir.mkdir(parents=True, exist_ok=True)
dataset = load_dataset(dataset_mappings[dataset_name])
train, test, adversarial = convert(dataset)
train, test, adversarial = download_dataset(dataset_name)
train.to_json(output_dir / "train.jsonl", orient="records", lines=True)
test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
adversarial.to_json(
......
"""Classification model for enron_spam"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def download_dataset():
dataset = load_dataset("SetFit/enron_spam")
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["label"] = train["label_text"]
train = train.rename(columns={"message_id": "id"})
train = train.drop(columns=["label_text", "subject", "message", "date"])
test["label"] = test["label_text"]
test = test.rename(columns={"message_id": "id"})
test = test.drop(columns=["label_text", "subject", "message", "date"])
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
"""Download and preprecess poleval"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def download_dataset():
dataset = load_dataset("poleval2019_cyberbullying", "task01")
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["id"] = list(range(len(train)))
train["label"] = [
"hate" if lab == 1 else "normal" for lab in train["label"]
]
test["id"] = list(range(len(test)))
test["label"] = [
"hate" if lab == 1 else "normal" for lab in test["label"]
]
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
"""Classification model for enron_spam"""
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(
"mrm8488/bert-tiny-finetuned-enron-spam-detection"
)
model = AutoModelForSequenceClassification.from_pretrained(
"mrm8488/bert-tiny-finetuned-enron-spam-detection"
)
model_path = "data/models/endron_spam"
if not os.path.exists(model_path):
model_path = "mrm8488/bert-tiny-finetuned-enron-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.config.id2label = {0: "ham", 1: "spam"}
return model, tokenizer
def get_classify_function():
model, tokenizer = get_model_and_tokenizer()
def fun(texts):
encoded_inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
......@@ -10,4 +10,19 @@ def get_model_and_tokenizer(dataset_name):
)
return fun()
def get_classify_function(dataset_name):
"""Return get_model_and_tokenizer for a specific dataset."""
fun = getattr(
importlib.import_module(f"text_attacks.models.{dataset_name}"),
"get_classify_function",
)
return fun()
def download_dataset(dataset_name):
fun = getattr(
importlib.import_module(f"text_attacks.datasets.{dataset_name}"),
"download_dataset",
)
return fun()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment