Skip to content
Snippets Groups Projects
Commit d3950c57 authored by MGniew's avatar MGniew
Browse files

Added poleval dataset

parent 9cb6d84a
No related merge requests found
...@@ -5,11 +5,11 @@ stages: ...@@ -5,11 +5,11 @@ stages:
enron_spam --output_dir data/datasets/enron_spam enron_spam --output_dir data/datasets/enron_spam
deps: deps:
- path: experiments/scripts/download_dataset.py - path: experiments/scripts/download_dataset.py
md5: dfcc61ca00234b3dbe0e9c04697ae40a md5: 9eb915fd5b9216965db519f686408a51
size: 1686 size: 887
outs: outs:
- path: data/datasets/enron_spam/ - path: data/datasets/enron_spam/
md5: b2115d2a6901cd29727f9ed294196544.dir md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069 size: 53096069
nfiles: 3 nfiles: 3
get_model@enron_spam: get_model@enron_spam:
...@@ -17,7 +17,7 @@ stages: ...@@ -17,7 +17,7 @@ stages:
--output_dir data/models/enron_spam --output_dir data/models/enron_spam
deps: deps:
- path: data/datasets/enron_spam - path: data/datasets/enron_spam
md5: b2115d2a6901cd29727f9ed294196544.dir md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069 size: 53096069
nfiles: 3 nfiles: 3
- path: experiments/scripts/get_model.py - path: experiments/scripts/get_model.py
...@@ -33,7 +33,7 @@ stages: ...@@ -33,7 +33,7 @@ stages:
--output_dir data/classification/enron_spam --output_dir data/classification/enron_spam
deps: deps:
- path: data/datasets/enron_spam/ - path: data/datasets/enron_spam/
md5: b2115d2a6901cd29727f9ed294196544.dir md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069 size: 53096069
nfiles: 3 nfiles: 3
- path: data/models/enron_spam/ - path: data/models/enron_spam/
...@@ -41,19 +41,19 @@ stages: ...@@ -41,19 +41,19 @@ stages:
size: 18505614 size: 18505614
nfiles: 6 nfiles: 6
- path: experiments/scripts/classify.py - path: experiments/scripts/classify.py
md5: 5bd1363bd8cb2742e5d8391a0287cddb md5: 50f55b90eb47cbf448d83f8392dd37b6
size: 1281 size: 1102
outs: outs:
- path: data/classification/enron_spam - path: data/classification/enron_spam
md5: a83267cc1b9d8e210412b725f93902c0.dir md5: c7d42825b98b289f6a5ed3be1af14413.dir
size: 326 size: 2763843
nfiles: 1 nfiles: 2
explain@enron_spam: explain@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
--output_dir data/explanations/enron_spam --output_dir data/explanations/enron_spam
deps: deps:
- path: data/datasets/enron_spam - path: data/datasets/enron_spam
md5: b2115d2a6901cd29727f9ed294196544.dir md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069 size: 53096069
nfiles: 3 nfiles: 3
- path: data/models/enron_spam - path: data/models/enron_spam
...@@ -65,6 +65,6 @@ stages: ...@@ -65,6 +65,6 @@ stages:
size: 1445 size: 1445
outs: outs:
- path: data/explanations/enron_spam/ - path: data/explanations/enron_spam/
md5: b05c04769355a99964e67e4b8a15f082.dir md5: 376bd1619c08b4989564788e74de8e06.dir
size: 6269580 size: 7870394
nfiles: 1 nfiles: 1
...@@ -2,6 +2,7 @@ stages: ...@@ -2,6 +2,7 @@ stages:
download_dataset: download_dataset:
foreach: foreach:
- enron_spam - enron_spam
- poleval
do: do:
wdir: . wdir: .
cmd: >- cmd: >-
......
...@@ -3,10 +3,9 @@ from pathlib import Path ...@@ -3,10 +3,9 @@ from pathlib import Path
import click import click
import pandas as pd import pandas as pd
import torch
from sklearn.metrics import classification_report from sklearn.metrics import classification_report
from text_attacks.utils import get_model_and_tokenizer from text_attacks.utils import get_classify_function
@click.command() @click.command()
...@@ -21,32 +20,25 @@ from text_attacks.utils import get_model_and_tokenizer ...@@ -21,32 +20,25 @@ from text_attacks.utils import get_model_and_tokenizer
type=click.Path(path_type=Path), type=click.Path(path_type=Path),
) )
def main( def main(
dataset_name: str, dataset_name: str,
output_dir: Path, output_dir: Path,
): ):
"""Downloads the dataset to the output directory.""" """Classifies the test data and saves results to the output directory."""
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
classify = get_classify_function(
model, tokenizer = get_model_and_tokenizer(
dataset_name=dataset_name, dataset_name=dataset_name,
) )
test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True) test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist() test_x = test["text"].tolist()
test_y = test["label"] test_y = test["label"]
encoded_inputs = tokenizer( pred_y = classify(test_x)
test_x,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
with open(output_dir / "metrics.txt", mode="wt") as fd: with open(output_dir / "metrics.txt", mode="wt") as fd:
fd.write(classification_report(test_y, pred_y)) fd.write(classification_report(test_y, pred_y))
test["pred_label"] = pred_y
test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
...@@ -2,29 +2,8 @@ ...@@ -2,29 +2,8 @@
from pathlib import Path from pathlib import Path
import click import click
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def convert(dataset):
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["label"] = train["label_text"]
train = train.rename(columns={"message_id": "id"})
train = train.drop(columns=["label_text", "subject", "message", "date"])
test["label"] = test["label_text"]
test = test.rename(columns={"message_id": "id"})
test = test.drop(columns=["label_text", "subject", "message", "date"])
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial from text_attacks.utils import download_dataset
@click.command() @click.command()
...@@ -43,12 +22,9 @@ def main( ...@@ -43,12 +22,9 @@ def main(
output_dir: Path, output_dir: Path,
): ):
"""Downloads the dataset to the output directory.""" """Downloads the dataset to the output directory."""
dataset_mappings = {
"enron_spam": "SetFit/enron_spam",
}
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
dataset = load_dataset(dataset_mappings[dataset_name])
train, test, adversarial = convert(dataset) train, test, adversarial = download_dataset(dataset_name)
train.to_json(output_dir / "train.jsonl", orient="records", lines=True) train.to_json(output_dir / "train.jsonl", orient="records", lines=True)
test.to_json(output_dir / "test.jsonl", orient="records", lines=True) test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
adversarial.to_json( adversarial.to_json(
......
"""Classification model for enron_spam"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def download_dataset():
dataset = load_dataset("SetFit/enron_spam")
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["label"] = train["label_text"]
train = train.rename(columns={"message_id": "id"})
train = train.drop(columns=["label_text", "subject", "message", "date"])
test["label"] = test["label_text"]
test = test.rename(columns={"message_id": "id"})
test = test.drop(columns=["label_text", "subject", "message", "date"])
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
"""Download and preprecess poleval"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def download_dataset():
dataset = load_dataset("poleval2019_cyberbullying", "task01")
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["id"] = list(range(len(train)))
train["label"] = [
"hate" if lab == 1 else "normal" for lab in train["label"]
]
test["id"] = list(range(len(test)))
test["label"] = [
"hate" if lab == 1 else "normal" for lab in test["label"]
]
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
"""Classification model for enron_spam""" """Classification model for enron_spam"""
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer(): def get_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained( model_path = "data/models/endron_spam"
"mrm8488/bert-tiny-finetuned-enron-spam-detection" if not os.path.exists(model_path):
) model_path = "mrm8488/bert-tiny-finetuned-enron-spam-detection"
model = AutoModelForSequenceClassification.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(model_path)
"mrm8488/bert-tiny-finetuned-enron-spam-detection" model = AutoModelForSequenceClassification.from_pretrained(model_path)
)
model.config.id2label = {0: "ham", 1: "spam"} model.config.id2label = {0: "ham", 1: "spam"}
return model, tokenizer return model, tokenizer
def get_classify_function():
model, tokenizer = get_model_and_tokenizer()
def fun(texts):
encoded_inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
...@@ -10,4 +10,19 @@ def get_model_and_tokenizer(dataset_name): ...@@ -10,4 +10,19 @@ def get_model_and_tokenizer(dataset_name):
) )
return fun() return fun()
def get_classify_function(dataset_name):
"""Return get_model_and_tokenizer for a specific dataset."""
fun = getattr(
importlib.import_module(f"text_attacks.models.{dataset_name}"),
"get_classify_function",
)
return fun()
def download_dataset(dataset_name):
fun = getattr(
importlib.import_module(f"text_attacks.datasets.{dataset_name}"),
"download_dataset",
)
return fun()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment