From d3950c57a5d9dcd6acbb453c8bda759dfc324310 Mon Sep 17 00:00:00 2001 From: MGniew <m.f.gniewkowski@gmail.com> Date: Thu, 9 Mar 2023 00:14:13 +0100 Subject: [PATCH] Added poleval dataset --- dvc.lock | 26 +++++++++---------- dvc.yaml | 1 + experiments/scripts/classify.py | 26 +++++++------------ experiments/scripts/download_dataset.py | 30 +++------------------- text_attacks/datasets/__init__.py | 0 text_attacks/datasets/enron_spam.py | 25 +++++++++++++++++++ text_attacks/datasets/poleval.py | 27 ++++++++++++++++++++ text_attacks/models/enron_spam.py | 33 ++++++++++++++++++++----- text_attacks/utils.py | 17 ++++++++++++- 9 files changed, 121 insertions(+), 64 deletions(-) create mode 100644 text_attacks/datasets/__init__.py create mode 100644 text_attacks/datasets/enron_spam.py create mode 100644 text_attacks/datasets/poleval.py diff --git a/dvc.lock b/dvc.lock index 6f7bc72..e0d6202 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,11 +5,11 @@ stages: enron_spam --output_dir data/datasets/enron_spam deps: - path: experiments/scripts/download_dataset.py - md5: dfcc61ca00234b3dbe0e9c04697ae40a - size: 1686 + md5: 9eb915fd5b9216965db519f686408a51 + size: 887 outs: - path: data/datasets/enron_spam/ - md5: b2115d2a6901cd29727f9ed294196544.dir + md5: 66d44efedf37990b1989c81bbee085e0.dir size: 53096069 nfiles: 3 get_model@enron_spam: @@ -17,7 +17,7 @@ stages: --output_dir data/models/enron_spam deps: - path: data/datasets/enron_spam - md5: b2115d2a6901cd29727f9ed294196544.dir + md5: 66d44efedf37990b1989c81bbee085e0.dir size: 53096069 nfiles: 3 - path: experiments/scripts/get_model.py @@ -33,7 +33,7 @@ stages: --output_dir data/classification/enron_spam deps: - path: data/datasets/enron_spam/ - md5: b2115d2a6901cd29727f9ed294196544.dir + md5: 66d44efedf37990b1989c81bbee085e0.dir size: 53096069 nfiles: 3 - path: data/models/enron_spam/ @@ -41,19 +41,19 @@ stages: size: 18505614 nfiles: 6 - path: experiments/scripts/classify.py - md5: 5bd1363bd8cb2742e5d8391a0287cddb - size: 1281 + md5: 50f55b90eb47cbf448d83f8392dd37b6 + size: 1102 outs: - path: data/classification/enron_spam - md5: a83267cc1b9d8e210412b725f93902c0.dir - size: 326 - nfiles: 1 + md5: c7d42825b98b289f6a5ed3be1af14413.dir + size: 2763843 + nfiles: 2 explain@enron_spam: cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam --output_dir data/explanations/enron_spam deps: - path: data/datasets/enron_spam - md5: b2115d2a6901cd29727f9ed294196544.dir + md5: 66d44efedf37990b1989c81bbee085e0.dir size: 53096069 nfiles: 3 - path: data/models/enron_spam @@ -65,6 +65,6 @@ stages: size: 1445 outs: - path: data/explanations/enron_spam/ - md5: b05c04769355a99964e67e4b8a15f082.dir - size: 6269580 + md5: 376bd1619c08b4989564788e74de8e06.dir + size: 7870394 nfiles: 1 diff --git a/dvc.yaml b/dvc.yaml index 2c8de24..c035ccb 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -2,6 +2,7 @@ stages: download_dataset: foreach: - enron_spam + - poleval do: wdir: . cmd: >- diff --git a/experiments/scripts/classify.py b/experiments/scripts/classify.py index 4466560..b642d9b 100644 --- a/experiments/scripts/classify.py +++ b/experiments/scripts/classify.py @@ -3,10 +3,9 @@ from pathlib import Path import click import pandas as pd -import torch from sklearn.metrics import classification_report -from text_attacks.utils import get_model_and_tokenizer +from text_attacks.utils import get_classify_function @click.command() @@ -21,32 +20,25 @@ from text_attacks.utils import get_model_and_tokenizer type=click.Path(path_type=Path), ) def main( - dataset_name: str, - output_dir: Path, + dataset_name: str, + output_dir: Path, ): - """Downloads the dataset to the output directory.""" + """Classifies the test data and saves results to the output directory.""" output_dir.mkdir(parents=True, exist_ok=True) - - model, tokenizer = get_model_and_tokenizer( + classify = get_classify_function( dataset_name=dataset_name, ) test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True) test_x = test["text"].tolist() test_y = test["label"] - encoded_inputs = tokenizer( - test_x, - return_tensors="pt", - padding=True, - truncation=True, - max_length=512 - ) - logits = model(**encoded_inputs).logits - pred_y = torch.argmax(logits, dim=1).tolist() - pred_y = [model.config.id2label[p] for p in pred_y] + pred_y = classify(test_x) with open(output_dir / "metrics.txt", mode="wt") as fd: fd.write(classification_report(test_y, pred_y)) + test["pred_label"] = pred_y + test.to_json(output_dir / "test.jsonl", orient="records", lines=True) + if __name__ == "__main__": main() diff --git a/experiments/scripts/download_dataset.py b/experiments/scripts/download_dataset.py index ce84cb6..94eafcf 100644 --- a/experiments/scripts/download_dataset.py +++ b/experiments/scripts/download_dataset.py @@ -2,29 +2,8 @@ from pathlib import Path import click -import pandas as pd -from datasets import load_dataset -from sklearn.model_selection import train_test_split - - -def convert(dataset): - train = pd.DataFrame(dataset["train"].to_dict()) - test = pd.DataFrame(dataset["test"].to_dict()) - - train["label"] = train["label_text"] - train = train.rename(columns={"message_id": "id"}) - train = train.drop(columns=["label_text", "subject", "message", "date"]) - - test["label"] = test["label_text"] - test = test.rename(columns={"message_id": "id"}) - test = test.drop(columns=["label_text", "subject", "message", "date"]) - adversarial, test = train_test_split( - test, - test_size=0.9, - stratify=test["label"] - ) - return train, test, adversarial +from text_attacks.utils import download_dataset @click.command() @@ -43,12 +22,9 @@ def main( output_dir: Path, ): """Downloads the dataset to the output directory.""" - dataset_mappings = { - "enron_spam": "SetFit/enron_spam", - } output_dir.mkdir(parents=True, exist_ok=True) - dataset = load_dataset(dataset_mappings[dataset_name]) - train, test, adversarial = convert(dataset) + + train, test, adversarial = download_dataset(dataset_name) train.to_json(output_dir / "train.jsonl", orient="records", lines=True) test.to_json(output_dir / "test.jsonl", orient="records", lines=True) adversarial.to_json( diff --git a/text_attacks/datasets/__init__.py b/text_attacks/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/text_attacks/datasets/enron_spam.py b/text_attacks/datasets/enron_spam.py new file mode 100644 index 0000000..35b21f5 --- /dev/null +++ b/text_attacks/datasets/enron_spam.py @@ -0,0 +1,25 @@ +"""Classification model for enron_spam""" +import pandas as pd +from datasets import load_dataset +from sklearn.model_selection import train_test_split + + +def download_dataset(): + dataset = load_dataset("SetFit/enron_spam") + train = pd.DataFrame(dataset["train"].to_dict()) + test = pd.DataFrame(dataset["test"].to_dict()) + + train["label"] = train["label_text"] + train = train.rename(columns={"message_id": "id"}) + train = train.drop(columns=["label_text", "subject", "message", "date"]) + + test["label"] = test["label_text"] + test = test.rename(columns={"message_id": "id"}) + test = test.drop(columns=["label_text", "subject", "message", "date"]) + adversarial, test = train_test_split( + test, + test_size=0.9, + stratify=test["label"] + ) + + return train, test, adversarial diff --git a/text_attacks/datasets/poleval.py b/text_attacks/datasets/poleval.py new file mode 100644 index 0000000..193ee02 --- /dev/null +++ b/text_attacks/datasets/poleval.py @@ -0,0 +1,27 @@ +"""Download and preprecess poleval""" +import pandas as pd +from datasets import load_dataset +from sklearn.model_selection import train_test_split + + +def download_dataset(): + dataset = load_dataset("poleval2019_cyberbullying", "task01") + train = pd.DataFrame(dataset["train"].to_dict()) + test = pd.DataFrame(dataset["test"].to_dict()) + + train["id"] = list(range(len(train))) + train["label"] = [ + "hate" if lab == 1 else "normal" for lab in train["label"] + ] + + test["id"] = list(range(len(test))) + test["label"] = [ + "hate" if lab == 1 else "normal" for lab in test["label"] + ] + adversarial, test = train_test_split( + test, + test_size=0.9, + stratify=test["label"] + ) + + return train, test, adversarial diff --git a/text_attacks/models/enron_spam.py b/text_attacks/models/enron_spam.py index d87f3e4..063a52a 100644 --- a/text_attacks/models/enron_spam.py +++ b/text_attacks/models/enron_spam.py @@ -1,14 +1,35 @@ """Classification model for enron_spam""" +import os + +import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification def get_model_and_tokenizer(): - tokenizer = AutoTokenizer.from_pretrained( - "mrm8488/bert-tiny-finetuned-enron-spam-detection" - ) - model = AutoModelForSequenceClassification.from_pretrained( - "mrm8488/bert-tiny-finetuned-enron-spam-detection" - ) + model_path = "data/models/endron_spam" + if not os.path.exists(model_path): + model_path = "mrm8488/bert-tiny-finetuned-enron-spam-detection" + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForSequenceClassification.from_pretrained(model_path) model.config.id2label = {0: "ham", 1: "spam"} return model, tokenizer + + +def get_classify_function(): + model, tokenizer = get_model_and_tokenizer() + + def fun(texts): + encoded_inputs = tokenizer( + texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ) + logits = model(**encoded_inputs).logits + pred_y = torch.argmax(logits, dim=1).tolist() + pred_y = [model.config.id2label[p] for p in pred_y] + return pred_y + + return fun diff --git a/text_attacks/utils.py b/text_attacks/utils.py index 82659af..e47d520 100644 --- a/text_attacks/utils.py +++ b/text_attacks/utils.py @@ -10,4 +10,19 @@ def get_model_and_tokenizer(dataset_name): ) return fun() - + +def get_classify_function(dataset_name): + """Return get_model_and_tokenizer for a specific dataset.""" + fun = getattr( + importlib.import_module(f"text_attacks.models.{dataset_name}"), + "get_classify_function", + ) + return fun() + + +def download_dataset(dataset_name): + fun = getattr( + importlib.import_module(f"text_attacks.datasets.{dataset_name}"), + "download_dataset", + ) + return fun() -- GitLab