From 6c9a8020aee2cc21de40e4d1647141f6354dfba4 Mon Sep 17 00:00:00 2001 From: MGniew <m.f.gniewkowski@gmail.com> Date: Mon, 27 Mar 2023 00:31:27 +0200 Subject: [PATCH] poleval --- .gitignore | 1 + data/classification/.gitignore | 1 + data/explanations/.gitignore | 1 + data/models/.gitignore | 1 + dvc.lock | 56 +++++++++++ dvc.yaml | 3 + text_attacks/models/poleval.py | 173 ++++++++++++++++++++++++++++++++- 7 files changed, 233 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 853a525..92e0643 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +tmp/ # PyInstaller # Usually these files are written by a python script from a template diff --git a/data/classification/.gitignore b/data/classification/.gitignore index e695872..27e43f9 100644 --- a/data/classification/.gitignore +++ b/data/classification/.gitignore @@ -1,3 +1,4 @@ /enron_spam /wiki_pl /20_news +/poleval diff --git a/data/explanations/.gitignore b/data/explanations/.gitignore index e695872..27e43f9 100644 --- a/data/explanations/.gitignore +++ b/data/explanations/.gitignore @@ -1,3 +1,4 @@ /enron_spam /wiki_pl /20_news +/poleval diff --git a/data/models/.gitignore b/data/models/.gitignore index ea22867..f37e1f6 100644 --- a/data/models/.gitignore +++ b/data/models/.gitignore @@ -1,3 +1,4 @@ /enron_spam /20_news /wiki_pl +/poleval diff --git a/dvc.lock b/dvc.lock index 3db2d0d..bbd4afc 100644 --- a/dvc.lock +++ b/dvc.lock @@ -639,3 +639,59 @@ stages: md5: d9595138a6aca9cad4c39b1c5fdbfce1.dir size: 38396600 nfiles: 2 + get_model@poleval: + cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name poleval + --output_dir data/models/poleval + deps: + - path: data/preprocessed/poleval + md5: b0ea9f0ad1dba6d3b474c0a3cedf866e.dir + size: 2812175 + nfiles: 3 + - path: experiments/scripts/get_model.py + md5: 5050f51b4019bba97af47971f6c7cab4 + size: 747 + outs: + - path: data/models/poleval/ + md5: 8f806cb1b2eb0dd097811d42e4bf9c2d.dir + size: 501609312 + nfiles: 7 + explain@poleval: + cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name poleval + --output_dir data/explanations/poleval + deps: + - path: data/models/poleval + md5: 8f806cb1b2eb0dd097811d42e4bf9c2d.dir + size: 501609312 + nfiles: 7 + - path: data/preprocessed/poleval + md5: b0ea9f0ad1dba6d3b474c0a3cedf866e.dir + size: 2812175 + nfiles: 3 + - path: experiments/scripts/explain.py + md5: 4d82a557627f59c884f52fd7994ed80a + size: 4617 + outs: + - path: data/explanations/poleval/ + md5: 246ec6832b514bbfc7ff6bcdb0d9f292.dir + size: 2377833 + nfiles: 2003 + classify@poleval: + cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name poleval + --output_dir data/classification/poleval + deps: + - path: data/models/poleval/ + md5: 8f806cb1b2eb0dd097811d42e4bf9c2d.dir + size: 501609312 + nfiles: 7 + - path: data/preprocessed/poleval/ + md5: b0ea9f0ad1dba6d3b474c0a3cedf866e.dir + size: 2812175 + nfiles: 3 + - path: experiments/scripts/classify.py + md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5 + size: 1181 + outs: + - path: data/classification/poleval + md5: f207458f9365a74672c31b5ffb2a83af.dir + size: 787456 + nfiles: 2 diff --git a/dvc.yaml b/dvc.yaml index 0455a1b..e0cd0d3 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -32,6 +32,7 @@ stages: get_model: foreach: - enron_spam + - poleval do: wdir: . cmd: >- @@ -46,6 +47,7 @@ stages: classify: foreach: - enron_spam + - poleval - 20_news - wiki_pl do: @@ -63,6 +65,7 @@ stages: explain: foreach: - enron_spam + - poleval - 20_news - wiki_pl do: diff --git a/text_attacks/models/poleval.py b/text_attacks/models/poleval.py index a037f8d..cc2b0c1 100644 --- a/text_attacks/models/poleval.py +++ b/text_attacks/models/poleval.py @@ -1,13 +1,180 @@ """Classification model for enron_spam""" +import os +import pandas as pd +from transformers import AutoConfig, BertForSequenceClassification +from transformers import AutoTokenizer, TrainingArguments, Trainer +from transformers import BertConfig, AutoModelForSequenceClassification + +import random +import numpy as np +import torch + +from tqdm import tqdm + +from sklearn.metrics import accuracy_score, precision_recall_fscore_support + + +class Dataset(torch.utils.data.Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()} + item["labels"] = torch.tensor([self.labels[idx]]) + return item + + def __len__(self): + return len(self.labels) + + +def compute_metrics(pred): + labels = pred.label_ids + preds = pred.predictions.argmax(-1) + precision, recall, f1, _ = precision_recall_fscore_support( + labels, preds, average='binary' + ) + acc = accuracy_score(labels, preds) + return { + 'accuracy': acc, + 'f1': f1, + 'precision': precision, + 'recall': recall + } + + +def train_model(): + tokenizer = AutoTokenizer.from_pretrained("./data/models/wiki_pl") + model = AutoModelForSequenceClassification.from_pretrained( + "./data/models/wiki_pl", num_labels=2, + ignore_mismatched_sizes=True + ) + + test = pd.read_json(f"data/preprocessed/poleval/test.jsonl", lines=True) + train = pd.read_json(f"data/preprocessed/poleval/train.jsonl", lines=True) + y_test = [0 if y == "normal" else 1 for y in test["label"]] + y_train = [0 if y == "normal" else 1 for y in train["label"]] + x_test = test["text"].tolist() + x_train = train["text"].tolist() + + train_encodings = tokenizer( + x_train, truncation=True, padding=True, max_length=512 + ) + train_dataset = Dataset(train_encodings, y_train) + + test_encodings = tokenizer( + x_test, truncation=True, padding=True, max_length=512 + ) + test_dataset = Dataset(test_encodings, y_test) + + training_args = TrainingArguments( + output_dir='./tmp', + num_train_epochs=100, + warmup_steps=100, + weight_decay=0.01, + per_device_train_batch_size=32, + per_device_eval_batch_size=32, + logging_dir='./tmp/logs', + logging_steps=500, + save_steps=500, + save_total_limit=10, + learning_rate=1e-5, + load_best_model_at_end=True, + evaluation_strategy="steps", + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=test_dataset, + compute_metrics=compute_metrics, + ) + trainer.train() + return model, tokenizer + + +def train_model_old(): + tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") + config = AutoConfig.from_pretrained( + "mrm8488/bert-tiny-finetuned-enron-spam-detection" + ) + config.update({"vocab_size": tokenizer.vocab_size}) + + model = BertForSequenceClassification(config) + test = pd.read_json(f"data/preprocessed/poleval/test.jsonl", lines=True) + train = pd.read_json(f"data/preprocessed/poleval/train.jsonl", lines=True) + y_test = [0 if y == "normal" else 1 for y in test["label"]] + y_train = [0 if y == "normal" else 1 for y in train["label"]] + x_test = test["text"].tolist() + x_train = train["text"].tolist() + + train_encodings = tokenizer( + x_train, truncation=True, padding=True, max_length=512 + ) + train_dataset = Dataset(train_encodings, y_train) + + test_encodings = tokenizer( + x_test, truncation=True, padding=True, max_length=512 + ) + test_dataset = Dataset(test_encodings, y_test) + + training_args = TrainingArguments( + output_dir='./tmp', + num_train_epochs=250, + warmup_steps=100, + weight_decay=0.01, + logging_dir='./tmp/logs', + logging_steps=1000, + save_steps=1000, + save_total_limit=10, + load_best_model_at_end=True, + evaluation_strategy="steps", + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=test_dataset, + compute_metrics=compute_metrics, + ) + trainer.train() + return model, tokenizer def get_model_and_tokenizer(): - return None, None + model_path = "./data/models/poleval/" + if not os.path.exists(model_path + "config.json"): + model, tokenizer = train_model() + else: + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForSequenceClassification.from_pretrained(model_path) + model.config.id2label = {0: "normal", 1: "hate"} + return model, tokenizer -def get_classify_function(): +def get_classify_function(device="cpu"): + model, tokenizer = get_model_and_tokenizer() + model.eval() + model = model.to(device) def fun(texts): - return "dummy" + logits = list() + i = 0 + for chunk in tqdm( + [texts[pos:pos + 128] for pos in range(0, len(texts), 128)] + ): + encoded_inputs = tokenizer( + chunk, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(device) + with torch.no_grad(): + logits.append(model(**encoded_inputs).logits.cpu()) + logits = torch.cat(logits, dim=0) + pred_y = torch.argmax(logits, dim=1).tolist() + pred_y = [model.config.id2label[p] for p in pred_y] + return pred_y return fun -- GitLab