Skip to content
Snippets Groups Projects
Commit 6c9a8020 authored by MGniew's avatar MGniew
Browse files

poleval

parent ce744073
No related merge requests found
......@@ -25,6 +25,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
tmp/
# PyInstaller
# Usually these files are written by a python script from a template
......
/enron_spam
/wiki_pl
/20_news
/poleval
/enron_spam
/wiki_pl
/20_news
/poleval
/enron_spam
/20_news
/wiki_pl
/poleval
......@@ -639,3 +639,59 @@ stages:
md5: d9595138a6aca9cad4c39b1c5fdbfce1.dir
size: 38396600
nfiles: 2
get_model@poleval:
cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name poleval
--output_dir data/models/poleval
deps:
- path: data/preprocessed/poleval
md5: b0ea9f0ad1dba6d3b474c0a3cedf866e.dir
size: 2812175
nfiles: 3
- path: experiments/scripts/get_model.py
md5: 5050f51b4019bba97af47971f6c7cab4
size: 747
outs:
- path: data/models/poleval/
md5: 8f806cb1b2eb0dd097811d42e4bf9c2d.dir
size: 501609312
nfiles: 7
explain@poleval:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name poleval
--output_dir data/explanations/poleval
deps:
- path: data/models/poleval
md5: 8f806cb1b2eb0dd097811d42e4bf9c2d.dir
size: 501609312
nfiles: 7
- path: data/preprocessed/poleval
md5: b0ea9f0ad1dba6d3b474c0a3cedf866e.dir
size: 2812175
nfiles: 3
- path: experiments/scripts/explain.py
md5: 4d82a557627f59c884f52fd7994ed80a
size: 4617
outs:
- path: data/explanations/poleval/
md5: 246ec6832b514bbfc7ff6bcdb0d9f292.dir
size: 2377833
nfiles: 2003
classify@poleval:
cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name poleval
--output_dir data/classification/poleval
deps:
- path: data/models/poleval/
md5: 8f806cb1b2eb0dd097811d42e4bf9c2d.dir
size: 501609312
nfiles: 7
- path: data/preprocessed/poleval/
md5: b0ea9f0ad1dba6d3b474c0a3cedf866e.dir
size: 2812175
nfiles: 3
- path: experiments/scripts/classify.py
md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
size: 1181
outs:
- path: data/classification/poleval
md5: f207458f9365a74672c31b5ffb2a83af.dir
size: 787456
nfiles: 2
......@@ -32,6 +32,7 @@ stages:
get_model:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......@@ -46,6 +47,7 @@ stages:
classify:
foreach:
- enron_spam
- poleval
- 20_news
- wiki_pl
do:
......@@ -63,6 +65,7 @@ stages:
explain:
foreach:
- enron_spam
- poleval
- 20_news
- wiki_pl
do:
......
"""Classification model for enron_spam"""
import os
import pandas as pd
from transformers import AutoConfig, BertForSequenceClassification
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import BertConfig, AutoModelForSequenceClassification
import random
import numpy as np
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[idx]])
return item
def __len__(self):
return len(self.labels)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, preds, average='binary'
)
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
def train_model():
tokenizer = AutoTokenizer.from_pretrained("./data/models/wiki_pl")
model = AutoModelForSequenceClassification.from_pretrained(
"./data/models/wiki_pl", num_labels=2,
ignore_mismatched_sizes=True
)
test = pd.read_json(f"data/preprocessed/poleval/test.jsonl", lines=True)
train = pd.read_json(f"data/preprocessed/poleval/train.jsonl", lines=True)
y_test = [0 if y == "normal" else 1 for y in test["label"]]
y_train = [0 if y == "normal" else 1 for y in train["label"]]
x_test = test["text"].tolist()
x_train = train["text"].tolist()
train_encodings = tokenizer(
x_train, truncation=True, padding=True, max_length=512
)
train_dataset = Dataset(train_encodings, y_train)
test_encodings = tokenizer(
x_test, truncation=True, padding=True, max_length=512
)
test_dataset = Dataset(test_encodings, y_test)
training_args = TrainingArguments(
output_dir='./tmp',
num_train_epochs=100,
warmup_steps=100,
weight_decay=0.01,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
logging_dir='./tmp/logs',
logging_steps=500,
save_steps=500,
save_total_limit=10,
learning_rate=1e-5,
load_best_model_at_end=True,
evaluation_strategy="steps",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
return model, tokenizer
def train_model_old():
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
config = AutoConfig.from_pretrained(
"mrm8488/bert-tiny-finetuned-enron-spam-detection"
)
config.update({"vocab_size": tokenizer.vocab_size})
model = BertForSequenceClassification(config)
test = pd.read_json(f"data/preprocessed/poleval/test.jsonl", lines=True)
train = pd.read_json(f"data/preprocessed/poleval/train.jsonl", lines=True)
y_test = [0 if y == "normal" else 1 for y in test["label"]]
y_train = [0 if y == "normal" else 1 for y in train["label"]]
x_test = test["text"].tolist()
x_train = train["text"].tolist()
train_encodings = tokenizer(
x_train, truncation=True, padding=True, max_length=512
)
train_dataset = Dataset(train_encodings, y_train)
test_encodings = tokenizer(
x_test, truncation=True, padding=True, max_length=512
)
test_dataset = Dataset(test_encodings, y_test)
training_args = TrainingArguments(
output_dir='./tmp',
num_train_epochs=250,
warmup_steps=100,
weight_decay=0.01,
logging_dir='./tmp/logs',
logging_steps=1000,
save_steps=1000,
save_total_limit=10,
load_best_model_at_end=True,
evaluation_strategy="steps",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
return model, tokenizer
def get_model_and_tokenizer():
return None, None
model_path = "./data/models/poleval/"
if not os.path.exists(model_path + "config.json"):
model, tokenizer = train_model()
else:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.config.id2label = {0: "normal", 1: "hate"}
return model, tokenizer
def get_classify_function():
def get_classify_function(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
def fun(texts):
return "dummy"
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 128] for pos in range(0, len(texts), 128)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment