Newer
Older
"""Download and preprecess poleval"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def download_dataset():
dataset = load_dataset("poleval2019_cyberbullying", "task01")
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["id"] = list(range(len(train)))
train["label"] = [
"hate" if lab == 1 else "normal" for lab in train["label"]
]
test["id"] = list(range(len(test)))
test["label"] = [
"hate" if lab == 1 else "normal" for lab in test["label"]
]
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial