Skip to content
Snippets Groups Projects
poleval.py 768 B
Newer Older
MGniew's avatar
MGniew committed
"""Download and preprecess poleval"""
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split


def download_dataset():
    dataset = load_dataset("poleval2019_cyberbullying", "task01")
    train = pd.DataFrame(dataset["train"].to_dict())
    test = pd.DataFrame(dataset["test"].to_dict())

    train["id"] = list(range(len(train)))
    train["label"] = [
        "hate" if lab == 1 else "normal" for lab in train["label"]
    ]

    test["id"] = list(range(len(test)))
    test["label"] = [
        "hate" if lab == 1 else "normal" for lab in test["label"]
    ]
    adversarial, test = train_test_split(
        test,
        test_size=0.9,
        stratify=test["label"]
    )

    return train, test, adversarial