Skip to content
Snippets Groups Projects
Commit 13b1f119 authored by MGniew's avatar MGniew
Browse files

Basic pipeline

parents
Branches
No related merge requests found
/config.local
/tmp
/cache
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
.idea/
/enron_spam
/enron_spam
/enron_spam
/enron_spam
dvc.lock 0 → 100644
schema: '2.0'
stages:
download_dataset@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/download_dataset.py --dataset_name
enron_spam --output_dir data/datasets/enron_spam
deps:
- path: experiments/scripts/download_dataset.py
md5: dfcc61ca00234b3dbe0e9c04697ae40a
size: 1686
outs:
- path: data/datasets/enron_spam/
md5: b2115d2a6901cd29727f9ed294196544.dir
size: 53096069
nfiles: 3
get_model@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name enron_spam
--output_dir data/models/enron_spam
deps:
- path: experiments/scripts/get_model.py
md5: 5050f51b4019bba97af47971f6c7cab4
size: 747
outs:
- path: data/models/enron_spam/
md5: 3e16b22f59532c66beeadea958e0579a.dir
size: 18505614
nfiles: 6
classify@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name enron_spam
--output_dir data/classification/enron_spam
deps:
- path: experiments/scripts/classify.py
md5: 5bd1363bd8cb2742e5d8391a0287cddb
size: 1281
outs:
- path: data/classification/enron_spam/
md5: a83267cc1b9d8e210412b725f93902c0.dir
size: 326
nfiles: 1
explain@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
--output_dir data/explanations/enron_spam
deps:
- path: experiments/scripts/explain.py
md5: c85cbb774f2682ee39948e701fa0b0ca
size: 1445
outs:
- path: data/explanations/enron_spam/
md5: 147226f0423c899e283cdbbcc223d8e0.dir
size: 6269580
nfiles: 1
dvc.yaml 0 → 100644
stages:
download_dataset:
foreach:
- enron_spam
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/download_dataset.py
--dataset_name ${item}
--output_dir data/datasets/${item}
deps:
- experiments/scripts/download_dataset.py
outs:
- data/datasets/${item}/
get_model:
foreach:
- enron_spam
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/get_model.py
--dataset_name ${item}
--output_dir data/models/${item}
deps:
- experiments/scripts/get_model.py
outs:
- data/models/${item}/
classify:
foreach:
- enron_spam
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/classify.py
--dataset_name ${item}
--output_dir data/classification/${item}
deps:
- experiments/scripts/classify.py
outs:
- data/classification/${item}/
explain:
foreach:
- enron_spam
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/explain.py
--dataset_name ${item}
--output_dir data/explanations/${item}
deps:
- experiments/scripts/explain.py
outs:
- data/explanations/${item}/
"""Classification results."""
from pathlib import Path
import click
import pandas as pd
import torch
from sklearn.metrics import classification_report
from text_attacks.utils import get_model_and_tokenizer
@click.command()
@click.option(
"--dataset_name",
help="Dataset name",
type=str,
)
@click.option(
"--output_dir",
help="Path to output directory",
type=click.Path(path_type=Path),
)
def main(
dataset_name: str,
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
output_dir.mkdir(parents=True, exist_ok=True)
model, tokenizer = get_model_and_tokenizer(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
test_y = test["label"]
encoded_inputs = tokenizer(
test_x,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
logits = model(**encoded_inputs).logits
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
with open(output_dir / "metrics.txt", mode="wt") as fd:
fd.write(classification_report(test_y, pred_y))
if __name__ == "__main__":
main()
"""Script for downloading and converting datasets."""
from pathlib import Path
import click
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
def convert(dataset):
train = pd.DataFrame(dataset["train"].to_dict())
test = pd.DataFrame(dataset["test"].to_dict())
train["label"] = train["label_text"]
train = train.rename(columns={"message_id": "id"})
train = train.drop(columns=["label_text", "subject", "message", "date"])
test["label"] = test["label_text"]
test = test.rename(columns={"message_id": "id"})
test = test.drop(columns=["label_text", "subject", "message", "date"])
adversarial, test = train_test_split(
test,
test_size=0.9,
stratify=test["label"]
)
return train, test, adversarial
@click.command()
@click.option(
"--dataset_name",
help="Dataset name",
type=str,
)
@click.option(
"--output_dir",
help="Path to output directory",
type=click.Path(path_type=Path),
)
def main(
dataset_name: str,
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
dataset_mappings = {
"enron_spam": "SetFit/enron_spam",
}
output_dir.mkdir(parents=True, exist_ok=True)
dataset = load_dataset(dataset_mappings[dataset_name])
train, test, adversarial = convert(dataset)
train.to_json(output_dir / "train.jsonl", orient="records", lines=True)
test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
adversarial.to_json(
output_dir / "adversarial.jsonl",
orient="records",
lines=True
)
if __name__ == "__main__":
main()
"""XAI results."""
import pickle
from pathlib import Path
import click
import pandas as pd
import shap
import torch
from text_attacks.utils import get_model_and_tokenizer
def build_predict_fun(model, tokenizer):
def f(x):
encoded_inputs = torch.tensor(
[tokenizer.encode(
v, padding='max_length', max_length=512, truncation=True
) for v in x])
logits = model(encoded_inputs).logits
return logits
return f
@click.command()
@click.option(
"--dataset_name",
help="Dataset name",
type=str,
)
@click.option(
"--output_dir",
help="Path to output directory",
type=click.Path(path_type=Path),
)
def main(
dataset_name: str,
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
output_dir.mkdir(parents=True, exist_ok=True)
model, tokenizer = get_model_and_tokenizer(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/adversarial.jsonl", lines=True)
test_x = test["text"].tolist()
predict = build_predict_fun(model, tokenizer)
explainer = shap.Explainer(
predict,
masker=tokenizer,
output_names=list(model.config.id2label.values())
)
shap_values = explainer(test_x)
with open(output_dir / "shap_values.pickle", mode="wb") as fd:
pickle.dump(shap_values, fd)
if __name__ == "__main__":
main()
"""Downloads pretrained model from huggingface or trains new one."""
from pathlib import Path
import click
from text_attacks.utils import get_model_and_tokenizer
@click.command()
@click.option(
"--dataset_name",
help="Dataset name",
type=str,
)
@click.option(
"--output_dir",
help="Path to output directory",
type=click.Path(path_type=Path),
)
def main(
dataset_name: str,
output_dir: Path,
):
"""Downloads the dataset to the output directory."""
output_dir.mkdir(parents=True, exist_ok=True)
model, tokenizer = get_model_and_tokenizer(
dataset_name=dataset_name,
)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
if __name__ == "__main__":
main()
datasets
transformers
click
scikit-learn
dvc[s3]
shap
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==1.12.0+cu116
"""Classification model for enron_spam"""
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def get_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(
"mrm8488/bert-tiny-finetuned-enron-spam-detection"
)
model = AutoModelForSequenceClassification.from_pretrained(
"mrm8488/bert-tiny-finetuned-enron-spam-detection"
)
model.config.id2label = {0: "ham", 1: "spam"}
return model, tokenizer
"""Utility functions."""
import importlib
def get_model_and_tokenizer(dataset_name):
"""Return get_model_and_tokenizer for a specific dataset."""
fun = getattr(
importlib.import_module(f"text_attacks.models.{dataset_name}"),
"get_model_and_tokenizer",
)
return fun()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment