diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e5bd7a2c75e3512d3d53df746cbc9a197111cabf 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,5 @@ +``` +pip install morfeusz2 +pip install -r requirements.txt +pip install --no-deps git+ssh://git@gitlab.clarin-pl.eu/adversarial-attacks/textfooling.git@develop +``` diff --git a/data/classification/.gitignore b/data/classification/.gitignore index 60ba70084bebf52a8521c364d3a4b019028fe084..e69587204a79358e2809c210d36b6519d6be3b81 100644 --- a/data/classification/.gitignore +++ b/data/classification/.gitignore @@ -1 +1,3 @@ /enron_spam +/wiki_pl +/20_news diff --git a/data/datasets/.gitignore b/data/datasets/.gitignore index af871dfd6ce1c3aab7e8d1a405df6390acab6f65..43bd163e72f8f4c3216a67dd67aaa506107fbda5 100644 --- a/data/datasets/.gitignore +++ b/data/datasets/.gitignore @@ -1,2 +1,4 @@ /enron_spam +/20_news /poleval +/wiki_pl diff --git a/data/datasets/20_news.dvc b/data/datasets/20_news.dvc new file mode 100644 index 0000000000000000000000000000000000000000..00b5cf40552d824c249c9e692753ce5dbdf3b4d5 --- /dev/null +++ b/data/datasets/20_news.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 999207f1c2c123c9943397b47f2c3b3a.dir + size: 23460358 + nfiles: 3 + path: 20_news diff --git a/data/datasets/wiki_pl.dvc b/data/datasets/wiki_pl.dvc new file mode 100644 index 0000000000000000000000000000000000000000..f0f2afeb3fb36b5b1a88e081bc18151e9f3500dd --- /dev/null +++ b/data/datasets/wiki_pl.dvc @@ -0,0 +1,5 @@ +outs: +- md5: abcbccb3e352ed623cace1b95078bd63.dir + size: 29115538 + nfiles: 3 + path: wiki_pl diff --git a/data/models/.gitignore b/data/models/.gitignore index 60ba70084bebf52a8521c364d3a4b019028fe084..ea22867615bba98d219c12d7f14467d051a33e80 100644 --- a/data/models/.gitignore +++ b/data/models/.gitignore @@ -1 +1,3 @@ /enron_spam +/20_news +/wiki_pl diff --git a/data/models/20_news.dvc b/data/models/20_news.dvc new file mode 100644 index 0000000000000000000000000000000000000000..d667d5706620ffdbfc2e6148aefc3a781540abf4 --- /dev/null +++ b/data/models/20_news.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 43d68a67ecb8149bd6bf50db9767cb64.dir + size: 439008808 + nfiles: 6 + path: 20_news diff --git a/data/models/wiki_pl.dvc b/data/models/wiki_pl.dvc new file mode 100644 index 0000000000000000000000000000000000000000..fdf58d54d28455296247165dff7d827def75296c --- /dev/null +++ b/data/models/wiki_pl.dvc @@ -0,0 +1,5 @@ +outs: +- md5: fd453042628fb09c080ef05d34a32cce.dir + size: 501711136 + nfiles: 7 + path: wiki_pl diff --git a/data/preprocessed/.gitignore b/data/preprocessed/.gitignore index 8cfcddc78ec875a51b9da2648f6a001136ec87fe..7fdd029740deec7ff19796ce6b503b9ca1a4c89b 100644 --- a/data/preprocessed/.gitignore +++ b/data/preprocessed/.gitignore @@ -1,2 +1,4 @@ /poleval /enron_spam +/wiki_pl +/20_news diff --git a/dvc.lock b/dvc.lock index 346ede5ec88b823d474a3c0dc3adddaad6c1f578..52672d93fe9bcbe1ae3e0ed339c6e3b6eb7a3bcd 100644 --- a/dvc.lock +++ b/dvc.lock @@ -41,8 +41,8 @@ stages: size: 61709260 nfiles: 3 - path: experiments/scripts/classify.py - md5: ba9284c90847fbbd0f2a6cca414d9636 - size: 1106 + md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5 + size: 1181 outs: - path: data/classification/enron_spam md5: 0450c0b672bc4a5db3cc7be2dac786bd.dir @@ -88,8 +88,8 @@ stages: size: 1688836 nfiles: 3 - path: experiments/scripts/tag_dataset.py - md5: 2c4e097b3a278c12d19858f988232b44 - size: 3435 + md5: 8e8039b73b8ea3ce39287ed0a304af9b + size: 3750 outs: - path: data/preprocessed/poleval/ md5: 854387459b193c5eba6db1273ca5ad23.dir @@ -103,10 +103,80 @@ stages: size: 53096069 nfiles: 3 - path: experiments/scripts/tag_dataset.py - md5: 2c4e097b3a278c12d19858f988232b44 - size: 3435 + md5: 8e8039b73b8ea3ce39287ed0a304af9b + size: 3750 outs: - path: data/preprocessed/enron_spam/ md5: b75efba1a62182dc8ac32acd1faf92ed.dir size: 61709260 nfiles: 3 + preprocess_dataset@wiki_pl: + cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name wiki_pl + deps: + - path: data/datasets/wiki_pl/ + md5: abcbccb3e352ed623cace1b95078bd63.dir + size: 29115538 + nfiles: 3 + - path: experiments/scripts/tag_dataset.py + md5: 8e8039b73b8ea3ce39287ed0a304af9b + size: 3750 + outs: + - path: data/preprocessed/wiki_pl/ + md5: 3e9b2e1e0542777e0a751d9d7f7f4241.dir + size: 55380570 + nfiles: 3 + classify@wiki_pl: + cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name wiki_pl + --output_dir data/classification/wiki_pl + deps: + - path: data/models/wiki_pl/ + md5: fd453042628fb09c080ef05d34a32cce.dir + size: 501711136 + nfiles: 7 + - path: data/preprocessed/wiki_pl/ + md5: 3e9b2e1e0542777e0a751d9d7f7f4241.dir + size: 55380570 + nfiles: 3 + - path: experiments/scripts/classify.py + md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5 + size: 1181 + outs: + - path: data/classification/wiki_pl + md5: 515330772505f489b55686545bcf23a0.dir + size: 34103198 + nfiles: 2 + preprocess_dataset@20_news: + cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name 20_news + deps: + - path: data/datasets/20_news/ + md5: 999207f1c2c123c9943397b47f2c3b3a.dir + size: 23460358 + nfiles: 3 + - path: experiments/scripts/tag_dataset.py + md5: 8e8039b73b8ea3ce39287ed0a304af9b + size: 3750 + outs: + - path: data/preprocessed/20_news/ + md5: 1ed5ef2dabe4bc05f7377175ed11137b.dir + size: 46845669 + nfiles: 3 + classify@20_news: + cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name 20_news + --output_dir data/classification/20_news + deps: + - path: data/models/20_news/ + md5: 43d68a67ecb8149bd6bf50db9767cb64.dir + size: 439008808 + nfiles: 6 + - path: data/preprocessed/20_news/ + md5: 1ed5ef2dabe4bc05f7377175ed11137b.dir + size: 46845669 + nfiles: 3 + - path: experiments/scripts/classify.py + md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5 + size: 1181 + outs: + - path: data/classification/20_news + md5: 6831f104f7c20541548fe72250c45706.dir + size: 31286120 + nfiles: 2 diff --git a/dvc.yaml b/dvc.yaml index 533298e55d4632d5a8413d34b917d531bf334396..92afbaa266962d0bbc8d0f316ec7b6e38c6633d9 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -17,6 +17,8 @@ stages: foreach: - enron_spam - poleval + - 20_news + - wiki_pl do: wdir: . cmd: >- @@ -30,7 +32,6 @@ stages: get_model: foreach: - enron_spam - # - poleval do: wdir: . cmd: >- @@ -45,7 +46,8 @@ stages: classify: foreach: - enron_spam - #- poleval + - 20_news + - wiki_pl do: wdir: . cmd: >- @@ -61,7 +63,6 @@ stages: explain: foreach: - enron_spam - #- poleval do: wdir: . cmd: >- diff --git a/experiments/scripts/classify.py b/experiments/scripts/classify.py index 9639d298904c1a2815f0b34f8cbb6894df6c8527..ab34bd70e815f74effd4044efa041f3ebb5249d6 100644 --- a/experiments/scripts/classify.py +++ b/experiments/scripts/classify.py @@ -3,6 +3,7 @@ from pathlib import Path import click import pandas as pd +import torch from sklearn.metrics import classification_report from text_attacks.utils import get_classify_function @@ -27,6 +28,7 @@ def main( output_dir.mkdir(parents=True, exist_ok=True) classify = get_classify_function( dataset_name=dataset_name, + device="cuda" if torch.cuda.is_available() else "cpu" ) test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True) test_x = test["text"].tolist() diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py index e1b0671fc6ba9111847dabd3e1dea7a1fb7d71fb..743052a011db3cb742c3d8f1b7112c1532285835 100644 --- a/experiments/scripts/tag_dataset.py +++ b/experiments/scripts/tag_dataset.py @@ -7,53 +7,60 @@ import os from tqdm import tqdm from multiprocessing import cpu_count, Pool -TOKENS = 'tokens' -ORTH = 'orth' -LEXEMES = 'lexemes' -LEMMA = 'lemma' -MSTAG = 'mstag' -TEXT = 'text' -LEMMAS = 'lemmas' -TAGS = 'tags' +TOKENS = "tokens" +ORTH = "orth" +LEXEMES = "lexemes" +LEMMA = "lemma" +MSTAG = "mstag" +TEXT = "text" +LEMMAS = "lemmas" +TAGS = "tags" def tag_sentence(sentence: str, lang: str): connection = Connection(config_file="experiments/configs/config.yml") - lpmn = ["morphodita", - {"posconverter": - {"input_format": "ccl", "output_format": "json"}}] \ - if lang == 'pl' else [{"spacy": {"lang": "en"}}] + lpmn = [{"spacy": {"lang": "en"}}] + if lang == "pl": + lpmn = [ + "morphodita", + {"posconverter": {"input_format": "ccl", "output_format": "json"}}, + ] + task = Task(lpmn, connection=connection) output_file_id = task.run(str(sentence), IOType.TEXT) tokens = [] try: - clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8")) + clarin_json = json.loads( + download(connection, output_file_id, IOType.TEXT).decode("utf-8") + ) tokens = clarin_json[TOKENS] except json.decoder.JSONDecodeError: downloaded = download(connection, output_file_id, IOType.FILE) - with open(downloaded, 'r') as file: + with open(downloaded, "r") as file: lines = [json.loads(line) for line in file.readlines()] for line in lines: tokens.extend(line[TOKENS]) os.remove(downloaded) lemmas, tags = [], [] for token in tokens: - lexeme = token['lexemes'][0] - lemmas.append(lexeme['lemma']) - tags.append(lexeme['mstag']) + lexeme = token["lexemes"][0] + lemmas.append(lexeme["lemma"]) + tags.append(lexeme["mstag"]) return lemmas, tags def process_file(dataset_df, lang, output_path): test_with_tags = pd.DataFrame(dataset_df) lemmas_col, tags_col = [], [] - cpus = cpu_count() + cpus = 8 with Pool(processes=cpus) as pool: results = [] for idx in tqdm(range(0, len(dataset_df), cpus)): - end = min(idx+cpus, len(dataset_df) + 1) + end = min(idx + cpus, len(dataset_df) + 1) for sentence in dataset_df[TEXT][idx:end]: - results.append(pool.apply_async(tag_sentence, args=[sentence, lang])) + results.append( + pool.apply_async(tag_sentence, args=[sentence, lang]) + ) for res in results: lemmas, tags = res.get() lemmas_col.append(lemmas) @@ -63,7 +70,7 @@ def process_file(dataset_df, lang, output_path): test_with_tags[TAGS] = tags_col with open(output_path, mode="wt") as fd: - fd.write(test_with_tags.to_json(orient='records', lines=True)) + fd.write(test_with_tags.to_json(orient="records", lines=True)) @click.command() @@ -74,7 +81,12 @@ def process_file(dataset_df, lang, output_path): ) def main(dataset_name: str): """Downloads the dataset to the output directory.""" - lang = 'en' if dataset_name == 'enron_spam' else 'pl' + lang = { + "enron_spam": "en", + "poleval": "pl", + "20_news": "en", + "wiki_pl": "pl", + }[dataset_name] output_dir = f"data/preprocessed/{dataset_name}" os.makedirs(output_dir, exist_ok=True) @@ -82,15 +94,24 @@ def main(dataset_name: str): for file in os.listdir(input_dir): if os.path.isfile(os.path.join(input_dir, file)): if file == "test.jsonl": - process_file(pd.read_json(os.path.join(input_dir, file), lines=True), - lang, os.path.join(output_dir, file)) + process_file( + pd.read_json(os.path.join(input_dir, file), lines=True), + lang, + os.path.join(output_dir, file), + ) else: - test_with_tags = pd.DataFrame(pd.read_json(os.path.join(input_dir, file), lines=True)) - test_with_tags[LEMMAS] = ['' for _ in range(len(test_with_tags))] - test_with_tags[TAGS] = ['' for _ in range(len(test_with_tags))] + test_with_tags = pd.DataFrame( + pd.read_json(os.path.join(input_dir, file), lines=True) + ) + test_with_tags[LEMMAS] = [ + "" for _ in range(len(test_with_tags)) + ] + test_with_tags[TAGS] = ["" for _ in range(len(test_with_tags))] with open(os.path.join(output_dir, file), mode="wt") as fd: - fd.write(test_with_tags.to_json(orient='records', lines=True)) + fd.write( + test_with_tags.to_json(orient="records", lines=True) + ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/requirements.txt b/requirements.txt index fec55bda346006486c107831bf7b436564332a9e..78255946c4d875b554424d2091bbdc35bb3798be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,11 +4,14 @@ click scikit-learn dvc[s3] shap -lpmn_client_biz +tqdm +transformers +tokenizers +sentence-transformers --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.0+cu116 --index-url https://pypi.clarin-pl.eu/simple/ plwn-api -git+ssh://git@gitlab.clarin-pl.eu/adversarial-attacks/textfooling.git@develop +lpmn_client_biz diff --git a/text_attacks/models/20_news.py b/text_attacks/models/20_news.py new file mode 100644 index 0000000000000000000000000000000000000000..53712fa403a55b03a12aaf6962cdfcd6f4c503ce --- /dev/null +++ b/text_attacks/models/20_news.py @@ -0,0 +1,42 @@ +"""Classification model for enron_spam""" +import os + +import torch +from tqdm import tqdm + +from transformers import AutoTokenizer, AutoModelForSequenceClassification + + +def get_model_and_tokenizer(): + model_path = "./data/models/20_news" + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForSequenceClassification.from_pretrained(model_path) + return model, tokenizer + + +def get_classify_function(device="cpu"): + model, tokenizer = get_model_and_tokenizer() + model.eval() + model = model.to(device) + + def fun(texts): + logits = list() + i = 0 + for chunk in tqdm( + [texts[pos:pos + 256] for pos in range(0, len(texts), 256)] + ): + encoded_inputs = tokenizer( + chunk, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(device) + with torch.no_grad(): + logits.append(model(**encoded_inputs).logits.cpu()) + logits = torch.cat(logits, dim=0) + pred_y = torch.argmax(logits, dim=1).tolist() + pred_y = [model.config.id2label[p] for p in pred_y] + return pred_y + + return fun diff --git a/text_attacks/models/enron_spam.py b/text_attacks/models/enron_spam.py index 063a52a0c6cb7f09a804e00f19fc90d69944aa0e..9a1946d83a2ab8ad886d66ee8303cb283d74884b 100644 --- a/text_attacks/models/enron_spam.py +++ b/text_attacks/models/enron_spam.py @@ -2,12 +2,13 @@ import os import torch +from tqdm import tqdm from transformers import AutoTokenizer, AutoModelForSequenceClassification def get_model_and_tokenizer(): - model_path = "data/models/endron_spam" + model_path = "./data/models/endron_spam" if not os.path.exists(model_path): model_path = "mrm8488/bert-tiny-finetuned-enron-spam-detection" tokenizer = AutoTokenizer.from_pretrained(model_path) @@ -16,18 +17,27 @@ def get_model_and_tokenizer(): return model, tokenizer -def get_classify_function(): +def get_classify_function(device="cpu"): model, tokenizer = get_model_and_tokenizer() + model.eval() + model = model.to(device) def fun(texts): - encoded_inputs = tokenizer( - texts, - return_tensors="pt", - padding=True, - truncation=True, - max_length=512 - ) - logits = model(**encoded_inputs).logits + logits = list() + i = 0 + for chunk in tqdm( + [texts[pos:pos + 256] for pos in range(0, len(texts), 256)] + ): + encoded_inputs = tokenizer( + chunk, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(device) + with torch.no_grad(): + logits.append(model(**encoded_inputs).logits.cpu()) + logits = torch.cat(logits, dim=0) pred_y = torch.argmax(logits, dim=1).tolist() pred_y = [model.config.id2label[p] for p in pred_y] return pred_y diff --git a/text_attacks/models/wiki_pl.py b/text_attacks/models/wiki_pl.py new file mode 100644 index 0000000000000000000000000000000000000000..1ad153955d6e5acfc89f4f922465fb624c1ecf5d --- /dev/null +++ b/text_attacks/models/wiki_pl.py @@ -0,0 +1,42 @@ +"""Classification model for enron_spam""" +import os + +import torch +from tqdm import tqdm + +from transformers import AutoTokenizer, AutoModelForSequenceClassification + + +def get_model_and_tokenizer(): + model_path = "./data/models/wiki_pl" + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForSequenceClassification.from_pretrained(model_path) + return model, tokenizer + + +def get_classify_function(device="cpu"): + model, tokenizer = get_model_and_tokenizer() + model.eval() + model = model.to(device) + + def fun(texts): + logits = list() + i = 0 + for chunk in tqdm( + [texts[pos:pos + 256] for pos in range(0, len(texts), 256)] + ): + encoded_inputs = tokenizer( + chunk, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(device) + with torch.no_grad(): + logits.append(model(**encoded_inputs).logits.cpu()) + logits = torch.cat(logits, dim=0) + pred_y = torch.argmax(logits, dim=1).tolist() + pred_y = [model.config.id2label[p] for p in pred_y] + return pred_y + + return fun diff --git a/text_attacks/utils.py b/text_attacks/utils.py index e47d5209c6c84c6b31d6836aef75051a6c66b57f..6a0588292c9542cc5b2d8bb5bd6a1437150276d0 100644 --- a/text_attacks/utils.py +++ b/text_attacks/utils.py @@ -11,13 +11,13 @@ def get_model_and_tokenizer(dataset_name): return fun() -def get_classify_function(dataset_name): +def get_classify_function(dataset_name, device="cpu"): """Return get_model_and_tokenizer for a specific dataset.""" fun = getattr( importlib.import_module(f"text_attacks.models.{dataset_name}"), "get_classify_function", ) - return fun() + return fun(device=device) def download_dataset(dataset_name):