From 34e11ddee3a52bb18248b0c78f501bfcba300cce Mon Sep 17 00:00:00 2001 From: pwalkow <pwalkow@gpu-server.ws.clarin> Date: Tue, 14 Mar 2023 16:15:46 +0100 Subject: [PATCH] Add orths --- experiments/scripts/attack.py | 61 +++--------------------------- experiments/scripts/tag_dataset.py | 26 +++++++------ 2 files changed, 20 insertions(+), 67 deletions(-) diff --git a/experiments/scripts/attack.py b/experiments/scripts/attack.py index 3acd0fa..79c954f 100644 --- a/experiments/scripts/attack.py +++ b/experiments/scripts/attack.py @@ -3,7 +3,6 @@ import click import pandas as pd import os from tqdm import tqdm -from multiprocessing import cpu_count, Pool from text_attacks.utils import get_classify_function from textfooler import Attack, TextFooler, BaseLine, process @@ -11,6 +10,7 @@ from textfooler import Attack, TextFooler, BaseLine, process TEXT = "text" LEMMAS = "lemmas" TAGS = "tags" +ORTHS = "orths" ATTACK_SUMMARY = "attacks_summary" ATTACK_SUCCEEDED = "attacks_succeeded" @@ -35,12 +35,6 @@ DEFAULT_RES = { } -def spoil_sentence(sentence, lemmas, tags, lang, similarity, max_sub): - attack = TextFooler(lang) - # attack = BaseLine(lang, 0.5, 0.4, 0.3) - return attack.spoil(sentence, [], lemmas, tags, similarity, max_sub) - - @click.command() @click.option( "--dataset_name", @@ -61,62 +55,17 @@ def main(dataset_name: str): output_path = os.path.join(output_dir, "test.jsonl") classify = get_classify_function(dataset_name=dataset_name) dataset_df = pd.read_json(input_file, lines=True) - # dataset_df = dataset_df[:10] spoiled, results = [], [] similarity, max_sub = 0.95, 1 - cpus = cpu_count() classes = classify(dataset_df[TEXT].tolist()) - # used_id = 0 - # sent_nbr = len(dataset_df[TEXT]) - # with Pool(processes=cpus) as pool: - # for idx in range(0, min(cpus, sent_nbr)): - # sentence, lemmas, tags = dataset_df[TEXT][idx], \ - # dataset_df[LEMMAS][idx], \ - # dataset_df[TAGS][idx] - # - lang = "en" if dataset_name == "enron_spam" else "pl" - # results.append(pool.apply_async(spoil_sentence, args=[sentence, - # lemmas, - # tags, - # lang, - # similarity, - # max_sub])) - # used_id = idx - # count = len(results) - # while count and used_id < sent_nbr: - # ready = 0 - # to_rm = [] - # for r in results: - # if r.ready(): - # ready += 1 - # changed_sent = r.get() - # if changed_sent: - # spoiled.append(process(changed_sent, classes[i], classify)) - # to_rm.append(r) - # count = len(results) - ready - # results = [res for res in results if res not in to_rm] - # h_bound = min(used_id + cpus - len(results), sent_nbr) - # for i in range(used_id + 1, h_bound): - # used_id += 1 - # sentence, lemmas, tags = dataset_df[TEXT][idx], \ - # dataset_df[LEMMAS][idx], \ - # dataset_df[TAGS][idx] - # - # results.append(pool.apply_async(spoil_sentence, args=[sentence, - # lemmas, - # tags, - # lang, - # similarity, - # max_sub])) + attack = TextFooler(lang) for i, cols in tqdm( - dataset_df[[TEXT, LEMMAS, TAGS]].iterrows(), total=len(dataset_df) + dataset_df[[TEXT, LEMMAS, TAGS, ORTHS]].iterrows(), total=len(dataset_df) ): - sentence, lemmas, tags = cols[0], cols[1], cols[2] - changed_sent = spoil_sentence( - sentence, lemmas, tags, lang, similarity, max_sub - ) + sentence, lemmas, tags, orths = cols[0], cols[1], cols[2], cols[3] + changed_sent = attack.spoil(sentence, [], lemmas, tags, orths, similarity, max_sub) if changed_sent: spoiled.append(process(changed_sent, classes[i], classify)) diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py index 743052a..f099412 100644 --- a/experiments/scripts/tag_dataset.py +++ b/experiments/scripts/tag_dataset.py @@ -15,6 +15,7 @@ MSTAG = "mstag" TEXT = "text" LEMMAS = "lemmas" TAGS = "tags" +ORTHS = "orths" def tag_sentence(sentence: str, lang: str): @@ -41,17 +42,18 @@ def tag_sentence(sentence: str, lang: str): for line in lines: tokens.extend(line[TOKENS]) os.remove(downloaded) - lemmas, tags = [], [] + lemmas, tags, orths = [], [], [] for token in tokens: - lexeme = token["lexemes"][0] - lemmas.append(lexeme["lemma"]) - tags.append(lexeme["mstag"]) - return lemmas, tags + lexeme = token[LEXEMES][0] + lemmas.append(lexeme[LEMMA]) + tags.append(lexeme[MSTAG]) + orths.append(token[ORTH]) + return lemmas, tags, orths def process_file(dataset_df, lang, output_path): test_with_tags = pd.DataFrame(dataset_df) - lemmas_col, tags_col = [], [] + lemmas_col, tags_col, orth_col = [], [], [] cpus = 8 with Pool(processes=cpus) as pool: results = [] @@ -62,12 +64,14 @@ def process_file(dataset_df, lang, output_path): pool.apply_async(tag_sentence, args=[sentence, lang]) ) for res in results: - lemmas, tags = res.get() + lemmas, tags, orths = res.get() lemmas_col.append(lemmas) tags_col.append(tags) + orth_col.append(orths) results = [] test_with_tags[LEMMAS] = lemmas_col test_with_tags[TAGS] = tags_col + test_with_tags[ORTHS] = orth_col with open(output_path, mode="wt") as fd: fd.write(test_with_tags.to_json(orient="records", lines=True)) @@ -103,10 +107,10 @@ def main(dataset_name: str): test_with_tags = pd.DataFrame( pd.read_json(os.path.join(input_dir, file), lines=True) ) - test_with_tags[LEMMAS] = [ - "" for _ in range(len(test_with_tags)) - ] - test_with_tags[TAGS] = ["" for _ in range(len(test_with_tags))] + empty_list = [[] for _ in range(len(test_with_tags)]] + test_with_tags[LEMMAS] = empty_list + test_with_tags[TAGS] = empty_list + test_with_tags[ORTHS] = empty_list with open(os.path.join(output_dir, file), mode="wt") as fd: fd.write( test_with_tags.to_json(orient="records", lines=True) -- GitLab