From 0c324297ebb678eda77d16b8d609fa995d99f2af Mon Sep 17 00:00:00 2001 From: pwalkow <pwalkow@gpu-server.ws.clarin> Date: Thu, 9 Mar 2023 16:11:32 +0100 Subject: [PATCH] Change script --- experiments/scripts/tag_dataset.py | 46 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py index 494bfe6..0c266d3 100644 --- a/experiments/scripts/tag_dataset.py +++ b/experiments/scripts/tag_dataset.py @@ -40,28 +40,18 @@ def tag_sentence(connection: Connection, sentence: str, lang: str): return lemmas, tags -@click.command() -@click.option( - "--dataset_name", - help="Dataset name", - type=str, -) -def main(dataset_name: str): - """Downloads the dataset to the output directory.""" - lang = 'en' if dataset_name == 'enron_spam' else 'pl' - test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True) - test_with_tags = pd.DataFrame(test) - conn = Connection(config_file="experiments/configs/config.yml") +def process_file(dataset_df, connection, lang, output_path): + test_with_tags = pd.DataFrame(dataset_df) lemmas_col, tags_col = [], [] cpus = cpu_count() with Pool(processes=cpus) as pool: results = [] - for idx in tqdm(range(0, len(test), cpus)): - end = min(idx+cpus, len(test) + 1) - for sentence in test[TEXT][idx:end]: - results.append(pool.apply_async(tag_sentence, args=[conn, + for idx in tqdm(range(0, len(dataset_df), cpus)): + end = min(idx+cpus, len(dataset_df) + 1) + for sentence in dataset_df[TEXT][idx:end]: + results.append(pool.apply_async(tag_sentence, args=(connection, sentence, - lang])) + lang,))) for res in results: lemmas, tags = res.get() lemmas_col.append(lemmas) @@ -70,10 +60,28 @@ def main(dataset_name: str): test_with_tags[LEMMAS] = lemmas_col test_with_tags[TAGS] = tags_col + with open(output_path, mode="wt") as fd: + fd.write(test_with_tags.to_json(orient='records', lines=True)) + + +@click.command() +@click.option( + "--dataset_name", + help="Dataset name", + type=str, +) +def main(dataset_name: str): + """Downloads the dataset to the output directory.""" + lang = 'en' if dataset_name == 'enron_spam' else 'pl' + conn = Connection(config_file="experiments/configs/config.yml") output_dir = f"data/preprocessed/{dataset_name}" os.makedirs(output_dir, exist_ok=True) - with open(f"{output_dir}/test.jsonl", mode="wt") as fd: - fd.write(test_with_tags.to_json(orient='records', lines=True)) + + input_dir = f"data/datasets/{dataset_name}" + for file in os.listdir(input_dir): + if os.path.isfile(os.path.join(input_dir, file)): + process_file(pd.read_json(os.path.join(input_dir, file), lines=True), + conn, lang, os.path.join(output_dir, file)) if __name__ == "__main__": -- GitLab