From 6fb12f8d3a581f5966da5087bcb341094b1c9517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martyna=20Wi=C4=85cek?= <martyna.wiacek@ipipan.waw.pl> Date: Thu, 30 Nov 2023 00:52:23 +0100 Subject: [PATCH] Changes to include calculation of matrices for COMBO v.2.0 --- README_data_preparation.md | 61 +++++++++++++++ .../prepare/{ => mt}/nc11de2en/prepare.sh | 0 .../prepare/mt/nc11de2en/prepare_combo.sh | 21 +++++ .../{ => mt}/nc11de2en/prepare_nc11_de2en.py | 0 .../mt/nc11de2en/prepare_nc11_de2en_combo.py | 29 +++++++ .../prepare/{ => mt}/nc11en2de/prepare.sh | 0 .../prepare/mt/nc11en2de/prepare_combo.sh | 21 +++++ .../{ => mt}/nc11en2de/prepare_nc11_en2de.py | 0 .../mt/nc11en2de/prepare_nc11_en2de_combo.py | 29 +++++++ .../prepare/{ => mt}/wmt16en2de/prepare.sh | 0 .../prepare/mt/wmt16en2de/prepare_combo.sh | 21 +++++ .../wmt16en2de/prepare_wmt16_en2de.py | 0 .../wmt16en2de/prepare_wmt16_en2de_combo.py | 29 +++++++ .../prepare/{ => mt}/wmt18en2tr/prepare.sh | 0 .../prepare/mt/wmt18en2tr/prepare_combo.sh | 21 +++++ .../wmt18en2tr/prepare_wmt18_en2tr.py | 0 .../wmt18en2tr/prepare_wmt18_en2tr_combo.py | 29 +++++++ .../prepare/reasoning/hellaswag/prepare.py | 29 +++++++ .../reasoning/hellaswag/prepare_combo.sh | 17 ++++ .../prepare/reasoning/winogender/prepare.py | 29 +++++++ .../reasoning/winogender/prepare_combo.sh | 15 ++++ .../prepare/reasoning/winogrande/prepare.py | 29 +++++++ .../reasoning/winogrande/prepare_combo.sh | 15 ++++ .../data/prepare/sts_nli/mnli/prepare.py | 29 +++++++ .../prepare/sts_nli/mnli/prepare_combo.sh | 15 ++++ .../data/prepare/sts_nli/nli/prepare.py | 29 +++++++ .../data/prepare/sts_nli/nli/prepare_combo.sh | 15 ++++ .../data/prepare/sts_nli/wiki/prepare.py | 29 +++++++ .../prepare/sts_nli/wiki/prepare_combo.sh | 15 ++++ .../data/prepare/wmt17en2de/prepare.sh | 78 ------------------- .../prepare/wmt17en2de/prepare_wmt17_en2de.py | 54 ------------- .../{ => mt}/vanilla/nc11de2en/predict.sh | 0 .../vanilla/nc11de2en/predict_nc11_de2en.py | 0 .../vanilla/nc11de2en/train_nc11_de2en.py | 0 scripts/prepare_combo.py | 52 +++++++++++++ scripts/prepare_combo_simcse_hellaswag.py | 64 +++++++++++++++ scripts/prepare_combo_simcse_mnli.py | 54 +++++++++++++ scripts/prepare_combo_simcse_nli.py | 72 +++++++++++++++++ scripts/prepare_combo_simcse_wiki.py | 53 +++++++++++++ scripts/prepare_combo_simcse_winogender.py | 45 +++++++++++ scripts/prepare_combo_simcse_winogrande.py | 51 ++++++++++++ scripts/utils.py | 40 ++++++++++ 42 files changed, 958 insertions(+), 132 deletions(-) create mode 100644 README_data_preparation.md rename experiments/data/prepare/{ => mt}/nc11de2en/prepare.sh (100%) create mode 100755 experiments/data/prepare/mt/nc11de2en/prepare_combo.sh rename experiments/data/prepare/{ => mt}/nc11de2en/prepare_nc11_de2en.py (100%) create mode 100644 experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en_combo.py rename experiments/data/prepare/{ => mt}/nc11en2de/prepare.sh (100%) create mode 100755 experiments/data/prepare/mt/nc11en2de/prepare_combo.sh rename experiments/data/prepare/{ => mt}/nc11en2de/prepare_nc11_en2de.py (100%) create mode 100644 experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de_combo.py rename experiments/data/prepare/{ => mt}/wmt16en2de/prepare.sh (100%) create mode 100755 experiments/data/prepare/mt/wmt16en2de/prepare_combo.sh rename experiments/data/prepare/{ => mt}/wmt16en2de/prepare_wmt16_en2de.py (100%) create mode 100644 experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de_combo.py rename experiments/data/prepare/{ => mt}/wmt18en2tr/prepare.sh (100%) create mode 100755 experiments/data/prepare/mt/wmt18en2tr/prepare_combo.sh rename experiments/data/prepare/{ => mt}/wmt18en2tr/prepare_wmt18_en2tr.py (100%) create mode 100644 experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr_combo.py create mode 100644 experiments/data/prepare/reasoning/hellaswag/prepare.py create mode 100755 experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh create mode 100644 experiments/data/prepare/reasoning/winogender/prepare.py create mode 100755 experiments/data/prepare/reasoning/winogender/prepare_combo.sh create mode 100644 experiments/data/prepare/reasoning/winogrande/prepare.py create mode 100755 experiments/data/prepare/reasoning/winogrande/prepare_combo.sh create mode 100644 experiments/data/prepare/sts_nli/mnli/prepare.py create mode 100755 experiments/data/prepare/sts_nli/mnli/prepare_combo.sh create mode 100644 experiments/data/prepare/sts_nli/nli/prepare.py create mode 100755 experiments/data/prepare/sts_nli/nli/prepare_combo.sh create mode 100644 experiments/data/prepare/sts_nli/wiki/prepare.py create mode 100755 experiments/data/prepare/sts_nli/wiki/prepare_combo.sh delete mode 100755 experiments/data/prepare/wmt17en2de/prepare.sh delete mode 100644 experiments/data/prepare/wmt17en2de/prepare_wmt17_en2de.py rename experiments/predict/{ => mt}/vanilla/nc11de2en/predict.sh (100%) rename experiments/predict/{ => mt}/vanilla/nc11de2en/predict_nc11_de2en.py (100%) rename experiments/train/{ => mt}/vanilla/nc11de2en/train_nc11_de2en.py (100%) create mode 100644 scripts/prepare_combo.py create mode 100644 scripts/prepare_combo_simcse_hellaswag.py create mode 100644 scripts/prepare_combo_simcse_mnli.py create mode 100644 scripts/prepare_combo_simcse_nli.py create mode 100644 scripts/prepare_combo_simcse_wiki.py create mode 100644 scripts/prepare_combo_simcse_winogender.py create mode 100644 scripts/prepare_combo_simcse_winogrande.py create mode 100644 scripts/utils.py diff --git a/README_data_preparation.md b/README_data_preparation.md new file mode 100644 index 0000000..47b3f21 --- /dev/null +++ b/README_data_preparation.md @@ -0,0 +1,61 @@ +# Data preparation + +In order to prepare the data for the experiment of enriching the attention with +the deprendency parsing information in form of the dependency tree distribution over possible heads, we need to calculate +such matrix using COMBO morphological analyzer and parser. + +The version used in this project is combo-lightning, which can be downloaded from here: [https://gitlab.clarin-pl.eu/syntactic-tools/combo-lightning/-/branches]() + +## Prerequisites + +1. Access to SLURM cluster with Python 3.9+ and CUDA 11.0.3+ installed. +2. Access to mrunner tool for simplification of the experiment management on cluster +3. Downloading the data accessible online + +## Data selection for tasks + +The tasks that were chosen to be perfomed for these experiments are those that can potentially +benefit from syntactic information. The tasks should also use models based on transformer encoder-decoder (or encoder-only) +architecture, in order to be able to use the attention mechanism. + +### Machine Translation + +It's an obvious choice for the task that can benefit from the syntactic information. The task is to translate +the sentence from one language to another. The model is trained to predict the next word in the target language +given the previous words in the source language. The attention mechanism is used to focus on the most important +words in the source sentence. The syntactic information can be used to enrich the attention mechanism with +the dependency tree distribution over possible heads. + +The datasets created for this task are: +- NC11 de2en (small dataset from German to English, which is "harder" direction to train) +- NC11 en2de (smaller but potentially easier to train on dataset) +- WMT16 en2de (quite large dataset of about 4.5M sentence pairs - represents a high-resource setting +- WMT18 en2tr (small dataset of about 200k sentence pairs - represents a low-resource setting) + +### Semantic Textual Similarity (Natural Language Inference) + +It is also quite common task in NLP. The task is to predict the similarity between two sentences. The model +is trained to predict the similarity score between two sentences. The attention mechanism is used to focus +on the most important words in the sentences. The syntactic information can be used to enrich the attention +mechanism with the dependency tree distribution over possible heads. + +The datasets created for this task are: +- NLI (small dataset of about 500k sentence pairs) +- MNLI (large dataset of varying domains and number of test cases) +- Wiki1M (large dataset of about 1M sentence pairs) + +### Reasoning + +The task is to predict the answer to the question given the context. + +The datasets created for this task are: +- Winogrande (small dataset of about 50k sentence pairs) +- Winogende (small dataset focusing on gender of the missing pronouns) +- HellaSwag (dataset that contains sentences and possible ending of those sentences) + +## Data preparation + +The data was either downloaded from the Internet or created from scratch by various means of preprocessing. + +We used COMBO models trained on UD v2.13 to create a model for dependency parsing for English and German (these +are languages seen by the encoder in the machine translation task and other tasks). \ No newline at end of file diff --git a/experiments/data/prepare/nc11de2en/prepare.sh b/experiments/data/prepare/mt/nc11de2en/prepare.sh similarity index 100% rename from experiments/data/prepare/nc11de2en/prepare.sh rename to experiments/data/prepare/mt/nc11de2en/prepare.sh diff --git a/experiments/data/prepare/mt/nc11de2en/prepare_combo.sh b/experiments/data/prepare/mt/nc11de2en/prepare_combo.sh new file mode 100755 index 0000000..22eaa7e --- /dev/null +++ b/experiments/data/prepare/mt/nc11de2en/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-de-hdt-ud213.tar.gz +# +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-de-hdt-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-de-hdt-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/nc11de2en/prepare_nc11_de2en.py b/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en.py similarity index 100% rename from experiments/data/prepare/nc11de2en/prepare_nc11_de2en.py rename to experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en.py diff --git a/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en_combo.py b/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en_combo.py new file mode 100644 index 0000000..7635478 --- /dev/null +++ b/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/nc11de2en/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/nc11en2de/prepare.sh b/experiments/data/prepare/mt/nc11en2de/prepare.sh similarity index 100% rename from experiments/data/prepare/nc11en2de/prepare.sh rename to experiments/data/prepare/mt/nc11en2de/prepare.sh diff --git a/experiments/data/prepare/mt/nc11en2de/prepare_combo.sh b/experiments/data/prepare/mt/nc11en2de/prepare_combo.sh new file mode 100755 index 0000000..74c1673 --- /dev/null +++ b/experiments/data/prepare/mt/nc11en2de/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=en +TGT=de + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/nc11en2de/prepare_nc11_en2de.py b/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de.py similarity index 100% rename from experiments/data/prepare/nc11en2de/prepare_nc11_en2de.py rename to experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de.py diff --git a/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de_combo.py b/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de_combo.py new file mode 100644 index 0000000..cc12839 --- /dev/null +++ b/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/nc11en2de/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/wmt16en2de/prepare.sh b/experiments/data/prepare/mt/wmt16en2de/prepare.sh similarity index 100% rename from experiments/data/prepare/wmt16en2de/prepare.sh rename to experiments/data/prepare/mt/wmt16en2de/prepare.sh diff --git a/experiments/data/prepare/mt/wmt16en2de/prepare_combo.sh b/experiments/data/prepare/mt/wmt16en2de/prepare_combo.sh new file mode 100755 index 0000000..8f460ce --- /dev/null +++ b/experiments/data/prepare/mt/wmt16en2de/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=en +TGT=de + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/wmt16${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/wmt16${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt16en2de/prepare_wmt16_en2de.py b/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de.py similarity index 100% rename from experiments/data/prepare/wmt16en2de/prepare_wmt16_en2de.py rename to experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de.py diff --git a/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de_combo.py b/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de_combo.py new file mode 100644 index 0000000..7635478 --- /dev/null +++ b/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/nc11de2en/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/wmt18en2tr/prepare.sh b/experiments/data/prepare/mt/wmt18en2tr/prepare.sh similarity index 100% rename from experiments/data/prepare/wmt18en2tr/prepare.sh rename to experiments/data/prepare/mt/wmt18en2tr/prepare.sh diff --git a/experiments/data/prepare/mt/wmt18en2tr/prepare_combo.sh b/experiments/data/prepare/mt/wmt18en2tr/prepare_combo.sh new file mode 100755 index 0000000..510cce6 --- /dev/null +++ b/experiments/data/prepare/mt/wmt18en2tr/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=en +TGT=tr + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-en-gum-ud213.tar.gz +# +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt18en2tr/prepare_wmt18_en2tr.py b/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr.py similarity index 100% rename from experiments/data/prepare/wmt18en2tr/prepare_wmt18_en2tr.py rename to experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr.py diff --git a/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr_combo.py b/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr_combo.py new file mode 100644 index 0000000..4bf5ab5 --- /dev/null +++ b/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/wmt18en2tr/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/hellaswag/prepare.py b/experiments/data/prepare/reasoning/hellaswag/prepare.py new file mode 100644 index 0000000..8be1bb4 --- /dev/null +++ b/experiments/data/prepare/reasoning/hellaswag/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh b/experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh new file mode 100755 index 0000000..cf70b1d --- /dev/null +++ b/experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh @@ -0,0 +1,17 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/reasoning" +INPUT_DIR="${PROJ_DIR}/hellaswag" +OUTPUT_DIR="${PROJ_DIR}/hellaswag/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + + +python3 ./scripts/prepare_combo_simcse_hellaswag.py $INPUT_DIR \ +$OUTPUT_DIR/hellaswag $MODEL_DIR/model-en-gum-ud213.tar.gz + + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/reasoning/winogender/prepare.py b/experiments/data/prepare/reasoning/winogender/prepare.py new file mode 100644 index 0000000..5e618d1 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogender/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/reasoning/winogender/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/winogender/prepare_combo.sh b/experiments/data/prepare/reasoning/winogender/prepare_combo.sh new file mode 100755 index 0000000..740f519 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogender/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/reasoning" +INPUT_DIR="${PROJ_DIR}/winogender" +OUTPUT_DIR="${PROJ_DIR}/winogender/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_winogender.py $INPUT_DIR/winogender_all_sentences.tsv \ +$OUTPUT_DIR/winogender $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/reasoning/winogrande/prepare.py b/experiments/data/prepare/reasoning/winogrande/prepare.py new file mode 100644 index 0000000..68173f0 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogrande/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/reasoning/winogrande/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/winogrande/prepare_combo.sh b/experiments/data/prepare/reasoning/winogrande/prepare_combo.sh new file mode 100755 index 0000000..9c1cec5 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogrande/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/reasoning" +INPUT_DIR="${PROJ_DIR}/winogrande" +OUTPUT_DIR="${PROJ_DIR}/winogrande/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_winogrande.py $INPUT_DIR \ +$OUTPUT_DIR/winogrande $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/sts_nli/mnli/prepare.py b/experiments/data/prepare/sts_nli/mnli/prepare.py new file mode 100644 index 0000000..c891999 --- /dev/null +++ b/experiments/data/prepare/sts_nli/mnli/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/sts_nli/mnli/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/sts_nli/mnli/prepare_combo.sh b/experiments/data/prepare/sts_nli/mnli/prepare_combo.sh new file mode 100755 index 0000000..9dec42f --- /dev/null +++ b/experiments/data/prepare/sts_nli/mnli/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/nli_sts" +INPUT_DIR="${PROJ_DIR}/mnli" +OUTPUT_DIR="${PROJ_DIR}/mnli/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_mnli.py $INPUT_DIR \ +$OUTPUT_DIR/mnli $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/sts_nli/nli/prepare.py b/experiments/data/prepare/sts_nli/nli/prepare.py new file mode 100644 index 0000000..870c140 --- /dev/null +++ b/experiments/data/prepare/sts_nli/nli/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/sts_nli/nli/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/sts_nli/nli/prepare_combo.sh b/experiments/data/prepare/sts_nli/nli/prepare_combo.sh new file mode 100755 index 0000000..4a62564 --- /dev/null +++ b/experiments/data/prepare/sts_nli/nli/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/nli_sts" +INPUT_DIR="${PROJ_DIR}/nli" +OUTPUT_DIR="${PROJ_DIR}/nli/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_nli.py $INPUT_DIR/nli_for_simcse.csv \ +$OUTPUT_DIR/nli $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/sts_nli/wiki/prepare.py b/experiments/data/prepare/sts_nli/wiki/prepare.py new file mode 100644 index 0000000..f36110f --- /dev/null +++ b/experiments/data/prepare/sts_nli/wiki/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/sts_nli/wiki/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/sts_nli/wiki/prepare_combo.sh b/experiments/data/prepare/sts_nli/wiki/prepare_combo.sh new file mode 100755 index 0000000..3cbd62c --- /dev/null +++ b/experiments/data/prepare/sts_nli/wiki/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/nli_sts" +INPUT_DIR="${PROJ_DIR}/wiki" +OUTPUT_DIR="${PROJ_DIR}/wiki/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_wiki.py $INPUT_DIR/wiki1m_for_simcse.txt \ +$OUTPUT_DIR/wiki $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt17en2de/prepare.sh b/experiments/data/prepare/wmt17en2de/prepare.sh deleted file mode 100755 index 912a08e..0000000 --- a/experiments/data/prepare/wmt17en2de/prepare.sh +++ /dev/null @@ -1,78 +0,0 @@ -#! /usr/bin/env bash - -SRC=en -TGT=de - -PROJ_DIR="/syntax_enhanced_mt" -OUTPUT_DIR="$PROJ_DIR/data/wmt17${SRC}2${TGT}/corpus" -MOSES_DIR="$PROJ_DIR/tools/mosesdecoder" -SCRIPTS_DIR="./scripts" - -rm -rf $OUTPUT_DIR -mkdir -p $OUTPUT_DIR - -echo "Downloading WMT17 De-En. This may take a while..." -wget -nc -nv -O ${OUTPUT_DIR}/corpus.tc.de.gz \ - http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.de.gz -wget -nc -nv -O ${OUTPUT_DIR}/corpus.tc.en.gz \ - http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.en.gz - -echo "Downloading dev/test sets" -wget -nc -nv -O ${OUTPUT_DIR}/dev.tgz \ - http://data.statmt.org/wmt18/translation-task/preprocessed/de-en/dev.tgz - -# Extract everything -echo "Extracting all files..." -gunzip ${OUTPUT_DIR}/corpus.tc.${TGT}.gz ${OUTPUT_DIR}/corpus.tc.${SRC}.gz -mv ${OUTPUT_DIR}/corpus.tc.${TGT} ${OUTPUT_DIR}/train.tok.${TGT} -mv ${OUTPUT_DIR}/corpus.tc.${SRC} ${OUTPUT_DIR}/train.tok.${SRC} -mkdir -p "${OUTPUT_DIR}/dev" -tar -xvzf "${OUTPUT_DIR}/dev.tgz" -C "${OUTPUT_DIR}/dev" -cp ${OUTPUT_DIR}/dev/newstest2016.tc.${TGT} ${OUTPUT_DIR}/valid.tok.${TGT} -cp ${OUTPUT_DIR}/dev/newstest2016.tc.${SRC} ${OUTPUT_DIR}/valid.tok.${SRC} -cp ${OUTPUT_DIR}/dev/newstest2017.tc.${TGT} ${OUTPUT_DIR}/test.tok.${TGT} -cp ${OUTPUT_DIR}/dev/newstest2017.tc.${SRC} ${OUTPUT_DIR}/test.tok.${SRC} - -# Remove raw data -rm -r ${OUTPUT_DIR}/dev* - -# Tokenize data -for f in ${OUTPUT_DIR}/*.${SRC}; do - echo "Tokenizing $f..." - ${MOSES_DIR}/scripts/tokenizer/tokenizer.perl -q -l ${SRC} -threads 8 < $f > ${f%.*}.tok.${SRC} -done -for f in ${OUTPUT_DIR}/*.${TGT}; do - echo "Tokenizing $f..." - ${MOSES_DIR}/scripts/tokenizer/tokenizer.perl -q -l ${TGT} -threads 8 < $f > ${f%.*}.tok.${TGT} -done - -## Clean train corpus -f=${OUTPUT_DIR}/train.tok.${TGT} -fbase=${f%.*} -echo "Cleaning ${fbase}..." -${MOSES_DIR}/scripts/training/clean-corpus-n.perl $fbase ${SRC} ${TGT} "${fbase}.clean" 1 80 -# -# CoreNLP tokenization -for f in "${OUTPUT_DIR}/train.tok.clean.${TGT}" "${OUTPUT_DIR}/valid.tok.${TGT}" "${OUTPUT_DIR}/test.tok.${TGT}"; do - fbase=${f%.*} - echo "CoreNLP tokenizing ${fbase}..." - python3 ${SCRIPTS_DIR}/corenlp_tok.py $fbase $SRC $TGT -done - -# Learn Shared BPE -for merge_ops in 32000; do - echo "Learning BPE with merge_ops=${merge_ops}. This may take a while..." - cat "${OUTPUT_DIR}/train.tok.clean.tok.${SRC}" "${OUTPUT_DIR}/train.tok.clean.tok.${TGT}" | \ - subword-nmt learn-bpe -s $merge_ops > "${OUTPUT_DIR}/bpe.${merge_ops}" - - echo "Apply BPE with merge_ops=${merge_ops} to tokenized files..." - for lang in ${TGT} ${SRC}; do - for f in ${OUTPUT_DIR}/*tok.tok.${lang} ${OUTPUT_DIR}/train.tok.clean.tok.${lang}; do - outfile="${f%.*}.bpe.${merge_ops}.${lang}" - subword-nmt apply-bpe -c "${OUTPUT_DIR}/bpe.${merge_ops}" < $f > "${outfile}" - done - done - -done - -echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt17en2de/prepare_wmt17_en2de.py b/experiments/data/prepare/wmt17en2de/prepare_wmt17_en2de.py deleted file mode 100644 index fed35c8..0000000 --- a/experiments/data/prepare/wmt17en2de/prepare_wmt17_en2de.py +++ /dev/null @@ -1,54 +0,0 @@ -from mrunner.helpers.specification_helper import create_experiments_helper - -base_config = {}#{"run.job_class": "@jobs.AnyJob"} - -params_grid = { - "idx": [0], -} - -DATA_DIR = '/data_preparation/nc11de2en/corpus/_nc11de2en' - -pascal_params=f"{DATA_DIR} " \ - "--save-dir $CKPTS " \ - "--arch transformer_wmt_en_de " \ - "--dropout 0.3 " \ - "--share-all-embeddings " \ - "--optimizer adam "\ - "--adam-betas (0.9,0.997) " \ - "--adam-eps 1e-09 " \ - "--clip-norm 0.0 " \ - "--lr-scheduler inverse_sqrt " \ - "--warmup-init-lr 1e-07 " \ - "--warmup-updates 8000 " \ - "--lr 0.001 " \ - "--min-lr 1e-09 " \ - "--weight-decay 0.0 " \ - "--criterion label_smoothed_cross_entropy " \ - "--label-smoothing 0.1 " \ - "--max-tokens 2048 " \ - "--max-update 20000 " \ - "--no-progress-bar " \ - "--log-format json " \ - "--log-interval 100 " \ - "--save-interval 500000 " \ - "--save-interval-updates 500 " \ - "--keep-interval-updates 1" \ - "--best-checkpoint-metric bleu " \ - "--maximize-best-checkpoint-metric" \ -"" - -experiments_list = create_experiments_helper( - experiment_name="mt_syntax", - project_name="mt_syntax/prepare_data", - base_config=base_config, - params_grid=params_grid, - script=f" ./experiments/data/prepare/wmt17en2de/prepare.sh ", - exclude=[ - "docs", "checkpoints", "fairseq", "tools", "fairseq_data", "corpus", "examples", "tests", - ".pytest_cache", "alpacka.egg-info", "out", ".git" - ], - python_path="", - tags=["quality-data_preparation"], - with_neptune=False, - env={}, -) diff --git a/experiments/predict/vanilla/nc11de2en/predict.sh b/experiments/predict/mt/vanilla/nc11de2en/predict.sh similarity index 100% rename from experiments/predict/vanilla/nc11de2en/predict.sh rename to experiments/predict/mt/vanilla/nc11de2en/predict.sh diff --git a/experiments/predict/vanilla/nc11de2en/predict_nc11_de2en.py b/experiments/predict/mt/vanilla/nc11de2en/predict_nc11_de2en.py similarity index 100% rename from experiments/predict/vanilla/nc11de2en/predict_nc11_de2en.py rename to experiments/predict/mt/vanilla/nc11de2en/predict_nc11_de2en.py diff --git a/experiments/train/vanilla/nc11de2en/train_nc11_de2en.py b/experiments/train/mt/vanilla/nc11de2en/train_nc11_de2en.py similarity index 100% rename from experiments/train/vanilla/nc11de2en/train_nc11_de2en.py rename to experiments/train/mt/vanilla/nc11de2en/train_nc11_de2en.py diff --git a/scripts/prepare_combo.py b/scripts/prepare_combo.py new file mode 100644 index 0000000..d8fd61f --- /dev/null +++ b/scripts/prepare_combo.py @@ -0,0 +1,52 @@ +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path:str ): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + def batch(iterable, n=1): + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx:min(ndx + n, l)] + + with open(file_path, 'r') as file: + data = file.readlines() + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE))): + preds = predictor([eval(x.replace('\n', '')) for x in batch_data]) + for i, pred in enumerate(preds): + rel_dist = pred.relation_distribution + label_dist = pred.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + np.savez( + os.path.join(directory_to_save, npz_order + '.npz'), + relation_distribution=rel_dist, + relation_distribution_trimmed=rel_dist[1:, 1:], + relation_distribution_softmax=rel_dist_softmax, + relation_distribution_trimmed_softmax=rel_dist_softmax[1:, 1:], + relation_label_distribution=label_dist, + relation_label_distribution_softmax=label_dist_softmax + ) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_hellaswag.py b/scripts/prepare_combo_simcse_hellaswag.py new file mode 100644 index 0000000..e4ff68f --- /dev/null +++ b/scripts/prepare_combo_simcse_hellaswag.py @@ -0,0 +1,64 @@ +import json +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 512 + + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + files = [f for f in os.listdir(file_path) if 'jsonl' in f] + + for jsonl_file in files: + + with open(os.path.join(file_path, jsonl_file), 'r') as file: + + data = list(file) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + + jsons = [json.loads(x) for x in batch_data] + + ctx_a = [x['ctx_a'] for x in jsons] + ctx_b = [x['ctx_b'] for x in jsons] + endings = [x['endings'] for x in jsons] + + sentences2 = [] + for ctx_b_sentence, endings_list in zip(ctx_b, endings): + for ending in endings_list: + sentences2.append(ctx_b_sentence + ' ' + ending) + + pred_sentences1 = predictor(ctx_a) + pred_sentences2 = predictor(sentences2) + + pred_sentences2 = [pred_sentences1[i:i + 4] for i in range(0, len(pred_sentences2), 4)] + + for i, (sent1, sent2) in enumerate(zip(pred_sentences1, pred_sentences2)): + pred_sentences1_npz = get_attributes_of_sentence(sent1) + endings_attributes = [get_attributes_of_sentence(x) for x in sent2] + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred_sentences1_npz, 'pred_sentences1', os.path.join(directory_to_save, jsonl_file.replace('.jsonl', '')), npz_order) + + for j, ending_attributes in enumerate(endings_attributes): + save_npz_to_directory(ending_attributes, 'pred_endings_{}'.format(j), os.path.join(directory_to_save, jsonl_file.replace('.jsonl', '')), npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_mnli.py b/scripts/prepare_combo_simcse_mnli.py new file mode 100644 index 0000000..781ac19 --- /dev/null +++ b/scripts/prepare_combo_simcse_mnli.py @@ -0,0 +1,54 @@ +import json +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + + +def main(file_path:str, directory_to_save:str, combo_model_path:str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + files = [f for f in os.listdir(file_path) if 'jsonl' in f] + + for jsonl_file in files: + + with open(os.path.join(file_path, jsonl_file), 'r') as file: + data = list(file) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + + jsons = [json.loads(x) for x in batch_data] + + sentences1 = [x['sentence1'] for x in jsons] + sentences2 = [x['sentence2'] for x in jsons] + + pred_sentences1 = predictor(sentences1) + pred_sentences2 = predictor(sentences2) + + for i, (sent1, sent2) in enumerate(zip(pred_sentences1, pred_sentences2)): + pred_sentences1_npz = get_attributes_of_sentence(sent1) + pred_sentences2_npz = get_attributes_of_sentence(sent2) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred_sentences1_npz, 'pred_sentences1', directory_to_save + jsonl_file.replace('.jsonl', ''), npz_order) + save_npz_to_directory(pred_sentences2_npz, 'pred_sentences2', directory_to_save + jsonl_file.replace('.jsonl', ''), npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_nli.py b/scripts/prepare_combo_simcse_nli.py new file mode 100644 index 0000000..9c7707e --- /dev/null +++ b/scripts/prepare_combo_simcse_nli.py @@ -0,0 +1,72 @@ +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +import pandas as pd +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + sent0 = 'sent0' + sent1 = 'sent1' + hard_neg = 'hard_neg' + + def get_attributes_of_sentence(sentence): + rel_dist = sentence.relation_distribution + label_dist = sentence.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + return { + 'relation_distribution': rel_dist, + 'relation_distribution_trimmed': rel_dist[1:, 1:], + 'relation_distribution_softmax': rel_dist_softmax, + 'relation_distribution_trimmed_softmax': rel_dist_softmax[1:, 1:], + 'relation_label_distribution': label_dist, + 'relation_label_distribution_softmax': label_dist_softmax + } + + def save_npz_to_directory(npz, npz_name, directory, npz_order): + os.makedirs(os.path.join(directory, npz_order), exist_ok=True) + np.savez( + os.path.join(directory, npz_order, npz_name + '.npz'), + **npz + ) + + with open(file_path, 'r') as file: + data = pd.read_csv(file_path, sep=',', header=0) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + if batch_no < 196: + continue + preds_sent0 = predictor([x.replace('\n', '') for x in batch_data[sent0]]) + preds_sent1 = predictor([x.replace('\n', '') for x in batch_data[sent1]]) + preds_hard_neg = predictor([x.replace('\n', '') for x in batch_data[hard_neg]]) + for i, (pred0, pred1, pred_hard_neg) in enumerate(zip(preds_sent0, preds_sent1, preds_hard_neg)): + pred0_npz = get_attributes_of_sentence(pred0) + pred1_npz = get_attributes_of_sentence(pred1) + pred_hard_neg_npz = get_attributes_of_sentence(pred_hard_neg) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred0_npz, 'pred0', directory_to_save, npz_order) + save_npz_to_directory(pred1_npz, 'pred1', directory_to_save, npz_order) + save_npz_to_directory(pred_hard_neg_npz, 'pred_hard_neg', directory_to_save, npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_wiki.py b/scripts/prepare_combo_simcse_wiki.py new file mode 100644 index 0000000..e8e695c --- /dev/null +++ b/scripts/prepare_combo_simcse_wiki.py @@ -0,0 +1,53 @@ +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + with open(file_path, 'r') as file: + data = file.readlines() + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE))): + if batch_no < 3622: + continue + try: + preds = predictor([x.replace('\n', '') for x in batch_data]) + except: + continue + + for i, pred in enumerate(preds): + rel_dist = pred.relation_distribution + label_dist = pred.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + np.savez( + os.path.join(directory_to_save, npz_order + '.npz'), + relation_distribution=rel_dist, + relation_distribution_trimmed=rel_dist[1:, 1:], + relation_distribution_softmax=rel_dist_softmax, + relation_distribution_trimmed_softmax=rel_dist_softmax[1:, 1:], + relation_label_distribution=label_dist, + relation_label_distribution_softmax=label_dist_softmax + ) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_winogender.py b/scripts/prepare_combo_simcse_winogender.py new file mode 100644 index 0000000..76b513b --- /dev/null +++ b/scripts/prepare_combo_simcse_winogender.py @@ -0,0 +1,45 @@ +import json +import math +import os +import sys + +import pandas as pd +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + def batch(iterable, n=1): + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx:min(ndx + n, l)] + + sentence = 'sentence' + + with open(file_path, 'r') as file: + data = pd.read_csv(file_path, sep='\t', header=0) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + preds_sent0 = predictor([x.replace('\n', '') for x in batch_data[sentence]]) + for i, pred in enumerate(preds_sent0): + pred0_npz = get_attributes_of_sentence(pred) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred0_npz, sentence, directory_to_save, npz_order) + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_winogrande.py b/scripts/prepare_combo_simcse_winogrande.py new file mode 100644 index 0000000..19b0774 --- /dev/null +++ b/scripts/prepare_combo_simcse_winogrande.py @@ -0,0 +1,51 @@ +import json +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + + + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + files = [f for f in os.listdir(file_path) if 'jsonl' in f] + + + for jsonl_file in files: + + with open(os.path.join(file_path, jsonl_file), 'r') as file: + data = list(file) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + + jsons = [json.loads(x) for x in batch_data] + + sentences = [x['sentence'] for x in jsons] + + pred_sentences = predictor(sentences) + + for i, sent1 in enumerate(pred_sentences): + pred_sentences1_npz = get_attributes_of_sentence(sent1) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred_sentences1_npz, 'pred_sentences', directory_to_save + jsonl_file.replace('.jsonl', ''), npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000..383f53a --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,40 @@ +def initialize_with_combo(combo_model_path: str) -> COMBO: + ''' Initialize COMBO model from pretrained model path''' + return COMBO.from_pretrained(combo_model_path, cuda_device=0) + + +def predict_with_combo(predictor: COMBO, text: str) -> [Sentence]: + ''' Predict with COMBO model''' + return predictor(text) + + +def get_attributes_of_sentence(sentence: Sentence) -> dict: + ''' Get attributes of sentence, in particular, relation distribution and relation label distribution''' + ''' Return also the softmax of the distributions''' + rel_dist = sentence.relation_distribution + label_dist = sentence.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + return { + 'relation_distribution': rel_dist, + 'relation_distribution_trimmed': rel_dist[1:, 1:], + 'relation_distribution_softmax': rel_dist_softmax, + 'relation_distribution_trimmed_softmax': rel_dist_softmax[1:, 1:], + 'relation_label_distribution': label_dist, + 'relation_label_distribution_softmax': label_dist_softmax + } + + +def save_npz_to_directory(npz: dict, npz_name: str, directory: str, npz_order: str): + ''' Save dictionary to npz file in a directory''' + os.makedirs(os.path.join(directory, npz_order), exist_ok=True) + np.savez( + os.path.join(directory, npz_order, npz_name + '.npz'), + **npz + ) + +def batch(iterable: Iterable, n: int=1): + ''' Batch an iterable into chunks of size n''' + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx:min(ndx + n, l)] -- GitLab