diff --git a/README_data_preparation.md b/README_data_preparation.md new file mode 100644 index 0000000000000000000000000000000000000000..47b3f2191b77831c454e556e93bcab917c255fc0 --- /dev/null +++ b/README_data_preparation.md @@ -0,0 +1,61 @@ +# Data preparation + +In order to prepare the data for the experiment of enriching the attention with +the deprendency parsing information in form of the dependency tree distribution over possible heads, we need to calculate +such matrix using COMBO morphological analyzer and parser. + +The version used in this project is combo-lightning, which can be downloaded from here: [https://gitlab.clarin-pl.eu/syntactic-tools/combo-lightning/-/branches]() + +## Prerequisites + +1. Access to SLURM cluster with Python 3.9+ and CUDA 11.0.3+ installed. +2. Access to mrunner tool for simplification of the experiment management on cluster +3. Downloading the data accessible online + +## Data selection for tasks + +The tasks that were chosen to be perfomed for these experiments are those that can potentially +benefit from syntactic information. The tasks should also use models based on transformer encoder-decoder (or encoder-only) +architecture, in order to be able to use the attention mechanism. + +### Machine Translation + +It's an obvious choice for the task that can benefit from the syntactic information. The task is to translate +the sentence from one language to another. The model is trained to predict the next word in the target language +given the previous words in the source language. The attention mechanism is used to focus on the most important +words in the source sentence. The syntactic information can be used to enrich the attention mechanism with +the dependency tree distribution over possible heads. + +The datasets created for this task are: +- NC11 de2en (small dataset from German to English, which is "harder" direction to train) +- NC11 en2de (smaller but potentially easier to train on dataset) +- WMT16 en2de (quite large dataset of about 4.5M sentence pairs - represents a high-resource setting +- WMT18 en2tr (small dataset of about 200k sentence pairs - represents a low-resource setting) + +### Semantic Textual Similarity (Natural Language Inference) + +It is also quite common task in NLP. The task is to predict the similarity between two sentences. The model +is trained to predict the similarity score between two sentences. The attention mechanism is used to focus +on the most important words in the sentences. The syntactic information can be used to enrich the attention +mechanism with the dependency tree distribution over possible heads. + +The datasets created for this task are: +- NLI (small dataset of about 500k sentence pairs) +- MNLI (large dataset of varying domains and number of test cases) +- Wiki1M (large dataset of about 1M sentence pairs) + +### Reasoning + +The task is to predict the answer to the question given the context. + +The datasets created for this task are: +- Winogrande (small dataset of about 50k sentence pairs) +- Winogende (small dataset focusing on gender of the missing pronouns) +- HellaSwag (dataset that contains sentences and possible ending of those sentences) + +## Data preparation + +The data was either downloaded from the Internet or created from scratch by various means of preprocessing. + +We used COMBO models trained on UD v2.13 to create a model for dependency parsing for English and German (these +are languages seen by the encoder in the machine translation task and other tasks). \ No newline at end of file diff --git a/experiments/data/prepare/nc11de2en/prepare.sh b/experiments/data/prepare/mt/nc11de2en/prepare.sh similarity index 100% rename from experiments/data/prepare/nc11de2en/prepare.sh rename to experiments/data/prepare/mt/nc11de2en/prepare.sh diff --git a/experiments/data/prepare/mt/nc11de2en/prepare_combo.sh b/experiments/data/prepare/mt/nc11de2en/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..22eaa7ee60f3e0acd8d54cd3fdec31110486a078 --- /dev/null +++ b/experiments/data/prepare/mt/nc11de2en/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-de-hdt-ud213.tar.gz +# +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-de-hdt-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-de-hdt-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/nc11de2en/prepare_nc11_de2en.py b/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en.py similarity index 100% rename from experiments/data/prepare/nc11de2en/prepare_nc11_de2en.py rename to experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en.py diff --git a/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en_combo.py b/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en_combo.py new file mode 100644 index 0000000000000000000000000000000000000000..763547861d79c1ca4ac15cf9b183eead166e37f1 --- /dev/null +++ b/experiments/data/prepare/mt/nc11de2en/prepare_nc11_de2en_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/nc11de2en/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/nc11en2de/prepare.sh b/experiments/data/prepare/mt/nc11en2de/prepare.sh similarity index 100% rename from experiments/data/prepare/nc11en2de/prepare.sh rename to experiments/data/prepare/mt/nc11en2de/prepare.sh diff --git a/experiments/data/prepare/mt/nc11en2de/prepare_combo.sh b/experiments/data/prepare/mt/nc11en2de/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..74c167399a4dfdb6c2cafec9352ce5093eb31499 --- /dev/null +++ b/experiments/data/prepare/mt/nc11en2de/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=en +TGT=de + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/nc11en2de/prepare_nc11_en2de.py b/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de.py similarity index 100% rename from experiments/data/prepare/nc11en2de/prepare_nc11_en2de.py rename to experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de.py diff --git a/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de_combo.py b/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de_combo.py new file mode 100644 index 0000000000000000000000000000000000000000..cc12839d3c79881acf987bc8fefe1eb61e340bef --- /dev/null +++ b/experiments/data/prepare/mt/nc11en2de/prepare_nc11_en2de_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/nc11en2de/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/wmt16en2de/prepare.sh b/experiments/data/prepare/mt/wmt16en2de/prepare.sh similarity index 100% rename from experiments/data/prepare/wmt16en2de/prepare.sh rename to experiments/data/prepare/mt/wmt16en2de/prepare.sh diff --git a/experiments/data/prepare/mt/wmt16en2de/prepare_combo.sh b/experiments/data/prepare/mt/wmt16en2de/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..8f460ceef31d2ea70b0c1a052646a54e5b5a6581 --- /dev/null +++ b/experiments/data/prepare/mt/wmt16en2de/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=en +TGT=de + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/wmt16${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/wmt16${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt16en2de/prepare_wmt16_en2de.py b/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de.py similarity index 100% rename from experiments/data/prepare/wmt16en2de/prepare_wmt16_en2de.py rename to experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de.py diff --git a/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de_combo.py b/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de_combo.py new file mode 100644 index 0000000000000000000000000000000000000000..763547861d79c1ca4ac15cf9b183eead166e37f1 --- /dev/null +++ b/experiments/data/prepare/mt/wmt16en2de/prepare_wmt16_en2de_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/nc11de2en/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/wmt18en2tr/prepare.sh b/experiments/data/prepare/mt/wmt18en2tr/prepare.sh similarity index 100% rename from experiments/data/prepare/wmt18en2tr/prepare.sh rename to experiments/data/prepare/mt/wmt18en2tr/prepare.sh diff --git a/experiments/data/prepare/mt/wmt18en2tr/prepare_combo.sh b/experiments/data/prepare/mt/wmt18en2tr/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..510cce6e11881908c227ad7194fd894169f759a6 --- /dev/null +++ b/experiments/data/prepare/mt/wmt18en2tr/prepare_combo.sh @@ -0,0 +1,21 @@ +#! /usr/bin/env bash + +SRC=en +TGT=tr + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/mt" +INPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD" +OUTPUT_DIR="${PROJ_DIR}/nc11${SRC}2${TGT}/sp/corpus_0606/lambo_GSD_separate_interpunction/lambo_GSD/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo.py $INPUT_DIR/test.unescaped.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/test $MODEL_DIR/model-en-gum-ud213.tar.gz +# +python3 ./scripts/prepare_combo.py $INPUT_DIR/valid.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/valid $MODEL_DIR/model-en-gum-ud213.tar.gz + +python3 ./scripts/prepare_combo.py $INPUT_DIR/train.unescaped.no_qst.clean.pp.dedup.norm.fix.lambo.tok_for_combo.$SRC \ +$OUTPUT_DIR/train $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt18en2tr/prepare_wmt18_en2tr.py b/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr.py similarity index 100% rename from experiments/data/prepare/wmt18en2tr/prepare_wmt18_en2tr.py rename to experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr.py diff --git a/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr_combo.py b/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr_combo.py new file mode 100644 index 0000000000000000000000000000000000000000..4bf5ab5ef7f7088e44ee80327d1818764c34b192 --- /dev/null +++ b/experiments/data/prepare/mt/wmt18en2tr/prepare_wmt18_en2tr_combo.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/wmt18en2tr/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/hellaswag/prepare.py b/experiments/data/prepare/reasoning/hellaswag/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..8be1bb4c9d497233702ea5cae121f183d620df7a --- /dev/null +++ b/experiments/data/prepare/reasoning/hellaswag/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh b/experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..cf70b1d0563a5960bb45cb43295638ca3b542a7a --- /dev/null +++ b/experiments/data/prepare/reasoning/hellaswag/prepare_combo.sh @@ -0,0 +1,17 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/reasoning" +INPUT_DIR="${PROJ_DIR}/hellaswag" +OUTPUT_DIR="${PROJ_DIR}/hellaswag/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + + +python3 ./scripts/prepare_combo_simcse_hellaswag.py $INPUT_DIR \ +$OUTPUT_DIR/hellaswag $MODEL_DIR/model-en-gum-ud213.tar.gz + + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/reasoning/winogender/prepare.py b/experiments/data/prepare/reasoning/winogender/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..5e618d18d9ab0e5f14e2b006dde9f0a8cc922076 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogender/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/reasoning/winogender/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/winogender/prepare_combo.sh b/experiments/data/prepare/reasoning/winogender/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..740f51918e54f369cbd1dd0d7d4f0ea63f30b4e5 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogender/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/reasoning" +INPUT_DIR="${PROJ_DIR}/winogender" +OUTPUT_DIR="${PROJ_DIR}/winogender/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_winogender.py $INPUT_DIR/winogender_all_sentences.tsv \ +$OUTPUT_DIR/winogender $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/reasoning/winogrande/prepare.py b/experiments/data/prepare/reasoning/winogrande/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..68173f09cff5ab7f4f276ca867c2f48c8ca528f9 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogrande/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/reasoning/winogrande/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/reasoning/winogrande/prepare_combo.sh b/experiments/data/prepare/reasoning/winogrande/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..9c1cec50dd5046bcb6c502e81b55a9a93eb5e141 --- /dev/null +++ b/experiments/data/prepare/reasoning/winogrande/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/reasoning" +INPUT_DIR="${PROJ_DIR}/winogrande" +OUTPUT_DIR="${PROJ_DIR}/winogrande/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_winogrande.py $INPUT_DIR \ +$OUTPUT_DIR/winogrande $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/sts_nli/mnli/prepare.py b/experiments/data/prepare/sts_nli/mnli/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..c891999a7e659413e4e96e17ba7d8eb607c4d085 --- /dev/null +++ b/experiments/data/prepare/sts_nli/mnli/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/sts_nli/mnli/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/sts_nli/mnli/prepare_combo.sh b/experiments/data/prepare/sts_nli/mnli/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..9dec42f22b615201e9b39d421233c8fcf4e0911d --- /dev/null +++ b/experiments/data/prepare/sts_nli/mnli/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/nli_sts" +INPUT_DIR="${PROJ_DIR}/mnli" +OUTPUT_DIR="${PROJ_DIR}/mnli/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_mnli.py $INPUT_DIR \ +$OUTPUT_DIR/mnli $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/sts_nli/nli/prepare.py b/experiments/data/prepare/sts_nli/nli/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..870c140a7850b45a6451e26686eda534f0ef7b65 --- /dev/null +++ b/experiments/data/prepare/sts_nli/nli/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/sts_nli/nli/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/sts_nli/nli/prepare_combo.sh b/experiments/data/prepare/sts_nli/nli/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..4a62564fa1edfa35605c1eb1cfb543be801bc1d8 --- /dev/null +++ b/experiments/data/prepare/sts_nli/nli/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/nli_sts" +INPUT_DIR="${PROJ_DIR}/nli" +OUTPUT_DIR="${PROJ_DIR}/nli/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_nli.py $INPUT_DIR/nli_for_simcse.csv \ +$OUTPUT_DIR/nli $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/sts_nli/wiki/prepare.py b/experiments/data/prepare/sts_nli/wiki/prepare.py new file mode 100644 index 0000000000000000000000000000000000000000..f36110fc220245982f881cbf66bccf81a8af6737 --- /dev/null +++ b/experiments/data/prepare/sts_nli/wiki/prepare.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from mrunner.helpers.specification_helper import create_experiments_helper + +base_config = {}#{"run.job_class": "@jobs.AnyJob"} + +params_grid = { + "idx": [0], +} + +exclude_fairseq = [ + "fairseq/docs", "fairseq/tests", "fairseq/fairseq.egg-info", "fairseq/examples" +] + [str(path) for path in Path('fairseq').rglob('*.so')] + +experiments_list = create_experiments_helper( + experiment_name="mt_syntax", + project_name="mt_syntax/prepare_data", + base_config=base_config, + params_grid=params_grid, + script=f" ./experiments/data/prepare/sts_nli/wiki/prepare_combo.sh ", + exclude=[ + "docs", "data", "models", "test_comet", "build", "checkpoints", "tools", "fairseq_data", "corpus", + ".pytest_cache", "alpacka.egg-info", "out", "__pycache__", ".idea" + ] + exclude_fairseq, + python_path="", + tags=["quality-data_preparation"], + with_neptune=False, + env={}, +) diff --git a/experiments/data/prepare/sts_nli/wiki/prepare_combo.sh b/experiments/data/prepare/sts_nli/wiki/prepare_combo.sh new file mode 100755 index 0000000000000000000000000000000000000000..3cbd62cbf19506438d57ee09b5194d87d2b00b80 --- /dev/null +++ b/experiments/data/prepare/sts_nli/wiki/prepare_combo.sh @@ -0,0 +1,15 @@ +#! /usr/bin/env bash + +SRC=de +TGT=en + +PROJ_DIR="/net/pr2/projects/plgrid/plgg_nlp/martyna/SYNTAX_DATA/nli_sts" +INPUT_DIR="${PROJ_DIR}/wiki" +OUTPUT_DIR="${PROJ_DIR}/wiki/combo_preds" +SCRIPTS_DIR="./scripts" +MODEL_DIR="/net/pr2/projects/plgrid/plgg_nlp/" + +python3 ./scripts/prepare_combo_simcse_wiki.py $INPUT_DIR/wiki1m_for_simcse.txt \ +$OUTPUT_DIR/wiki $MODEL_DIR/model-en-gum-ud213.tar.gz + +echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt17en2de/prepare.sh b/experiments/data/prepare/wmt17en2de/prepare.sh deleted file mode 100755 index 912a08e35021e2a32a125c923c71907cf2da19e2..0000000000000000000000000000000000000000 --- a/experiments/data/prepare/wmt17en2de/prepare.sh +++ /dev/null @@ -1,78 +0,0 @@ -#! /usr/bin/env bash - -SRC=en -TGT=de - -PROJ_DIR="/syntax_enhanced_mt" -OUTPUT_DIR="$PROJ_DIR/data/wmt17${SRC}2${TGT}/corpus" -MOSES_DIR="$PROJ_DIR/tools/mosesdecoder" -SCRIPTS_DIR="./scripts" - -rm -rf $OUTPUT_DIR -mkdir -p $OUTPUT_DIR - -echo "Downloading WMT17 De-En. This may take a while..." -wget -nc -nv -O ${OUTPUT_DIR}/corpus.tc.de.gz \ - http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.de.gz -wget -nc -nv -O ${OUTPUT_DIR}/corpus.tc.en.gz \ - http://data.statmt.org/wmt17/translation-task/preprocessed/de-en/corpus.tc.en.gz - -echo "Downloading dev/test sets" -wget -nc -nv -O ${OUTPUT_DIR}/dev.tgz \ - http://data.statmt.org/wmt18/translation-task/preprocessed/de-en/dev.tgz - -# Extract everything -echo "Extracting all files..." -gunzip ${OUTPUT_DIR}/corpus.tc.${TGT}.gz ${OUTPUT_DIR}/corpus.tc.${SRC}.gz -mv ${OUTPUT_DIR}/corpus.tc.${TGT} ${OUTPUT_DIR}/train.tok.${TGT} -mv ${OUTPUT_DIR}/corpus.tc.${SRC} ${OUTPUT_DIR}/train.tok.${SRC} -mkdir -p "${OUTPUT_DIR}/dev" -tar -xvzf "${OUTPUT_DIR}/dev.tgz" -C "${OUTPUT_DIR}/dev" -cp ${OUTPUT_DIR}/dev/newstest2016.tc.${TGT} ${OUTPUT_DIR}/valid.tok.${TGT} -cp ${OUTPUT_DIR}/dev/newstest2016.tc.${SRC} ${OUTPUT_DIR}/valid.tok.${SRC} -cp ${OUTPUT_DIR}/dev/newstest2017.tc.${TGT} ${OUTPUT_DIR}/test.tok.${TGT} -cp ${OUTPUT_DIR}/dev/newstest2017.tc.${SRC} ${OUTPUT_DIR}/test.tok.${SRC} - -# Remove raw data -rm -r ${OUTPUT_DIR}/dev* - -# Tokenize data -for f in ${OUTPUT_DIR}/*.${SRC}; do - echo "Tokenizing $f..." - ${MOSES_DIR}/scripts/tokenizer/tokenizer.perl -q -l ${SRC} -threads 8 < $f > ${f%.*}.tok.${SRC} -done -for f in ${OUTPUT_DIR}/*.${TGT}; do - echo "Tokenizing $f..." - ${MOSES_DIR}/scripts/tokenizer/tokenizer.perl -q -l ${TGT} -threads 8 < $f > ${f%.*}.tok.${TGT} -done - -## Clean train corpus -f=${OUTPUT_DIR}/train.tok.${TGT} -fbase=${f%.*} -echo "Cleaning ${fbase}..." -${MOSES_DIR}/scripts/training/clean-corpus-n.perl $fbase ${SRC} ${TGT} "${fbase}.clean" 1 80 -# -# CoreNLP tokenization -for f in "${OUTPUT_DIR}/train.tok.clean.${TGT}" "${OUTPUT_DIR}/valid.tok.${TGT}" "${OUTPUT_DIR}/test.tok.${TGT}"; do - fbase=${f%.*} - echo "CoreNLP tokenizing ${fbase}..." - python3 ${SCRIPTS_DIR}/corenlp_tok.py $fbase $SRC $TGT -done - -# Learn Shared BPE -for merge_ops in 32000; do - echo "Learning BPE with merge_ops=${merge_ops}. This may take a while..." - cat "${OUTPUT_DIR}/train.tok.clean.tok.${SRC}" "${OUTPUT_DIR}/train.tok.clean.tok.${TGT}" | \ - subword-nmt learn-bpe -s $merge_ops > "${OUTPUT_DIR}/bpe.${merge_ops}" - - echo "Apply BPE with merge_ops=${merge_ops} to tokenized files..." - for lang in ${TGT} ${SRC}; do - for f in ${OUTPUT_DIR}/*tok.tok.${lang} ${OUTPUT_DIR}/train.tok.clean.tok.${lang}; do - outfile="${f%.*}.bpe.${merge_ops}.${lang}" - subword-nmt apply-bpe -c "${OUTPUT_DIR}/bpe.${merge_ops}" < $f > "${outfile}" - done - done - -done - -echo "All done." \ No newline at end of file diff --git a/experiments/data/prepare/wmt17en2de/prepare_wmt17_en2de.py b/experiments/data/prepare/wmt17en2de/prepare_wmt17_en2de.py deleted file mode 100644 index fed35c88617c1b8672209d71f3d7eaf33617952f..0000000000000000000000000000000000000000 --- a/experiments/data/prepare/wmt17en2de/prepare_wmt17_en2de.py +++ /dev/null @@ -1,54 +0,0 @@ -from mrunner.helpers.specification_helper import create_experiments_helper - -base_config = {}#{"run.job_class": "@jobs.AnyJob"} - -params_grid = { - "idx": [0], -} - -DATA_DIR = '/data_preparation/nc11de2en/corpus/_nc11de2en' - -pascal_params=f"{DATA_DIR} " \ - "--save-dir $CKPTS " \ - "--arch transformer_wmt_en_de " \ - "--dropout 0.3 " \ - "--share-all-embeddings " \ - "--optimizer adam "\ - "--adam-betas (0.9,0.997) " \ - "--adam-eps 1e-09 " \ - "--clip-norm 0.0 " \ - "--lr-scheduler inverse_sqrt " \ - "--warmup-init-lr 1e-07 " \ - "--warmup-updates 8000 " \ - "--lr 0.001 " \ - "--min-lr 1e-09 " \ - "--weight-decay 0.0 " \ - "--criterion label_smoothed_cross_entropy " \ - "--label-smoothing 0.1 " \ - "--max-tokens 2048 " \ - "--max-update 20000 " \ - "--no-progress-bar " \ - "--log-format json " \ - "--log-interval 100 " \ - "--save-interval 500000 " \ - "--save-interval-updates 500 " \ - "--keep-interval-updates 1" \ - "--best-checkpoint-metric bleu " \ - "--maximize-best-checkpoint-metric" \ -"" - -experiments_list = create_experiments_helper( - experiment_name="mt_syntax", - project_name="mt_syntax/prepare_data", - base_config=base_config, - params_grid=params_grid, - script=f" ./experiments/data/prepare/wmt17en2de/prepare.sh ", - exclude=[ - "docs", "checkpoints", "fairseq", "tools", "fairseq_data", "corpus", "examples", "tests", - ".pytest_cache", "alpacka.egg-info", "out", ".git" - ], - python_path="", - tags=["quality-data_preparation"], - with_neptune=False, - env={}, -) diff --git a/experiments/predict/vanilla/nc11de2en/predict.sh b/experiments/predict/mt/vanilla/nc11de2en/predict.sh similarity index 100% rename from experiments/predict/vanilla/nc11de2en/predict.sh rename to experiments/predict/mt/vanilla/nc11de2en/predict.sh diff --git a/experiments/predict/vanilla/nc11de2en/predict_nc11_de2en.py b/experiments/predict/mt/vanilla/nc11de2en/predict_nc11_de2en.py similarity index 100% rename from experiments/predict/vanilla/nc11de2en/predict_nc11_de2en.py rename to experiments/predict/mt/vanilla/nc11de2en/predict_nc11_de2en.py diff --git a/experiments/train/vanilla/nc11de2en/train_nc11_de2en.py b/experiments/train/mt/vanilla/nc11de2en/train_nc11_de2en.py similarity index 100% rename from experiments/train/vanilla/nc11de2en/train_nc11_de2en.py rename to experiments/train/mt/vanilla/nc11de2en/train_nc11_de2en.py diff --git a/scripts/prepare_combo.py b/scripts/prepare_combo.py new file mode 100644 index 0000000000000000000000000000000000000000..d8fd61ff10357410e47ebaa8996a9f32853cd718 --- /dev/null +++ b/scripts/prepare_combo.py @@ -0,0 +1,52 @@ +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path:str ): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + def batch(iterable, n=1): + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx:min(ndx + n, l)] + + with open(file_path, 'r') as file: + data = file.readlines() + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE))): + preds = predictor([eval(x.replace('\n', '')) for x in batch_data]) + for i, pred in enumerate(preds): + rel_dist = pred.relation_distribution + label_dist = pred.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + np.savez( + os.path.join(directory_to_save, npz_order + '.npz'), + relation_distribution=rel_dist, + relation_distribution_trimmed=rel_dist[1:, 1:], + relation_distribution_softmax=rel_dist_softmax, + relation_distribution_trimmed_softmax=rel_dist_softmax[1:, 1:], + relation_label_distribution=label_dist, + relation_label_distribution_softmax=label_dist_softmax + ) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_hellaswag.py b/scripts/prepare_combo_simcse_hellaswag.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ff68f753a231ce0014e257d9d5a9062712b389 --- /dev/null +++ b/scripts/prepare_combo_simcse_hellaswag.py @@ -0,0 +1,64 @@ +import json +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 512 + + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + files = [f for f in os.listdir(file_path) if 'jsonl' in f] + + for jsonl_file in files: + + with open(os.path.join(file_path, jsonl_file), 'r') as file: + + data = list(file) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + + jsons = [json.loads(x) for x in batch_data] + + ctx_a = [x['ctx_a'] for x in jsons] + ctx_b = [x['ctx_b'] for x in jsons] + endings = [x['endings'] for x in jsons] + + sentences2 = [] + for ctx_b_sentence, endings_list in zip(ctx_b, endings): + for ending in endings_list: + sentences2.append(ctx_b_sentence + ' ' + ending) + + pred_sentences1 = predictor(ctx_a) + pred_sentences2 = predictor(sentences2) + + pred_sentences2 = [pred_sentences1[i:i + 4] for i in range(0, len(pred_sentences2), 4)] + + for i, (sent1, sent2) in enumerate(zip(pred_sentences1, pred_sentences2)): + pred_sentences1_npz = get_attributes_of_sentence(sent1) + endings_attributes = [get_attributes_of_sentence(x) for x in sent2] + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred_sentences1_npz, 'pred_sentences1', os.path.join(directory_to_save, jsonl_file.replace('.jsonl', '')), npz_order) + + for j, ending_attributes in enumerate(endings_attributes): + save_npz_to_directory(ending_attributes, 'pred_endings_{}'.format(j), os.path.join(directory_to_save, jsonl_file.replace('.jsonl', '')), npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_mnli.py b/scripts/prepare_combo_simcse_mnli.py new file mode 100644 index 0000000000000000000000000000000000000000..781ac1939c7717c7388fec2102f3ad0ef4012bce --- /dev/null +++ b/scripts/prepare_combo_simcse_mnli.py @@ -0,0 +1,54 @@ +import json +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + + +def main(file_path:str, directory_to_save:str, combo_model_path:str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + files = [f for f in os.listdir(file_path) if 'jsonl' in f] + + for jsonl_file in files: + + with open(os.path.join(file_path, jsonl_file), 'r') as file: + data = list(file) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + + jsons = [json.loads(x) for x in batch_data] + + sentences1 = [x['sentence1'] for x in jsons] + sentences2 = [x['sentence2'] for x in jsons] + + pred_sentences1 = predictor(sentences1) + pred_sentences2 = predictor(sentences2) + + for i, (sent1, sent2) in enumerate(zip(pred_sentences1, pred_sentences2)): + pred_sentences1_npz = get_attributes_of_sentence(sent1) + pred_sentences2_npz = get_attributes_of_sentence(sent2) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred_sentences1_npz, 'pred_sentences1', directory_to_save + jsonl_file.replace('.jsonl', ''), npz_order) + save_npz_to_directory(pred_sentences2_npz, 'pred_sentences2', directory_to_save + jsonl_file.replace('.jsonl', ''), npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_nli.py b/scripts/prepare_combo_simcse_nli.py new file mode 100644 index 0000000000000000000000000000000000000000..9c7707e281eadf73480759676345aac577537c6d --- /dev/null +++ b/scripts/prepare_combo_simcse_nli.py @@ -0,0 +1,72 @@ +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +import pandas as pd +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + sent0 = 'sent0' + sent1 = 'sent1' + hard_neg = 'hard_neg' + + def get_attributes_of_sentence(sentence): + rel_dist = sentence.relation_distribution + label_dist = sentence.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + return { + 'relation_distribution': rel_dist, + 'relation_distribution_trimmed': rel_dist[1:, 1:], + 'relation_distribution_softmax': rel_dist_softmax, + 'relation_distribution_trimmed_softmax': rel_dist_softmax[1:, 1:], + 'relation_label_distribution': label_dist, + 'relation_label_distribution_softmax': label_dist_softmax + } + + def save_npz_to_directory(npz, npz_name, directory, npz_order): + os.makedirs(os.path.join(directory, npz_order), exist_ok=True) + np.savez( + os.path.join(directory, npz_order, npz_name + '.npz'), + **npz + ) + + with open(file_path, 'r') as file: + data = pd.read_csv(file_path, sep=',', header=0) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + if batch_no < 196: + continue + preds_sent0 = predictor([x.replace('\n', '') for x in batch_data[sent0]]) + preds_sent1 = predictor([x.replace('\n', '') for x in batch_data[sent1]]) + preds_hard_neg = predictor([x.replace('\n', '') for x in batch_data[hard_neg]]) + for i, (pred0, pred1, pred_hard_neg) in enumerate(zip(preds_sent0, preds_sent1, preds_hard_neg)): + pred0_npz = get_attributes_of_sentence(pred0) + pred1_npz = get_attributes_of_sentence(pred1) + pred_hard_neg_npz = get_attributes_of_sentence(pred_hard_neg) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred0_npz, 'pred0', directory_to_save, npz_order) + save_npz_to_directory(pred1_npz, 'pred1', directory_to_save, npz_order) + save_npz_to_directory(pred_hard_neg_npz, 'pred_hard_neg', directory_to_save, npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_wiki.py b/scripts/prepare_combo_simcse_wiki.py new file mode 100644 index 0000000000000000000000000000000000000000..e8e695caed5b2fa47e54e4d3ff42692bdd16e9e6 --- /dev/null +++ b/scripts/prepare_combo_simcse_wiki.py @@ -0,0 +1,53 @@ +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + with open(file_path, 'r') as file: + data = file.readlines() + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE))): + if batch_no < 3622: + continue + try: + preds = predictor([x.replace('\n', '') for x in batch_data]) + except: + continue + + for i, pred in enumerate(preds): + rel_dist = pred.relation_distribution + label_dist = pred.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + np.savez( + os.path.join(directory_to_save, npz_order + '.npz'), + relation_distribution=rel_dist, + relation_distribution_trimmed=rel_dist[1:, 1:], + relation_distribution_softmax=rel_dist_softmax, + relation_distribution_trimmed_softmax=rel_dist_softmax[1:, 1:], + relation_label_distribution=label_dist, + relation_label_distribution_softmax=label_dist_softmax + ) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_winogender.py b/scripts/prepare_combo_simcse_winogender.py new file mode 100644 index 0000000000000000000000000000000000000000..76b513b9929e13ede89e51e509ff903a89af94c3 --- /dev/null +++ b/scripts/prepare_combo_simcse_winogender.py @@ -0,0 +1,45 @@ +import json +import math +import os +import sys + +import pandas as pd +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + def batch(iterable, n=1): + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx:min(ndx + n, l)] + + sentence = 'sentence' + + with open(file_path, 'r') as file: + data = pd.read_csv(file_path, sep='\t', header=0) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + preds_sent0 = predictor([x.replace('\n', '') for x in batch_data[sentence]]) + for i, pred in enumerate(preds_sent0): + pred0_npz = get_attributes_of_sentence(pred) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred0_npz, sentence, directory_to_save, npz_order) + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/prepare_combo_simcse_winogrande.py b/scripts/prepare_combo_simcse_winogrande.py new file mode 100644 index 0000000000000000000000000000000000000000..19b0774053765e5174aed5b6a2d43fb0a318b24a --- /dev/null +++ b/scripts/prepare_combo_simcse_winogrande.py @@ -0,0 +1,51 @@ +import json +import math +import os +import sys + +from combo.predict import COMBO +from scipy.special import softmax +import numpy as np +from tqdm import tqdm +from utils import initialize_with_combo, predict_with_combo, get_attributes_of_sentence, save_npz_to_directory, batch + + +BATCH_SIZE = 256 + + + +def main(file_path: str, directory_to_save: str, combo_model_path: str): + '''Main function to prepare the npz files with dependency parsing relation matrix for the combo model.''' + + predictor = initialize_with_combo(combo_model_path) + os.makedirs(directory_to_save, exist_ok=True) + + files = [f for f in os.listdir(file_path) if 'jsonl' in f] + + + for jsonl_file in files: + + with open(os.path.join(file_path, jsonl_file), 'r') as file: + data = list(file) + for batch_no, batch_data in tqdm(enumerate(batch(data, BATCH_SIZE)), total=math.ceil(len(data) / BATCH_SIZE)): + + jsons = [json.loads(x) for x in batch_data] + + sentences = [x['sentence'] for x in jsons] + + pred_sentences = predictor(sentences) + + for i, sent1 in enumerate(pred_sentences): + pred_sentences1_npz = get_attributes_of_sentence(sent1) + + npz_order = str(batch_no * BATCH_SIZE + i + 1) + + save_npz_to_directory(pred_sentences1_npz, 'pred_sentences', directory_to_save + jsonl_file.replace('.jsonl', ''), npz_order) + + +if __name__ == '__main__': + argv = sys.argv[1:] + file_path = argv[0] + directory_to_save = argv[1] + combo_model_path = argv[2] + main(file_path, directory_to_save, combo_model_path) diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..383f53ae000af75478ee1378f93c22e640faebe0 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,40 @@ +def initialize_with_combo(combo_model_path: str) -> COMBO: + ''' Initialize COMBO model from pretrained model path''' + return COMBO.from_pretrained(combo_model_path, cuda_device=0) + + +def predict_with_combo(predictor: COMBO, text: str) -> [Sentence]: + ''' Predict with COMBO model''' + return predictor(text) + + +def get_attributes_of_sentence(sentence: Sentence) -> dict: + ''' Get attributes of sentence, in particular, relation distribution and relation label distribution''' + ''' Return also the softmax of the distributions''' + rel_dist = sentence.relation_distribution + label_dist = sentence.relation_label_distribution + rel_dist_softmax = softmax(rel_dist, axis=-1) + label_dist_softmax = softmax(label_dist[1:, 1:], axis=-1) + return { + 'relation_distribution': rel_dist, + 'relation_distribution_trimmed': rel_dist[1:, 1:], + 'relation_distribution_softmax': rel_dist_softmax, + 'relation_distribution_trimmed_softmax': rel_dist_softmax[1:, 1:], + 'relation_label_distribution': label_dist, + 'relation_label_distribution_softmax': label_dist_softmax + } + + +def save_npz_to_directory(npz: dict, npz_name: str, directory: str, npz_order: str): + ''' Save dictionary to npz file in a directory''' + os.makedirs(os.path.join(directory, npz_order), exist_ok=True) + np.savez( + os.path.join(directory, npz_order, npz_name + '.npz'), + **npz + ) + +def batch(iterable: Iterable, n: int=1): + ''' Batch an iterable into chunks of size n''' + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx:min(ndx + n, l)]