From 7d793d41138abe1ac3f8822d7d2a6b43b8f93e39 Mon Sep 17 00:00:00 2001 From: pszenny <pszenny@e-science.pl> Date: Sat, 15 Jan 2022 15:14:03 +0100 Subject: [PATCH] Version update --- README.md | 27 +++++---- combo/config.template.jsonnet | 47 ++++++--------- combo/data/dataset.py | 23 ++------ combo/main.py | 3 - combo/predict.py | 14 +---- docs/installation.md | 6 +- scripts/predict_ud_transformer.py | 63 -------------------- scripts/train_ud_transformer.py | 96 ------------------------------- setup.py | 2 +- 9 files changed, 44 insertions(+), 237 deletions(-) delete mode 100644 scripts/predict_ud_transformer.py delete mode 100644 scripts/train_ud_transformer.py diff --git a/README.md b/README.md index 76e436b..e758bc6 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed): ```bash pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` Run the following commands in your Python console to make predictions with a pre-trained model: ```python @@ -42,21 +42,24 @@ We encourage you to use the [beginner's tutorial](https://colab.research.google. ## Citing -### Accepted at EMNLP'21 demo session :tada: :fire: - -If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://arxiv.org/abs/2109.05361) +If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://aclanthology.org/2021.emnlp-demo.7) ```bibtex -@misc{klimaszewski2021combo, - title={COMBO: State-of-the-Art Morphosyntactic Analysis}, - author={Mateusz Klimaszewski and Alina Wróblewska}, - year={2021}, - eprint={2109.05361}, - archivePrefix={arXiv}, - primaryClass={cs.CL} +@inproceedings{klimaszewski-wroblewska-2021-combo-state, + title = "{COMBO}: State-of-the-Art Morphosyntactic Analysis", + author = "Klimaszewski, Mateusz and + Wr{\'o}blewska, Alina", + booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", + month = nov, + year = "2021", + address = "Online and Punta Cana, Dominican Republic", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2021.emnlp-demo.7", + pages = "50--62", + abstract = "We introduce COMBO {--} a fully neural NLP system for accurate part-of-speech tagging, morphological analysis, lemmatisation, and (enhanced) dependency parsing. It predicts categorical morphosyntactic features whilst also exposes their vector representations, extracted from hidden layers. COMBO is an easy to install Python package with automatically downloadable pre-trained models for over 40 languages. It maintains a balance between efficiency and quality. As it is an end-to-end system and its modules are jointly trained, its training is competitively fast. As its models are optimised for accuracy, they achieve often better prediction quality than SOTA. The COMBO library is available at: https://gitlab.clarin-pl.eu/syntactic-tools/combo.", } ``` -If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16/) +If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16) ```bibtex @inproceedings{klimaszewski-wroblewska-2021-combo, title = "{COMBO}: A New Module for {EUD} Parsing", diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet index 53013ef..4e44f42 100644 --- a/combo/config.template.jsonnet +++ b/combo/config.template.jsonnet @@ -77,11 +77,6 @@ local in_features(name) = !(std.length(std.find(name, features)) == 0); local in_targets(name) = !(std.length(std.find(name, targets)) == 0); local use_transformer = pretrained_transformer_name != null; -# Transformer encoder options -local use_transformer_encoder = if std.length(std.extVar("use_transformer_encoder")) == "True" then true else false; -local num_layers_transformer_encoder = 6; -local num_attention_heads = 8; - # Verify some configuration requirements assert in_features("token"): "Key 'token' must be in features!"; assert in_features("char"): "Key 'char' must be in features!"; @@ -257,32 +252,22 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't }, }, loss_weights: loss_weights, - seq_encoder: if use_transformer_encoder then { - type: "pytorch_transformer", - input_dim: (char_dim + projected_embedding_dim + - (if in_features('xpostag') then xpostag_dim else 0) + - (if in_features('lemma') then lemma_char_dim else 0) + - (if in_features('upostag') then upostag_dim else 0) + - (if in_features('feats') then feats_dim else 0)), - num_layers: num_layers_transformer_encoder, - feedforward_hidden_dim: hidden_size, - num_attention_heads: num_attention_heads, - positional_encoding: "sinusoidal"} else { - type: "combo_encoder", - layer_dropout_probability: 0.33, - stacked_bilstm: { - input_size: - (char_dim + projected_embedding_dim + - (if in_features('xpostag') then xpostag_dim else 0) + - (if in_features('lemma') then lemma_char_dim else 0) + - (if in_features('upostag') then upostag_dim else 0) + - (if in_features('feats') then feats_dim else 0)), - hidden_size: hidden_size, - num_layers: num_layers, - recurrent_dropout_probability: 0.33, - layer_dropout_probability: 0.33 - } + seq_encoder: { + type: "combo_encoder", + layer_dropout_probability: 0.33, + stacked_bilstm: { + input_size: + (char_dim + projected_embedding_dim + + (if in_features('xpostag') then xpostag_dim else 0) + + (if in_features('lemma') then lemma_char_dim else 0) + + (if in_features('upostag') then upostag_dim else 0) + + (if in_features('feats') then feats_dim else 0)), + hidden_size: hidden_size, + num_layers: num_layers, + recurrent_dropout_probability: 0.33, + layer_dropout_probability: 0.33 }, + }, dependency_relation: { type: "combo_dependency_parsing_from_vocab", vocab_namespace: 'deprel_labels', @@ -404,4 +389,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't random_seed: 8787, pytorch_seed: 8787, numpy_seed: 8787, -} +} \ No newline at end of file diff --git a/combo/data/dataset.py b/combo/data/dataset.py index 4b0352a..bdc8b20 100644 --- a/combo/data/dataset.py +++ b/combo/data/dataset.py @@ -8,7 +8,7 @@ import torch from allennlp import data as allen_data from allennlp.common import checks, util from allennlp.data import fields as allen_fields, vocabulary -from conllu import parser, TokenList +from conllu import parser from dataclasses import dataclass from overrides import overrides @@ -27,7 +27,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): features: List[str] = None, targets: List[str] = None, use_sem: bool = False, - max_input_embedder: int = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -49,7 +48,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): "Remove {} from either features or targets.".format(intersection) ) self.use_sem = use_sem - self.max_input_embedder = max_input_embedder # *.conllu readers configuration fields = list(parser.DEFAULT_FIELDS) @@ -90,16 +88,13 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): @overrides def text_to_instance(self, tree: conllu.TokenList) -> allen_data.Instance: - if self.max_input_embedder: - tree = TokenList(tokens = tree.tokens[: self.max_input_embedder], - metadata = tree.metadata) fields_: Dict[str, allen_data.Field] = {} tree_tokens = [t for t in tree if isinstance(t["id"], int)] tokens = [_Token(t["token"], - pos_=t.get("upostag"), - tag_=t.get("xpostag"), - lemma_=t.get("lemma"), - feats_=t.get("feats")) + pos_=t.get("upostag"), + tag_=t.get("xpostag"), + lemma_=t.get("lemma"), + feats_=t.get("feats")) for t in tree_tokens] # features @@ -122,11 +117,7 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): text_field, label_namespace="feats_labels") elif target_name == "head": - if self.max_input_embedder: - target_values = [0 if v == "_" else int(v) for v in target_values] - target_values = [v for v in target_values if v < self.max_input_embedder] - else: - target_values = [0 if v == "_" else int(v) for v in target_values] + target_values = [0 if v == "_" else int(v) for v in target_values] fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field, label_namespace=target_name + "_labels") elif target_name == "deps": @@ -139,8 +130,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): t_deps = t["deps"] if t_deps and t_deps != "_": for rel, head in t_deps: - if int(head) >= self.max_input_embedder: - continue # EmoryNLP skips the first edge, if there are two edges between the same # nodes. Thanks to that one is in a tree and another in a graph. # This snippet follows that approach. diff --git a/combo/main.py b/combo/main.py index 10b04ef..d1e0292 100644 --- a/combo/main.py +++ b/combo/main.py @@ -57,8 +57,6 @@ flags.DEFINE_string(name="serialization_dir", default=None, help="Model serialization directory (default - system temp dir).") flags.DEFINE_boolean(name="tensorboard", default=False, help="When provided model will log tensorboard metrics.") -flags.DEFINE_boolean(name="use_transformer_encoder", default=False, - help="Indicator whether to use transformer encoder or BiLSTM (default)") # Finetune after training flags flags.DEFINE_list(name="finetuning_training_data_path", default="", @@ -199,7 +197,6 @@ def _get_ext_vars(finetuning: bool = False) -> Dict: "num_epochs": str(FLAGS.num_epochs), "word_batch_size": str(FLAGS.word_batch_size), "use_tensorboard": str(FLAGS.tensorboard), - "use_transformer_encoder": str(FLAGS.use_transformer_encoder) } diff --git a/combo/predict.py b/combo/predict.py index b235389..01a0837 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -228,8 +228,7 @@ class COMBO(predictor.Predictor): @classmethod def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(), batch_size: int = 1024, - cuda_device: int = -1, - max_input_embedder: int = None): + cuda_device: int = -1): util.import_module_and_submodules("combo.commands") util.import_module_and_submodules("combo.models") util.import_module_and_submodules("combo.training") @@ -246,13 +245,6 @@ class COMBO(predictor.Predictor): archive = models.load_archive(model_path, cuda_device=cuda_device) model = archive.model - dataset_reader = allen_data.DatasetReader.from_params(archive.config["dataset_reader"], - max_input_embedder = max_input_embedder) - - logger.info("Using pretrained transformer embedder may require truncating tokenized sentences.") - if max_input_embedder: - logger.info(f"Currently they are truncated to {max_input_embedder} first tokens") - else: - logger.info("Currently they are not truncated") - + dataset_reader = allen_data.DatasetReader.from_params( + archive.config["dataset_reader"]) return cls(model, dataset_reader, tokenizer, batch_size) diff --git a/docs/installation.md b/docs/installation.md index 422bed2..6354582 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -2,7 +2,7 @@ Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+): ```bash pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 combo --helpfull ``` @@ -11,7 +11,7 @@ combo --helpfull python -m venv venv source venv/bin/activate pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` ### Conda example: @@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4 conda create -n combo python=3.8 conda activate combo pip install -U pip setuptools wheel -pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1 +pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5 ``` ## Problems & solutions diff --git a/scripts/predict_ud_transformer.py b/scripts/predict_ud_transformer.py deleted file mode 100644 index c24832c..0000000 --- a/scripts/predict_ud_transformer.py +++ /dev/null @@ -1,63 +0,0 @@ -import pathlib - -from absl import app -from absl import flags - -from scripts import utils - -LANG = ["Polish"] -TREEBANKS = {"Polish" : "UD_Polish-PDB"} - -FLAGS = flags.FLAGS -flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo", - help="Path to data directory.") -flags.DEFINE_string(name="models_dir", default="/home/pszenny/combo/tmp/", - help="Model serialization dir.") -flags.DEFINE_integer(name="cuda_device", default=-1, - help="Cuda device id (-1 for cpu).") -flags.DEFINE_boolean(name="expect_prefix", default=True, - help="Whether to expect allennlp prefix.") -flags.DEFINE_integer(name="batch_size", default=32, - help="Batch size.") - -def run(_): - for encoder in ["BiLSTM","transformer"]: - models_dir = pathlib.Path(FLAGS.models_dir) - for model_dir in models_dir.iterdir(): - lang = model_dir.name - if lang not in LANG: - print("Skipping unknown directory: ", lang) - continue - - if FLAGS.expect_prefix: - model_dir = pathlib.Path(models_dir) / (lang + "/" + encoder + "/") - model_dir = list(model_dir.iterdir()) - assert len(model_dir) == 1, f"There is incorrect count of models {model_dir}" - model_dir = model_dir[0] - - data_dir = pathlib.Path(FLAGS.data_dir) - data_dir = data_dir / TREEBANKS[lang] - files = list(data_dir.iterdir()) - test_file = [f for f in files if "test" in f.name and ".conllu" in f.name] - assert len(test_file) == 1, f"Couldn't find training file." - test_file = test_file[0] - - output_pred = data_dir / f'{lang}_pred.conllu' - command = f"""combo --mode predict --model_path {model_dir} - --input_file {test_file} - --output_file {output_pred} - --cuda_device {FLAGS.cuda_device} - --batch_size {FLAGS.batch_size} - --silent - """ - utils.execute_command(command) - return 1 - - - -def main(): - app.run(run) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/train_ud_transformer.py b/scripts/train_ud_transformer.py deleted file mode 100644 index 9ecabae..0000000 --- a/scripts/train_ud_transformer.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Script to train Dependency Parsing models based on UD 2.x data.""" -import pathlib - -from absl import app -from absl import flags - -from scripts import utils - -# # ls -1 | xargs -i echo "\"{}\"," -# UD 2.7 -TREEBANKS = ["UD_Polish-PDB"] -embedding_model = "allegro/herbert-base-cased" - -FLAGS = flags.FLAGS -flags.DEFINE_list(name="treebanks", default=TREEBANKS, - help=f"Treebanks to train. Possible values: {TREEBANKS}.") -flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo/", - help="Path to UD data directory.") -flags.DEFINE_string(name="serialization_dir", default="/home/pszenny/combo/tmp/", - help="Model serialization directory.") -flags.DEFINE_integer(name="cuda_device", default=-1, - help="Cuda device id (-1 for cpu).") -flags.DEFINE_string(name="train_config_path", default="/home/pszenny/combo/combo/config.template.jsonnet", - help="Directory of jsonnet config file") -flags.DEFINE_boolean(name="use_transformer_encoder", default=False, - help="Indicator whether to use transformer encoder or BiLSTM (default)") - -def run(_): - treebanks_dir = pathlib.Path(FLAGS.data_dir) - - for treebank in FLAGS.treebanks: - assert treebank in TREEBANKS, f"Unknown treebank {treebank}." - treebank_dir = treebanks_dir / treebank - treebank_parts = treebank[3:].split("-") - language = treebank_parts[0] - - files = list(treebank_dir.iterdir()) - - training_file = [f for f in files if "train" in f.name and ".conllu" in f.name] - assert len(training_file) == 1, f"Couldn't find training file." - training_file_path = training_file[0] - - valid_file = [f for f in files if "dev" in f.name and ".conllu" in f.name] - assert len(valid_file) == 1, f"Couldn't find validation file." - valid_file_path = valid_file[0] - - serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/BiLSTM/") - serialization_dir.mkdir(exist_ok=True, parents=True) - - word_batch_size = 2500 - - command = f"""time combo --mode train - --cuda_device {FLAGS.cuda_device} - --training_data_path {training_file_path} - --validation_data_path {valid_file_path} - --pretrained_transformer_name {embedding_model} - --serialization_dir {serialization_dir} - --use_transformer_encoder {FLAGS.use_transformer_encoder} - --config_path {FLAGS.train_config_path} - --notensorboard - --word_batch_size {word_batch_size} - --targets deprel,head,upostag,lemma,feats,xpostag - """ - - utils.execute_command(command) - - FLAGS.use_transformer_encoder = True - serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/transformer/") - serialization_dir.mkdir(exist_ok=True, parents=True) - - command = f"""time combo --mode train - --cuda_device {FLAGS.cuda_device} - --training_data_path {training_file_path} - --validation_data_path {valid_file_path} - --pretrained_transformer_name {embedding_model} - --serialization_dir {serialization_dir} - --use_transformer_encoder {FLAGS.use_transformer_encoder} - --config_path {FLAGS.train_config_path} - --notensorboard - --word_batch_size {word_batch_size} - --targets deprel,head,upostag,lemma,feats,xpostag - """ - - utils.execute_command(command) - - - - - - -def main(): - app.run(run) - - -if __name__ == "__main__": - main() diff --git a/setup.py b/setup.py index 876909d..0e28601 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ REQUIREMENTS = [ setup( name='combo', - version='1.0.4', + version='1.0.5', author='Mateusz Klimaszewski', author_email='M.Klimaszewski@ii.pw.edu.pl', install_requires=REQUIREMENTS, -- GitLab