Version update

7d793d41 · Łukasz Pszenny · 03bc35b2 · 7d793d41 · 7d793d41 · 7d793d41
Commit 7d793d41 authored Jan 15, 2022 by Łukasz Pszenny
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 Run the following commands in your Python console to make predictions with a pre-trained model:
 ```python
@@ -42,21 +42,24 @@ We encourage you to use the [beginner's tutorial](https://colab.research.google.

 ## Citing

-### Accepted at EMNLP'21 demo session :tada: :fire:
-
-If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://arxiv.org/abs/2109.05361)
+If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://aclanthology.org/2021.emnlp-demo.7)
 ```bibtex
-@misc{klimaszewski2021combo,
-      title={COMBO: State-of-the-Art Morphosyntactic Analysis}, 
-      author={Mateusz Klimaszewski and Alina Wróblewska},
-      year={2021},
-      eprint={2109.05361},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
+@inproceedings{klimaszewski-wroblewska-2021-combo-state,
+    title = "{COMBO}: State-of-the-Art Morphosyntactic Analysis",
+    author = "Klimaszewski, Mateusz  and
+      Wr{\'o}blewska, Alina",
+    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = nov,
+    year = "2021",
+    address = "Online and Punta Cana, Dominican Republic",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.emnlp-demo.7",
+    pages = "50--62",
+    abstract = "We introduce COMBO {--} a fully neural NLP system for accurate part-of-speech tagging, morphological analysis, lemmatisation, and (enhanced) dependency parsing. It predicts categorical morphosyntactic features whilst also exposes their vector representations, extracted from hidden layers. COMBO is an easy to install Python package with automatically downloadable pre-trained models for over 40 languages. It maintains a balance between efficiency and quality. As it is an end-to-end system and its modules are jointly trained, its training is competitively fast. As its models are optimised for accuracy, they achieve often better prediction quality than SOTA. The COMBO library is available at: https://gitlab.clarin-pl.eu/syntactic-tools/combo.",
 }
 ```

-If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16/)
+If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16)
 ```bibtex
 @inproceedings{klimaszewski-wroblewska-2021-combo,
    title = "{COMBO}: A New Module for {EUD} Parsing",

--- a/combo/config.template.jsonnet
+++ b/combo/config.template.jsonnet
@@ -77,11 +77,6 @@ local in_features(name) = !(std.length(std.find(name, features)) == 0);
 local in_targets(name) = !(std.length(std.find(name, targets)) == 0);
 local use_transformer = pretrained_transformer_name != null;

-# Transformer encoder options
-local use_transformer_encoder = if std.length(std.extVar("use_transformer_encoder")) == "True" then true else false;
-local num_layers_transformer_encoder = 6;
-local num_attention_heads = 8;
-
 # Verify some configuration requirements
 assert in_features("token"): "Key 'token' must be in features!";
 assert in_features("char"): "Key 'char' must be in features!";
@@ -257,17 +252,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
            },
        },
        loss_weights: loss_weights,
-        seq_encoder: if use_transformer_encoder then {
-                type: "pytorch_transformer",
-                input_dim: (char_dim + projected_embedding_dim +
-                    (if in_features('xpostag') then xpostag_dim else 0) +
-                    (if in_features('lemma') then lemma_char_dim else 0) +
-                    (if in_features('upostag') then upostag_dim else 0) +
-                    (if in_features('feats') then feats_dim else 0)),
-                num_layers: num_layers_transformer_encoder,
-                feedforward_hidden_dim: hidden_size,
-                num_attention_heads: num_attention_heads,
-                positional_encoding: "sinusoidal"} else {
+        seq_encoder: {
            type: "combo_encoder",
            layer_dropout_probability: 0.33,
            stacked_bilstm: {
@@ -281,7 +266,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
                num_layers: num_layers,
                recurrent_dropout_probability: 0.33,
                layer_dropout_probability: 0.33
-                }
+            },
        },
        dependency_relation: {
            type: "combo_dependency_parsing_from_vocab",

--- a/combo/data/dataset.py
+++ b/combo/data/dataset.py
@@ -8,7 +8,7 @@ import torch
 from allennlp import data as allen_data
 from allennlp.common import checks, util
 from allennlp.data import fields as allen_fields, vocabulary
-from conllu import parser, TokenList
+from conllu import parser
 from dataclasses import dataclass
 from overrides import overrides

@@ -27,7 +27,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
            features: List[str] = None,
            targets: List[str] = None,
            use_sem: bool = False,
-            max_input_embedder: int = None,
            **kwargs,
    ) -> None:
        super().__init__(**kwargs)
@@ -49,7 +48,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                "Remove {} from either features or targets.".format(intersection)
            )
        self.use_sem = use_sem
-        self.max_input_embedder = max_input_embedder

        # *.conllu readers configuration
        fields = list(parser.DEFAULT_FIELDS)
@@ -90,9 +88,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):

    @overrides
    def text_to_instance(self, tree: conllu.TokenList) -> allen_data.Instance:
-        if self.max_input_embedder:
-            tree = TokenList(tokens = tree.tokens[: self.max_input_embedder],
-                             metadata = tree.metadata)
        fields_: Dict[str, allen_data.Field] = {}
        tree_tokens = [t for t in tree if isinstance(t["id"], int)]
        tokens = [_Token(t["token"],
@@ -122,10 +117,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                                                                              text_field,
                                                                              label_namespace="feats_labels")
                    elif target_name == "head":
-                        if self.max_input_embedder:
-                            target_values = [0 if v == "_" else int(v) for v in target_values]
-                            target_values = [v for v in target_values if v < self.max_input_embedder]
-                        else:
                        target_values = [0 if v == "_" else int(v) for v in target_values]
                        fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field,
                                                                               label_namespace=target_name + "_labels")
@@ -139,8 +130,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                            t_deps = t["deps"]
                            if t_deps and t_deps != "_":
                                for rel, head in t_deps:
-                                    if int(head) >= self.max_input_embedder:
-                                        continue
                                    # EmoryNLP skips the first edge, if there are two edges between the same
                                    # nodes. Thanks to that one is in a tree and another in a graph.
                                    # This snippet follows that approach.

--- a/combo/main.py
+++ b/combo/main.py
@@ -57,8 +57,6 @@ flags.DEFINE_string(name="serialization_dir", default=None,
                    help="Model serialization directory (default - system temp dir).")
 flags.DEFINE_boolean(name="tensorboard", default=False,
                     help="When provided model will log tensorboard metrics.")
-flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
-                    help="Indicator whether to use transformer encoder or BiLSTM (default)")

 # Finetune after training flags
 flags.DEFINE_list(name="finetuning_training_data_path", default="",
@@ -199,7 +197,6 @@ def _get_ext_vars(finetuning: bool = False) -> Dict:
        "num_epochs": str(FLAGS.num_epochs),
        "word_batch_size": str(FLAGS.word_batch_size),
        "use_tensorboard": str(FLAGS.tensorboard),
-        "use_transformer_encoder": str(FLAGS.use_transformer_encoder)
    }



--- a/combo/predict.py
+++ b/combo/predict.py
@@ -228,8 +228,7 @@ class COMBO(predictor.Predictor):
    @classmethod
    def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(),
                        batch_size: int = 1024,
-                        cuda_device: int = -1,
-                        max_input_embedder: int = None):
+                        cuda_device: int = -1):
        util.import_module_and_submodules("combo.commands")
        util.import_module_and_submodules("combo.models")
        util.import_module_and_submodules("combo.training")
@@ -246,13 +245,6 @@ class COMBO(predictor.Predictor):

        archive = models.load_archive(model_path, cuda_device=cuda_device)
        model = archive.model
-        dataset_reader = allen_data.DatasetReader.from_params(archive.config["dataset_reader"],
-                                                              max_input_embedder = max_input_embedder)
-
-        logger.info("Using pretrained transformer embedder may require truncating tokenized sentences.")
-        if max_input_embedder:
-            logger.info(f"Currently they are truncated to {max_input_embedder} first tokens")
-        else:
-            logger.info("Currently they are not truncated")
-
+        dataset_reader = allen_data.DatasetReader.from_params(
+            archive.config["dataset_reader"])
        return cls(model, dataset_reader, tokenizer, batch_size)
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -2,7 +2,7 @@
 Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 combo --helpfull
 ```

@@ -11,7 +11,7 @@ combo --helpfull
 python -m venv venv
 source venv/bin/activate
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```

 ### Conda example:
@@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
 conda create -n combo python=3.8
 conda activate combo
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```

 ## Problems & solutions

--- a/scripts/predict_ud_transformer.py
+++ b/scripts/predict_ud_transformer.py
-import pathlib
-
-from absl import app
-from absl import flags
-
-from scripts import utils
-
-LANG = ["Polish"]
-TREEBANKS = {"Polish" : "UD_Polish-PDB"}
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo",
-                    help="Path to data directory.")
-flags.DEFINE_string(name="models_dir", default="/home/pszenny/combo/tmp/",
-                    help="Model serialization dir.")
-flags.DEFINE_integer(name="cuda_device", default=-1,
-                     help="Cuda device id (-1 for cpu).")
-flags.DEFINE_boolean(name="expect_prefix", default=True,
-                     help="Whether to expect allennlp prefix.")
-flags.DEFINE_integer(name="batch_size", default=32,
-                     help="Batch size.")
-
-def run(_):
-    for encoder in ["BiLSTM","transformer"]:
-        models_dir = pathlib.Path(FLAGS.models_dir)
-        for model_dir in models_dir.iterdir():
-            lang = model_dir.name
-            if lang not in LANG:
-                print("Skipping unknown directory: ", lang)
-                continue
-
-            if FLAGS.expect_prefix:
-                model_dir = pathlib.Path(models_dir) / (lang + "/" + encoder + "/")
-                model_dir = list(model_dir.iterdir())
-                assert len(model_dir) == 1, f"There is incorrect count of models {model_dir}"
-                model_dir = model_dir[0]
-
-            data_dir = pathlib.Path(FLAGS.data_dir)
-            data_dir = data_dir / TREEBANKS[lang]
-            files = list(data_dir.iterdir())
-            test_file = [f for f in files if "test" in f.name and ".conllu" in f.name]
-            assert len(test_file) == 1, f"Couldn't find training file."
-            test_file = test_file[0]
-
-            output_pred = data_dir / f'{lang}_pred.conllu'
-            command = f"""combo --mode predict --model_path {model_dir}
-            --input_file {test_file}
-            --output_file {output_pred}
-            --cuda_device {FLAGS.cuda_device}
-            --batch_size {FLAGS.batch_size}
-            --silent
-            """
-            utils.execute_command(command)
-            return 1
-
-
-
-def main():
-    app.run(run)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
--- a/scripts/train_ud_transformer.py
+++ b/scripts/train_ud_transformer.py
-"""Script to train Dependency Parsing models based on UD 2.x data."""
-import pathlib
-
-from absl import app
-from absl import flags
-
-from scripts import utils
-
-# # ls -1 | xargs -i echo "\"{}\","
-# UD 2.7
-TREEBANKS = ["UD_Polish-PDB"]
-embedding_model = "allegro/herbert-base-cased"
-
-FLAGS = flags.FLAGS
-flags.DEFINE_list(name="treebanks", default=TREEBANKS,
-                  help=f"Treebanks to train. Possible values: {TREEBANKS}.")
-flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo/",
-                    help="Path to UD data directory.")
-flags.DEFINE_string(name="serialization_dir", default="/home/pszenny/combo/tmp/",
-                    help="Model serialization directory.")
-flags.DEFINE_integer(name="cuda_device", default=-1,
-                     help="Cuda device id (-1 for cpu).")
-flags.DEFINE_string(name="train_config_path", default="/home/pszenny/combo/combo/config.template.jsonnet",
-                    help="Directory of jsonnet config file")
-flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
-                    help="Indicator whether to use transformer encoder or BiLSTM (default)")
-
-def run(_):
-    treebanks_dir = pathlib.Path(FLAGS.data_dir)
-
-    for treebank in FLAGS.treebanks:
-        assert treebank in TREEBANKS, f"Unknown treebank {treebank}."
-        treebank_dir = treebanks_dir / treebank
-        treebank_parts = treebank[3:].split("-")
-        language = treebank_parts[0]
-
-        files = list(treebank_dir.iterdir())
-
-        training_file = [f for f in files if "train" in f.name and ".conllu" in f.name]
-        assert len(training_file) == 1, f"Couldn't find training file."
-        training_file_path = training_file[0]
-
-        valid_file = [f for f in files if "dev" in f.name and ".conllu" in f.name]
-        assert len(valid_file) == 1, f"Couldn't find validation file."
-        valid_file_path = valid_file[0]
-
-        serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/BiLSTM/")
-        serialization_dir.mkdir(exist_ok=True, parents=True)
-
-        word_batch_size = 2500
-
-        command = f"""time combo --mode train
-        --cuda_device {FLAGS.cuda_device}
-        --training_data_path {training_file_path}
-        --validation_data_path {valid_file_path}
-        --pretrained_transformer_name {embedding_model}
-        --serialization_dir {serialization_dir}
-        --use_transformer_encoder {FLAGS.use_transformer_encoder}
-        --config_path {FLAGS.train_config_path}
-        --notensorboard
-        --word_batch_size {word_batch_size}
-        --targets deprel,head,upostag,lemma,feats,xpostag
-        """
-
-        utils.execute_command(command)
-
-        FLAGS.use_transformer_encoder = True
-        serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/transformer/")
-        serialization_dir.mkdir(exist_ok=True, parents=True)
-
-        command = f"""time combo --mode train
-        --cuda_device {FLAGS.cuda_device}
-        --training_data_path {training_file_path}
-        --validation_data_path {valid_file_path}
-        --pretrained_transformer_name {embedding_model}
-        --serialization_dir {serialization_dir}
-        --use_transformer_encoder {FLAGS.use_transformer_encoder}
-        --config_path {FLAGS.train_config_path}
-        --notensorboard
-        --word_batch_size {word_batch_size}
-        --targets deprel,head,upostag,lemma,feats,xpostag
-        """
-
-        utils.execute_command(command)
-
-
-
-
-
-
-def main():
-    app.run(run)
-
-
-if __name__ == "__main__":
-    main()
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ REQUIREMENTS = [

 setup(
    name='combo',
-    version='1.0.4',
+    version='1.0.5',
    author='Mateusz Klimaszewski',
    author_email='M.Klimaszewski@ii.pw.edu.pl',
    install_requires=REQUIREMENTS,