From 35c211f79c8127d42f4a9b6f1a2b20934a583bc0 Mon Sep 17 00:00:00 2001
From: pszenny <pszenny@e-science.pl>
Date: Wed, 20 Oct 2021 01:34:33 +0200
Subject: [PATCH 1/3] Truncation in token indexer and script to train COMBO on
 PDB with transformer_encoder

---
 combo/config.template.jsonnet                 | 45 ++++++---
 ...etrained_transformer_mismatched_indexer.py | 37 ++++++-
 combo/main.py                                 |  3 +
 scripts/predict_ud_transformer.py             | 63 ++++++++++++
 scripts/train_ud_transformer.py               | 96 +++++++++++++++++++
 5 files changed, 227 insertions(+), 17 deletions(-)
 create mode 100644 scripts/predict_ud_transformer.py
 create mode 100644 scripts/train_ud_transformer.py

diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet
index c602d9c..53013ef 100644
--- a/combo/config.template.jsonnet
+++ b/combo/config.template.jsonnet
@@ -77,6 +77,11 @@ local in_features(name) = !(std.length(std.find(name, features)) == 0);
 local in_targets(name) = !(std.length(std.find(name, targets)) == 0);
 local use_transformer = pretrained_transformer_name != null;
 
+# Transformer encoder options
+local use_transformer_encoder = if std.length(std.extVar("use_transformer_encoder")) == "True" then true else false;
+local num_layers_transformer_encoder = 6;
+local num_attention_heads = 8;
+
 # Verify some configuration requirements
 assert in_features("token"): "Key 'token' must be in features!";
 assert in_features("char"): "Key 'char' must be in features!";
@@ -252,22 +257,32 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
             },
         },
         loss_weights: loss_weights,
-        seq_encoder: {
-            type: "combo_encoder",
-            layer_dropout_probability: 0.33,
-            stacked_bilstm: {
-                input_size:
-                (char_dim + projected_embedding_dim +
-                (if in_features('xpostag') then xpostag_dim else 0) +
-                (if in_features('lemma') then lemma_char_dim else 0) +
-                (if in_features('upostag') then upostag_dim else 0) +
-                (if in_features('feats') then feats_dim else 0)),
-                hidden_size: hidden_size,
-                num_layers: num_layers,
-                recurrent_dropout_probability: 0.33,
-                layer_dropout_probability: 0.33
+        seq_encoder: if use_transformer_encoder then {
+                type: "pytorch_transformer",
+                input_dim: (char_dim + projected_embedding_dim +
+                    (if in_features('xpostag') then xpostag_dim else 0) +
+                    (if in_features('lemma') then lemma_char_dim else 0) +
+                    (if in_features('upostag') then upostag_dim else 0) +
+                    (if in_features('feats') then feats_dim else 0)),
+                num_layers: num_layers_transformer_encoder,
+                feedforward_hidden_dim: hidden_size,
+                num_attention_heads: num_attention_heads,
+                positional_encoding: "sinusoidal"} else {
+                type: "combo_encoder",
+                layer_dropout_probability: 0.33,
+                stacked_bilstm: {
+                    input_size:
+                    (char_dim + projected_embedding_dim +
+                    (if in_features('xpostag') then xpostag_dim else 0) +
+                    (if in_features('lemma') then lemma_char_dim else 0) +
+                    (if in_features('upostag') then upostag_dim else 0) +
+                    (if in_features('feats') then feats_dim else 0)),
+                    hidden_size: hidden_size,
+                    num_layers: num_layers,
+                    recurrent_dropout_probability: 0.33,
+                    layer_dropout_probability: 0.33
+                }
             },
-        },
         dependency_relation: {
             type: "combo_dependency_parsing_from_vocab",
             vocab_namespace: 'deprel_labels',
diff --git a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
index 3eee80e..b9a4e3c 100644
--- a/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+++ b/combo/data/token_indexers/pretrained_transformer_mismatched_indexer.py
@@ -1,12 +1,14 @@
 from typing import Optional, Dict, Any, List, Tuple
 
 from allennlp import data
-from allennlp.data import token_indexers, tokenizers
+from allennlp.data import token_indexers, tokenizers, IndexedTokenList, vocabulary
+from overrides import overrides
+
+from typing import List
 
 
 @data.TokenIndexer.register("pretrained_transformer_mismatched_fixed")
 class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransformerMismatchedIndexer):
-    """TODO(mklimasz) Remove during next allennlp update, fixed on allennlp master."""
 
     def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None,
                  tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs) -> None:
@@ -24,6 +26,37 @@ class PretrainedTransformerMismatchedIndexer(token_indexers.PretrainedTransforme
         self._num_added_start_tokens = self._matched_indexer._num_added_start_tokens
         self._num_added_end_tokens = self._matched_indexer._num_added_end_tokens
 
+    @overrides
+    def tokens_to_indices(self,
+                          tokens,
+                          vocabulary: vocabulary ) -> IndexedTokenList:
+        """
+        Method is overridden in order to raise an error while the number of tokens needed to embed a sentence exceeds the
+        maximal input of a model.
+        """
+        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
+
+        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize(
+            [t.ensure_text() for t in tokens])
+
+        if len(wordpieces) > self._tokenizer.max_len_single_sentence:
+            raise ValueError("Following sentence consists of more wordpiece tokens that the model can process:\n" +\
+                             " ".join([str(x) for x in tokens[:10]]) + " ... \n" + \
+                             f"Maximal input: {self._tokenizer.max_len_single_sentence}\n"+ \
+                             f"Current input: {len(wordpieces)}")
+
+        offsets = [x if x is not None else (-1, -1) for x in offsets]
+
+        output: IndexedTokenList = {
+            "token_ids": [t.text_id for t in wordpieces],
+            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
+            "type_ids": [t.type_id for t in wordpieces],
+            "offsets": offsets,
+            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
+        }
+
+        return self._matched_indexer._postprocess_output(output)
+
 
 class PretrainedTransformerIndexer(token_indexers.PretrainedTransformerIndexer):
 
diff --git a/combo/main.py b/combo/main.py
index d1e0292..10b04ef 100644
--- a/combo/main.py
+++ b/combo/main.py
@@ -57,6 +57,8 @@ flags.DEFINE_string(name="serialization_dir", default=None,
                     help="Model serialization directory (default - system temp dir).")
 flags.DEFINE_boolean(name="tensorboard", default=False,
                      help="When provided model will log tensorboard metrics.")
+flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
+                    help="Indicator whether to use transformer encoder or BiLSTM (default)")
 
 # Finetune after training flags
 flags.DEFINE_list(name="finetuning_training_data_path", default="",
@@ -197,6 +199,7 @@ def _get_ext_vars(finetuning: bool = False) -> Dict:
         "num_epochs": str(FLAGS.num_epochs),
         "word_batch_size": str(FLAGS.word_batch_size),
         "use_tensorboard": str(FLAGS.tensorboard),
+        "use_transformer_encoder": str(FLAGS.use_transformer_encoder)
     }
 
 
diff --git a/scripts/predict_ud_transformer.py b/scripts/predict_ud_transformer.py
new file mode 100644
index 0000000..c24832c
--- /dev/null
+++ b/scripts/predict_ud_transformer.py
@@ -0,0 +1,63 @@
+import pathlib
+
+from absl import app
+from absl import flags
+
+from scripts import utils
+
+LANG = ["Polish"]
+TREEBANKS = {"Polish" : "UD_Polish-PDB"}
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo",
+                    help="Path to data directory.")
+flags.DEFINE_string(name="models_dir", default="/home/pszenny/combo/tmp/",
+                    help="Model serialization dir.")
+flags.DEFINE_integer(name="cuda_device", default=-1,
+                     help="Cuda device id (-1 for cpu).")
+flags.DEFINE_boolean(name="expect_prefix", default=True,
+                     help="Whether to expect allennlp prefix.")
+flags.DEFINE_integer(name="batch_size", default=32,
+                     help="Batch size.")
+
+def run(_):
+    for encoder in ["BiLSTM","transformer"]:
+        models_dir = pathlib.Path(FLAGS.models_dir)
+        for model_dir in models_dir.iterdir():
+            lang = model_dir.name
+            if lang not in LANG:
+                print("Skipping unknown directory: ", lang)
+                continue
+
+            if FLAGS.expect_prefix:
+                model_dir = pathlib.Path(models_dir) / (lang + "/" + encoder + "/")
+                model_dir = list(model_dir.iterdir())
+                assert len(model_dir) == 1, f"There is incorrect count of models {model_dir}"
+                model_dir = model_dir[0]
+
+            data_dir = pathlib.Path(FLAGS.data_dir)
+            data_dir = data_dir / TREEBANKS[lang]
+            files = list(data_dir.iterdir())
+            test_file = [f for f in files if "test" in f.name and ".conllu" in f.name]
+            assert len(test_file) == 1, f"Couldn't find training file."
+            test_file = test_file[0]
+
+            output_pred = data_dir / f'{lang}_pred.conllu'
+            command = f"""combo --mode predict --model_path {model_dir}
+            --input_file {test_file}
+            --output_file {output_pred}
+            --cuda_device {FLAGS.cuda_device}
+            --batch_size {FLAGS.batch_size}
+            --silent
+            """
+            utils.execute_command(command)
+            return 1
+
+
+
+def main():
+    app.run(run)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/train_ud_transformer.py b/scripts/train_ud_transformer.py
new file mode 100644
index 0000000..9ecabae
--- /dev/null
+++ b/scripts/train_ud_transformer.py
@@ -0,0 +1,96 @@
+"""Script to train Dependency Parsing models based on UD 2.x data."""
+import pathlib
+
+from absl import app
+from absl import flags
+
+from scripts import utils
+
+# # ls -1 | xargs -i echo "\"{}\","
+# UD 2.7
+TREEBANKS = ["UD_Polish-PDB"]
+embedding_model = "allegro/herbert-base-cased"
+
+FLAGS = flags.FLAGS
+flags.DEFINE_list(name="treebanks", default=TREEBANKS,
+                  help=f"Treebanks to train. Possible values: {TREEBANKS}.")
+flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo/",
+                    help="Path to UD data directory.")
+flags.DEFINE_string(name="serialization_dir", default="/home/pszenny/combo/tmp/",
+                    help="Model serialization directory.")
+flags.DEFINE_integer(name="cuda_device", default=-1,
+                     help="Cuda device id (-1 for cpu).")
+flags.DEFINE_string(name="train_config_path", default="/home/pszenny/combo/combo/config.template.jsonnet",
+                    help="Directory of jsonnet config file")
+flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
+                    help="Indicator whether to use transformer encoder or BiLSTM (default)")
+
+def run(_):
+    treebanks_dir = pathlib.Path(FLAGS.data_dir)
+
+    for treebank in FLAGS.treebanks:
+        assert treebank in TREEBANKS, f"Unknown treebank {treebank}."
+        treebank_dir = treebanks_dir / treebank
+        treebank_parts = treebank[3:].split("-")
+        language = treebank_parts[0]
+
+        files = list(treebank_dir.iterdir())
+
+        training_file = [f for f in files if "train" in f.name and ".conllu" in f.name]
+        assert len(training_file) == 1, f"Couldn't find training file."
+        training_file_path = training_file[0]
+
+        valid_file = [f for f in files if "dev" in f.name and ".conllu" in f.name]
+        assert len(valid_file) == 1, f"Couldn't find validation file."
+        valid_file_path = valid_file[0]
+
+        serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/BiLSTM/")
+        serialization_dir.mkdir(exist_ok=True, parents=True)
+
+        word_batch_size = 2500
+
+        command = f"""time combo --mode train
+        --cuda_device {FLAGS.cuda_device}
+        --training_data_path {training_file_path}
+        --validation_data_path {valid_file_path}
+        --pretrained_transformer_name {embedding_model}
+        --serialization_dir {serialization_dir}
+        --use_transformer_encoder {FLAGS.use_transformer_encoder}
+        --config_path {FLAGS.train_config_path}
+        --notensorboard
+        --word_batch_size {word_batch_size}
+        --targets deprel,head,upostag,lemma,feats,xpostag
+        """
+
+        utils.execute_command(command)
+
+        FLAGS.use_transformer_encoder = True
+        serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/transformer/")
+        serialization_dir.mkdir(exist_ok=True, parents=True)
+
+        command = f"""time combo --mode train
+        --cuda_device {FLAGS.cuda_device}
+        --training_data_path {training_file_path}
+        --validation_data_path {valid_file_path}
+        --pretrained_transformer_name {embedding_model}
+        --serialization_dir {serialization_dir}
+        --use_transformer_encoder {FLAGS.use_transformer_encoder}
+        --config_path {FLAGS.train_config_path}
+        --notensorboard
+        --word_batch_size {word_batch_size}
+        --targets deprel,head,upostag,lemma,feats,xpostag
+        """
+
+        utils.execute_command(command)
+
+
+
+
+
+
+def main():
+    app.run(run)
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab


From 03bc35b22f488d643b0746e6fcf4bd4c2a0dd3a4 Mon Sep 17 00:00:00 2001
From: pszenny <pszenny@e-science.pl>
Date: Thu, 7 Oct 2021 09:44:07 +0200
Subject: [PATCH 2/3] Truncation while  text_to_instance is called.

---
 combo/data/dataset.py | 23 +++++++++++++++++------
 combo/predict.py      | 14 +++++++++++---
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/combo/data/dataset.py b/combo/data/dataset.py
index bdc8b20..4b0352a 100644
--- a/combo/data/dataset.py
+++ b/combo/data/dataset.py
@@ -8,7 +8,7 @@ import torch
 from allennlp import data as allen_data
 from allennlp.common import checks, util
 from allennlp.data import fields as allen_fields, vocabulary
-from conllu import parser
+from conllu import parser, TokenList
 from dataclasses import dataclass
 from overrides import overrides
 
@@ -27,6 +27,7 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
             features: List[str] = None,
             targets: List[str] = None,
             use_sem: bool = False,
+            max_input_embedder: int = None,
             **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -48,6 +49,7 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                 "Remove {} from either features or targets.".format(intersection)
             )
         self.use_sem = use_sem
+        self.max_input_embedder = max_input_embedder
 
         # *.conllu readers configuration
         fields = list(parser.DEFAULT_FIELDS)
@@ -88,13 +90,16 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
 
     @overrides
     def text_to_instance(self, tree: conllu.TokenList) -> allen_data.Instance:
+        if self.max_input_embedder:
+            tree = TokenList(tokens = tree.tokens[: self.max_input_embedder],
+                             metadata = tree.metadata)
         fields_: Dict[str, allen_data.Field] = {}
         tree_tokens = [t for t in tree if isinstance(t["id"], int)]
         tokens = [_Token(t["token"],
-                         pos_=t.get("upostag"),
-                         tag_=t.get("xpostag"),
-                         lemma_=t.get("lemma"),
-                         feats_=t.get("feats"))
+                  pos_=t.get("upostag"),
+                  tag_=t.get("xpostag"),
+                  lemma_=t.get("lemma"),
+                  feats_=t.get("feats"))
                   for t in tree_tokens]
 
         # features
@@ -117,7 +122,11 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                                                                               text_field,
                                                                               label_namespace="feats_labels")
                     elif target_name == "head":
-                        target_values = [0 if v == "_" else int(v) for v in target_values]
+                        if self.max_input_embedder:
+                            target_values = [0 if v == "_" else int(v) for v in target_values]
+                            target_values = [v for v in target_values if v < self.max_input_embedder]
+                        else:
+                            target_values = [0 if v == "_" else int(v) for v in target_values]
                         fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field,
                                                                                label_namespace=target_name + "_labels")
                     elif target_name == "deps":
@@ -130,6 +139,8 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                             t_deps = t["deps"]
                             if t_deps and t_deps != "_":
                                 for rel, head in t_deps:
+                                    if int(head) >= self.max_input_embedder:
+                                        continue
                                     # EmoryNLP skips the first edge, if there are two edges between the same
                                     # nodes. Thanks to that one is in a tree and another in a graph.
                                     # This snippet follows that approach.
diff --git a/combo/predict.py b/combo/predict.py
index 01a0837..b235389 100644
--- a/combo/predict.py
+++ b/combo/predict.py
@@ -228,7 +228,8 @@ class COMBO(predictor.Predictor):
     @classmethod
     def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(),
                         batch_size: int = 1024,
-                        cuda_device: int = -1):
+                        cuda_device: int = -1,
+                        max_input_embedder: int = None):
         util.import_module_and_submodules("combo.commands")
         util.import_module_and_submodules("combo.models")
         util.import_module_and_submodules("combo.training")
@@ -245,6 +246,13 @@ class COMBO(predictor.Predictor):
 
         archive = models.load_archive(model_path, cuda_device=cuda_device)
         model = archive.model
-        dataset_reader = allen_data.DatasetReader.from_params(
-            archive.config["dataset_reader"])
+        dataset_reader = allen_data.DatasetReader.from_params(archive.config["dataset_reader"],
+                                                              max_input_embedder = max_input_embedder)
+
+        logger.info("Using pretrained transformer embedder may require truncating tokenized sentences.")
+        if max_input_embedder:
+            logger.info(f"Currently they are truncated to {max_input_embedder} first tokens")
+        else:
+            logger.info("Currently they are not truncated")
+
         return cls(model, dataset_reader, tokenizer, batch_size)
-- 
GitLab


From 7d793d41138abe1ac3f8822d7d2a6b43b8f93e39 Mon Sep 17 00:00:00 2001
From: pszenny <pszenny@e-science.pl>
Date: Sat, 15 Jan 2022 15:14:03 +0100
Subject: [PATCH 3/3] Version update

---
 README.md                         | 27 +++++----
 combo/config.template.jsonnet     | 47 ++++++---------
 combo/data/dataset.py             | 23 ++------
 combo/main.py                     |  3 -
 combo/predict.py                  | 14 +----
 docs/installation.md              |  6 +-
 scripts/predict_ud_transformer.py | 63 --------------------
 scripts/train_ud_transformer.py   | 96 -------------------------------
 setup.py                          |  2 +-
 9 files changed, 44 insertions(+), 237 deletions(-)
 delete mode 100644 scripts/predict_ud_transformer.py
 delete mode 100644 scripts/train_ud_transformer.py

diff --git a/README.md b/README.md
index 76e436b..e758bc6 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 Run the following commands in your Python console to make predictions with a pre-trained model:
 ```python
@@ -42,21 +42,24 @@ We encourage you to use the [beginner's tutorial](https://colab.research.google.
 
 ## Citing
 
-### Accepted at EMNLP'21 demo session :tada: :fire:
-
-If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://arxiv.org/abs/2109.05361)
+If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://aclanthology.org/2021.emnlp-demo.7)
 ```bibtex
-@misc{klimaszewski2021combo,
-      title={COMBO: State-of-the-Art Morphosyntactic Analysis}, 
-      author={Mateusz Klimaszewski and Alina Wróblewska},
-      year={2021},
-      eprint={2109.05361},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
+@inproceedings{klimaszewski-wroblewska-2021-combo-state,
+    title = "{COMBO}: State-of-the-Art Morphosyntactic Analysis",
+    author = "Klimaszewski, Mateusz  and
+      Wr{\'o}blewska, Alina",
+    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = nov,
+    year = "2021",
+    address = "Online and Punta Cana, Dominican Republic",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.emnlp-demo.7",
+    pages = "50--62",
+    abstract = "We introduce COMBO {--} a fully neural NLP system for accurate part-of-speech tagging, morphological analysis, lemmatisation, and (enhanced) dependency parsing. It predicts categorical morphosyntactic features whilst also exposes their vector representations, extracted from hidden layers. COMBO is an easy to install Python package with automatically downloadable pre-trained models for over 40 languages. It maintains a balance between efficiency and quality. As it is an end-to-end system and its modules are jointly trained, its training is competitively fast. As its models are optimised for accuracy, they achieve often better prediction quality than SOTA. The COMBO library is available at: https://gitlab.clarin-pl.eu/syntactic-tools/combo.",
 }
 ```
 
-If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16/)
+If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16)
 ```bibtex
 @inproceedings{klimaszewski-wroblewska-2021-combo,
     title = "{COMBO}: A New Module for {EUD} Parsing",
diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet
index 53013ef..4e44f42 100644
--- a/combo/config.template.jsonnet
+++ b/combo/config.template.jsonnet
@@ -77,11 +77,6 @@ local in_features(name) = !(std.length(std.find(name, features)) == 0);
 local in_targets(name) = !(std.length(std.find(name, targets)) == 0);
 local use_transformer = pretrained_transformer_name != null;
 
-# Transformer encoder options
-local use_transformer_encoder = if std.length(std.extVar("use_transformer_encoder")) == "True" then true else false;
-local num_layers_transformer_encoder = 6;
-local num_attention_heads = 8;
-
 # Verify some configuration requirements
 assert in_features("token"): "Key 'token' must be in features!";
 assert in_features("char"): "Key 'char' must be in features!";
@@ -257,32 +252,22 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
             },
         },
         loss_weights: loss_weights,
-        seq_encoder: if use_transformer_encoder then {
-                type: "pytorch_transformer",
-                input_dim: (char_dim + projected_embedding_dim +
-                    (if in_features('xpostag') then xpostag_dim else 0) +
-                    (if in_features('lemma') then lemma_char_dim else 0) +
-                    (if in_features('upostag') then upostag_dim else 0) +
-                    (if in_features('feats') then feats_dim else 0)),
-                num_layers: num_layers_transformer_encoder,
-                feedforward_hidden_dim: hidden_size,
-                num_attention_heads: num_attention_heads,
-                positional_encoding: "sinusoidal"} else {
-                type: "combo_encoder",
-                layer_dropout_probability: 0.33,
-                stacked_bilstm: {
-                    input_size:
-                    (char_dim + projected_embedding_dim +
-                    (if in_features('xpostag') then xpostag_dim else 0) +
-                    (if in_features('lemma') then lemma_char_dim else 0) +
-                    (if in_features('upostag') then upostag_dim else 0) +
-                    (if in_features('feats') then feats_dim else 0)),
-                    hidden_size: hidden_size,
-                    num_layers: num_layers,
-                    recurrent_dropout_probability: 0.33,
-                    layer_dropout_probability: 0.33
-                }
+        seq_encoder: {
+            type: "combo_encoder",
+            layer_dropout_probability: 0.33,
+            stacked_bilstm: {
+                input_size:
+                (char_dim + projected_embedding_dim +
+                (if in_features('xpostag') then xpostag_dim else 0) +
+                (if in_features('lemma') then lemma_char_dim else 0) +
+                (if in_features('upostag') then upostag_dim else 0) +
+                (if in_features('feats') then feats_dim else 0)),
+                hidden_size: hidden_size,
+                num_layers: num_layers,
+                recurrent_dropout_probability: 0.33,
+                layer_dropout_probability: 0.33
             },
+        },
         dependency_relation: {
             type: "combo_dependency_parsing_from_vocab",
             vocab_namespace: 'deprel_labels',
@@ -404,4 +389,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
     random_seed: 8787,
     pytorch_seed: 8787,
     numpy_seed: 8787,
-}
+}
\ No newline at end of file
diff --git a/combo/data/dataset.py b/combo/data/dataset.py
index 4b0352a..bdc8b20 100644
--- a/combo/data/dataset.py
+++ b/combo/data/dataset.py
@@ -8,7 +8,7 @@ import torch
 from allennlp import data as allen_data
 from allennlp.common import checks, util
 from allennlp.data import fields as allen_fields, vocabulary
-from conllu import parser, TokenList
+from conllu import parser
 from dataclasses import dataclass
 from overrides import overrides
 
@@ -27,7 +27,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
             features: List[str] = None,
             targets: List[str] = None,
             use_sem: bool = False,
-            max_input_embedder: int = None,
             **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -49,7 +48,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                 "Remove {} from either features or targets.".format(intersection)
             )
         self.use_sem = use_sem
-        self.max_input_embedder = max_input_embedder
 
         # *.conllu readers configuration
         fields = list(parser.DEFAULT_FIELDS)
@@ -90,16 +88,13 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
 
     @overrides
     def text_to_instance(self, tree: conllu.TokenList) -> allen_data.Instance:
-        if self.max_input_embedder:
-            tree = TokenList(tokens = tree.tokens[: self.max_input_embedder],
-                             metadata = tree.metadata)
         fields_: Dict[str, allen_data.Field] = {}
         tree_tokens = [t for t in tree if isinstance(t["id"], int)]
         tokens = [_Token(t["token"],
-                  pos_=t.get("upostag"),
-                  tag_=t.get("xpostag"),
-                  lemma_=t.get("lemma"),
-                  feats_=t.get("feats"))
+                         pos_=t.get("upostag"),
+                         tag_=t.get("xpostag"),
+                         lemma_=t.get("lemma"),
+                         feats_=t.get("feats"))
                   for t in tree_tokens]
 
         # features
@@ -122,11 +117,7 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                                                                               text_field,
                                                                               label_namespace="feats_labels")
                     elif target_name == "head":
-                        if self.max_input_embedder:
-                            target_values = [0 if v == "_" else int(v) for v in target_values]
-                            target_values = [v for v in target_values if v < self.max_input_embedder]
-                        else:
-                            target_values = [0 if v == "_" else int(v) for v in target_values]
+                        target_values = [0 if v == "_" else int(v) for v in target_values]
                         fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field,
                                                                                label_namespace=target_name + "_labels")
                     elif target_name == "deps":
@@ -139,8 +130,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
                             t_deps = t["deps"]
                             if t_deps and t_deps != "_":
                                 for rel, head in t_deps:
-                                    if int(head) >= self.max_input_embedder:
-                                        continue
                                     # EmoryNLP skips the first edge, if there are two edges between the same
                                     # nodes. Thanks to that one is in a tree and another in a graph.
                                     # This snippet follows that approach.
diff --git a/combo/main.py b/combo/main.py
index 10b04ef..d1e0292 100644
--- a/combo/main.py
+++ b/combo/main.py
@@ -57,8 +57,6 @@ flags.DEFINE_string(name="serialization_dir", default=None,
                     help="Model serialization directory (default - system temp dir).")
 flags.DEFINE_boolean(name="tensorboard", default=False,
                      help="When provided model will log tensorboard metrics.")
-flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
-                    help="Indicator whether to use transformer encoder or BiLSTM (default)")
 
 # Finetune after training flags
 flags.DEFINE_list(name="finetuning_training_data_path", default="",
@@ -199,7 +197,6 @@ def _get_ext_vars(finetuning: bool = False) -> Dict:
         "num_epochs": str(FLAGS.num_epochs),
         "word_batch_size": str(FLAGS.word_batch_size),
         "use_tensorboard": str(FLAGS.tensorboard),
-        "use_transformer_encoder": str(FLAGS.use_transformer_encoder)
     }
 
 
diff --git a/combo/predict.py b/combo/predict.py
index b235389..01a0837 100644
--- a/combo/predict.py
+++ b/combo/predict.py
@@ -228,8 +228,7 @@ class COMBO(predictor.Predictor):
     @classmethod
     def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(),
                         batch_size: int = 1024,
-                        cuda_device: int = -1,
-                        max_input_embedder: int = None):
+                        cuda_device: int = -1):
         util.import_module_and_submodules("combo.commands")
         util.import_module_and_submodules("combo.models")
         util.import_module_and_submodules("combo.training")
@@ -246,13 +245,6 @@ class COMBO(predictor.Predictor):
 
         archive = models.load_archive(model_path, cuda_device=cuda_device)
         model = archive.model
-        dataset_reader = allen_data.DatasetReader.from_params(archive.config["dataset_reader"],
-                                                              max_input_embedder = max_input_embedder)
-
-        logger.info("Using pretrained transformer embedder may require truncating tokenized sentences.")
-        if max_input_embedder:
-            logger.info(f"Currently they are truncated to {max_input_embedder} first tokens")
-        else:
-            logger.info("Currently they are not truncated")
-
+        dataset_reader = allen_data.DatasetReader.from_params(
+            archive.config["dataset_reader"])
         return cls(model, dataset_reader, tokenizer, batch_size)
diff --git a/docs/installation.md b/docs/installation.md
index 422bed2..6354582 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -2,7 +2,7 @@
 Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+):
 ```bash
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 combo --helpfull
 ```
 
@@ -11,7 +11,7 @@ combo --helpfull
 python -m venv venv
 source venv/bin/activate
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ### Conda example:
@@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
 conda create -n combo python=3.8
 conda activate combo
 pip install -U pip setuptools wheel
-pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1
+pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
 ```
 
 ## Problems & solutions
diff --git a/scripts/predict_ud_transformer.py b/scripts/predict_ud_transformer.py
deleted file mode 100644
index c24832c..0000000
--- a/scripts/predict_ud_transformer.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import pathlib
-
-from absl import app
-from absl import flags
-
-from scripts import utils
-
-LANG = ["Polish"]
-TREEBANKS = {"Polish" : "UD_Polish-PDB"}
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo",
-                    help="Path to data directory.")
-flags.DEFINE_string(name="models_dir", default="/home/pszenny/combo/tmp/",
-                    help="Model serialization dir.")
-flags.DEFINE_integer(name="cuda_device", default=-1,
-                     help="Cuda device id (-1 for cpu).")
-flags.DEFINE_boolean(name="expect_prefix", default=True,
-                     help="Whether to expect allennlp prefix.")
-flags.DEFINE_integer(name="batch_size", default=32,
-                     help="Batch size.")
-
-def run(_):
-    for encoder in ["BiLSTM","transformer"]:
-        models_dir = pathlib.Path(FLAGS.models_dir)
-        for model_dir in models_dir.iterdir():
-            lang = model_dir.name
-            if lang not in LANG:
-                print("Skipping unknown directory: ", lang)
-                continue
-
-            if FLAGS.expect_prefix:
-                model_dir = pathlib.Path(models_dir) / (lang + "/" + encoder + "/")
-                model_dir = list(model_dir.iterdir())
-                assert len(model_dir) == 1, f"There is incorrect count of models {model_dir}"
-                model_dir = model_dir[0]
-
-            data_dir = pathlib.Path(FLAGS.data_dir)
-            data_dir = data_dir / TREEBANKS[lang]
-            files = list(data_dir.iterdir())
-            test_file = [f for f in files if "test" in f.name and ".conllu" in f.name]
-            assert len(test_file) == 1, f"Couldn't find training file."
-            test_file = test_file[0]
-
-            output_pred = data_dir / f'{lang}_pred.conllu'
-            command = f"""combo --mode predict --model_path {model_dir}
-            --input_file {test_file}
-            --output_file {output_pred}
-            --cuda_device {FLAGS.cuda_device}
-            --batch_size {FLAGS.batch_size}
-            --silent
-            """
-            utils.execute_command(command)
-            return 1
-
-
-
-def main():
-    app.run(run)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/scripts/train_ud_transformer.py b/scripts/train_ud_transformer.py
deleted file mode 100644
index 9ecabae..0000000
--- a/scripts/train_ud_transformer.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Script to train Dependency Parsing models based on UD 2.x data."""
-import pathlib
-
-from absl import app
-from absl import flags
-
-from scripts import utils
-
-# # ls -1 | xargs -i echo "\"{}\","
-# UD 2.7
-TREEBANKS = ["UD_Polish-PDB"]
-embedding_model = "allegro/herbert-base-cased"
-
-FLAGS = flags.FLAGS
-flags.DEFINE_list(name="treebanks", default=TREEBANKS,
-                  help=f"Treebanks to train. Possible values: {TREEBANKS}.")
-flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo/",
-                    help="Path to UD data directory.")
-flags.DEFINE_string(name="serialization_dir", default="/home/pszenny/combo/tmp/",
-                    help="Model serialization directory.")
-flags.DEFINE_integer(name="cuda_device", default=-1,
-                     help="Cuda device id (-1 for cpu).")
-flags.DEFINE_string(name="train_config_path", default="/home/pszenny/combo/combo/config.template.jsonnet",
-                    help="Directory of jsonnet config file")
-flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
-                    help="Indicator whether to use transformer encoder or BiLSTM (default)")
-
-def run(_):
-    treebanks_dir = pathlib.Path(FLAGS.data_dir)
-
-    for treebank in FLAGS.treebanks:
-        assert treebank in TREEBANKS, f"Unknown treebank {treebank}."
-        treebank_dir = treebanks_dir / treebank
-        treebank_parts = treebank[3:].split("-")
-        language = treebank_parts[0]
-
-        files = list(treebank_dir.iterdir())
-
-        training_file = [f for f in files if "train" in f.name and ".conllu" in f.name]
-        assert len(training_file) == 1, f"Couldn't find training file."
-        training_file_path = training_file[0]
-
-        valid_file = [f for f in files if "dev" in f.name and ".conllu" in f.name]
-        assert len(valid_file) == 1, f"Couldn't find validation file."
-        valid_file_path = valid_file[0]
-
-        serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/BiLSTM/")
-        serialization_dir.mkdir(exist_ok=True, parents=True)
-
-        word_batch_size = 2500
-
-        command = f"""time combo --mode train
-        --cuda_device {FLAGS.cuda_device}
-        --training_data_path {training_file_path}
-        --validation_data_path {valid_file_path}
-        --pretrained_transformer_name {embedding_model}
-        --serialization_dir {serialization_dir}
-        --use_transformer_encoder {FLAGS.use_transformer_encoder}
-        --config_path {FLAGS.train_config_path}
-        --notensorboard
-        --word_batch_size {word_batch_size}
-        --targets deprel,head,upostag,lemma,feats,xpostag
-        """
-
-        utils.execute_command(command)
-
-        FLAGS.use_transformer_encoder = True
-        serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/transformer/")
-        serialization_dir.mkdir(exist_ok=True, parents=True)
-
-        command = f"""time combo --mode train
-        --cuda_device {FLAGS.cuda_device}
-        --training_data_path {training_file_path}
-        --validation_data_path {valid_file_path}
-        --pretrained_transformer_name {embedding_model}
-        --serialization_dir {serialization_dir}
-        --use_transformer_encoder {FLAGS.use_transformer_encoder}
-        --config_path {FLAGS.train_config_path}
-        --notensorboard
-        --word_batch_size {word_batch_size}
-        --targets deprel,head,upostag,lemma,feats,xpostag
-        """
-
-        utils.execute_command(command)
-
-
-
-
-
-
-def main():
-    app.run(run)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/setup.py b/setup.py
index 876909d..0e28601 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ REQUIREMENTS = [
 
 setup(
     name='combo',
-    version='1.0.4',
+    version='1.0.5',
     author='Mateusz Klimaszewski',
     author_email='M.Klimaszewski@ii.pw.edu.pl',
     install_requires=REQUIREMENTS,
-- 
GitLab