From 8d0fd6c911cb3cb740e4ad271e569bf42fab756c Mon Sep 17 00:00:00 2001
From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com>
Date: Mon, 11 Jan 2021 09:04:08 +0100
Subject: [PATCH 1/4] Add configuration files to packaged library and pip
 installation instructions.

---
 Makefile                                                 | 2 +-
 README.md                                                | 5 ++---
 .../config.graph.template.jsonnet                        | 0
 config.template.jsonnet => combo/config.template.jsonnet | 0
 combo/data/dataset.py                                    | 5 ++++-
 combo/main.py                                            | 5 +++--
 docs/installation.md                                     | 9 ++++-----
 setup.py                                                 | 3 ++-
 tests/test_main.py                                       | 2 +-
 9 files changed, 17 insertions(+), 14 deletions(-)
 rename config.graph.template.jsonnet => combo/config.graph.template.jsonnet (100%)
 rename config.template.jsonnet => combo/config.template.jsonnet (100%)

diff --git a/Makefile b/Makefile
index df0c483..af1b424 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 clean:
-	rm -rf COMBO.egg-info
+	rm -rf combo.egg-info
 	rm -rf .eggs
 	rm -rf .pytest_cache
 
diff --git a/README.md b/README.md
index a9c2113..d48aa40 100644
--- a/README.md
+++ b/README.md
@@ -12,9 +12,8 @@
 ## Quick start
 Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed):
 ```bash
-git clone https://gitlab.clarin-pl.eu/syntactic-tools/clarinbiz/combo.git
-cd combo
-python setup.py develop
+pip install -U pip setuptools wheel
+pip install --index-url https://pypi.clarin-pl.eu/simple combo
 ```
 Run the following commands in your Python console to make predictions with a pre-trained model:
 ```python
diff --git a/config.graph.template.jsonnet b/combo/config.graph.template.jsonnet
similarity index 100%
rename from config.graph.template.jsonnet
rename to combo/config.graph.template.jsonnet
diff --git a/config.template.jsonnet b/combo/config.template.jsonnet
similarity index 100%
rename from config.template.jsonnet
rename to combo/config.template.jsonnet
diff --git a/combo/data/dataset.py b/combo/data/dataset.py
index 48b68b1..9aabe19 100644
--- a/combo/data/dataset.py
+++ b/combo/data/dataset.py
@@ -1,5 +1,6 @@
 import copy
 import logging
+import pathlib
 from typing import Union, List, Dict, Iterable, Optional, Any, Tuple
 
 import conllu
@@ -79,7 +80,9 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
         file_path = [file_path] if len(file_path.split(",")) == 0 else file_path.split(",")
 
         for conllu_file in file_path:
-            with open(conllu_file, "r") as file:
+            file = pathlib.Path(conllu_file)
+            assert conllu_file and file.exists(), f"File with path '{conllu_file}' does not exists!"
+            with file.open("r"):
                 for annotation in conllu.parse_incr(file, fields=self.fields, field_parsers=self.field_parsers):
                     yield self.text_to_instance(annotation)
 
diff --git a/combo/main.py b/combo/main.py
index dc9a0d7..d1e0292 100644
--- a/combo/main.py
+++ b/combo/main.py
@@ -1,6 +1,7 @@
 """Main entry point."""
 import logging
 import os
+import pathlib
 import tempfile
 from typing import Dict
 
@@ -31,7 +32,7 @@ flags.DEFINE_string(name="output_file", default="output.log",
                     help="Predictions result file.")
 
 # Training flags
-flags.DEFINE_list(name="training_data_path", default="./tests/fixtures/example.conllu",
+flags.DEFINE_list(name="training_data_path", default=[],
                   help="Training data path(s)")
 flags.DEFINE_alias(name="training_data", original_name="training_data_path")
 flags.DEFINE_list(name="validation_data_path", default="",
@@ -62,7 +63,7 @@ flags.DEFINE_list(name="finetuning_training_data_path", default="",
                   help="Training data path(s)")
 flags.DEFINE_list(name="finetuning_validation_data_path", default="",
                   help="Validation data path(s)")
-flags.DEFINE_string(name="config_path", default="config.template.jsonnet",
+flags.DEFINE_string(name="config_path", default=str(pathlib.Path(__file__).parent / "config.template.jsonnet"),
                     help="Config file path.")
 
 # Test after training flags
diff --git a/docs/installation.md b/docs/installation.md
index bf741f9..ed2ae3f 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,9 +1,8 @@
 # Installation
 Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+):
 ```bash
-git clone https://gitlab.clarin-pl.eu/syntactic-tools/clarinbiz/combo.git
-cd combo
-python setup.py develop
+pip install -U pip setuptools wheel
+pip install --index-url https://pypi.clarin-pl.eu/simple combo
 combo --helpfull
 ```
 
@@ -11,8 +10,8 @@ combo --helpfull
 ```bash
 python -m venv venv
 source venv/bin/activate
-pip install --upgrade pip
-python setup.py develop
+pip install -U pip setuptools wheel
+pip install --index-url https://pypi.clarin-pl.eu/simple combo
 ```
 
 ## Problems & solutions
diff --git a/setup.py b/setup.py
index 07cd2e3..d946c84 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ REQUIREMENTS = [
 ]
 
 setup(
-    name='COMBO',
+    name='combo',
     version='1.0.0b1',
     author='Mateusz Klimaszewski',
     author_email='M.Klimaszewski@ii.pw.edu.pl',
@@ -31,6 +31,7 @@ setup(
     setup_requires=['pytest-runner', 'pytest-pylint'],
     tests_require=['pytest', 'pylint'],
     python_requires='>=3.6',
+    package_data={'combo': ['config.graph.template.jsonnet', 'config.template.jsonnet']},
     entry_points={'console_scripts': ['combo = combo.main:main']},
     classifiers=[
         'Development Status :: 4 - Beta',
diff --git a/tests/test_main.py b/tests/test_main.py
index 448e64d..107f681 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -41,7 +41,7 @@ class TrainingEndToEndTest(unittest.TestCase):
             "word_batch_size": "1",
             "use_tensorboard": "False"
         }
-        params = Params.from_file(os.path.join(self.PROJECT_ROOT, "config.template.jsonnet"),
+        params = Params.from_file(os.path.join(self.MODULE_ROOT, "config.template.jsonnet"),
                                   ext_vars=ext_vars)
 
         # when
-- 
GitLab


From f3f0c4a8bf0f84d67db14aad0eb90b28dae9715f Mon Sep 17 00:00:00 2001
From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com>
Date: Mon, 11 Jan 2021 13:21:43 +0100
Subject: [PATCH 2/4] Cleaner download bar.

---
 combo/utils/download.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/combo/utils/download.py b/combo/utils/download.py
index b2c6b2e..003b64c 100644
--- a/combo/utils/download.py
+++ b/combo/utils/download.py
@@ -1,6 +1,5 @@
 import errno
 import logging
-import math
 import os
 
 import requests
@@ -23,18 +22,18 @@ def download_file(model_name, force=False):
     if os.path.exists(location) and not force:
         logger.debug("Using cached model.")
         return location
-    chunk_size = 1024 * 10
+    chunk_size = 1024
     logger.info(url)
     try:
         with _requests_retry_session(retries=2).get(url, stream=True) as r:
-            total_length = math.ceil(int(r.headers.get("content-length")) / chunk_size)
+            pbar = tqdm.tqdm(unit="B", total=int(r.headers.get("content-length")),
+                             unit_divisor=chunk_size, unit_scale=True)
             with open(location, "wb") as f:
-                with tqdm.tqdm(total=total_length) as pbar:
-                    for chunk in r.raw.stream(chunk_size, decode_content=False):
+                with pbar:
+                    for chunk in r.iter_content(chunk_size):
                         if chunk:
                             f.write(chunk)
-                            f.flush()
-                            pbar.update(1)
+                            pbar.update(len(chunk))
     except exceptions.RetryError:
         raise ConnectionError(f"Couldn't find or download model {model_name}.tar.gz. "
                               "Check if model name is correct or try again later!")
-- 
GitLab


From d2a17a768b4165a4a6017d1e71b3849132e7a0f1 Mon Sep 17 00:00:00 2001
From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com>
Date: Wed, 13 Jan 2021 11:30:20 +0100
Subject: [PATCH 3/4] Add models license information, make dataset_reader
 public attribute, exclude treebanks without data from script.

---
 combo/predict.py | 11 +++----
 docs/models.md   |  9 +++++-
 scripts/train.py | 74 ++++++++++++++++++++----------------------------
 3 files changed, 44 insertions(+), 50 deletions(-)

diff --git a/combo/predict.py b/combo/predict.py
index bd9f5d4..e528a18 100644
--- a/combo/predict.py
+++ b/combo/predict.py
@@ -29,8 +29,9 @@ class COMBO(predictor.Predictor):
         super().__init__(model, dataset_reader)
         self.batch_size = batch_size
         self.vocab = model.vocab
-        self._dataset_reader.generate_labels = False
-        self._dataset_reader.lazy = True
+        self.dataset_reader = self._dataset_reader
+        self.dataset_reader.generate_labels = False
+        self.dataset_reader.lazy = True
         self._tokenizer = tokenizer
         self.without_sentence_embedding = False
         self.line_to_conllu = line_to_conllu
@@ -112,7 +113,7 @@ class COMBO(predictor.Predictor):
             tokens = sentence
         else:
             raise ValueError("Input must be either string or list of strings.")
-        return self._dataset_reader.text_to_instance(tokens2conllu(tokens))
+        return self.dataset_reader.text_to_instance(tokens2conllu(tokens))
 
     @overrides
     def load_line(self, line: str) -> common.JsonDict:
@@ -125,7 +126,7 @@ class COMBO(predictor.Predictor):
         if self.without_sentence_embedding:
             outputs.sentence_embedding = []
         if self.line_to_conllu:
-            return sentence2conllu(outputs, keep_semrel=self._dataset_reader.use_sem).serialize()
+            return sentence2conllu(outputs, keep_semrel=self.dataset_reader.use_sem).serialize()
         else:
             return outputs.to_json()
 
@@ -134,7 +135,7 @@ class COMBO(predictor.Predictor):
         return {"sentence": sentence}
 
     def _to_input_instance(self, sentence: data.Sentence) -> allen_data.Instance:
-        return self._dataset_reader.text_to_instance(sentence2conllu(sentence))
+        return self.dataset_reader.text_to_instance(sentence2conllu(sentence))
 
     def _predictions_as_tree(self, predictions: Dict[str, Any], instance: allen_data.Instance):
         tree = instance.fields["metadata"]["input"]
diff --git a/docs/models.md b/docs/models.md
index 94eed03..96bd7e9 100644
--- a/docs/models.md
+++ b/docs/models.md
@@ -4,8 +4,15 @@ COMBO provides pre-trained models for:
 - morphosyntactic prediction (i.e. part-of-speech tagging, morphosyntactic analysis, lemmatisation and dependency parsing) trained on the treebanks from [Universal Dependencies repository](https://universaldependencies.org),
 - enhanced dependency parsing trained on IWPT 2020 shared task [data](https://universaldependencies.org/iwpt20/data.html).
 
-Pre-trained models list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing)
+## Pre-trained models
+**Pre-trained models** list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing)
 Please notice that the name in the brackets matches the name used in [Automatic Download](models.md#Automatic download).
+
+### License
+Models are licensed on the same license as data used to train.
+
+See [Universal Dependencies v2.7 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/license-ud-2.7) and [Universal Dependencies v2.5 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/licence-UD-2.5) for details.
+
 ## Manual download
 
 The pre-trained models can be downloaded from [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/).
diff --git a/scripts/train.py b/scripts/train.py
index dc75344..accca4a 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -10,24 +10,17 @@ from scripts import utils
 # UD 2.7
 TREEBANKS = [
     "UD_Afrikaans-AfriBooms",
-    "UD_Akkadian-PISANDUB",
-    "UD_Akkadian-RIAO",
-    "UD_Akuntsu-TuDeT",
-    "UD_Albanian-TSA",
-    "UD_Amharic-ATT",
-    "UD_Ancient_Greek-Perseus",
-    "UD_Ancient_Greek-PROIEL",
-    "UD_Apurina-UFPA",
+    # "UD_Albanian-TSA", No training data
+    # "UD_Amharic-ATT", No training data
     "UD_Arabic-NYUAD",
     "UD_Arabic-PADT",
     "UD_Arabic-PUD",
     "UD_Armenian-ArmTDP",
-    "UD_Assyrian-AS",
-    "UD_Bambara-CRB",
+    # "UD_Assyrian-AS", No training data
+    # "UD_Bambara-CRB", No training data
     "UD_Basque-BDT",
     "UD_Belarusian-HSE",
-    "UD_Bhojpuri-BHTB",
-    "UD_Breton-KEB",
+    # "UD_Breton-KEB", No training data
     "UD_Bulgarian-BTB",
     "UD_Buryat-BDT",
     "UD_Cantonese-HK",
@@ -48,17 +41,9 @@ TREEBANKS = [
     "UD_Czech-PUD",
     "UD_Danish-DDT",
     "UD_Dutch-Alpino",
-    "UD_Dutch-LassySmall",
-    "UD_English-ESL",
+    #END OF FIRST RUN
     "UD_English-EWT",
-    "UD_English-GUM",
-    "UD_English-GUMReddit",
-    "UD_English-LinES",
-    "UD_English-ParTUT",
-    "UD_English-Pronouns",
-    "UD_English-PUD",
-    "UD_Erzya-JR",
-    "UD_Estonian-EDT",
+    # "UD_Erzya-JR", No training data
     "UD_Estonian-EWT",
     "UD_Faroese-FarPaHC",
     "UD_Faroese-OFT",
@@ -98,7 +83,7 @@ TREEBANKS = [
     "UD_Italian-PUD",
     "UD_Italian-TWITTIRO",
     "UD_Italian-VIT",
-    # "UD_Japanese-BCCWJ", no data
+    # "UD_Japanese-BCCWJ", No public data
     "UD_Japanese-GSD",
     "UD_Japanese-Modern",
     "UD_Japanese-PUD",
@@ -119,9 +104,9 @@ TREEBANKS = [
     "UD_Latvian-LVTB",
     "UD_Lithuanian-ALKSNIS",
     "UD_Lithuanian-HSE",
-    "UD_Livvi-KKPP",
+    # end batch 2
     "UD_Maltese-MUDT",
-    "UD_Manx-Cadhan",
+    # "UD_Manx-Cadhan", No training data
     "UD_Marathi-UFAL",
     "UD_Mbya_Guarani-Dooley",
     "UD_Mbya_Guarani-Thomas",
@@ -153,8 +138,7 @@ TREEBANKS = [
     "UD_Russian-PUD",
     "UD_Russian-SynTagRus",
     "UD_Russian-Taiga",
-    "UD_Sanskrit-UFAL",
-    "UD_Sanskrit-Vedic",
+    # "UD_Sanskrit-UFAL", No training data
     "UD_Scottish_Gaelic-ARCOSG",
     "UD_Serbian-SET",
     "UD_Skolt_Sami-Giellagas",
@@ -167,31 +151,22 @@ TREEBANKS = [
     "UD_Spanish-GSD",
     "UD_Spanish-PUD",
     "UD_Swedish-LinES",
-    "UD_Swedish-PUD",
-    "UD_Swedish_Sign_Language-SSLC",
-    "UD_Swedish-Talbanken",
-    "UD_Swiss_German-UZH",
-    "UD_Tagalog-TRG",
-    "UD_Tagalog-Ugnayan",
-    "UD_Tamil-MWTT",
-    "UD_Tamil-TTB",
+    # "UD_Tagalog-TRG", No training data
+    # "UD_Tamil-MWTT", No training data
     "UD_Telugu-MTG",
-    "UD_Thai-PUD",
-    "UD_Tupinamba-TuDeT",
+    # "UD_Thai-PUD", No training data
     "UD_Turkish-BOUN",
     "UD_Turkish-GB",
     "UD_Turkish_German-SAGT",
     "UD_Turkish-IMST",
     "UD_Turkish-PUD",
     "UD_Ukrainian-IU",
-    "UD_Upper_Sorbian-UFAL",
+    # "UD_Upper_Sorbian-UFAL", No validation data
     "UD_Urdu-UDTB",
     "UD_Uyghur-UDT",
     "UD_Vietnamese-VTB",
-    "UD_Warlpiri-UFAL",
-    "UD_Welsh-CCG",
-    "UD_Wolof-WTB",
-    "UD_Yoruba-YTB",
+    # "UD_Welsh-CCG", No validation data
+    # "UD_Yoruba-YTB", No training data
 ]
 
 FLAGS = flags.FLAGS
@@ -250,13 +225,24 @@ def run(_):
         """
 
         # Datasets without XPOS
-        if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Hungarian-Szeged"}:
+        if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Danish-DDT", "UD_Hungarian-Szeged", "UD_French-GSD",
+                        "UD_Marathi-UFAL", "UD_Norwegian-Bokmaal"}:
             command = command + " --targets deprel,head,upostag,lemma,feats"
 
+        # Datasets without LEMMA and FEATS
+        if treebank in {"UD_Maltese-MUDT"}:
+            command = command + " --targets deprel,head,upostag,xpostag"
+
+        # Datasets without XPOS and FEATS
+        if treebank in {"UD_Telugu-MTG"}:
+            command = command + " --targets deprel,head,upostag,lemma"
+
         # Reduce word_batch_size
         word_batch_size = 2500
-        if treebank in {"UD_German-HDT"}:
+        if treebank in {"UD_German-HDT", "UD_Marathi-UFAL"}:
             word_batch_size = 1000
+        elif treebank in {"UD_Telugu-MTG"}:
+            word_batch_size = 500
         command = command + f" --word_batch_size {word_batch_size}"
 
         utils.execute_command(command)
-- 
GitLab


From 6943a2673f80614266e585a6244c8300cf6a67d3 Mon Sep 17 00:00:00 2001
From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com>
Date: Wed, 13 Jan 2021 14:00:56 +0100
Subject: [PATCH 4/4] Release 1.0.0b2.

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d946c84..1a9bfd0 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ REQUIREMENTS = [
 
 setup(
     name='combo',
-    version='1.0.0b1',
+    version='1.0.0b2',
     author='Mateusz Klimaszewski',
     author_email='M.Klimaszewski@ii.pw.edu.pl',
     install_requires=REQUIREMENTS,
-- 
GitLab