From 8d0fd6c911cb3cb740e4ad271e569bf42fab756c Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Mon, 11 Jan 2021 09:04:08 +0100 Subject: [PATCH 1/4] Add configuration files to packaged library and pip installation instructions. --- Makefile | 2 +- README.md | 5 ++--- .../config.graph.template.jsonnet | 0 config.template.jsonnet => combo/config.template.jsonnet | 0 combo/data/dataset.py | 5 ++++- combo/main.py | 5 +++-- docs/installation.md | 9 ++++----- setup.py | 3 ++- tests/test_main.py | 2 +- 9 files changed, 17 insertions(+), 14 deletions(-) rename config.graph.template.jsonnet => combo/config.graph.template.jsonnet (100%) rename config.template.jsonnet => combo/config.template.jsonnet (100%) diff --git a/Makefile b/Makefile index df0c483..af1b424 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ clean: - rm -rf COMBO.egg-info + rm -rf combo.egg-info rm -rf .eggs rm -rf .pytest_cache diff --git a/README.md b/README.md index a9c2113..d48aa40 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,8 @@ ## Quick start Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed): ```bash -git clone https://gitlab.clarin-pl.eu/syntactic-tools/clarinbiz/combo.git -cd combo -python setup.py develop +pip install -U pip setuptools wheel +pip install --index-url https://pypi.clarin-pl.eu/simple combo ``` Run the following commands in your Python console to make predictions with a pre-trained model: ```python diff --git a/config.graph.template.jsonnet b/combo/config.graph.template.jsonnet similarity index 100% rename from config.graph.template.jsonnet rename to combo/config.graph.template.jsonnet diff --git a/config.template.jsonnet b/combo/config.template.jsonnet similarity index 100% rename from config.template.jsonnet rename to combo/config.template.jsonnet diff --git a/combo/data/dataset.py b/combo/data/dataset.py index 48b68b1..9aabe19 100644 --- a/combo/data/dataset.py +++ b/combo/data/dataset.py @@ -1,5 +1,6 @@ import copy import logging +import pathlib from typing import Union, List, Dict, Iterable, Optional, Any, Tuple import conllu @@ -79,7 +80,9 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader): file_path = [file_path] if len(file_path.split(",")) == 0 else file_path.split(",") for conllu_file in file_path: - with open(conllu_file, "r") as file: + file = pathlib.Path(conllu_file) + assert conllu_file and file.exists(), f"File with path '{conllu_file}' does not exists!" + with file.open("r"): for annotation in conllu.parse_incr(file, fields=self.fields, field_parsers=self.field_parsers): yield self.text_to_instance(annotation) diff --git a/combo/main.py b/combo/main.py index dc9a0d7..d1e0292 100644 --- a/combo/main.py +++ b/combo/main.py @@ -1,6 +1,7 @@ """Main entry point.""" import logging import os +import pathlib import tempfile from typing import Dict @@ -31,7 +32,7 @@ flags.DEFINE_string(name="output_file", default="output.log", help="Predictions result file.") # Training flags -flags.DEFINE_list(name="training_data_path", default="./tests/fixtures/example.conllu", +flags.DEFINE_list(name="training_data_path", default=[], help="Training data path(s)") flags.DEFINE_alias(name="training_data", original_name="training_data_path") flags.DEFINE_list(name="validation_data_path", default="", @@ -62,7 +63,7 @@ flags.DEFINE_list(name="finetuning_training_data_path", default="", help="Training data path(s)") flags.DEFINE_list(name="finetuning_validation_data_path", default="", help="Validation data path(s)") -flags.DEFINE_string(name="config_path", default="config.template.jsonnet", +flags.DEFINE_string(name="config_path", default=str(pathlib.Path(__file__).parent / "config.template.jsonnet"), help="Config file path.") # Test after training flags diff --git a/docs/installation.md b/docs/installation.md index bf741f9..ed2ae3f 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,9 +1,8 @@ # Installation Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+): ```bash -git clone https://gitlab.clarin-pl.eu/syntactic-tools/clarinbiz/combo.git -cd combo -python setup.py develop +pip install -U pip setuptools wheel +pip install --index-url https://pypi.clarin-pl.eu/simple combo combo --helpfull ``` @@ -11,8 +10,8 @@ combo --helpfull ```bash python -m venv venv source venv/bin/activate -pip install --upgrade pip -python setup.py develop +pip install -U pip setuptools wheel +pip install --index-url https://pypi.clarin-pl.eu/simple combo ``` ## Problems & solutions diff --git a/setup.py b/setup.py index 07cd2e3..d946c84 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ REQUIREMENTS = [ ] setup( - name='COMBO', + name='combo', version='1.0.0b1', author='Mateusz Klimaszewski', author_email='M.Klimaszewski@ii.pw.edu.pl', @@ -31,6 +31,7 @@ setup( setup_requires=['pytest-runner', 'pytest-pylint'], tests_require=['pytest', 'pylint'], python_requires='>=3.6', + package_data={'combo': ['config.graph.template.jsonnet', 'config.template.jsonnet']}, entry_points={'console_scripts': ['combo = combo.main:main']}, classifiers=[ 'Development Status :: 4 - Beta', diff --git a/tests/test_main.py b/tests/test_main.py index 448e64d..107f681 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -41,7 +41,7 @@ class TrainingEndToEndTest(unittest.TestCase): "word_batch_size": "1", "use_tensorboard": "False" } - params = Params.from_file(os.path.join(self.PROJECT_ROOT, "config.template.jsonnet"), + params = Params.from_file(os.path.join(self.MODULE_ROOT, "config.template.jsonnet"), ext_vars=ext_vars) # when -- GitLab From f3f0c4a8bf0f84d67db14aad0eb90b28dae9715f Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Mon, 11 Jan 2021 13:21:43 +0100 Subject: [PATCH 2/4] Cleaner download bar. --- combo/utils/download.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/combo/utils/download.py b/combo/utils/download.py index b2c6b2e..003b64c 100644 --- a/combo/utils/download.py +++ b/combo/utils/download.py @@ -1,6 +1,5 @@ import errno import logging -import math import os import requests @@ -23,18 +22,18 @@ def download_file(model_name, force=False): if os.path.exists(location) and not force: logger.debug("Using cached model.") return location - chunk_size = 1024 * 10 + chunk_size = 1024 logger.info(url) try: with _requests_retry_session(retries=2).get(url, stream=True) as r: - total_length = math.ceil(int(r.headers.get("content-length")) / chunk_size) + pbar = tqdm.tqdm(unit="B", total=int(r.headers.get("content-length")), + unit_divisor=chunk_size, unit_scale=True) with open(location, "wb") as f: - with tqdm.tqdm(total=total_length) as pbar: - for chunk in r.raw.stream(chunk_size, decode_content=False): + with pbar: + for chunk in r.iter_content(chunk_size): if chunk: f.write(chunk) - f.flush() - pbar.update(1) + pbar.update(len(chunk)) except exceptions.RetryError: raise ConnectionError(f"Couldn't find or download model {model_name}.tar.gz. " "Check if model name is correct or try again later!") -- GitLab From d2a17a768b4165a4a6017d1e71b3849132e7a0f1 Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Wed, 13 Jan 2021 11:30:20 +0100 Subject: [PATCH 3/4] Add models license information, make dataset_reader public attribute, exclude treebanks without data from script. --- combo/predict.py | 11 +++---- docs/models.md | 9 +++++- scripts/train.py | 74 ++++++++++++++++++++---------------------------- 3 files changed, 44 insertions(+), 50 deletions(-) diff --git a/combo/predict.py b/combo/predict.py index bd9f5d4..e528a18 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -29,8 +29,9 @@ class COMBO(predictor.Predictor): super().__init__(model, dataset_reader) self.batch_size = batch_size self.vocab = model.vocab - self._dataset_reader.generate_labels = False - self._dataset_reader.lazy = True + self.dataset_reader = self._dataset_reader + self.dataset_reader.generate_labels = False + self.dataset_reader.lazy = True self._tokenizer = tokenizer self.without_sentence_embedding = False self.line_to_conllu = line_to_conllu @@ -112,7 +113,7 @@ class COMBO(predictor.Predictor): tokens = sentence else: raise ValueError("Input must be either string or list of strings.") - return self._dataset_reader.text_to_instance(tokens2conllu(tokens)) + return self.dataset_reader.text_to_instance(tokens2conllu(tokens)) @overrides def load_line(self, line: str) -> common.JsonDict: @@ -125,7 +126,7 @@ class COMBO(predictor.Predictor): if self.without_sentence_embedding: outputs.sentence_embedding = [] if self.line_to_conllu: - return sentence2conllu(outputs, keep_semrel=self._dataset_reader.use_sem).serialize() + return sentence2conllu(outputs, keep_semrel=self.dataset_reader.use_sem).serialize() else: return outputs.to_json() @@ -134,7 +135,7 @@ class COMBO(predictor.Predictor): return {"sentence": sentence} def _to_input_instance(self, sentence: data.Sentence) -> allen_data.Instance: - return self._dataset_reader.text_to_instance(sentence2conllu(sentence)) + return self.dataset_reader.text_to_instance(sentence2conllu(sentence)) def _predictions_as_tree(self, predictions: Dict[str, Any], instance: allen_data.Instance): tree = instance.fields["metadata"]["input"] diff --git a/docs/models.md b/docs/models.md index 94eed03..96bd7e9 100644 --- a/docs/models.md +++ b/docs/models.md @@ -4,8 +4,15 @@ COMBO provides pre-trained models for: - morphosyntactic prediction (i.e. part-of-speech tagging, morphosyntactic analysis, lemmatisation and dependency parsing) trained on the treebanks from [Universal Dependencies repository](https://universaldependencies.org), - enhanced dependency parsing trained on IWPT 2020 shared task [data](https://universaldependencies.org/iwpt20/data.html). -Pre-trained models list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing) +## Pre-trained models +**Pre-trained models** list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing) Please notice that the name in the brackets matches the name used in [Automatic Download](models.md#Automatic download). + +### License +Models are licensed on the same license as data used to train. + +See [Universal Dependencies v2.7 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/license-ud-2.7) and [Universal Dependencies v2.5 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/licence-UD-2.5) for details. + ## Manual download The pre-trained models can be downloaded from [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/). diff --git a/scripts/train.py b/scripts/train.py index dc75344..accca4a 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -10,24 +10,17 @@ from scripts import utils # UD 2.7 TREEBANKS = [ "UD_Afrikaans-AfriBooms", - "UD_Akkadian-PISANDUB", - "UD_Akkadian-RIAO", - "UD_Akuntsu-TuDeT", - "UD_Albanian-TSA", - "UD_Amharic-ATT", - "UD_Ancient_Greek-Perseus", - "UD_Ancient_Greek-PROIEL", - "UD_Apurina-UFPA", + # "UD_Albanian-TSA", No training data + # "UD_Amharic-ATT", No training data "UD_Arabic-NYUAD", "UD_Arabic-PADT", "UD_Arabic-PUD", "UD_Armenian-ArmTDP", - "UD_Assyrian-AS", - "UD_Bambara-CRB", + # "UD_Assyrian-AS", No training data + # "UD_Bambara-CRB", No training data "UD_Basque-BDT", "UD_Belarusian-HSE", - "UD_Bhojpuri-BHTB", - "UD_Breton-KEB", + # "UD_Breton-KEB", No training data "UD_Bulgarian-BTB", "UD_Buryat-BDT", "UD_Cantonese-HK", @@ -48,17 +41,9 @@ TREEBANKS = [ "UD_Czech-PUD", "UD_Danish-DDT", "UD_Dutch-Alpino", - "UD_Dutch-LassySmall", - "UD_English-ESL", + #END OF FIRST RUN "UD_English-EWT", - "UD_English-GUM", - "UD_English-GUMReddit", - "UD_English-LinES", - "UD_English-ParTUT", - "UD_English-Pronouns", - "UD_English-PUD", - "UD_Erzya-JR", - "UD_Estonian-EDT", + # "UD_Erzya-JR", No training data "UD_Estonian-EWT", "UD_Faroese-FarPaHC", "UD_Faroese-OFT", @@ -98,7 +83,7 @@ TREEBANKS = [ "UD_Italian-PUD", "UD_Italian-TWITTIRO", "UD_Italian-VIT", - # "UD_Japanese-BCCWJ", no data + # "UD_Japanese-BCCWJ", No public data "UD_Japanese-GSD", "UD_Japanese-Modern", "UD_Japanese-PUD", @@ -119,9 +104,9 @@ TREEBANKS = [ "UD_Latvian-LVTB", "UD_Lithuanian-ALKSNIS", "UD_Lithuanian-HSE", - "UD_Livvi-KKPP", + # end batch 2 "UD_Maltese-MUDT", - "UD_Manx-Cadhan", + # "UD_Manx-Cadhan", No training data "UD_Marathi-UFAL", "UD_Mbya_Guarani-Dooley", "UD_Mbya_Guarani-Thomas", @@ -153,8 +138,7 @@ TREEBANKS = [ "UD_Russian-PUD", "UD_Russian-SynTagRus", "UD_Russian-Taiga", - "UD_Sanskrit-UFAL", - "UD_Sanskrit-Vedic", + # "UD_Sanskrit-UFAL", No training data "UD_Scottish_Gaelic-ARCOSG", "UD_Serbian-SET", "UD_Skolt_Sami-Giellagas", @@ -167,31 +151,22 @@ TREEBANKS = [ "UD_Spanish-GSD", "UD_Spanish-PUD", "UD_Swedish-LinES", - "UD_Swedish-PUD", - "UD_Swedish_Sign_Language-SSLC", - "UD_Swedish-Talbanken", - "UD_Swiss_German-UZH", - "UD_Tagalog-TRG", - "UD_Tagalog-Ugnayan", - "UD_Tamil-MWTT", - "UD_Tamil-TTB", + # "UD_Tagalog-TRG", No training data + # "UD_Tamil-MWTT", No training data "UD_Telugu-MTG", - "UD_Thai-PUD", - "UD_Tupinamba-TuDeT", + # "UD_Thai-PUD", No training data "UD_Turkish-BOUN", "UD_Turkish-GB", "UD_Turkish_German-SAGT", "UD_Turkish-IMST", "UD_Turkish-PUD", "UD_Ukrainian-IU", - "UD_Upper_Sorbian-UFAL", + # "UD_Upper_Sorbian-UFAL", No validation data "UD_Urdu-UDTB", "UD_Uyghur-UDT", "UD_Vietnamese-VTB", - "UD_Warlpiri-UFAL", - "UD_Welsh-CCG", - "UD_Wolof-WTB", - "UD_Yoruba-YTB", + # "UD_Welsh-CCG", No validation data + # "UD_Yoruba-YTB", No training data ] FLAGS = flags.FLAGS @@ -250,13 +225,24 @@ def run(_): """ # Datasets without XPOS - if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Hungarian-Szeged"}: + if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Danish-DDT", "UD_Hungarian-Szeged", "UD_French-GSD", + "UD_Marathi-UFAL", "UD_Norwegian-Bokmaal"}: command = command + " --targets deprel,head,upostag,lemma,feats" + # Datasets without LEMMA and FEATS + if treebank in {"UD_Maltese-MUDT"}: + command = command + " --targets deprel,head,upostag,xpostag" + + # Datasets without XPOS and FEATS + if treebank in {"UD_Telugu-MTG"}: + command = command + " --targets deprel,head,upostag,lemma" + # Reduce word_batch_size word_batch_size = 2500 - if treebank in {"UD_German-HDT"}: + if treebank in {"UD_German-HDT", "UD_Marathi-UFAL"}: word_batch_size = 1000 + elif treebank in {"UD_Telugu-MTG"}: + word_batch_size = 500 command = command + f" --word_batch_size {word_batch_size}" utils.execute_command(command) -- GitLab From 6943a2673f80614266e585a6244c8300cf6a67d3 Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Wed, 13 Jan 2021 14:00:56 +0100 Subject: [PATCH 4/4] Release 1.0.0b2. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d946c84..1a9bfd0 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ REQUIREMENTS = [ setup( name='combo', - version='1.0.0b1', + version='1.0.0b2', author='Mateusz Klimaszewski', author_email='M.Klimaszewski@ii.pw.edu.pl', install_requires=REQUIREMENTS, -- GitLab