From 275921294df20f832035e24d036219c3625f3593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alina=20Wr=C3=B3blewska?= <alina@ipipan.waw.pl> Date: Tue, 28 Jul 2020 18:13:38 +0200 Subject: [PATCH 1/5] Readme added --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c5d0fd0..0ef5706 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ Input: one sentence per line. Output: List of token jsons. ```bash -combo --mode predict --model_path your_model_tar_gz --input_file your_text_file --output_file your_output_file --silent --noconllu_format +combo --mode predict --model_path your_model_tar_gz --input_file your_text_file --output_file your_output_file --silent ``` #### Advanced -- GitLab From 3fcd99d1297bb9b923c94fd2ae7702fe6cc0ae5e Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Sat, 7 Nov 2020 19:27:00 +0100 Subject: [PATCH 2/5] Add header for readme.md --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 0ef5706..0213823 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,14 @@ +# COMBO +<p align="center"> + A GPL-3.0 system, build on top of PyTorch and AllenNLP, for morphosyntactic analysis. +</p> +<hr/> +<p align="center"> + <a href="https://github.com/ipipan/combo/blob/master/LICENSE"> + <img alt="License" src="https://img.shields.io/github/license/ipipan/combo.svg?color=blue&cachedrop"> + </a> +</p> + ## Installation Clone this repository and run: -- GitLab From a5d36a1cabb190deb463dc6b870fe57c9dc576a7 Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Mon, 9 Nov 2020 10:22:28 +0100 Subject: [PATCH 3/5] Add information about pre-trained models. --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 0213823..87a4657 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@ </a> </p> +[Pre-trained models!](http://mozart.ipipan.waw.pl/~mklimaszewski/models/) +```python +import combo.predict as predict +nlp = predict.SemanticMultitaskPredictor.from_pretrained("polish-herbert-base") +sentence = nlp("Moje zdanie.") +print(sentence.tokens) +``` + ## Installation Clone this repository and run: -- GitLab From aa2b4d90bb4d4fe940ab37e11776aca2ea8b5969 Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Mon, 9 Nov 2020 12:59:44 +0100 Subject: [PATCH 4/5] Fix console prediction. --- combo/main.py | 3 +++ combo/predict.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/combo/main.py b/combo/main.py index c7aac87..44ad091 100644 --- a/combo/main.py +++ b/combo/main.py @@ -146,6 +146,9 @@ def run(_): else: use_dataset_reader = FLAGS.conllu_format predictor = _get_predictor() + if FLAGS.input_file == "-": + use_dataset_reader = False + predictor.without_sentence_embedding = True if use_dataset_reader: predictor.line_to_conllu = True if FLAGS.silent: diff --git a/combo/predict.py b/combo/predict.py index ebbb372..b6c7172 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -32,6 +32,7 @@ class SemanticMultitaskPredictor(predictor.Predictor): self._dataset_reader.generate_labels = False self._dataset_reader.lazy = True self._tokenizer = tokenizer + self.without_sentence_embedding = False self.line_to_conllu = line_to_conllu def __call__(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): @@ -127,6 +128,8 @@ class SemanticMultitaskPredictor(predictor.Predictor): def dump_line(self, outputs: data.Sentence) -> str: # Check whether serialized (str) tree or token's list # Serialized tree has already separators between lines + if self.without_sentence_embedding: + outputs.sentence_embedding = [] if self.line_to_conllu: return sentence2conllu(outputs, keep_semrel=self._dataset_reader.use_sem).serialize() else: -- GitLab From 5a79601340d546d2e254812e6ca4b8b5be38bf84 Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Tue, 10 Nov 2020 14:43:10 +0100 Subject: [PATCH 5/5] Split documentation into multiple markdown files. --- README.md | 122 ++++++------------------------------------- docs/installation.md | 13 +++++ docs/models.md | 19 +++++++ docs/prediction.md | 40 ++++++++++++++ docs/training.md | 52 ++++++++++++++++++ 5 files changed, 139 insertions(+), 107 deletions(-) create mode 100644 docs/installation.md create mode 100644 docs/models.md create mode 100644 docs/prediction.md create mode 100644 docs/training.md diff --git a/README.md b/README.md index 87a4657..19491b3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # COMBO <p align="center"> - A GPL-3.0 system, build on top of PyTorch and AllenNLP, for morphosyntactic analysis. + A language-independent NLP system for dependency parsing, part-of-speech tagging, lemmatisation and more built on top of PyTorch and AllenNLP. </p> <hr/> <p align="center"> @@ -9,118 +9,26 @@ </a> </p> -[Pre-trained models!](http://mozart.ipipan.waw.pl/~mklimaszewski/models/) -```python -import combo.predict as predict -nlp = predict.SemanticMultitaskPredictor.from_pretrained("polish-herbert-base") -sentence = nlp("Moje zdanie.") -print(sentence.tokens) -``` - -## Installation - -Clone this repository and run: +## Quick start +Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+): ```bash +git clone https://github.com/ipipan/combo.git +cd combo python setup.py develop ``` - -### Problems & solutions -* **jsonnet** installation error - -use `conda install -c conda-forge jsonnet=0.15.0` - -## Training - -Command: -```bash -combo --mode train \ - --training_data_path your_training_path \ - --validation_data_path your_validation_path -``` - -Options: -```bash -combo --helpfull -``` - -Examples (for clarity without training/validation data paths): - -* train on gpu 0 - - ```bash - combo --mode train --cuda_device 0 - ``` - -* use pretrained embeddings: - - ```bash - combo --mode train --pretrained_tokens your_pretrained_embeddings_path --embedding_dim your_embeddings_dim - ``` - -* use pretrained transformer embeddings: - - ```bash - combo --mode train --pretrained_transformer_name your_choosen_pretrained_transformer - ``` - -* predict only dependency tree: - - ```bash - combo --mode train --targets head,deprel - ``` - -* use part-of-speech tags for predicting only dependency tree - - ```bash - combo --mode train --targets head,deprel --features token,char,upostag - ``` - -Advanced configuration: [Configuration](#configuration) - -## Prediction - -### ConLLU file prediction: -Input and output are both in `*.conllu` format. -```bash -combo --mode predict --model_path your_model_tar_gz --input_file your_conllu_file --output_file your_output_file --silent -``` - -### Console -Works for models where input was text-based only. - -Interactive testing in console (load model and just type sentence in console). - -```bash -combo --mode predict --model_path your_model_tar_gz --input_file "-" --nosilent -``` -### Raw text -Works for models where input was text-based only. - -Input: one sentence per line. - -Output: List of token jsons. - -```bash -combo --mode predict --model_path your_model_tar_gz --input_file your_text_file --output_file your_output_file --silent -``` -#### Advanced - -There are 2 tokenizers: whitespace and spacy-based (`en_core_web_sm` model). - -Use either `--predictor_name semantic-multitask-predictor` or `--predictor_name semantic-multitask-predictor-spacy`. - -### Python +Run the following lines in your Python console to make predictions with a pre-trained model: ```python import combo.predict as predict -model_path = "your_model.tar.gz" -nlp = predict.SemanticMultitaskPredictor.from_pretrained(model_path) -sentence = nlp("Sentence to parse.") +nlp = predict.SemanticMultitaskPredictor.from_pretrained("polish-herbert-base") +sentence = nlp("Moje zdanie.") +print(sentence.tokens) ``` -## Configuration +## Details + +- [**Installation**](docs/installation.md) +- [**Pre-trained models**](docs/models.md) +- [**Training**](docs/training.md) +- [**Prediction**](docs/prediction.md) -### Advanced -Config template [config.template.jsonnet](config.template.jsonnet) is formed in `allennlp` format so you can freely modify it. -There is configuration for all the training/model parameters (learning rates, epochs number etc.). -Some of them use `jsonnet` syntax to get values from configuration flags, however most of them can be modified directly there. diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..559c9c7 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,13 @@ +# Installation +Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+): +```bash +git clone https://github.com/ipipan/combo.git +cd combo +python setup.py develop +combo --helpfull +``` + +## Problems & solutions +* **jsonnet** installation error + +use `conda install -c conda-forge jsonnet=0.15.0` diff --git a/docs/models.md b/docs/models.md new file mode 100644 index 0000000..485f761 --- /dev/null +++ b/docs/models.md @@ -0,0 +1,19 @@ +# Models + +Pre-trained models are available [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/). + +## Automatic download +Python `from_pretrained` method will download the pre-trained model if the provided name (without the extension .tar.gz) matches one of the names in [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/). +```python +import combo.predict as predict + +nlp = predict.SemanticMultitaskPredictor.from_pretrained("polish-herbert-base") +``` +Otherwise it looks for a model in local env. + +## Console prediction/Local model +If you want to use the console version of COMBO, you need to download a pre-trained model manually +```bash +wget http://mozart.ipipan.waw.pl/~mklimaszewski/models/polish-herbert-base.tar.gz +``` +and pass it as a parameter (see [prediction doc](prediction.md)). diff --git a/docs/prediction.md b/docs/prediction.md new file mode 100644 index 0000000..89cc74c --- /dev/null +++ b/docs/prediction.md @@ -0,0 +1,40 @@ +# Prediction + +## ConLLU file prediction: +Input and output are both in `*.conllu` format. +```bash +combo --mode predict --model_path your_model_tar_gz --input_file your_conllu_file --output_file your_output_file --silent +``` + +## Console +Works for models where input was text-based only. + +Interactive testing in console (load model and just type sentence in console). + +```bash +combo --mode predict --model_path your_model_tar_gz --input_file "-" --nosilent +``` +## Raw text +Works for models where input was text-based only. + +Input: one sentence per line. + +Output: List of token jsons. + +```bash +combo --mode predict --model_path your_model_tar_gz --input_file your_text_file --output_file your_output_file --silent --noconllu_format +``` +### Advanced + +There are 2 tokenizers: whitespace and spacy-based (`en_core_web_sm` model). + +Use either `--predictor_name semantic-multitask-predictor` or `--predictor_name semantic-multitask-predictor-spacy`. + +## Python +```python +import combo.predict as predict + +model_path = "your_model.tar.gz" +nlp = predict.SemanticMultitaskPredictor.from_pretrained(model_path) +sentence = nlp("Sentence to parse.") +``` diff --git a/docs/training.md b/docs/training.md new file mode 100644 index 0000000..9dc430a --- /dev/null +++ b/docs/training.md @@ -0,0 +1,52 @@ +# Training + +Command: +```bash +combo --mode train \ + --training_data_path your_training_path \ + --validation_data_path your_validation_path +``` + +Options: +```bash +combo --helpfull +``` + +Examples (for clarity without training/validation data paths): + +* train on gpu 0 + + ```bash + combo --mode train --cuda_device 0 + ``` + +* use pretrained embeddings: + + ```bash + combo --mode train --pretrained_tokens your_pretrained_embeddings_path --embedding_dim your_embeddings_dim + ``` + +* use pretrained transformer embeddings: + + ```bash + combo --mode train --pretrained_transformer_name your_choosen_pretrained_transformer + ``` + +* predict only dependency tree: + + ```bash + combo --mode train --targets head,deprel + ``` + +* use part-of-speech tags for predicting only dependency tree + + ```bash + combo --mode train --targets head,deprel --features token,char,upostag + ``` + +## Configuration + +### Advanced +Config template [config.template.jsonnet](config.template.jsonnet) is formed in `allennlp` format so you can freely modify it. +There is configuration for all the training/model parameters (learning rates, epochs number etc.). +Some of them use `jsonnet` syntax to get values from configuration flags, however most of them can be modified directly there. \ No newline at end of file -- GitLab