Skip to content
Snippets Groups Projects
Commit d2a17a76 authored by Mateusz Klimaszewski's avatar Mateusz Klimaszewski
Browse files

Add models license information, make dataset_reader public attribute, exclude...

Add models license information, make dataset_reader public attribute, exclude treebanks without data from script.
parent f3f0c4a8
Branches
Tags
2 merge requests!20Release 1.0.0b2.,!19Release 1.0.0b2.
Pipeline #2258 passed with stage
in 4 minutes and 23 seconds
......@@ -29,8 +29,9 @@ class COMBO(predictor.Predictor):
super().__init__(model, dataset_reader)
self.batch_size = batch_size
self.vocab = model.vocab
self._dataset_reader.generate_labels = False
self._dataset_reader.lazy = True
self.dataset_reader = self._dataset_reader
self.dataset_reader.generate_labels = False
self.dataset_reader.lazy = True
self._tokenizer = tokenizer
self.without_sentence_embedding = False
self.line_to_conllu = line_to_conllu
......@@ -112,7 +113,7 @@ class COMBO(predictor.Predictor):
tokens = sentence
else:
raise ValueError("Input must be either string or list of strings.")
return self._dataset_reader.text_to_instance(tokens2conllu(tokens))
return self.dataset_reader.text_to_instance(tokens2conllu(tokens))
@overrides
def load_line(self, line: str) -> common.JsonDict:
......@@ -125,7 +126,7 @@ class COMBO(predictor.Predictor):
if self.without_sentence_embedding:
outputs.sentence_embedding = []
if self.line_to_conllu:
return sentence2conllu(outputs, keep_semrel=self._dataset_reader.use_sem).serialize()
return sentence2conllu(outputs, keep_semrel=self.dataset_reader.use_sem).serialize()
else:
return outputs.to_json()
......@@ -134,7 +135,7 @@ class COMBO(predictor.Predictor):
return {"sentence": sentence}
def _to_input_instance(self, sentence: data.Sentence) -> allen_data.Instance:
return self._dataset_reader.text_to_instance(sentence2conllu(sentence))
return self.dataset_reader.text_to_instance(sentence2conllu(sentence))
def _predictions_as_tree(self, predictions: Dict[str, Any], instance: allen_data.Instance):
tree = instance.fields["metadata"]["input"]
......
......@@ -4,8 +4,15 @@ COMBO provides pre-trained models for:
- morphosyntactic prediction (i.e. part-of-speech tagging, morphosyntactic analysis, lemmatisation and dependency parsing) trained on the treebanks from [Universal Dependencies repository](https://universaldependencies.org),
- enhanced dependency parsing trained on IWPT 2020 shared task [data](https://universaldependencies.org/iwpt20/data.html).
Pre-trained models list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing)
## Pre-trained models
**Pre-trained models** list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing)
Please notice that the name in the brackets matches the name used in [Automatic Download](models.md#Automatic download).
### License
Models are licensed on the same license as data used to train.
See [Universal Dependencies v2.7 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/license-ud-2.7) and [Universal Dependencies v2.5 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/licence-UD-2.5) for details.
## Manual download
The pre-trained models can be downloaded from [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/).
......
......@@ -10,24 +10,17 @@ from scripts import utils
# UD 2.7
TREEBANKS = [
"UD_Afrikaans-AfriBooms",
"UD_Akkadian-PISANDUB",
"UD_Akkadian-RIAO",
"UD_Akuntsu-TuDeT",
"UD_Albanian-TSA",
"UD_Amharic-ATT",
"UD_Ancient_Greek-Perseus",
"UD_Ancient_Greek-PROIEL",
"UD_Apurina-UFPA",
# "UD_Albanian-TSA", No training data
# "UD_Amharic-ATT", No training data
"UD_Arabic-NYUAD",
"UD_Arabic-PADT",
"UD_Arabic-PUD",
"UD_Armenian-ArmTDP",
"UD_Assyrian-AS",
"UD_Bambara-CRB",
# "UD_Assyrian-AS", No training data
# "UD_Bambara-CRB", No training data
"UD_Basque-BDT",
"UD_Belarusian-HSE",
"UD_Bhojpuri-BHTB",
"UD_Breton-KEB",
# "UD_Breton-KEB", No training data
"UD_Bulgarian-BTB",
"UD_Buryat-BDT",
"UD_Cantonese-HK",
......@@ -48,17 +41,9 @@ TREEBANKS = [
"UD_Czech-PUD",
"UD_Danish-DDT",
"UD_Dutch-Alpino",
"UD_Dutch-LassySmall",
"UD_English-ESL",
#END OF FIRST RUN
"UD_English-EWT",
"UD_English-GUM",
"UD_English-GUMReddit",
"UD_English-LinES",
"UD_English-ParTUT",
"UD_English-Pronouns",
"UD_English-PUD",
"UD_Erzya-JR",
"UD_Estonian-EDT",
# "UD_Erzya-JR", No training data
"UD_Estonian-EWT",
"UD_Faroese-FarPaHC",
"UD_Faroese-OFT",
......@@ -98,7 +83,7 @@ TREEBANKS = [
"UD_Italian-PUD",
"UD_Italian-TWITTIRO",
"UD_Italian-VIT",
# "UD_Japanese-BCCWJ", no data
# "UD_Japanese-BCCWJ", No public data
"UD_Japanese-GSD",
"UD_Japanese-Modern",
"UD_Japanese-PUD",
......@@ -119,9 +104,9 @@ TREEBANKS = [
"UD_Latvian-LVTB",
"UD_Lithuanian-ALKSNIS",
"UD_Lithuanian-HSE",
"UD_Livvi-KKPP",
# end batch 2
"UD_Maltese-MUDT",
"UD_Manx-Cadhan",
# "UD_Manx-Cadhan", No training data
"UD_Marathi-UFAL",
"UD_Mbya_Guarani-Dooley",
"UD_Mbya_Guarani-Thomas",
......@@ -153,8 +138,7 @@ TREEBANKS = [
"UD_Russian-PUD",
"UD_Russian-SynTagRus",
"UD_Russian-Taiga",
"UD_Sanskrit-UFAL",
"UD_Sanskrit-Vedic",
# "UD_Sanskrit-UFAL", No training data
"UD_Scottish_Gaelic-ARCOSG",
"UD_Serbian-SET",
"UD_Skolt_Sami-Giellagas",
......@@ -167,31 +151,22 @@ TREEBANKS = [
"UD_Spanish-GSD",
"UD_Spanish-PUD",
"UD_Swedish-LinES",
"UD_Swedish-PUD",
"UD_Swedish_Sign_Language-SSLC",
"UD_Swedish-Talbanken",
"UD_Swiss_German-UZH",
"UD_Tagalog-TRG",
"UD_Tagalog-Ugnayan",
"UD_Tamil-MWTT",
"UD_Tamil-TTB",
# "UD_Tagalog-TRG", No training data
# "UD_Tamil-MWTT", No training data
"UD_Telugu-MTG",
"UD_Thai-PUD",
"UD_Tupinamba-TuDeT",
# "UD_Thai-PUD", No training data
"UD_Turkish-BOUN",
"UD_Turkish-GB",
"UD_Turkish_German-SAGT",
"UD_Turkish-IMST",
"UD_Turkish-PUD",
"UD_Ukrainian-IU",
"UD_Upper_Sorbian-UFAL",
# "UD_Upper_Sorbian-UFAL", No validation data
"UD_Urdu-UDTB",
"UD_Uyghur-UDT",
"UD_Vietnamese-VTB",
"UD_Warlpiri-UFAL",
"UD_Welsh-CCG",
"UD_Wolof-WTB",
"UD_Yoruba-YTB",
# "UD_Welsh-CCG", No validation data
# "UD_Yoruba-YTB", No training data
]
FLAGS = flags.FLAGS
......@@ -250,13 +225,24 @@ def run(_):
"""
# Datasets without XPOS
if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Hungarian-Szeged"}:
if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Danish-DDT", "UD_Hungarian-Szeged", "UD_French-GSD",
"UD_Marathi-UFAL", "UD_Norwegian-Bokmaal"}:
command = command + " --targets deprel,head,upostag,lemma,feats"
# Datasets without LEMMA and FEATS
if treebank in {"UD_Maltese-MUDT"}:
command = command + " --targets deprel,head,upostag,xpostag"
# Datasets without XPOS and FEATS
if treebank in {"UD_Telugu-MTG"}:
command = command + " --targets deprel,head,upostag,lemma"
# Reduce word_batch_size
word_batch_size = 2500
if treebank in {"UD_German-HDT"}:
if treebank in {"UD_German-HDT", "UD_Marathi-UFAL"}:
word_batch_size = 1000
elif treebank in {"UD_Telugu-MTG"}:
word_batch_size = 500
command = command + f" --word_batch_size {word_batch_size}"
utils.execute_command(command)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment