Skip to content
Snippets Groups Projects
Commit d2a17a76 authored by Mateusz Klimaszewski's avatar Mateusz Klimaszewski
Browse files

Add models license information, make dataset_reader public attribute, exclude...

Add models license information, make dataset_reader public attribute, exclude treebanks without data from script.
parent f3f0c4a8
No related branches found
No related tags found
2 merge requests!20Release 1.0.0b2.,!19Release 1.0.0b2.
Pipeline #2258 passed
...@@ -29,8 +29,9 @@ class COMBO(predictor.Predictor): ...@@ -29,8 +29,9 @@ class COMBO(predictor.Predictor):
super().__init__(model, dataset_reader) super().__init__(model, dataset_reader)
self.batch_size = batch_size self.batch_size = batch_size
self.vocab = model.vocab self.vocab = model.vocab
self._dataset_reader.generate_labels = False self.dataset_reader = self._dataset_reader
self._dataset_reader.lazy = True self.dataset_reader.generate_labels = False
self.dataset_reader.lazy = True
self._tokenizer = tokenizer self._tokenizer = tokenizer
self.without_sentence_embedding = False self.without_sentence_embedding = False
self.line_to_conllu = line_to_conllu self.line_to_conllu = line_to_conllu
...@@ -112,7 +113,7 @@ class COMBO(predictor.Predictor): ...@@ -112,7 +113,7 @@ class COMBO(predictor.Predictor):
tokens = sentence tokens = sentence
else: else:
raise ValueError("Input must be either string or list of strings.") raise ValueError("Input must be either string or list of strings.")
return self._dataset_reader.text_to_instance(tokens2conllu(tokens)) return self.dataset_reader.text_to_instance(tokens2conllu(tokens))
@overrides @overrides
def load_line(self, line: str) -> common.JsonDict: def load_line(self, line: str) -> common.JsonDict:
...@@ -125,7 +126,7 @@ class COMBO(predictor.Predictor): ...@@ -125,7 +126,7 @@ class COMBO(predictor.Predictor):
if self.without_sentence_embedding: if self.without_sentence_embedding:
outputs.sentence_embedding = [] outputs.sentence_embedding = []
if self.line_to_conllu: if self.line_to_conllu:
return sentence2conllu(outputs, keep_semrel=self._dataset_reader.use_sem).serialize() return sentence2conllu(outputs, keep_semrel=self.dataset_reader.use_sem).serialize()
else: else:
return outputs.to_json() return outputs.to_json()
...@@ -134,7 +135,7 @@ class COMBO(predictor.Predictor): ...@@ -134,7 +135,7 @@ class COMBO(predictor.Predictor):
return {"sentence": sentence} return {"sentence": sentence}
def _to_input_instance(self, sentence: data.Sentence) -> allen_data.Instance: def _to_input_instance(self, sentence: data.Sentence) -> allen_data.Instance:
return self._dataset_reader.text_to_instance(sentence2conllu(sentence)) return self.dataset_reader.text_to_instance(sentence2conllu(sentence))
def _predictions_as_tree(self, predictions: Dict[str, Any], instance: allen_data.Instance): def _predictions_as_tree(self, predictions: Dict[str, Any], instance: allen_data.Instance):
tree = instance.fields["metadata"]["input"] tree = instance.fields["metadata"]["input"]
......
...@@ -4,8 +4,15 @@ COMBO provides pre-trained models for: ...@@ -4,8 +4,15 @@ COMBO provides pre-trained models for:
- morphosyntactic prediction (i.e. part-of-speech tagging, morphosyntactic analysis, lemmatisation and dependency parsing) trained on the treebanks from [Universal Dependencies repository](https://universaldependencies.org), - morphosyntactic prediction (i.e. part-of-speech tagging, morphosyntactic analysis, lemmatisation and dependency parsing) trained on the treebanks from [Universal Dependencies repository](https://universaldependencies.org),
- enhanced dependency parsing trained on IWPT 2020 shared task [data](https://universaldependencies.org/iwpt20/data.html). - enhanced dependency parsing trained on IWPT 2020 shared task [data](https://universaldependencies.org/iwpt20/data.html).
Pre-trained models list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing) ## Pre-trained models
**Pre-trained models** list with the **evaluation results** is available in the [spreadsheet](https://docs.google.com/spreadsheets/d/1WFYc2aLRa1jw7le030HOacv9fc4zmtqiZtRQY6gl5mc/edit?usp=sharing)
Please notice that the name in the brackets matches the name used in [Automatic Download](models.md#Automatic download). Please notice that the name in the brackets matches the name used in [Automatic Download](models.md#Automatic download).
### License
Models are licensed on the same license as data used to train.
See [Universal Dependencies v2.7 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/license-ud-2.7) and [Universal Dependencies v2.5 License Agreement](https://lindat.mff.cuni.cz/repository/xmlui/page/licence-UD-2.5) for details.
## Manual download ## Manual download
The pre-trained models can be downloaded from [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/). The pre-trained models can be downloaded from [here](http://mozart.ipipan.waw.pl/~mklimaszewski/models/).
......
...@@ -10,24 +10,17 @@ from scripts import utils ...@@ -10,24 +10,17 @@ from scripts import utils
# UD 2.7 # UD 2.7
TREEBANKS = [ TREEBANKS = [
"UD_Afrikaans-AfriBooms", "UD_Afrikaans-AfriBooms",
"UD_Akkadian-PISANDUB", # "UD_Albanian-TSA", No training data
"UD_Akkadian-RIAO", # "UD_Amharic-ATT", No training data
"UD_Akuntsu-TuDeT",
"UD_Albanian-TSA",
"UD_Amharic-ATT",
"UD_Ancient_Greek-Perseus",
"UD_Ancient_Greek-PROIEL",
"UD_Apurina-UFPA",
"UD_Arabic-NYUAD", "UD_Arabic-NYUAD",
"UD_Arabic-PADT", "UD_Arabic-PADT",
"UD_Arabic-PUD", "UD_Arabic-PUD",
"UD_Armenian-ArmTDP", "UD_Armenian-ArmTDP",
"UD_Assyrian-AS", # "UD_Assyrian-AS", No training data
"UD_Bambara-CRB", # "UD_Bambara-CRB", No training data
"UD_Basque-BDT", "UD_Basque-BDT",
"UD_Belarusian-HSE", "UD_Belarusian-HSE",
"UD_Bhojpuri-BHTB", # "UD_Breton-KEB", No training data
"UD_Breton-KEB",
"UD_Bulgarian-BTB", "UD_Bulgarian-BTB",
"UD_Buryat-BDT", "UD_Buryat-BDT",
"UD_Cantonese-HK", "UD_Cantonese-HK",
...@@ -48,17 +41,9 @@ TREEBANKS = [ ...@@ -48,17 +41,9 @@ TREEBANKS = [
"UD_Czech-PUD", "UD_Czech-PUD",
"UD_Danish-DDT", "UD_Danish-DDT",
"UD_Dutch-Alpino", "UD_Dutch-Alpino",
"UD_Dutch-LassySmall", #END OF FIRST RUN
"UD_English-ESL",
"UD_English-EWT", "UD_English-EWT",
"UD_English-GUM", # "UD_Erzya-JR", No training data
"UD_English-GUMReddit",
"UD_English-LinES",
"UD_English-ParTUT",
"UD_English-Pronouns",
"UD_English-PUD",
"UD_Erzya-JR",
"UD_Estonian-EDT",
"UD_Estonian-EWT", "UD_Estonian-EWT",
"UD_Faroese-FarPaHC", "UD_Faroese-FarPaHC",
"UD_Faroese-OFT", "UD_Faroese-OFT",
...@@ -98,7 +83,7 @@ TREEBANKS = [ ...@@ -98,7 +83,7 @@ TREEBANKS = [
"UD_Italian-PUD", "UD_Italian-PUD",
"UD_Italian-TWITTIRO", "UD_Italian-TWITTIRO",
"UD_Italian-VIT", "UD_Italian-VIT",
# "UD_Japanese-BCCWJ", no data # "UD_Japanese-BCCWJ", No public data
"UD_Japanese-GSD", "UD_Japanese-GSD",
"UD_Japanese-Modern", "UD_Japanese-Modern",
"UD_Japanese-PUD", "UD_Japanese-PUD",
...@@ -119,9 +104,9 @@ TREEBANKS = [ ...@@ -119,9 +104,9 @@ TREEBANKS = [
"UD_Latvian-LVTB", "UD_Latvian-LVTB",
"UD_Lithuanian-ALKSNIS", "UD_Lithuanian-ALKSNIS",
"UD_Lithuanian-HSE", "UD_Lithuanian-HSE",
"UD_Livvi-KKPP", # end batch 2
"UD_Maltese-MUDT", "UD_Maltese-MUDT",
"UD_Manx-Cadhan", # "UD_Manx-Cadhan", No training data
"UD_Marathi-UFAL", "UD_Marathi-UFAL",
"UD_Mbya_Guarani-Dooley", "UD_Mbya_Guarani-Dooley",
"UD_Mbya_Guarani-Thomas", "UD_Mbya_Guarani-Thomas",
...@@ -153,8 +138,7 @@ TREEBANKS = [ ...@@ -153,8 +138,7 @@ TREEBANKS = [
"UD_Russian-PUD", "UD_Russian-PUD",
"UD_Russian-SynTagRus", "UD_Russian-SynTagRus",
"UD_Russian-Taiga", "UD_Russian-Taiga",
"UD_Sanskrit-UFAL", # "UD_Sanskrit-UFAL", No training data
"UD_Sanskrit-Vedic",
"UD_Scottish_Gaelic-ARCOSG", "UD_Scottish_Gaelic-ARCOSG",
"UD_Serbian-SET", "UD_Serbian-SET",
"UD_Skolt_Sami-Giellagas", "UD_Skolt_Sami-Giellagas",
...@@ -167,31 +151,22 @@ TREEBANKS = [ ...@@ -167,31 +151,22 @@ TREEBANKS = [
"UD_Spanish-GSD", "UD_Spanish-GSD",
"UD_Spanish-PUD", "UD_Spanish-PUD",
"UD_Swedish-LinES", "UD_Swedish-LinES",
"UD_Swedish-PUD", # "UD_Tagalog-TRG", No training data
"UD_Swedish_Sign_Language-SSLC", # "UD_Tamil-MWTT", No training data
"UD_Swedish-Talbanken",
"UD_Swiss_German-UZH",
"UD_Tagalog-TRG",
"UD_Tagalog-Ugnayan",
"UD_Tamil-MWTT",
"UD_Tamil-TTB",
"UD_Telugu-MTG", "UD_Telugu-MTG",
"UD_Thai-PUD", # "UD_Thai-PUD", No training data
"UD_Tupinamba-TuDeT",
"UD_Turkish-BOUN", "UD_Turkish-BOUN",
"UD_Turkish-GB", "UD_Turkish-GB",
"UD_Turkish_German-SAGT", "UD_Turkish_German-SAGT",
"UD_Turkish-IMST", "UD_Turkish-IMST",
"UD_Turkish-PUD", "UD_Turkish-PUD",
"UD_Ukrainian-IU", "UD_Ukrainian-IU",
"UD_Upper_Sorbian-UFAL", # "UD_Upper_Sorbian-UFAL", No validation data
"UD_Urdu-UDTB", "UD_Urdu-UDTB",
"UD_Uyghur-UDT", "UD_Uyghur-UDT",
"UD_Vietnamese-VTB", "UD_Vietnamese-VTB",
"UD_Warlpiri-UFAL", # "UD_Welsh-CCG", No validation data
"UD_Welsh-CCG", # "UD_Yoruba-YTB", No training data
"UD_Wolof-WTB",
"UD_Yoruba-YTB",
] ]
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
...@@ -250,13 +225,24 @@ def run(_): ...@@ -250,13 +225,24 @@ def run(_):
""" """
# Datasets without XPOS # Datasets without XPOS
if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Hungarian-Szeged"}: if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Danish-DDT", "UD_Hungarian-Szeged", "UD_French-GSD",
"UD_Marathi-UFAL", "UD_Norwegian-Bokmaal"}:
command = command + " --targets deprel,head,upostag,lemma,feats" command = command + " --targets deprel,head,upostag,lemma,feats"
# Datasets without LEMMA and FEATS
if treebank in {"UD_Maltese-MUDT"}:
command = command + " --targets deprel,head,upostag,xpostag"
# Datasets without XPOS and FEATS
if treebank in {"UD_Telugu-MTG"}:
command = command + " --targets deprel,head,upostag,lemma"
# Reduce word_batch_size # Reduce word_batch_size
word_batch_size = 2500 word_batch_size = 2500
if treebank in {"UD_German-HDT"}: if treebank in {"UD_German-HDT", "UD_Marathi-UFAL"}:
word_batch_size = 1000 word_batch_size = 1000
elif treebank in {"UD_Telugu-MTG"}:
word_batch_size = 500
command = command + f" --word_batch_size {word_batch_size}" command = command + f" --word_batch_size {word_batch_size}"
utils.execute_command(command) utils.execute_command(command)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment