From 9218e33cbcabe69e7f0fd1a7b1fed0ea53da29be Mon Sep 17 00:00:00 2001 From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com> Date: Thu, 4 Mar 2021 12:00:06 +0100 Subject: [PATCH] Extend training configuration. --- scripts/train.py | 4 ++++ scripts/train_eud.py | 11 +++++++++-- scripts/utils.py | 7 +++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/train.py b/scripts/train.py index accca4a..950ee82 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -229,6 +229,10 @@ def run(_): "UD_Marathi-UFAL", "UD_Norwegian-Bokmaal"}: command = command + " --targets deprel,head,upostag,lemma,feats" + # Datasets without FEATS + if treebank in {"UD_Japanese-GSD", "UD_Korean-Kaist"}: + command = command + " --targets deprel,head,upostag,xpostag,lemma" + # Datasets without LEMMA and FEATS if treebank in {"UD_Maltese-MUDT"}: command = command + " --targets deprel,head,upostag,xpostag" diff --git a/scripts/train_eud.py b/scripts/train_eud.py index 4904e0b..ba13a27 100644 --- a/scripts/train_eud.py +++ b/scripts/train_eud.py @@ -105,7 +105,8 @@ def run(_): serialization_dir = pathlib.Path(FLAGS.serialization_dir) / lang serialization_dir.mkdir(exist_ok=True, parents=True) - utils.execute_command("".join(f"""combo --mode train + + command = f"""combo --mode train --training_data {train_path} --validation_data {dev_path} --targets feats,upostag,xpostag,head,deprel,lemma,deps @@ -115,7 +116,13 @@ def run(_): --word_batch_size 2500 --config_path {pathlib.Path.cwd() / 'config.graph.template.jsonnet'} --notensorboard - """.splitlines())) + """ + + # Datasets without XPOS + if lang in {"fr"}: + command = command + " --targets deprel,head,upostag,lemma,feats" + + utils.execute_command("".join(command.splitlines())) def main(): diff --git a/scripts/utils.py b/scripts/utils.py index 5dda2b8..ebfec3e 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -4,6 +4,13 @@ import subprocess LANG2TRANSFORMER = { "en": "bert-base-cased", "pl": "allegro/herbert-base-cased", + "zh": "bert-base-chinese", + "fi": "TurkuNLP/bert-base-finnish-cased-v1", + "ja": "cl-tohoku/bert-base-japanese", + "ko": "kykim/bert-kor-base", + "de": "dbmdz/bert-base-german-cased", + "ar": "aubmindlab/bert-base-arabertv2", + "eu": "ixa-ehu/berteus-base-cased" } -- GitLab