From 9218e33cbcabe69e7f0fd1a7b1fed0ea53da29be Mon Sep 17 00:00:00 2001
From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com>
Date: Thu, 4 Mar 2021 12:00:06 +0100
Subject: [PATCH] Extend training configuration.

---
 scripts/train.py     |  4 ++++
 scripts/train_eud.py | 11 +++++++++--
 scripts/utils.py     |  7 +++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index accca4a..950ee82 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -229,6 +229,10 @@ def run(_):
                         "UD_Marathi-UFAL", "UD_Norwegian-Bokmaal"}:
             command = command + " --targets deprel,head,upostag,lemma,feats"
 
+        # Datasets without FEATS
+        if treebank in {"UD_Japanese-GSD", "UD_Korean-Kaist"}:
+            command = command + " --targets deprel,head,upostag,xpostag,lemma"
+
         # Datasets without LEMMA and FEATS
         if treebank in {"UD_Maltese-MUDT"}:
             command = command + " --targets deprel,head,upostag,xpostag"
diff --git a/scripts/train_eud.py b/scripts/train_eud.py
index 4904e0b..ba13a27 100644
--- a/scripts/train_eud.py
+++ b/scripts/train_eud.py
@@ -105,7 +105,8 @@ def run(_):
 
         serialization_dir = pathlib.Path(FLAGS.serialization_dir) / lang
         serialization_dir.mkdir(exist_ok=True, parents=True)
-        utils.execute_command("".join(f"""combo --mode train
+
+        command = f"""combo --mode train
         --training_data {train_path}
         --validation_data {dev_path}
         --targets feats,upostag,xpostag,head,deprel,lemma,deps
@@ -115,7 +116,13 @@ def run(_):
         --word_batch_size 2500
         --config_path {pathlib.Path.cwd() / 'config.graph.template.jsonnet'}
         --notensorboard
-        """.splitlines()))
+        """
+
+        # Datasets without XPOS
+        if lang in {"fr"}:
+            command = command + " --targets deprel,head,upostag,lemma,feats"
+
+        utils.execute_command("".join(command.splitlines()))
 
 
 def main():
diff --git a/scripts/utils.py b/scripts/utils.py
index 5dda2b8..ebfec3e 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -4,6 +4,13 @@ import subprocess
 LANG2TRANSFORMER = {
     "en": "bert-base-cased",
     "pl": "allegro/herbert-base-cased",
+    "zh": "bert-base-chinese",
+    "fi": "TurkuNLP/bert-base-finnish-cased-v1",
+    "ja": "cl-tohoku/bert-base-japanese",
+    "ko": "kykim/bert-kor-base",
+    "de": "dbmdz/bert-base-german-cased",
+    "ar": "aubmindlab/bert-base-arabertv2",
+    "eu": "ixa-ehu/berteus-base-cased"
 }
 
 
-- 
GitLab