From bcce7338a5325c44c2b7a6a3e77ad97fc6d11aa5 Mon Sep 17 00:00:00 2001
From: Mateusz Klimaszewski <mk.klimaszewski@gmail.com>
Date: Thu, 7 Jan 2021 13:27:29 +0100
Subject: [PATCH] Fix language extraction in training script and add script for
 downloading fasttext embeddings.

---
 scripts/download_fasttext.py | 306 +++++++++++++++++++++++++++++++++++
 scripts/train.py             |   2 +-
 2 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 scripts/download_fasttext.py

diff --git a/scripts/download_fasttext.py b/scripts/download_fasttext.py
new file mode 100644
index 0000000..1919fa2
--- /dev/null
+++ b/scripts/download_fasttext.py
@@ -0,0 +1,306 @@
+import pathlib
+
+from absl import app
+from absl import flags
+
+from scripts import utils
+
+# egrep -o 'https?://[^ ]+vec.gz' links.txt
+# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
+LINKS = [
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.vec.gz",
+    "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.vec.gz",
+]
+
+CODE_2_LANG = {
+    "af": "Afrikaans",
+    "aii": "Assyrian",
+    "ajp": "South_Levantine_Arabic",
+    "akk": "Akkadian",
+    "am": "Amharic",
+    "apu": "Apurina",
+    "aqz": "Akuntsu",
+    "ar": "Arabic",
+    "be": "Belarusian",
+    "bg": "Bulgarian",
+    "bho": "Bhojpuri",
+    "bm": "Bambara",
+    "br": "Breton",
+    "bxr": "Buryat",
+    "ca": "Catalan",
+    "ckt": "Chukchi",
+    "cop": "Coptic",
+    "cs": "Czech",
+    "cu": "Old_Church_Slavonic",
+    "cy": "Welsh",
+    "da": "Danish",
+    "de": "German",
+    "el": "Greek",
+    "en": "English",
+    "es": "Spanish",
+    "et": "Estonian",
+    "eu": "Basque",
+    "fa": "Persian",
+    "fi": "Finnish",
+    "fo": "Faroese",
+    "fr": "French",
+    "fro": "Old_French",
+    "ga": "Irish",
+    "gd": "Scottish_Gaelic",
+    "gl": "Galician",
+    "got": "Gothic",
+    "grc": "Ancient_Greek",
+    "gsw": "Swiss_German",
+    "gun": "Mbya_Guarani",
+    "gv": "Manx",
+    "he": "Hebrew",
+    "hi": "Hindi",
+    "hr": "Croatian",
+    "hsb": "Upper_Sorbian",
+    "hu": "Hungarian",
+    "hy": "Armenian",
+    "id": "Indonesian",
+    "is": "Icelandic",
+    "it": "Italian",
+    "ja": "Japanese",
+    "kfm": "Khunsari",
+    "kk": "Kazakh",
+    "kmr": "Kurmanji",
+    "ko": "Korean",
+    "koi": "Komi_Permyak",
+    "kpv": "Komi_Zyrian",
+    "krl": "Karelian",
+    "la": "Latin",
+    "lt": "Lithuanian",
+    "lv": "Latvian",
+    "lzh": "Classical_Chinese",
+    "mdf": "Moksha",
+    "mr": "Marathi",
+    "mt": "Maltese",
+    "myu": "Munduruku",
+    "myv": "Erzya",
+    "nl": "Dutch",
+    "no": "Norwegian",
+    "nyq": "Nayini",
+    "olo": "Livvi",
+    "orv": "Old_Russian",
+    "otk": "Old_Turkish",
+    "pcm": "Naija",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "qhe": "Hindi_English",
+    "qtd": "Turkish_German",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "sa": "Sanskrit",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "sme": "North_Sami",
+    "sms": "Skolt_Sami",
+    "soj": "Soi",
+    "sq": "Albanian",
+    "sr": "Serbian",
+    "sv": "Swedish",
+    "swl": "Swedish_Sign_Language",
+    "ta": "Tamil",
+    "te": "Telugu",
+    "th": "Thai",
+    "tl": "Tagalog",
+    "tpn": "Tupinamba",
+    "tr": "Turkish",
+    "ug": "Uyghur",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "vi": "Vietnamese",
+    "wbp": "Warlpiri",
+    "wo": "Wolof",
+    "yo": "Yoruba",
+    "yue": "Cantonese",
+    "zh": "Chinese",
+}
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string(name="output_dir", default="",
+                    help="Path to store embeddings.")
+
+
+def run(_):
+    output_dir = pathlib.Path(FLAGS.output_dir)
+    for link in LINKS:
+        lang_code = link.split(".")[-4]
+
+        if lang_code not in CODE_2_LANG:
+            print(f"Unknown code {lang_code}.")
+            continue
+
+        output_file = output_dir / CODE_2_LANG[lang_code]
+        output_file.mkdir(exist_ok=True, parents=True)
+        if (output_file / 'vectors.vec.gz').exists():
+            print(f"Vectors for {CODE_2_LANG[lang_code]} already exists, skipping.")
+            continue
+
+        utils.execute_command(f"wget -O {output_file / 'vectors.vec.gz'} {link}")
+
+
+def main():
+    app.run(run)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/train.py b/scripts/train.py
index 4bd342a..dc75344 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -212,7 +212,7 @@ def run(_):
     for treebank in FLAGS.treebanks:
         assert treebank in TREEBANKS, f"Unknown treebank {treebank}."
         treebank_dir = treebanks_dir / treebank
-        treebank_parts = treebank.split("_")[1].split("-")
+        treebank_parts = treebank[3:].split("-")
         language = treebank_parts[0]
 
         files = list(treebank_dir.iterdir())
-- 
GitLab