diff --git a/scripts/download_fasttext.py b/scripts/download_fasttext.py new file mode 100644 index 0000000000000000000000000000000000000000..1919fa214672e0161d49b95bf2b7dffa2ff1002a --- /dev/null +++ b/scripts/download_fasttext.py @@ -0,0 +1,306 @@ +import pathlib + +from absl import app +from absl import flags + +from scripts import utils + +# egrep -o 'https?://[^ ]+vec.gz' links.txt +# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md +LINKS = [ + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.vec.gz", + "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.vec.gz", +] + +CODE_2_LANG = { + "af": "Afrikaans", + "aii": "Assyrian", + "ajp": "South_Levantine_Arabic", + "akk": "Akkadian", + "am": "Amharic", + "apu": "Apurina", + "aqz": "Akuntsu", + "ar": "Arabic", + "be": "Belarusian", + "bg": "Bulgarian", + "bho": "Bhojpuri", + "bm": "Bambara", + "br": "Breton", + "bxr": "Buryat", + "ca": "Catalan", + "ckt": "Chukchi", + "cop": "Coptic", + "cs": "Czech", + "cu": "Old_Church_Slavonic", + "cy": "Welsh", + "da": "Danish", + "de": "German", + "el": "Greek", + "en": "English", + "es": "Spanish", + "et": "Estonian", + "eu": "Basque", + "fa": "Persian", + "fi": "Finnish", + "fo": "Faroese", + "fr": "French", + "fro": "Old_French", + "ga": "Irish", + "gd": "Scottish_Gaelic", + "gl": "Galician", + "got": "Gothic", + "grc": "Ancient_Greek", + "gsw": "Swiss_German", + "gun": "Mbya_Guarani", + "gv": "Manx", + "he": "Hebrew", + "hi": "Hindi", + "hr": "Croatian", + "hsb": "Upper_Sorbian", + "hu": "Hungarian", + "hy": "Armenian", + "id": "Indonesian", + "is": "Icelandic", + "it": "Italian", + "ja": "Japanese", + "kfm": "Khunsari", + "kk": "Kazakh", + "kmr": "Kurmanji", + "ko": "Korean", + "koi": "Komi_Permyak", + "kpv": "Komi_Zyrian", + "krl": "Karelian", + "la": "Latin", + "lt": "Lithuanian", + "lv": "Latvian", + "lzh": "Classical_Chinese", + "mdf": "Moksha", + "mr": "Marathi", + "mt": "Maltese", + "myu": "Munduruku", + "myv": "Erzya", + "nl": "Dutch", + "no": "Norwegian", + "nyq": "Nayini", + "olo": "Livvi", + "orv": "Old_Russian", + "otk": "Old_Turkish", + "pcm": "Naija", + "pl": "Polish", + "pt": "Portuguese", + "qhe": "Hindi_English", + "qtd": "Turkish_German", + "ro": "Romanian", + "ru": "Russian", + "sa": "Sanskrit", + "sk": "Slovak", + "sl": "Slovenian", + "sme": "North_Sami", + "sms": "Skolt_Sami", + "soj": "Soi", + "sq": "Albanian", + "sr": "Serbian", + "sv": "Swedish", + "swl": "Swedish_Sign_Language", + "ta": "Tamil", + "te": "Telugu", + "th": "Thai", + "tl": "Tagalog", + "tpn": "Tupinamba", + "tr": "Turkish", + "ug": "Uyghur", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "wbp": "Warlpiri", + "wo": "Wolof", + "yo": "Yoruba", + "yue": "Cantonese", + "zh": "Chinese", +} + +FLAGS = flags.FLAGS +flags.DEFINE_string(name="output_dir", default="", + help="Path to store embeddings.") + + +def run(_): + output_dir = pathlib.Path(FLAGS.output_dir) + for link in LINKS: + lang_code = link.split(".")[-4] + + if lang_code not in CODE_2_LANG: + print(f"Unknown code {lang_code}.") + continue + + output_file = output_dir / CODE_2_LANG[lang_code] + output_file.mkdir(exist_ok=True, parents=True) + if (output_file / 'vectors.vec.gz').exists(): + print(f"Vectors for {CODE_2_LANG[lang_code]} already exists, skipping.") + continue + + utils.execute_command(f"wget -O {output_file / 'vectors.vec.gz'} {link}") + + +def main(): + app.run(run) + + +if __name__ == "__main__": + main() diff --git a/scripts/train.py b/scripts/train.py index 4bd342a4cb2bb560c2713bb68ecf5145715d65ad..dc75344432a52cfb64b4931582aaa7d963b6839b 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -212,7 +212,7 @@ def run(_): for treebank in FLAGS.treebanks: assert treebank in TREEBANKS, f"Unknown treebank {treebank}." treebank_dir = treebanks_dir / treebank - treebank_parts = treebank.split("_")[1].split("-") + treebank_parts = treebank[3:].split("-") language = treebank_parts[0] files = list(treebank_dir.iterdir())