import pathlib from absl import app from absl import flags from scripts import utils # egrep -o 'https?://[^ ]+vec.gz' links.txt # https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md LINKS = [ "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.vec.gz", "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.vec.gz", ] CODE_2_LANG = { "af": "Afrikaans", "aii": "Assyrian", "ajp": "South_Levantine_Arabic", "akk": "Akkadian", "am": "Amharic", "apu": "Apurina", "aqz": "Akuntsu", "ar": "Arabic", "be": "Belarusian", "bg": "Bulgarian", "bho": "Bhojpuri", "bm": "Bambara", "br": "Breton", "bxr": "Buryat", "ca": "Catalan", "ckt": "Chukchi", "cop": "Coptic", "cs": "Czech", "cu": "Old_Church_Slavonic", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Persian", "fi": "Finnish", "fo": "Faroese", "fr": "French", "fro": "Old_French", "ga": "Irish", "gd": "Scottish_Gaelic", "gl": "Galician", "got": "Gothic", "grc": "Ancient_Greek", "gsw": "Swiss_German", "gun": "Mbya_Guarani", "gv": "Manx", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hsb": "Upper_Sorbian", "hu": "Hungarian", "hy": "Armenian", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "kfm": "Khunsari", "kk": "Kazakh", "kmr": "Kurmanji", "ko": "Korean", "koi": "Komi_Permyak", "kpv": "Komi_Zyrian", "krl": "Karelian", "la": "Latin", "lt": "Lithuanian", "lv": "Latvian", "lzh": "Classical_Chinese", "mdf": "Moksha", "mr": "Marathi", "mt": "Maltese", "myu": "Munduruku", "myv": "Erzya", "nl": "Dutch", "no": "Norwegian", "nyq": "Nayini", "olo": "Livvi", "orv": "Old_Russian", "otk": "Old_Turkish", "pcm": "Naija", "pl": "Polish", "pt": "Portuguese", "qhe": "Hindi_English", "qtd": "Turkish_German", "ro": "Romanian", "ru": "Russian", "sa": "Sanskrit", "sk": "Slovak", "sl": "Slovenian", "sme": "North_Sami", "sms": "Skolt_Sami", "soj": "Soi", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "swl": "Swedish_Sign_Language", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Tagalog", "tpn": "Tupinamba", "tr": "Turkish", "ug": "Uyghur", "uk": "Ukrainian", "ur": "Urdu", "vi": "Vietnamese", "wbp": "Warlpiri", "wo": "Wolof", "yo": "Yoruba", "yue": "Cantonese", "zh": "Chinese", } FLAGS = flags.FLAGS flags.DEFINE_string(name="output_dir", default="", help="Path to store embeddings.") def run(_): output_dir = pathlib.Path(FLAGS.output_dir) for link in LINKS: lang_code = link.split(".")[-4] if lang_code not in CODE_2_LANG: print(f"Unknown code {lang_code}.") continue output_file = output_dir / CODE_2_LANG[lang_code] output_file.mkdir(exist_ok=True, parents=True) if (output_file / 'vectors.vec.gz').exists(): print(f"Vectors for {CODE_2_LANG[lang_code]} already exists, skipping.") continue utils.execute_command(f"wget -O {output_file / 'vectors.vec.gz'} {link}") def main(): app.run(run) if __name__ == "__main__": main()