Select Git revision
download_fasttext.py 15.37 KiB
import pathlib
from absl import app
from absl import flags
from scripts import utils
# egrep -o 'https?://[^ ]+vec.gz' links.txt
# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
LINKS = [
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.vec.gz",
"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.vec.gz",
]
CODE_2_LANG = {
"af": "Afrikaans",
"aii": "Assyrian",
"ajp": "South_Levantine_Arabic",
"akk": "Akkadian",
"am": "Amharic",
"apu": "Apurina",
"aqz": "Akuntsu",
"ar": "Arabic",
"be": "Belarusian",
"bg": "Bulgarian",
"bho": "Bhojpuri",
"bm": "Bambara",
"br": "Breton",
"bxr": "Buryat",
"ca": "Catalan",
"ckt": "Chukchi",
"cop": "Coptic",
"cs": "Czech",
"cu": "Old_Church_Slavonic",
"cy": "Welsh",
"da": "Danish",
"de": "German",
"el": "Greek",
"en": "English",
"es": "Spanish",
"et": "Estonian",
"eu": "Basque",
"fa": "Persian",
"fi": "Finnish",
"fo": "Faroese",
"fr": "French",
"fro": "Old_French",
"ga": "Irish",
"gd": "Scottish_Gaelic",
"gl": "Galician",
"got": "Gothic",
"grc": "Ancient_Greek",
"gsw": "Swiss_German",
"gun": "Mbya_Guarani",
"gv": "Manx",
"he": "Hebrew",
"hi": "Hindi",
"hr": "Croatian",
"hsb": "Upper_Sorbian",
"hu": "Hungarian",
"hy": "Armenian",
"id": "Indonesian",
"is": "Icelandic",
"it": "Italian",
"ja": "Japanese",
"kfm": "Khunsari",
"kk": "Kazakh",
"kmr": "Kurmanji",
"ko": "Korean",
"koi": "Komi_Permyak",
"kpv": "Komi_Zyrian",
"krl": "Karelian",
"la": "Latin",
"lt": "Lithuanian",
"lv": "Latvian",
"lzh": "Classical_Chinese",
"mdf": "Moksha",
"mr": "Marathi",
"mt": "Maltese",
"myu": "Munduruku",
"myv": "Erzya",
"nl": "Dutch",
"no": "Norwegian",
"nyq": "Nayini",
"olo": "Livvi",
"orv": "Old_Russian",
"otk": "Old_Turkish",
"pcm": "Naija",
"pl": "Polish",
"pt": "Portuguese",
"qhe": "Hindi_English",
"qtd": "Turkish_German",
"ro": "Romanian",
"ru": "Russian",
"sa": "Sanskrit",
"sk": "Slovak",
"sl": "Slovenian",
"sme": "North_Sami",
"sms": "Skolt_Sami",
"soj": "Soi",
"sq": "Albanian",
"sr": "Serbian",
"sv": "Swedish",
"swl": "Swedish_Sign_Language",
"ta": "Tamil",
"te": "Telugu",
"th": "Thai",
"tl": "Tagalog",
"tpn": "Tupinamba",
"tr": "Turkish",
"ug": "Uyghur",
"uk": "Ukrainian",
"ur": "Urdu",
"vi": "Vietnamese",
"wbp": "Warlpiri",
"wo": "Wolof",
"yo": "Yoruba",
"yue": "Cantonese",
"zh": "Chinese",
}
FLAGS = flags.FLAGS
flags.DEFINE_string(name="output_dir", default="",
help="Path to store embeddings.")
def run(_):
output_dir = pathlib.Path(FLAGS.output_dir)
for link in LINKS:
lang_code = link.split(".")[-4]
if lang_code not in CODE_2_LANG:
print(f"Unknown code {lang_code}.")
continue
output_file = output_dir / CODE_2_LANG[lang_code]
output_file.mkdir(exist_ok=True, parents=True)
if (output_file / 'vectors.vec.gz').exists():
print(f"Vectors for {CODE_2_LANG[lang_code]} already exists, skipping.")
continue
utils.execute_command(f"wget -O {output_file / 'vectors.vec.gz'} {link}")
def main():
app.run(run)
if __name__ == "__main__":
main()