diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index 6ff74bcdf950584987de13b5a29dd8944ae9d4d6..2c724f408b35a8d8f964f4087fdf00fd91ead4ea 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -2,6 +2,8 @@ Script from pretraining models using OSCAR corpora """ import gzip +from urllib.error import HTTPError + import importlib_resources as resources from pathlib import Path @@ -39,7 +41,14 @@ if __name__ == '__main__': continue print("Language: " + language) print("Downloading corpus...") - download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + try: + download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + except HTTPError as err: + if err.code==404: + print("Language unavailable in OSCAR. moving on...") + continue + else: + raise err with gzip.open(tmppath) as jsonfile: train_documents, test_documents = read_jsonl_to_documents(jsonfile) print("Generated " + str(len(train_documents)) + " documents.")