From 7fffa5953255f62644a3614abb4faa874487343c Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Wed, 23 Nov 2022 20:01:25 +0100 Subject: [PATCH] Added handling for unknown language in OSCAR. --- src/lambo/examples/run_pretraining.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py index 6ff74bc..2c724f4 100644 --- a/src/lambo/examples/run_pretraining.py +++ b/src/lambo/examples/run_pretraining.py @@ -2,6 +2,8 @@ Script from pretraining models using OSCAR corpora """ import gzip +from urllib.error import HTTPError + import importlib_resources as resources from pathlib import Path @@ -39,7 +41,14 @@ if __name__ == '__main__': continue print("Language: " + language) print("Downloading corpus...") - download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + try: + download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) + except HTTPError as err: + if err.code==404: + print("Language unavailable in OSCAR. moving on...") + continue + else: + raise err with gzip.open(tmppath) as jsonfile: train_documents, test_documents = read_jsonl_to_documents(jsonfile) print("Generated " + str(len(train_documents)) + " documents.") -- GitLab