From 7fffa5953255f62644a3614abb4faa874487343c Mon Sep 17 00:00:00 2001
From: piotrmp <piotr.m.przybyla@gmail.com>
Date: Wed, 23 Nov 2022 20:01:25 +0100
Subject: [PATCH] Added handling for unknown language in OSCAR.

---
 src/lambo/examples/run_pretraining.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/lambo/examples/run_pretraining.py b/src/lambo/examples/run_pretraining.py
index 6ff74bc..2c724f4 100644
--- a/src/lambo/examples/run_pretraining.py
+++ b/src/lambo/examples/run_pretraining.py
@@ -2,6 +2,8 @@
 Script from pretraining models using OSCAR corpora
 """
 import gzip
+from urllib.error import HTTPError
+
 import importlib_resources as resources
 from pathlib import Path
 
@@ -39,7 +41,14 @@ if __name__ == '__main__':
             continue
         print("Language: " + language)
         print("Downloading corpus...")
-        download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD)
+        try:
+            download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD)
+        except HTTPError as err:
+            if err.code==404:
+                print("Language unavailable in OSCAR. moving on...")
+                continue
+            else:
+                raise err
         with gzip.open(tmppath) as jsonfile:
             train_documents, test_documents = read_jsonl_to_documents(jsonfile)
         print("Generated " + str(len(train_documents)) + " documents.")
-- 
GitLab