diff --git a/src/lambo/utils/oscar.py b/src/lambo/utils/oscar.py index bda4c66da81d3ef2176c112d04c21a0016d3608c..0b588a1b3a97c2877972c7ee4b3c311537e1dacd 100644 --- a/src/lambo/utils/oscar.py +++ b/src/lambo/utils/oscar.py @@ -4,6 +4,7 @@ Functions used to obtain multilingual corpora from `OSCAR <https://oscar-corpus. import json import random import urllib +import time from urllib.error import HTTPError from lambo.data.document import Document @@ -46,10 +47,11 @@ def download_archive1_from_oscar(language, path, OSCAR_LOGIN, OSCAR_PASSWORD, re return except HTTPError as err: error = err - if i == retry - 1: + if i == retry - 1 or err.code<500: raise error - time = ((i + 1) * (i + 1) * (i + 1) * 15) - print("[Got " + str(error.code) + ", retrying after " + str(time) + " seconds...]") + secs = ((i + 1) * (i + 1) * (i + 1) * 15) + print("[Got " + str(error.code) + ", retrying after " + str(secs) + " seconds...]") + time.sleep(secs) def read_jsonl_to_documents(fileobj, MAX_LEN=3000000):