Skip to content
Snippets Groups Projects
Commit 7fffa595 authored by piotrmp's avatar piotrmp
Browse files

Added handling for unknown language in OSCAR.

parent ac107081
No related branches found
No related tags found
1 merge request!1Migration to UD 2.11
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
Script from pretraining models using OSCAR corpora Script from pretraining models using OSCAR corpora
""" """
import gzip import gzip
from urllib.error import HTTPError
import importlib_resources as resources import importlib_resources as resources
from pathlib import Path from pathlib import Path
...@@ -39,7 +41,14 @@ if __name__ == '__main__': ...@@ -39,7 +41,14 @@ if __name__ == '__main__':
continue continue
print("Language: " + language) print("Language: " + language)
print("Downloading corpus...") print("Downloading corpus...")
try:
download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD)
except HTTPError as err:
if err.code==404:
print("Language unavailable in OSCAR. moving on...")
continue
else:
raise err
with gzip.open(tmppath) as jsonfile: with gzip.open(tmppath) as jsonfile:
train_documents, test_documents = read_jsonl_to_documents(jsonfile) train_documents, test_documents = read_jsonl_to_documents(jsonfile)
print("Generated " + str(len(train_documents)) + " documents.") print("Generated " + str(len(train_documents)) + " documents.")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment