Skip to content
Snippets Groups Projects
Commit 7fffa595 authored by piotrmp's avatar piotrmp
Browse files

Added handling for unknown language in OSCAR.

parent ac107081
Branches
Tags
1 merge request!1Migration to UD 2.11
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
Script from pretraining models using OSCAR corpora Script from pretraining models using OSCAR corpora
""" """
import gzip import gzip
from urllib.error import HTTPError
import importlib_resources as resources import importlib_resources as resources
from pathlib import Path from pathlib import Path
...@@ -39,7 +41,14 @@ if __name__ == '__main__': ...@@ -39,7 +41,14 @@ if __name__ == '__main__':
continue continue
print("Language: " + language) print("Language: " + language)
print("Downloading corpus...") print("Downloading corpus...")
download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD) try:
download_archive1_from_oscar(language, tmppath, OSCAR_LOGIN, OSCAR_PASSWORD)
except HTTPError as err:
if err.code==404:
print("Language unavailable in OSCAR. moving on...")
continue
else:
raise err
with gzip.open(tmppath) as jsonfile: with gzip.open(tmppath) as jsonfile:
train_documents, test_documents = read_jsonl_to_documents(jsonfile) train_documents, test_documents = read_jsonl_to_documents(jsonfile)
print("Generated " + str(len(train_documents)) + " documents.") print("Generated " + str(len(train_documents)) + " documents.")
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment