From 1408fe0d207c3594ad44c50efe1c27afced633d5 Mon Sep 17 00:00:00 2001 From: leszeks <leszeks@e-science.pl> Date: Wed, 29 Jul 2020 16:09:02 +0200 Subject: [PATCH] merge request changes --- README.md | 12 ++++++++++++ plwn/__init__.py | 2 ++ plwn/config.ini | 2 +- plwn/download.py | 43 ++++++++++++++++++++++++++++++++++--------- setup.py | 2 +- 5 files changed, 50 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index fed3cba..8e2eaa2 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,18 @@ To load this version at a later date, use `plwn.load(path)` instead of `plwn.loa >>> api = plwn.load("storage-dumps/plwn-new.db") +Downloading API dumps +===================== + +In order to download one of the dumps available at https://minio.clarin-pl.eu/ : + import plwn + plwn.download("optional_name") +File will be downloaded to the current directory. +If optional_name is not provided default dump will be downloaded. +If optional_name is provided but doesn't match name of any available dumps, the process will fail +and display possible names. + + Licenses ======== diff --git a/plwn/__init__.py b/plwn/__init__.py index fe9b1f0..83159a5 100644 --- a/plwn/__init__.py +++ b/plwn/__init__.py @@ -20,6 +20,7 @@ from ._loading import read from ._loading import load from ._loading import show_source_formats from ._loading import show_storage_formats +from .download import download # Import the enums that are needed for selecting and filtering from .enums import PoS, RelationKind @@ -35,4 +36,5 @@ __all__ = [ "show_source_formats", "load_default", "RelationKind", + "download", ] diff --git a/plwn/config.ini b/plwn/config.ini index 04c4bf2..ec3c6ff 100644 --- a/plwn/config.ini +++ b/plwn/config.ini @@ -1,2 +1,2 @@ [DOWNLOAD] -model = https://minio.clarin-pl.eu/public/models/plwn_api_dumps/plwn_dump_27-03-2018.sqlite \ No newline at end of file +default_model = https://minio.clarin-pl.eu/public/models/plwn_api_dumps/plwn_dump_27-03-2018.sqlite \ No newline at end of file diff --git a/plwn/download.py b/plwn/download.py index 9699eb5..21378e1 100644 --- a/plwn/download.py +++ b/plwn/download.py @@ -1,25 +1,50 @@ """Implementation of download method.""" import configparser import os +import xml.etree.ElementTree as ET +import re import requests +from six.moves.urllib.request import urlopen -models = { - "model", -} +config = configparser.ConfigParser() +config_path = os.path.join(os.path.dirname( + os.path.abspath(__file__)), "config.ini") +config.read(config_path) + + +def get_available_models(): + root = ET.parse(urlopen("https://minio.clarin-pl.eu/public")).getroot() + available_models = [] + for child in root.findall( + "{http://s3.amazonaws.com/doc/2006-03-01/}Contents"): + if "models/plwn_api_dumps/" in str( + child.find( + "{http://s3.amazonaws.com/doc/2006-03-01/}Key").text): + string = child.find( + "{http://s3.amazonaws.com/doc/2006-03-01/}Key").text + substring = r"models/plwn_api_dumps/" + available_models.append(re.sub(substring, r'', string)) + return available_models -def download(name): + +def download(name = "default_model"): """After called it downloads a specified database model. Currently only one model available. """ + models = get_available_models() + if name == "default_model": + url = config["DOWNLOAD"][name] + r = requests.get(url) + with open(name, "wb") as f: + f.write(r.content) + f.close() + return if name in models: - cfg = configparser.ConfigParser() - config_path = os.path.join(os.path.dirname( - os.path.abspath(__file__)), "config.ini") - cfg.read(config_path) - url = cfg["DOWNLOAD"][name] + url = config["DOWNLOAD"]["default_model"] + url = url.replace("plwn_dump_27-03-2018.sqlite",name) r = requests.get(url) with open(name, "wb") as f: f.write(r.content) diff --git a/setup.py b/setup.py index f4f5bef..fad4287 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ ENVNAME_DIST_NODEFAULT = 'PLWN_API_DIST_NO_DEFAULT_STORAGE' setup_args = dict( name='PLWN_API', - version='0.23', + version='0.24', license='LGPL-3.0+', description='Python API to access plWordNet lexicon', -- GitLab