From c287ab9de780766771318dad0b09c4181fe8b77e Mon Sep 17 00:00:00 2001 From: Wiktor Walentynowicz <wiktor.walentynowicz@pwr.edu.pl> Date: Wed, 14 Dec 2022 12:04:42 +0000 Subject: [PATCH] Develop --- .gitignore | 4 ++++ .gitlab-ci.yml | 39 +++++++++++++++++++++++++++++++++++++++ Dockerfile | 15 +++++++++++++++ config.ini | 19 +++++++++++++++++++ deployment.yaml | 42 ++++++++++++++++++++++++++++++++++++++++++ entrypoint.py | 22 ++++++++++++++++++++++ requirements.txt | 3 +++ src/__init__.py | 0 src/winer_worker.py | 36 ++++++++++++++++++++++++++++++++++++ tox.ini | 35 +++++++++++++++++++++++++++++++++++ worker.py | 24 ++++++++++++++++++++++++ 11 files changed, 239 insertions(+) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 Dockerfile create mode 100644 config.ini create mode 100644 deployment.yaml create mode 100644 entrypoint.py create mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/winer_worker.py create mode 100644 tox.ini create mode 100644 worker.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c0f7607 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.tox +/data +/venv \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..ea1c47a --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,39 @@ +image: "clarinpl/python:3.8" + +cache: + paths: + - .tox + +stages: + - check_style + - build + +before_script: + - pip install tox==3.18.1 + +pep8: + stage: check_style + script: + - tox -v -e pep8 + +build_image: + stage: build + image: 'docker:18.09.7' + only: + - master + services: + - 'docker:18.09.7-dind' + variables: + DOCKERHUB_NAME: clarinpl/$CI_PROJECT_NAME + before_script: + - '' + script: + - docker build -t $DOCKERHUB_NAME . + - echo $DOCKER_PASSWORD > pass.txt + - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin + - rm pass.txt + - docker push $DOCKERHUB_NAME + - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG + - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:latest + - docker push $CI_REGISTRY_IMAGE diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d704403 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM 11.7.0-cudnn8-runtime-ubuntu20.04 + +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev python3-venv python3-pip + +COPY requirements.txt requirements.txt +RUN python3 -m pip install -r requirements.txt && rm requirements.txt + +WORKDIR /home/worker + +COPY src src +COPY entrypoint.py entrypoint.py +COPY worker.py worker.py +COPY config.ini config.ini + +ENTRYPOINT [ "python3", "entrypoint.py"] \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..3cad108 --- /dev/null +++ b/config.ini @@ -0,0 +1,19 @@ +[service] +tool = winer +root = /samba/requests/ +rabbit_host = $RABBIT_HOST +rabbit_user = $RABBIT_USER +rabbit_password = $RABBIT_PASSWORD +queue_prefix = nlp_ + +[tool] +workers_number=1 + +[logging] +port=9981 +local_log_level=INFO + +[deployment] +s3_endpoint = https://s3.clarin-pl.eu +models_s3_location=s3://workers/winer/models +models_cache_dir=/home/worker/models \ No newline at end of file diff --git a/deployment.yaml b/deployment.yaml new file mode 100644 index 0000000..74f241f --- /dev/null +++ b/deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: winer + labels: + app: winer + namespace: nlpworkers +spec: + replicas: 1 + selector: + matchLabels: + app: winer + template: + metadata: + labels: + app: winer + spec: + containers: + - name: winer + image: clarinpl/$CI_PROJECT_NAME:latest + imagePullPolicy: Always + volumeMounts: + - name: config + mountPath: /home/worker/config.ini + subPath: config.ini + - name: samba + mountPath: /samba + - name: models + mountPath: /home/worker/models/ + volumes: + - name: config + configMap: + name: winer-config-ini + - name: samba + hostPath: + path: /samba + type: "" + - name: models + hostPath: + path: /tmp/winer-models + type: DirectoryOrCreate + diff --git a/entrypoint.py b/entrypoint.py new file mode 100644 index 0000000..0cbaaa5 --- /dev/null +++ b/entrypoint.py @@ -0,0 +1,22 @@ +#!/usr/bin/python3 +from subprocess import run +import configparser + +import sys + +parser = configparser.ConfigParser() +parser.read("config.ini") + +s3_endpoint = parser["deployment"].get("s3_endpoint", "https://s3.clarin-pl.eu") +s3_location = parser["deployment"].get( + "models_s3_location", "s3://workers/winer/models" +) +local_models_location = parser["deployment"].get("models_cache_dir", "/tmp/models") + +cmd = ( + f'aws --no-sign-request --endpoint-url "{s3_endpoint}" s3 sync --delete' + f' "{s3_location}" "{local_models_location}"' +) +run(cmd, shell=True) + +run(["python", "worker.py"] + sys.argv[1:]) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9c67cc3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +--index-url https://pypi.clarin-pl.eu/simple/ +nlp_ws +winer==0.2.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/winer_worker.py b/src/winer_worker.py new file mode 100644 index 0000000..526352a --- /dev/null +++ b/src/winer_worker.py @@ -0,0 +1,36 @@ +"""Implementation of punctuator service""" + +from winer.datafiles import read_clarin_json, write_clarin_json +from winer.document import create_document_from_clarin_json, \ + create_entities_from_hf_outputs +from winer.winer import Winer + +import logging + + +class WinerWorker: + DEFAULT_MODEL = "dummy" + + def __init__( + self, + models_location: str, + ): + + logging.info("Loading models...") + self.active_model = Winer(f'{models_location}/{self.DEFAULT_MODEL}') + + def process( + self, + input_path: str, + task_options: dict, + output_path: str + ) -> None: + documents = [create_document_from_clarin_json(read_clarin_json(input_path))] + outputs = self.active_model.predict( + [document.get_pretokenized_text() for document in documents] + ) + + for idx in range(len(documents)): + documents[idx].add_entites(create_entities_from_hf_outputs(outputs[idx])) + + write_clarin_json(documents[0].as_clarin_json(), output_path) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..e2a26df --- /dev/null +++ b/tox.ini @@ -0,0 +1,35 @@ +[tox] +envlist = unittest,pep8 +skipsdist = True + +[flake8] +exclude = + venv, + .tox, + .git, + __pycache__, + docs/source/conf.py, + build, + dist, + tests/fixtures/*, + *.pyc, + *.egg-info, + .cache, + .eggs + data + generated +max-complexity = 10 +min_python_version = 3.8 +max-line-length = 88 +select = I,C,E,F,W,B,B950,TYP,T +ignore = E231, W503 + + +[testenv:pep8] +deps = + flake8 + flake8-type-annotations + flake8-typing-imports +basepython = python +commands = + flake8 {posargs} \ No newline at end of file diff --git a/worker.py b/worker.py new file mode 100644 index 0000000..23abb49 --- /dev/null +++ b/worker.py @@ -0,0 +1,24 @@ +"""Implementation of punctuator service""" + +import configparser + +import nlp_ws + +from src.winer_worker import WinerWorker + + +class Worker(nlp_ws.NLPWorker): + def init(self): + config = configparser.ConfigParser() + config.read("config.ini") + config = config["deployment"] + + models_cache_dir = config.get("models_cache_dir", "/home/worker/models") + self.winer = WinerWorker(models_cache_dir) + + def process(self, input_path: str, task_options: dict, output_path: str) -> None: + self.winer.process(input_path, task_options, output_path) + + +if __name__ == "__main__": + nlp_ws.NLPService.main(Worker) -- GitLab