diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c0f7607976d57c26533eed53266d8e56ab4f9c35 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.tox +/data +/venv \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..ea1c47aff59bdd19f99c3f4049c2e0d2c72d1b75 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,39 @@ +image: "clarinpl/python:3.8" + +cache: + paths: + - .tox + +stages: + - check_style + - build + +before_script: + - pip install tox==3.18.1 + +pep8: + stage: check_style + script: + - tox -v -e pep8 + +build_image: + stage: build + image: 'docker:18.09.7' + only: + - master + services: + - 'docker:18.09.7-dind' + variables: + DOCKERHUB_NAME: clarinpl/$CI_PROJECT_NAME + before_script: + - '' + script: + - docker build -t $DOCKERHUB_NAME . + - echo $DOCKER_PASSWORD > pass.txt + - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin + - rm pass.txt + - docker push $DOCKERHUB_NAME + - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY + - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG + - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:latest + - docker push $CI_REGISTRY_IMAGE diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..d704403ceb9505a5c9105896aedc1a4de7756565 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM 11.7.0-cudnn8-runtime-ubuntu20.04 + +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev python3-venv python3-pip + +COPY requirements.txt requirements.txt +RUN python3 -m pip install -r requirements.txt && rm requirements.txt + +WORKDIR /home/worker + +COPY src src +COPY entrypoint.py entrypoint.py +COPY worker.py worker.py +COPY config.ini config.ini + +ENTRYPOINT [ "python3", "entrypoint.py"] \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..3cad108e3505183632229c3538ada14337a7392f --- /dev/null +++ b/config.ini @@ -0,0 +1,19 @@ +[service] +tool = winer +root = /samba/requests/ +rabbit_host = $RABBIT_HOST +rabbit_user = $RABBIT_USER +rabbit_password = $RABBIT_PASSWORD +queue_prefix = nlp_ + +[tool] +workers_number=1 + +[logging] +port=9981 +local_log_level=INFO + +[deployment] +s3_endpoint = https://s3.clarin-pl.eu +models_s3_location=s3://workers/winer/models +models_cache_dir=/home/worker/models \ No newline at end of file diff --git a/deployment.yaml b/deployment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74f241f158d124596f42e35b4d9230a08fcb6c44 --- /dev/null +++ b/deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: winer + labels: + app: winer + namespace: nlpworkers +spec: + replicas: 1 + selector: + matchLabels: + app: winer + template: + metadata: + labels: + app: winer + spec: + containers: + - name: winer + image: clarinpl/$CI_PROJECT_NAME:latest + imagePullPolicy: Always + volumeMounts: + - name: config + mountPath: /home/worker/config.ini + subPath: config.ini + - name: samba + mountPath: /samba + - name: models + mountPath: /home/worker/models/ + volumes: + - name: config + configMap: + name: winer-config-ini + - name: samba + hostPath: + path: /samba + type: "" + - name: models + hostPath: + path: /tmp/winer-models + type: DirectoryOrCreate + diff --git a/entrypoint.py b/entrypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..0cbaaa5b60af1a4b7f39a7c1a089f6b23d803547 --- /dev/null +++ b/entrypoint.py @@ -0,0 +1,22 @@ +#!/usr/bin/python3 +from subprocess import run +import configparser + +import sys + +parser = configparser.ConfigParser() +parser.read("config.ini") + +s3_endpoint = parser["deployment"].get("s3_endpoint", "https://s3.clarin-pl.eu") +s3_location = parser["deployment"].get( + "models_s3_location", "s3://workers/winer/models" +) +local_models_location = parser["deployment"].get("models_cache_dir", "/tmp/models") + +cmd = ( + f'aws --no-sign-request --endpoint-url "{s3_endpoint}" s3 sync --delete' + f' "{s3_location}" "{local_models_location}"' +) +run(cmd, shell=True) + +run(["python", "worker.py"] + sys.argv[1:]) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c67cc3a5da9d59c7aeace3fd389942ab6ce0a90 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +--index-url https://pypi.clarin-pl.eu/simple/ +nlp_ws +winer==0.2.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/winer_worker.py b/src/winer_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..526352abf400f656d4a1d3a6138e439e3d3ffad7 --- /dev/null +++ b/src/winer_worker.py @@ -0,0 +1,36 @@ +"""Implementation of punctuator service""" + +from winer.datafiles import read_clarin_json, write_clarin_json +from winer.document import create_document_from_clarin_json, \ + create_entities_from_hf_outputs +from winer.winer import Winer + +import logging + + +class WinerWorker: + DEFAULT_MODEL = "dummy" + + def __init__( + self, + models_location: str, + ): + + logging.info("Loading models...") + self.active_model = Winer(f'{models_location}/{self.DEFAULT_MODEL}') + + def process( + self, + input_path: str, + task_options: dict, + output_path: str + ) -> None: + documents = [create_document_from_clarin_json(read_clarin_json(input_path))] + outputs = self.active_model.predict( + [document.get_pretokenized_text() for document in documents] + ) + + for idx in range(len(documents)): + documents[idx].add_entites(create_entities_from_hf_outputs(outputs[idx])) + + write_clarin_json(documents[0].as_clarin_json(), output_path) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..e2a26df25c83d106929e5b68b7d9c1329f132928 --- /dev/null +++ b/tox.ini @@ -0,0 +1,35 @@ +[tox] +envlist = unittest,pep8 +skipsdist = True + +[flake8] +exclude = + venv, + .tox, + .git, + __pycache__, + docs/source/conf.py, + build, + dist, + tests/fixtures/*, + *.pyc, + *.egg-info, + .cache, + .eggs + data + generated +max-complexity = 10 +min_python_version = 3.8 +max-line-length = 88 +select = I,C,E,F,W,B,B950,TYP,T +ignore = E231, W503 + + +[testenv:pep8] +deps = + flake8 + flake8-type-annotations + flake8-typing-imports +basepython = python +commands = + flake8 {posargs} \ No newline at end of file diff --git a/worker.py b/worker.py new file mode 100644 index 0000000000000000000000000000000000000000..23abb49e599e2396aed88ccb7255d9ca9a54cf66 --- /dev/null +++ b/worker.py @@ -0,0 +1,24 @@ +"""Implementation of punctuator service""" + +import configparser + +import nlp_ws + +from src.winer_worker import WinerWorker + + +class Worker(nlp_ws.NLPWorker): + def init(self): + config = configparser.ConfigParser() + config.read("config.ini") + config = config["deployment"] + + models_cache_dir = config.get("models_cache_dir", "/home/worker/models") + self.winer = WinerWorker(models_cache_dir) + + def process(self, input_path: str, task_options: dict, output_path: str) -> None: + self.winer.process(input_path, task_options, output_path) + + +if __name__ == "__main__": + nlp_ws.NLPService.main(Worker)