Add XlmRoberta worker

82f23dd1 · Bartłomiej Bojanowski · af3fc319 · 82f23dd1 · 82f23dd1 · 82f23dd1
Commit 82f23dd1 authored Oct 12, 2020 by Bartłomiej Bojanowski
--- a/.gitignore
+++ b/.gitignore
+.idea
+/idea
+log.txt
+__pycache__
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: clarinpl/python:3.8
+
+cache:
+  paths:
+    - .tox
+
+stages:
+  - check_style
+  - build
+
+before_script:
+  - pip install tox==2.9.1
+
+pep8:
+  stage: check_style
+  script:
+    - tox -v -e pep8
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+FROM clarinpl/python:3.8
+
+WORKDIR /home/worker
+COPY ./main.py .
+COPY ./requirements.txt .
+COPY ./config.ini .
+COPY ./src ./src
+RUN apt-get install -y build-essential libffi-dev
+RUN pip install --index-url https://pypi.clarin-pl.eu/simple/ -r requirements.txt
+RUN pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+CMD ["python3.8", "main.py"]
--- a/config.ini
+++ b/config.ini
+[service]
+tool = xlmroberta
+
+root = /samba/requests/
+rabbit_host = 10.17.0.85
+rabbit_user = clarin
+rabbit_password = clarin123
+
+[tool]
+workers_number = 1
+
+[logging]
+port = 9981
+local_log_level = INFO
+
+[model]
+de_sent = {"file": "src/models/de-sent"}
+en_sent = {"file": "src/models/de-sent"}
+it_sent = {"file": "src/models/de-sent"}
+ru_sent = {"file": "src/models/de-sent"}
+zh_sent = {"file": "src/models/de-sent"}
+ja_sent = {"file": "src/models/de-sent"}
+fr_sent = {"file": "src/models/de-sent"}
+
--- a/main.py
+++ b/main.py
+import nlp_ws
+from src.XlmRoberta_Worker import XlmRobertaWorker
+
+
+if __name__ == '__main__':
+    nlp_ws.NLPService.main(XlmRobertaWorker)
--- a/requirements.txt
+++ b/requirements.txt
+nlp_ws
+transformers
+tensorboardx
+simpletransformers
--- a/src/XlmRoberta_Worker.py
+++ b/src/XlmRoberta_Worker.py
+import json
+import logging
+
+import nlp_ws
+from simpletransformers.classification import ClassificationModel
+
+log = logging.getLogger(__name__)
+
+
+class XlmRobertaWorker(nlp_ws.NLPWorker):
+    @classmethod
+    def static_init(self, config):
+        self.config = config
+        log.debug("static_init(%s)", config)
+
+    def init(self):
+        log.debug("init()")
+        models = dict()
+        list_models = dict()
+        for key in self.config["model"]:
+            models[key] = json.loads(self.config["model"][key])
+        for key, value in models.items():
+            list_models[key] = ClassificationModel("xlmroberta",
+                                                   value["file"],
+                                                   num_labels=4,
+                                                   use_cuda=False)
+            print(value["file"])
+        self._classifier = XlmRobertaClassifier(list_models)
+
+    def process(self, input_path, task_options, output_path):
+        task = task_options.get("type", None)
+        with open(input_path, "r") as f:
+            text = f.read()
+        lang = text.split('__label__')[1]
+        text = text.split('__label__')[0]
+        result = self._classifier.predict(text, lang, task)
+        with open(output_path, "w") as f:
+            json.dump(result, f, indent=4)
+
+
+class XlmRobertaClassifier(object):
+
+    def __init__(self, models):
+        self.models = models
+        self.labels_text = ["__label__meta_amb", "__label__meta_minus_m",
+                            "__label__meta_plus_m", "___label__meta_zero"]
+        self.labels_sen = ["__label__z_amb", "__label__z_minus_m",
+                           "__label__z_plus_m", "___label__z_zero"]
+
+    def predict(self, ccl, lang, task_options):
+        if task_options == "sentence":
+            task = "_sent_sen"
+            labels = self.labels_sen
+        else:
+            task = "_sent"
+            labels = self.labels_text
+        model = self.models[lang + task]
+        decision, raw = model.predict([ccl])
+        print(raw)
+        print(labels)
+        result = dict(zip(labels, raw[0]))
+        print(result)
+        result['decision'] = labels[decision[0]]
+        result['lang'] = lang
+        return result
--- a/tox.ini
+++ b/tox.ini
+[tox]
+envlist = pep8
+skipsdist = True
+
+[testenv:pep8]
+deps =
+    flake8
+basepython = python3.8
+commands =
+    flake8 {posargs}