From 0daab175246a1b36cbde870e47429456d28f3f8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Bojanowski?= <bartlomiej.bojanowski@pwr.edu.pl> Date: Wed, 30 Sep 2020 13:25:37 +0200 Subject: [PATCH] Add MultifiWorker --- .gitignore | 4 ++++ .gitlab-ci.yml | 17 ++++++++++++++ main.py | 6 +++++ src/Multifit_worker.py | 53 ++++++++++++++++++++++++++++++++++++++++++ tox.ini | 10 ++++++++ 5 files changed, 90 insertions(+) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml create mode 100644 main.py create mode 100644 src/Multifit_worker.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1cf6708 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea +/idea +log.txt +__pycache__ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..796f62b --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,17 @@ +image: clarinpl/python:3.8 + +cache: + paths: + - .tox + +stages: + - check_style + - build + +before_script: + - pip install tox==2.9.1 + +pep8: + stage: check_style + script: + - tox -v -e pep8 \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..2e3d817 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +import nlp_ws +from src.Multifit_worker import MultifitWorker + + +if __name__ == '__main__': + nlp_ws.NLPService.main(MultifitWorker) diff --git a/src/Multifit_worker.py b/src/Multifit_worker.py new file mode 100644 index 0000000..f486174 --- /dev/null +++ b/src/Multifit_worker.py @@ -0,0 +1,53 @@ +import json +import logging + +import nlp_ws +from fastai.text import load_learner +from torch import to_np + +log = logging.getLogger(__name__) + + +class MultifitWorker(nlp_ws.NLPWorker): + @classmethod + def static_init(self, config): + self.config = config + log.debug("static_init(%s)", config) + + def init(self): + log.debug("init()") + self._classifier = MultifitClassifier() + + def process(self, input_path, task_options, output_path): + task = task_options.get("type", None) + with open(input_path, "r") as f: + text = f.read() + lang = text.split('__label__')[1] + text = text.split('__label__')[0] + result = self._classifier.predict(text, lang=lang, task_options=task) + print(result) + with open(output_path, "w") as f: + json.dump(result, f, indent=4) + + +class MultifitClassifier(object): + + def __init__(self): + self.labels_text = ["__label__meta_amb", "__label__meta_minus_m", + "__label__meta_plus_m", "___label__meta_zero"] + self.labels_sen = ["__label__z_amb", "__label__z_minus_m", + "__label__z_plus_m", "___label__z_zero"] + + def predict(self, ccl, lang=None, task_options=None): + path = "" + if task_options == "sentence": + path = lang + "-sent-sen.pkl" + labels = self.label_sen + else: + path = lang + "-sent.pkl" + labels = self.labels_text + learner = load_learner("models", path) + results = learner.predict("xxbos " + str(ccl)) + probabilities = [str(x) for x in to_np(results[2])] + result = zip(probabilities, labels) + return result diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..7e68f0e --- /dev/null +++ b/tox.ini @@ -0,0 +1,10 @@ +[tox] +envlist = pep8 +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3.8 +commands = + flake8 {posargs} -- GitLab