diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1cf670843fa7ee148f2553819b46cc726520e5a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea +/idea +log.txt +__pycache__ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..796f62bc42889573258b21a53e13c6c960e4c754 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,17 @@ +image: clarinpl/python:3.8 + +cache: + paths: + - .tox + +stages: + - check_style + - build + +before_script: + - pip install tox==2.9.1 + +pep8: + stage: check_style + script: + - tox -v -e pep8 \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..024aa087e4af57e0c9bdbce369d22e7d7495a5d3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM clarinpl/python:3.8 + +WORKDIR /home/worker +COPY ./main.py . +COPY ./requirements.txt . +COPY ./config.ini . +COPY ./src ./src +RUN apt-get install -y build-essential libffi-dev +RUN pip install --index-url https://pypi.clarin-pl.eu/simple/ -r requirements.txt +RUN pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html +CMD ["python3.8", "main.py"] diff --git a/config.ini b/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..0744953b7e5ad2e825e509145c8ceb930bc63299 --- /dev/null +++ b/config.ini @@ -0,0 +1,24 @@ +[service] +tool = xlmroberta + +root = /samba/requests/ +rabbit_host = 10.17.0.85 +rabbit_user = clarin +rabbit_password = clarin123 + +[tool] +workers_number = 1 + +[logging] +port = 9981 +local_log_level = INFO + +[model] +de_sent = {"file": "src/models/de-sent"} +en_sent = {"file": "src/models/de-sent"} +it_sent = {"file": "src/models/de-sent"} +ru_sent = {"file": "src/models/de-sent"} +zh_sent = {"file": "src/models/de-sent"} +ja_sent = {"file": "src/models/de-sent"} +fr_sent = {"file": "src/models/de-sent"} + diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..10b47343288441ad7cb7c523434c1cf7cc88dabd --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +import nlp_ws +from src.XlmRoberta_Worker import XlmRobertaWorker + + +if __name__ == '__main__': + nlp_ws.NLPService.main(XlmRobertaWorker) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4844754cf5616ab1fc6be33c7c181fee51b5d648 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +nlp_ws +transformers +tensorboardx +simpletransformers diff --git a/src/XlmRoberta_Worker.py b/src/XlmRoberta_Worker.py new file mode 100644 index 0000000000000000000000000000000000000000..064b24c1e7dee6921a31b126c3b679754fb62b8e --- /dev/null +++ b/src/XlmRoberta_Worker.py @@ -0,0 +1,65 @@ +import json +import logging + +import nlp_ws +from simpletransformers.classification import ClassificationModel + +log = logging.getLogger(__name__) + + +class XlmRobertaWorker(nlp_ws.NLPWorker): + @classmethod + def static_init(self, config): + self.config = config + log.debug("static_init(%s)", config) + + def init(self): + log.debug("init()") + models = dict() + list_models = dict() + for key in self.config["model"]: + models[key] = json.loads(self.config["model"][key]) + for key, value in models.items(): + list_models[key] = ClassificationModel("xlmroberta", + value["file"], + num_labels=4, + use_cuda=False) + print(value["file"]) + self._classifier = XlmRobertaClassifier(list_models) + + def process(self, input_path, task_options, output_path): + task = task_options.get("type", None) + with open(input_path, "r") as f: + text = f.read() + lang = text.split('__label__')[1] + text = text.split('__label__')[0] + result = self._classifier.predict(text, lang, task) + with open(output_path, "w") as f: + json.dump(result, f, indent=4) + + +class XlmRobertaClassifier(object): + + def __init__(self, models): + self.models = models + self.labels_text = ["__label__meta_amb", "__label__meta_minus_m", + "__label__meta_plus_m", "___label__meta_zero"] + self.labels_sen = ["__label__z_amb", "__label__z_minus_m", + "__label__z_plus_m", "___label__z_zero"] + + def predict(self, ccl, lang, task_options): + if task_options == "sentence": + task = "_sent_sen" + labels = self.labels_sen + else: + task = "_sent" + labels = self.labels_text + model = self.models[lang + task] + decision, raw = model.predict([ccl]) + print(raw) + print(labels) + result = dict(zip(labels, raw[0])) + print(result) + result['decision'] = labels[decision[0]] + result['lang'] = lang + return result diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..7e68f0ea5976af435aff20608373ed6921ad3540 --- /dev/null +++ b/tox.ini @@ -0,0 +1,10 @@ +[tox] +envlist = pep8 +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3.8 +commands = + flake8 {posargs}