diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1cf670843fa7ee148f2553819b46cc726520e5a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea +/idea +log.txt +__pycache__ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..796f62bc42889573258b21a53e13c6c960e4c754 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,17 @@ +image: clarinpl/python:3.8 + +cache: + paths: + - .tox + +stages: + - check_style + - build + +before_script: + - pip install tox==2.9.1 + +pep8: + stage: check_style + script: + - tox -v -e pep8 \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..198ac470ee48ca1c08108ca2f302124de844fc24 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM clarinpl/python:3.8 + +WORKDIR /home/worker +COPY ./src ./src +COPY ./main.py . +COPY ./requirements.txt . +COPY ./config.ini . +COPY ./models ./models +RUN apt-get install -y build-essential libffi-dev +RUN pip install --index-url https://pypi.clarin-pl.eu/simple/ -r requirements.txt +RUN pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html +RUN python -m laserembeddings download-models +RUN python -m nltk.downloader punkt +CMD ["python3.8", "main.py"] diff --git a/config.ini b/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..d473f9474434abe246c1a3f90075d03f2beccd7a --- /dev/null +++ b/config.ini @@ -0,0 +1,19 @@ +[service] +tool = bilstm_laser + +root = /samba/requests/ +rabbit_host = 10.17.0.85 +rabbit_user = clarin +rabbit_password = clarin123 + +[tool] +workers_number = 1 + +[logging] +port = 9981 +local_log_level = INFO + +[model] +model_a = {"file": "models/model_a.hdf5", "labels": ["__label__meta_minus_m", "__label__meta_plus_m", "__label_meta_zero", "__label_meta_amb"]} +model_b = {"file": "models/model_b.hdf5", "labels": ["__label__z_zero", "__label__z_plus_m", "__label__z_minus_m", "__label__z_amb"]} +model_c = {"file": "models/model_b.hdf5", "labels": ["__label__z_zero", "__label__z_plus_m", "__label__z_minus_m", "__label__z_amb"]} diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..9884687168fa86a5012ea0cca493f2351a194f49 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +import nlp_ws +from src.BiLSTM_LASER_worker import BiLstmLaserWorker + + +if __name__ == '__main__': + nlp_ws.NLPService.main(BiLstmLaserWorker) diff --git a/models/model_a.hdf5 b/models/model_a.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..cbd551c057ad6c79c174fb830adbce586d48e570 Binary files /dev/null and b/models/model_a.hdf5 differ diff --git a/models/model_b.hdf5 b/models/model_b.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..ffcf5b6cb8ef8a3c8aaacbf016a58c6863e84a8b Binary files /dev/null and b/models/model_b.hdf5 differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..32d209b057e2fc7702e9e9d13bca1ab30f664ff7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +fasttext +nltk +nlp_ws +keras +laserembeddings +lpmn_client +tensorflow diff --git a/src/BiLSTM_LASER_worker.py b/src/BiLSTM_LASER_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..a436a2bcf651dd7b0a7db1399bb7ee98d928069b --- /dev/null +++ b/src/BiLSTM_LASER_worker.py @@ -0,0 +1,109 @@ +import json +import operator +import logging + +import keras +import numpy as np +from nltk.tokenize import sent_tokenize +from laserembeddings import Laser +from keras import backend as K +import nlp_ws + +log = logging.getLogger(__name__) + + +class BiLstmLaserWorker(nlp_ws.NLPWorker): + @classmethod + def static_init(self, config): + self.config = config + log.debug("static_init(%s)", config) + + def init(self): + log.debug("init()") + models = dict() + for key in self.config["model"]: + models[key] = json.loads(self.config["model"][key]) + self._classifier = BiLstmClassifier(models) + + def process(self, input_path, task_options, output_path): + task = task_options.get("model", None) + print("----MODEL-----") + print(task) + with open(input_path, "r") as f: + text = f.read() + lang = text.split('__label__')[1] + text = text.split('__label__')[0] + result = self._classifier.predict(text, lang=lang, task_options=task) + print(result) + with open(output_path, "w") as f: + json.dump(result, f, indent=4) + + +def loss_coeff_determination(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return SS_res / (SS_tot) + + +class BiLstmClassifier(object): + + def __init__(self, model_settings): + self.models = dict() + self.classes = [] + for key, value in model_settings.items(): + self.models[key] = {"model": keras.models.load_model( + value['file'], custom_objects={ + "loss_coeff_determination": + loss_coeff_determination}), + "labels": value["labels"]} + self.laser = Laser() + + def predict(self, ccl, models=None, + task_options=None, k=1, threshold=0.0, lang=None): + options = dict() + if task_options is None: + options = self.models + else: + for key in task_options: + options[key] = self.models[key] + + language_short_codes = {'pl': 'polish', 'en': 'english', + 'cs': 'czech', 'da': 'danish', + 'nl': 'dutch', 'et': 'estonian', + 'fi': 'finnish', 'fr': 'french', + 'de': 'german', 'el': 'greek', + 'it': 'italian', 'no': 'norwegian', + 'pt': 'portuguese', 'ru': 'russian', + 'sl': 'slovenian', 'es': 'spanish', + 'sv': 'swedish', 'tu': 'turkish'} + + input_data = sent_tokenize(ccl, language=language_short_codes[lang]) + line_length = len(input_data) + vector_dimension = 1024 + line_vector = np.zeros((1, line_length, vector_dimension)) + print("Detect language:") + print(lang) + embeddings = self.laser.embed_sentences(input_data, lang=lang) + + for sentence_i in range(line_length): + line_vector[0][sentence_i] = embeddings[sentence_i] + query_result = line_vector + result = dict() + for key, value in options.items(): + predict_result = next(iter(value["model"].predict(query_result))) + predictions = { + value["labels"][ind]: score + for ind, score in enumerate(predict_result) + } + new_predictions = dict() + for prediction in predictions: + if predictions[prediction] > 1: + new_predictions[prediction] = str(1) + elif predictions[prediction] < 0: + new_predictions[prediction] = str(0) + else: + new_predictions[prediction] = str(predictions[prediction]) + predictions = new_predictions + result[key] = max(predictions.items(), + key=operator.itemgetter(1))[0] + return result diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..7e68f0ea5976af435aff20608373ed6921ad3540 --- /dev/null +++ b/tox.ini @@ -0,0 +1,10 @@ +[tox] +envlist = pep8 +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3.8 +commands = + flake8 {posargs}