Merge branch 'develop' into 'master'

Develop Closes #2 See merge request !5

Merge branch 'develop' into 'master'
Develop Closes #2 See merge request !5
981ca4ed · Bartosz Walkowiak · c7a95bb7 · 4c05429c · 981ca4ed · 981ca4ed
Commit 981ca4ed authored 1 year ago by Bartosz Walkowiak
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 image: 'clarinpl/python:3.6'
+
 cache:
  paths:
    - .tox
+
 stages:
  - check_style
  - build
-before_script:
-  - pip install tox==2.9.1
+
+.check_style_template:
+  before_script:
+    - pip install tox==2.9.1
+
 pep8:
+  extends: .check_style_template
  stage: check_style
  script:
    - tox -v -e pep8
+
 docstyle:
+  extends: .check_style_template
  stage: check_style
  script:
    - tox -v -e docstyle
+
 build_image:
  stage: build
  image: 'docker:18.09.7'
@@ -22,8 +31,6 @@ build_image:
    - master
  services:
    - 'docker:18.09.7-dind'
-  before_script:
-    - ''
  script:
    - docker build -t $CI_REGISTRY_IMAGE .
    - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY

--- a/entrypoint.py
+++ b/entrypoint.py
@@ -18,4 +18,4 @@ location_params = \
 cmd = main_cmd + location_params

 run(cmd, shell=True)
-run(["python3.6", "main.py", "service"])
+run(["python3.6", "main.py"])
--- a/main.py
+++ b/main.py
 """Implementation of hask service."""
-import argparse

 import lex_ws

 from src.word2vec_worker import W2vWorker


-def get_args():
-    """Gets command line arguments."""
-    parser = argparse.ArgumentParser(description="Topic Modeling")
-
-    subparsers = parser.add_subparsers(dest="algorithm")
-    subparsers.required = True
-
-    subparsers.add_parser(
-        "service",
-        help="Run as a service"
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    """Runs the program."""
-    args = get_args()
-
-    generators = {
-        "service": lambda: lex_ws.LexService.main(W2vWorker),
-    }
-
-    gen_fn = generators.get(args.algorithm, lambda: None)
-    gen_fn()
-
-
 if __name__ == "__main__":
-    main()
+    lex_ws.LexService.main(W2vWorker)
--- a/src/w2v_service/W2vController.py
+++ b/src/w2v_service/W2vController.py
@@ -112,9 +112,9 @@ class W2vController:
                                                   top_n=top_n)))
        return most_similar

-    def __selectbestmodelformatlist(self, list):
+    def __selectbestmodelformatlist(self, pos_neg_list):
        result = []
-        for el in list:
+        for el in pos_neg_list:
            result.append(self.__selectbestmodelformat(el))
        return result

@@ -144,17 +144,19 @@ class W2vController:
    def doc2vec(self, words, counts):
        """."""
        ind = 0
-        sum = 0
+        total_sum = 0
        for word, count in zip(words, counts):
            if self.__model.contains(word):
                ind = ind + count
                if count == 1:
-                    sum = sum + self.__model.vector_representation(word)
+                    total_sum = total_sum + \
+                        self.__model.vector_representation(word)
                else:
-                    sum = sum + self.__model.vector_representation(word) * count
+                    total_sum = total_sum + self.__model.vector_representation(
+                        word) * count
        if ind > 0:
-            sum = sum / ind
-        return sum.tolist()
+            total_sum = total_sum / ind
+        return total_sum.tolist()

    def similarity_betweens(self, first_words, second_words):
        """."""
@@ -230,7 +232,3 @@ class W2vController:
        else:
            word = splitted_phrase[0]
            return ModelFormatGenerator(word)
-
-
-if __name__ == '__main__':
-    pass
--- a/src/w2v_service/__init__.py
+++ b/src/w2v_service/__init__.py
-# __all__ = ["bar", "spam", "eggs"]
-# from W2vModel import W2vModel
-# from W2vController import W2vController
--- a/src/w2v_service/constants.py
+++ b/src/w2v_service/constants.py
@@ -10,11 +10,10 @@ REPLACE_CHARACTERS_IN_WORD = {
 ALL_SPEECH_PARTS = \
    ['noun', 'adj', 'verb', 'num', 'adv', 'pron', 'prep',
     'conj', 'interj', 'burk', 'qub', 'xxx', 'interp', 'aglt']
-print(ALL_SPEECH_PARTS)
+
 __noun, __adj, __verb, __num, __adv, __pron, __prep, __conj, \
    __interj, __burk, __qub, __xxx, __interp, __aglt = ALL_SPEECH_PARTS
-# 'noun', 'adj', 'verb', 'num', 'adv', 'pron', 'prep', 'conj',
-# 'interj', 'burk', 'qub', 'xxx', 'interp', 'aglt'
+
 NKJP_POS_MAPPINGS = {
    'subst': __noun, 'depr': __noun, 'ger': __noun, 'brev': __noun,
    'adj': __adj, 'adja': __adj, 'adjp': __adj, 'adjc': __adj,

--- a/src/w2v_service/gensim_wrapper.py
+++ b/src/w2v_service/gensim_wrapper.py
@@ -21,8 +21,6 @@ class W2vModel(object):

    def most_similar_full(self, positive, negative, top_n=10):
        """."""
-        print(positive)
-        print(negative)
        return self.__model.most_similar(positive, negative, topn=top_n)

    def most_similar(self, lwsp, top_n=10):
@@ -44,7 +42,3 @@ class W2vModel(object):
    def vocab_size(self):
        """Returns size of the model's vocabulary."""
        return len(self.__model.vocab)
-
-
-if __name__ == '__main__':
-    pass
--- a/src/word2vec_worker.py
+++ b/src/word2vec_worker.py
@@ -7,7 +7,6 @@ import lex_ws
 from src.w2v_service.W2vController import W2vController

 my_logger = logging.getLogger(__name__)
-_log = logging.getLogger(__name__)


 MODELS_PATHS = {
@@ -22,29 +21,48 @@ class W2vWorker(lex_ws.LexWorker):

    @classmethod
    def static_init(cls, config):
-        """Initializes W2vWorker worker."""
-        # cls.__service = W2vService(model_type=config["tool"]["model_type"])
+        """Initializes W2vWorker worker.
+
+        :param config: Configuration dict storing info about model type to use.
+        :type config: dict
+        """
        my_logger.info("Loading models...")
        model_path = MODELS_PATHS[config["tool"]["model_type"]]
        cls.__model = W2vController(model_path)
        my_logger.info("Loading finished.")
        return

-    def process(self, input):
-        """Running lex process."""
+    def process(self, service_input):
+        """Running lex process.
+
+        :param service_input: Input containing information to process
+        :type service_input: dict
+        :return: Service results.
+        :rtype: list | dict
+        """
        my_logger.info("Doing work!")
-        if "function" in input:
-            res = self._evaluate_function(input["function"], input)
+        if "function" in service_input:
+            res = self._evaluate_function(
+                service_input["function"], service_input)
        else:
-            res = self._evaluate_operation(input["task"], input)
+            res = self._evaluate_operation(service_input["task"], service_input)
        my_logger.info("Work done!")
        return res

-    def _evaluate_function(self, function_type, input):
+    def _evaluate_function(self, function_type, input_to_evaluate):
+        """Evaluate input received by the service.
+
+        :param function_type: function to be performed on input.
+        :type function_type: str
+        :param input_to_evaluate: Data to be evaluated by the worker.
+        :type input_to_evaluate: dict
+        :return: response from the model.
+        :rtype: dict | list
+        """
        response = {}

        if function_type == "list":
-            element = input["element"]
+            element = input_to_evaluate["element"]
            if "lang" not in element or element["lang"] != "pl":
                return response
            if "lemma" not in element:
@@ -57,13 +75,13 @@ class W2vWorker(lex_ws.LexWorker):
                        "json"], "url": "http://ws.clarin-pl.eu/w2vdemo.shtml"}

        elif function_type == "get":
-            element = input["element"]
+            element = input_to_evaluate["element"]
            if "lang" not in element or element["lang"] != "pl":
                return response
            if "lemma" not in element:
                return response

-            element = input["element"]
+            element = input_to_evaluate["element"]
            return self.__model.most_similar(element["lemma"])
        elif function_type == "getInfo":
            with open("src/info.json", "rt", encoding="utf8") as f:
@@ -72,26 +90,51 @@ class W2vWorker(lex_ws.LexWorker):

        return response

-    def _evaluate_operation(self, operation_type, input):
+    def _evaluate_operation(self, operation_type, input_to_evaluate):
+        """Evaluates operation passed by task.
+
+        :param operation_type: task which is supposed to be done.
+            Can be one of either: vector, similarity, kbest.
+        :type operation_type: str
+        :raise Exception: If provided operation is not supported.
+        :param input_to_evaluate: Data to be evaluated by the worker.
+        :type input_to_evaluate: dict
+        :return: task response
+        :rtype: dict | list
+        """
        model = self.__model
        if operation_type == "notmatch":
-            return model.most_not_match(input["options"]["words"])
+            return model.most_not_match(input_to_evaluate["options"]["words"])
        if operation_type == "kbestposneg":
-            return model.most_similar_full(input["options"]["positive"],
-                                           input["options"]["negative"])
+            return model.most_similar_full(
+                input_to_evaluate["options"]["positive"],
+                input_to_evaluate["options"]["negative"]
+            )
        if operation_type == "kbest":
-            return model.most_similar(input["lexeme"])
+            return model.most_similar(input_to_evaluate["lexeme"])
        elif operation_type == "similarity":
-            return model.similarity_between(input["lexeme"], input["lexeme2"])
+            return model.similarity_between(
+                input_to_evaluate["lexeme"],
+                input_to_evaluate["lexeme2"]
+            )
        elif operation_type == "similarities":
-            return model.similarity_betweens(input["lexeme1"], input["lexeme2"])
+            return model.similarity_betweens(
+                input_to_evaluate["lexeme1"],
+                input_to_evaluate["lexeme2"]
+            )
        elif operation_type == "doc2vec":
-            return model.doc2vec(input["lexeme1"], input["counts1"])
+            return model.doc2vec(
+                input_to_evaluate["lexeme1"],
+                input_to_evaluate["counts1"]
+            )
        elif operation_type == "vector":
-            return model.vector_representations(input["lexeme"])
+            return model.vector_representations(input_to_evaluate["lexeme"])
        elif operation_type == "all" or not operation_type:
-            return {"vector": model.vector_representations(input["lexeme"]),
-                    "kbest": model.most_similar(input["lexeme"])}
+            return {
+                "vector": model.vector_representations(
+                    input_to_evaluate["lexeme"]),
+                "kbest": model.most_similar(input_to_evaluate["lexeme"])
+            }

        else:
            raise Exception(