Skip to content
Snippets Groups Projects
Select Git revision
  • master
  • vertical_relations
  • lu_without_semantic_frames
  • hierarchy
  • additional-unification-filters
  • v0.1.1
  • v0.1.0
  • v0.0.9
  • v0.0.8
  • v0.0.7
  • v0.0.6
  • v0.0.5
  • v0.0.4
  • v0.0.3
  • v0.0.2
  • v0.0.1
16 results

start_import.py

Blame
  • W2vController.py 8.65 KiB
    """Implementation of W2vController."""
    
    # unicode
    from src.w2v_service import constants
    from src.w2v_service.gensim_wrapper import W2vModel
    
    
    class ModelFormatGenerator:
        """Implements ModelFormatGenerator."""
    
        # TODO def __init__(self, word, pos, pos_type='nkjp'):
        def __init__(self, lemma, pos=None):
            """!!!nkjp tags only for now.
    
            pos is optional
            if given
                will be parsed into coarse pos
            else
                method generate_all_possible_gensim_formats() will generate mix
                with all possible coarse speech parts
            """
            self.unparsed_lemma = lemma
            self.pos = pos
            self.coarse_pos = self.__parse_to_coarse_pos(
                pos) if pos is not None else None
            self.lemma = self.__parse_to_gensim_format(lemma)
    
        @staticmethod
        def __parse_to_coarse_pos(pos):
            # only for nkjp
            fleksem = pos.split(':')[0]
            if fleksem in constants.NKJP_POS_MAPPINGS:
                return constants.NKJP_POS_MAPPINGS[fleksem]
            return None
    
        @staticmethod
        def __parse_to_gensim_format(lemma):
            for character, replacement in \
                    constants.REPLACE_CHARACTERS_IN_WORD.items():
                lemma = lemma.replace(character, replacement)
            return lemma
    
        def __gensim_format(self):
            return '{}:{}'.format(self.lemma, self.coarse_pos)
    
        def generate_all_possible_gensim_formats(self):
            """Generates all possible query forms for this lemma.
    
            If you pass pos in constructor list will
            contain one form ['word::given_nkjp_pos']
            :return: list of all possible queries f.eg.
            [u'piec::noun', u'piec::verb']
            """
            if self.coarse_pos is None:
                return [
                    '{}{}{}'.format(self.lemma,
                                    constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART,
                                    sp)
                    for sp in constants.ALL_SPEECH_PARTS]
            else:
                return [
                    '{}{}{}'.format(self.lemma,
                                    constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART,
                                    self.coarse_pos)]
    
    
    class W2vController:
        """Implements W2vController."""
    
        def __init__(self, path):
            """Initializes W2vController."""
            self.__model = W2vModel(path)
    
        def most_similar(self, word, top_n=100):
            """:param word: can be with nkjp pos or without.
    
            f. ex. both forms are acceptable: ''
            :param top_n: number of most similar words
            :return: list of most similar elements in vector
            """
            max_top_n = self.__model.vocab_size()
            if top_n > max_top_n or top_n == 'max':
                top_n = max_top_n
            leksem = self.__parse_word_to_format_generator_object(word)
            return self.__most_similar_to_leksem(leksem, top_n=top_n)
    
        def most_not_match(self, words):
            """:param words: can be with nkjp pos or without.
    
            f. ex. both forms are acceptable: ''
            :return: not matching element in words
            """
            words = self.__selectbestmodelformatlist(words)
            return self.__model.doesnt_match(words)
    
        def most_similar_full(self, positive, negative, top_n=100):
            """:param positive,negative: can be with nkjp pos or without.
    
            f. ex. both forms are acceptable: ''
            :param top_n: number of most similar words
            :return: list of most similar elements in vector
            """
            max_top_n = self.__model.vocab_size()
            if top_n > max_top_n or top_n == 'max':
                top_n = max_top_n
            positive = self.__selectbestmodelformatlist(positive)
            negative = self.__selectbestmodelformatlist(negative)
            most_similar = []
            most_similar.append(
                ("all", self.__model.most_similar_full(positive,
                                                       negative,
                                                       top_n=top_n)))
            return most_similar
    
        def __selectbestmodelformatlist(self, pos_neg_list):
            result = []
            for el in pos_neg_list:
                result.append(self.__selectbestmodelformat(el))
            return result
    
        def __selectbestmodelformat(self, word):
            leksem = self.__parse_word_to_format_generator_object(word)
            for lwsp in leksem.generate_all_possible_gensim_formats():
                if self.__model.contains(lwsp):
                    return lwsp
            return ""
    
        def vector_representations(self, word):
            """Be aware that it is possible to have many vectors for one word.
    
            f.eg. 'piec' as noun and verb
            :param word: word to get vector representations
            :return: list of all possible vector representations
            """
            leksem = self.__parse_word_to_format_generator_object(word)
            vectors = []
            for word in leksem.generate_all_possible_gensim_formats():
                if self.__model.contains(word):
                    vectors.append(
                        (word, self.__model.vector_representation(word).tolist()))
    
            return vectors
    
        def doc2vec(self, words, counts):
            """."""
            ind = 0
            total_sum = 0
            for word, count in zip(words, counts):
                if self.__model.contains(word):
                    ind = ind + count
                    if count == 1:
                        total_sum = total_sum + \
                            self.__model.vector_representation(word)
                    else:
                        total_sum = total_sum + self.__model.vector_representation(
                            word) * count
            if ind > 0:
                total_sum = total_sum / ind
            return total_sum.tolist()
    
        def similarity_betweens(self, first_words, second_words):
            """."""
            similarities = []
            firsts = []
            seconds = []
            for el in first_words:
                if self.__model.contains(el):
                    firsts.append(el)
            for el in second_words:
                if self.__model.contains(el):
                    seconds.append(el)
            for el2 in seconds:
                row = []
                for el1 in firsts:
                    row.append(1 - self.__model.similarity(el1, el2))
                similarities.append(row)
            return {"row": firsts, "column": seconds, "dist": similarities}
    
        def similarity_between(self, first_word, second_word):
            """Returns similarity beetwen 2 given words.
    
            :param first_word: can be with or without nkjp pos
            ex. u'piec::noun' or u'piec'
            :param second_word: can be with or without
            nkjp pos u'piec::noun' or u'piec'
            :return: list of words with pos and similarities between them
            """
            first_leksem = self.__parse_word_to_format_generator_object(first_word)
            second_leksem = self.__parse_word_to_format_generator_object(
                second_word)
            return self.__get_similarity_between_leksems(first_leksem,
                                                         second_leksem)
    
        def __get_similarity_between_leksems(self, first_leksem, second_leksem):
            similarities = []
            for first_form in first_leksem.generate_all_possible_gensim_formats():
                for second_form in \
                        second_leksem.generate_all_possible_gensim_formats():
                    # TODO co jesli bedzie akurat w kolejce first_form==second_form?
                    # np piec::verb
                    if self.__model.contains(first_form) and \
                            self.__model.contains(second_form):
                        similarity = self.__model.similarity(
                            first_form, second_form).item()
                        first_lek, first_pos = first_form.split(
                            constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART)
                        second_lek, second_pos = second_form.split(
                            constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART)
                        similarities.append(
                            (first_lek,
                             first_pos,
                             second_lek,
                             second_pos,
                             similarity))
            return similarities
    
        def __most_similar_to_leksem(self, leksem, top_n):
            most_similar = []
            for lwsp in leksem.generate_all_possible_gensim_formats():
                if self.__model.contains(lwsp):
                    most_similar.append(
                        (lwsp, self.__model.most_similar(lwsp, top_n=top_n)))
            return most_similar
    
        @staticmethod
        def __parse_word_to_format_generator_object(phrase):
            splitted_phrase = phrase.split(':')
            if len(splitted_phrase) >= 2:
                word = splitted_phrase[0]
                pos = splitted_phrase[1]
                return ModelFormatGenerator(word, pos)
            else:
                word = splitted_phrase[0]
                return ModelFormatGenerator(word)