Select Git revision
start_import.py
W2vController.py 8.65 KiB
"""Implementation of W2vController."""
# unicode
from src.w2v_service import constants
from src.w2v_service.gensim_wrapper import W2vModel
class ModelFormatGenerator:
"""Implements ModelFormatGenerator."""
# TODO def __init__(self, word, pos, pos_type='nkjp'):
def __init__(self, lemma, pos=None):
"""!!!nkjp tags only for now.
pos is optional
if given
will be parsed into coarse pos
else
method generate_all_possible_gensim_formats() will generate mix
with all possible coarse speech parts
"""
self.unparsed_lemma = lemma
self.pos = pos
self.coarse_pos = self.__parse_to_coarse_pos(
pos) if pos is not None else None
self.lemma = self.__parse_to_gensim_format(lemma)
@staticmethod
def __parse_to_coarse_pos(pos):
# only for nkjp
fleksem = pos.split(':')[0]
if fleksem in constants.NKJP_POS_MAPPINGS:
return constants.NKJP_POS_MAPPINGS[fleksem]
return None
@staticmethod
def __parse_to_gensim_format(lemma):
for character, replacement in \
constants.REPLACE_CHARACTERS_IN_WORD.items():
lemma = lemma.replace(character, replacement)
return lemma
def __gensim_format(self):
return '{}:{}'.format(self.lemma, self.coarse_pos)
def generate_all_possible_gensim_formats(self):
"""Generates all possible query forms for this lemma.
If you pass pos in constructor list will
contain one form ['word::given_nkjp_pos']
:return: list of all possible queries f.eg.
[u'piec::noun', u'piec::verb']
"""
if self.coarse_pos is None:
return [
'{}{}{}'.format(self.lemma,
constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART,
sp)
for sp in constants.ALL_SPEECH_PARTS]
else:
return [
'{}{}{}'.format(self.lemma,
constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART,
self.coarse_pos)]
class W2vController:
"""Implements W2vController."""
def __init__(self, path):
"""Initializes W2vController."""
self.__model = W2vModel(path)
def most_similar(self, word, top_n=100):
""":param word: can be with nkjp pos or without.
f. ex. both forms are acceptable: ''
:param top_n: number of most similar words
:return: list of most similar elements in vector
"""
max_top_n = self.__model.vocab_size()
if top_n > max_top_n or top_n == 'max':
top_n = max_top_n
leksem = self.__parse_word_to_format_generator_object(word)
return self.__most_similar_to_leksem(leksem, top_n=top_n)
def most_not_match(self, words):
""":param words: can be with nkjp pos or without.
f. ex. both forms are acceptable: ''
:return: not matching element in words
"""
words = self.__selectbestmodelformatlist(words)
return self.__model.doesnt_match(words)
def most_similar_full(self, positive, negative, top_n=100):
""":param positive,negative: can be with nkjp pos or without.
f. ex. both forms are acceptable: ''
:param top_n: number of most similar words
:return: list of most similar elements in vector
"""
max_top_n = self.__model.vocab_size()
if top_n > max_top_n or top_n == 'max':
top_n = max_top_n
positive = self.__selectbestmodelformatlist(positive)
negative = self.__selectbestmodelformatlist(negative)
most_similar = []
most_similar.append(
("all", self.__model.most_similar_full(positive,
negative,
top_n=top_n)))
return most_similar
def __selectbestmodelformatlist(self, pos_neg_list):
result = []
for el in pos_neg_list:
result.append(self.__selectbestmodelformat(el))
return result
def __selectbestmodelformat(self, word):
leksem = self.__parse_word_to_format_generator_object(word)
for lwsp in leksem.generate_all_possible_gensim_formats():
if self.__model.contains(lwsp):
return lwsp
return ""
def vector_representations(self, word):
"""Be aware that it is possible to have many vectors for one word.
f.eg. 'piec' as noun and verb
:param word: word to get vector representations
:return: list of all possible vector representations
"""
leksem = self.__parse_word_to_format_generator_object(word)
vectors = []
for word in leksem.generate_all_possible_gensim_formats():
if self.__model.contains(word):
vectors.append(
(word, self.__model.vector_representation(word).tolist()))
return vectors
def doc2vec(self, words, counts):
"""."""
ind = 0
total_sum = 0
for word, count in zip(words, counts):
if self.__model.contains(word):
ind = ind + count
if count == 1:
total_sum = total_sum + \
self.__model.vector_representation(word)
else:
total_sum = total_sum + self.__model.vector_representation(
word) * count
if ind > 0:
total_sum = total_sum / ind
return total_sum.tolist()
def similarity_betweens(self, first_words, second_words):
"""."""
similarities = []
firsts = []
seconds = []
for el in first_words:
if self.__model.contains(el):
firsts.append(el)
for el in second_words:
if self.__model.contains(el):
seconds.append(el)
for el2 in seconds:
row = []
for el1 in firsts:
row.append(1 - self.__model.similarity(el1, el2))
similarities.append(row)
return {"row": firsts, "column": seconds, "dist": similarities}
def similarity_between(self, first_word, second_word):
"""Returns similarity beetwen 2 given words.
:param first_word: can be with or without nkjp pos
ex. u'piec::noun' or u'piec'
:param second_word: can be with or without
nkjp pos u'piec::noun' or u'piec'
:return: list of words with pos and similarities between them
"""
first_leksem = self.__parse_word_to_format_generator_object(first_word)
second_leksem = self.__parse_word_to_format_generator_object(
second_word)
return self.__get_similarity_between_leksems(first_leksem,
second_leksem)
def __get_similarity_between_leksems(self, first_leksem, second_leksem):
similarities = []
for first_form in first_leksem.generate_all_possible_gensim_formats():
for second_form in \
second_leksem.generate_all_possible_gensim_formats():
# TODO co jesli bedzie akurat w kolejce first_form==second_form?
# np piec::verb
if self.__model.contains(first_form) and \
self.__model.contains(second_form):
similarity = self.__model.similarity(
first_form, second_form).item()
first_lek, first_pos = first_form.split(
constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART)
second_lek, second_pos = second_form.split(
constants.DELIMITER_BEETWEN_LEMMA_SPEECH_PART)
similarities.append(
(first_lek,
first_pos,
second_lek,
second_pos,
similarity))
return similarities
def __most_similar_to_leksem(self, leksem, top_n):
most_similar = []
for lwsp in leksem.generate_all_possible_gensim_formats():
if self.__model.contains(lwsp):
most_similar.append(
(lwsp, self.__model.most_similar(lwsp, top_n=top_n)))
return most_similar
@staticmethod
def __parse_word_to_format_generator_object(phrase):
splitted_phrase = phrase.split(':')
if len(splitted_phrase) >= 2:
word = splitted_phrase[0]
pos = splitted_phrase[1]
return ModelFormatGenerator(word, pos)
else:
word = splitted_phrase[0]
return ModelFormatGenerator(word)