Commit c39e1fef authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski

Merge branch 'develop' into 'current'

# Conflicts:
#   keyword_assignment_tool/config/config.ini
#   setup.py
parents 1e1fe461 3fd2a505
# Unreleased
## Added
- tests for ResultContainer
- chandle binary vectors included in zip archive
## Fixed
- value for single occurence for category depends on number of different
categories related to analyzed concept - it's more fair approach
- fixed version of vectors for categories
- fixed version of concept - categiroes mapping - with reduced relationship
leading to common categories
## Removed
- Remove plwn mappings to dbpedia categories
# 0.6
## Added
- Parameters now can be changed between tasks - adding possibility to reload
config from config file
- Adding possibility to apply weights for resulted ranking. Possible only when
......
......@@ -59,7 +59,6 @@ class GTPPRMC(WSDAlgorithmInterface):
return p
def prepare_weights_map(self, vertex_id_weight_map, graph, min_weight=0.5):
print(type(graph))
w = graph.new_vertex_property('double', val=0.0)
# w = graph.use_graph_tool().new_vertex_property('double', val=min_weight)
for node, weight in vertex_id_weight_map.iteritems():
......@@ -67,6 +66,9 @@ class GTPPRMC(WSDAlgorithmInterface):
node = graph.vertex(node)
else:
node = node.use_graph_tool()
# if weight < 0.55:
# weight = 0.0
# print("{} -> {}".format(str(node).ljust(10), weight))
w[node] = weight
return w
......@@ -122,14 +124,14 @@ class GTPPRMC(WSDAlgorithmInterface):
# pers_prop = self.prepare_v(nodes_context, graph, options.init_pers())
# logger.debug("RANKING:\n{}".format(url_rank_map.__repr__()))
# _log.info("start pprmc algorithm...")
_log.info("start pprmc algorithm...")
ranking = pprmc(
g,
pers=pers_prop,
epsilon=options.damping_factor(),
rw_count=options.max_iter(),
weight=weights_prop)
# _log.info("done.")
_log.info("pprmc algorithm finished.")
# filtering moved to results.py
# _log.debug("Start building res map...")
......
......@@ -3,27 +3,33 @@
; directory or should be absolute paths
; Keep in mind that only files included in instalation setup.py file will be
; available in 'data' directory.
; graph_file_path = graph-2018-11-13-categories-only-broader.graphml
graph_file_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/graph-2018-11-13-categories-only-broader.graphml
; graph_file_path = wibi-graph.graphml
graph_file_path = graph-2018-11-13-categories-only-broader.graphml
;graph_file_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/graph-2018-11-13-categories-only-broader.graphml
; concepts_categories_mapping = concept_category_index.txt.zip
concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index.txt.zip
; concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v4_many.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
concepts_categories_mapping = concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
; path to txt file with wikipedia2vec vectors
; categories_vectors = categories_vectors.txt.zip
categories_vectors = categories_vectors_v2.txt.zip
; categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors.txt.zip
; concepts_vectors = concepts_vectors.txt.zip
; for developing purposes
; categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors_v2.txt.zip
concepts_vectors = concepts_vectors.txt.zip
; concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_vectors.txt.zip
; for developing purposes
; concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin
;concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin.zip
; path to file with list of categories which will be used to filter list
; of results
; use_only_categories = MTC_depth_2.txt
use_only_categories = MTC_depth_2-BN-descr-extended.txt
; use_only_categories = MTC_depth_3.txt
categories_weights = cat_weights_MTC2.txt
; use_only_categories = MTC_depth_2-BN-descr-extended.txt
; categories_weights = cat_weights_MTC2.txt
[results]
as_json = true
n_best_keywords = 10
; n_best_keywords = 300
; minimal value of score from pprmc algorithm for any keyword.
; Only keywords with value above given will be returned in keyword list
; Note: Only used when 'n_best_keywords' is not given.
......
......@@ -131,15 +131,20 @@ class ConfigReader(object):
def should_use_json_output_format(self):
return self.conf.getboolean(S_RESULTS, O_JSON_FORMAT)
def _get_file_path_attr(self, section, key):
def _get_file_path_attr(self, section, key, check_if_exists=True):
# Returns absolute path from config or absolute path to resource dir
# with filename from config
filename = self.conf.get(section, key)
if os.path.isabs(filename):
return filename
try:
return pkg_resources.resource_filename(RESOURCE_MODULE, filename)
except:
raise ValueError(
"file with name '{}' doesn't exist in module '{}'.".format(
filename, RESOURCE_MODULE))
res_path = filename
else:
try:
res_path = pkg_resources.resource_filename(
RESOURCE_MODULE, filename)
except:
raise ValueError(
"file with name '{}' doesn't exist in module '{}'.".format(
filename, RESOURCE_MODULE))
if check_if_exists and not os.path.isfile(res_path):
raise IOError("File not found: {}".format(res_path))
return res_path
from __future__ import division
from collections import defaultdict, OrderedDict
import operator
from keyword_assignment_tool.utils import utils
import logging
_log = logging.getLogger()
class DocumentContext(object):
......@@ -67,16 +71,8 @@ class DocumentContext(object):
self._sorted_node_counters_map = OrderedDict(
self._sorted_node_counters)
cat_cum_counter_map = defaultdict(int)
for concept in concepts:
related_cats = self.concept_cats_map[concept]
concept_occurences = self._concept_counters_map[concept]
for rel_cat in related_cats:
cat_cum_counter_map[rel_cat] += concept_occurences
self._sorted_cat_cum_counter = sorted(
cat_cum_counter_map.items(), key=operator.itemgetter(1))
self._sorted_cat_cum_counter_map = OrderedDict(
self._sorted_cat_cum_counter)
self._sorted_cat_cum_counter, self._sorted_cat_cum_counter_map = \
self._calculate_cat_cum_counter(concepts)
@property
def concept_cats_map(self):
......@@ -87,7 +83,8 @@ class DocumentContext(object):
return self._concept_nodes_map
@property
def concept_counters_map(self):
def concept_counters_map(
self): # FIXME change name - it's not a map (dict)
return self._sorted_concept_counters
@property
......@@ -147,13 +144,15 @@ class DocumentContext(object):
# alternative way
self._node_cum_counter_map = {}
counted_cats = set(self._sorted_cat_cum_counter_map.keys())
# import pudb
# pudb.set_trace()
for v_id in g_v_ids:
try:
self._node_cum_counter_map[
v_id] = self._sorted_cat_cum_counter_map[
self._vertex_id_lod_url_map[v_id]]
except:
pass
vertex_lod_url = self._vertex_id_lod_url_map[v_id]
if vertex_lod_url in counted_cats:
self._node_cum_counter_map[v_id] = \
self._sorted_cat_cum_counter_map[vertex_lod_url]
# print("warn: ", e)
else:
# g_vertices = g.vertices()
# self._node_cum_counter_map = {v: self._sorted_cat_cum_counter_map\
......@@ -175,27 +174,105 @@ class DocumentContext(object):
@property
def document_vector(self):
if not self._doc_vec and self._cats_keyed_vectors:
self._doc_vec = self._build_dummy_doc_vec()
# self._doc_vec = self._build_dummy_doc_vec()
self._doc_vec = self._build_mean_doc_concepts_vec()
# self._doc_vec = self._build_dummy_doc_vec_single_concept(
# 'http://dbpedia.org/resource/Anthropology')
return self._doc_vec
def _calculate_cat_cum_counter(self, concepts):
'''
Note: Value for single occurence for category depends on number
of different categories related to analyzed concept.
It's because single occurence of concept should have equal influence
on graph for different concepts (concepts relevance is taken into
consideration in different place).
'''
cat_cum_counter_map = defaultdict(int)
for concept in concepts:
related_cats = self.concept_cats_map[concept]
cat_for_concept_weight = 1. / float(len(related_cats))
concept_occurences = self._concept_counters_map[concept]
for rel_cat in related_cats:
cat_cum_counter_map[\
rel_cat] += concept_occurences * cat_for_concept_weight
sorted_cat_cum_counter = sorted(
cat_cum_counter_map.items(), key=operator.itemgetter(1))
return sorted_cat_cum_counter, OrderedDict(sorted_cat_cum_counter)
def _build_dummy_doc_vec(
self): # FIXME takes vector for random wikipedia (dbpedia) concept
concepts = self._concepts_reader.get_concepts_occurences().keys()
urls_with_vectors = self._cats_keyed_vectors.vocab.keys()
for c in concepts:
if c in urls_with_vectors:
return self._cats_keyed_vectors.get_vector(c)
concepts = set(self._concepts_reader.get_concepts_occurences().keys())
urls_with_vectors = set(self._concepts_keyed_vectors.vocab.keys())
c = list(concepts.intersection(urls_with_vectors))[0]
return self._concepts_keyed_vectors.get_vector(c)
# for c in concepts:
# if c in urls_with_vectors:
# return self._cats_keyed_vectors.get_vector(c)
def _build_dummy_doc_vec_single_concept(
self, concept
): # FIXME takes vector for random wikipedia (dbpedia) concept
urls_with_vectors = set(self._concepts_keyed_vectors.vocab.keys())
if concept in urls_with_vectors:
return self._concepts_keyed_vectors.get_vector(concept)
else:
raise ValueError("Cannot make dummy vector for document.")
def _build_mean_doc_concepts_vec(self):
def _build_mean_doc_concepts_vec(self,
use_weights=True,
min_perc_occurences=0.1,
min_occurences=2):
'''Takes all vectors for dbpedia concepts presesnt in document and
calculates mean vector. '''
concepts_occurences_map = None
if use_weights:
concepts_occurences_map = self._concept_counters_map
fixed_perc_occurs = int(
max(concepts_occurences_map.values()) * min_perc_occurences)
min_threshold = max(fixed_perc_occurs, min_occurences)
_log.info("Min threshold: {}".format(min_threshold))
# removing rare concepts
concepts_occurences_map = {k: v \
for k, v in concepts_occurences_map.iteritems()\
if v >= min_threshold}
if self._concepts_keyed_vectors:
concepts = self._concepts_reader.get_concepts_occurences().keys()
urls_with_vectors = self._concepts_keyed_vectors.vocab.keys()
concepts_vectors = [self._cats_keyed_vectors.get_vector(c) for c in concepts \
if c in urls_with_vectors]
return utils.calculate_mean_vector(concepts_vectors)
concepts_from_doc = set(
self._concepts_reader.get_concepts_occurences().keys())
urls_with_vectors = set(self._concepts_keyed_vectors.vocab.keys())
vectorized_concepts = urls_with_vectors.intersection(
concepts_from_doc)
weights = None
concepts_urls = None
if use_weights and concepts_occurences_map:
weighted_concepts = set(concepts_occurences_map.keys())
concepts_vectors_map = {sel_c: self._concepts_keyed_vectors.get_vector(sel_c) \
for sel_c in vectorized_concepts if sel_c in weighted_concepts}
concepts_urls = concepts_vectors_map.keys()
weights = [concepts_occurences_map[c] for c in concepts_urls]
# _log.info("Weights for concepts for doc vector: {}".format(
# concepts_occurences_map))
else:
concepts_vectors_map = {sel_c: self._concepts_keyed_vectors.get_vector(sel_c) \
for sel_c in vectorized_concepts}
concepts_vectors = concepts_vectors_map.values()
if not concepts_urls:
concepts_urls = concepts_vectors_map.keys()
# concepts_vectors = [self._cats_keyed_vectors.get_vector(c) for c in concepts \
# if c in urls_with_vectors]
_log.debug(
"Building mean vector for document from {} vectors ...".format(
len(concepts_vectors)))
_log.debug(
"Selected concepts from document to calculate mean vector: \n{}"
.format('\n'.join(concepts_urls)))
return utils.calculate_mean_vector(
concepts_vectors, weights=weights)
else:
raise ValueError(
'''Cannot calculate mean vector for concepts in document -
......
......@@ -389,6 +389,21 @@ def run(
if cats_keyed_vectors:
resources['vertex_id_weight_map'] = prepare_graph_vertex_weight_map(
doc_context, lod_url_vertex_id_map, cats_keyed_vectors)
# DEBUG PRINTS
if debug_output_writer:
debug_output_writer.write(
"APPLIED WEIGHTS FOR CATEGORIES NODES:\n")
sorted_v_id_weight = sorted(
resources['vertex_id_weight_map'].items(),
key=operator.itemgetter(1))
for v_id, w in sorted_v_id_weight:
cat = vertex_id_lod_url_map.get(v_id, '-')
if isinstance(cat, unicode):
cat = cat.encode("utf8")
cat = cat.replace('http://dbpedia.org/resource/Category:',
'') # just for shorter print
debug_output_writer.write("{}\t{}\n".format(cat.ljust(80), w))
# doc_context, vertex_id_vector_map, cats_keyed_vectors)
use_only_cats_list = None
......@@ -401,7 +416,7 @@ def run(
# starting the pprmc algorithm
_log.debug("Initialize the algorithm")
category_rank_map = algorithm.run(context, graph, options, resources)
category_rank_prop_map = algorithm.run(context, graph, options, resources)
if debug_output_writer:
debug_output_writer.flush()
......@@ -416,12 +431,21 @@ def run(
lod_urls = set(vertex_id_lod_url_map.values())
# _log.info("Done")
# debug checking
# _log.info("Debug check ...")
# g = graph.use_graph_tool()
# for v in g.vertices():
# if not isinstance(v, int):
# v = g.vertex(v)
# assert g.vp.lod_url[v] == vertex_id_lod_url_map[v], "%s != %s" \
# % (g.vp.lod_url[v], vertex_id_lod_url_map[v])
# save reults
_log.debug("Process results")
if output_ranking or output_debug: # should provide details
# for output rank
res = ResultContainer(
category_rank_map,
category_rank_prop_map,
initial_categories=doc_context.initial_categories,
as_json=store_as_json,
n_best_kw=n_best_keywords,
......@@ -433,7 +457,7 @@ def run(
use_only_cats=use_only_cats_list)
else:
res = ResultContainer(
category_rank_map,
category_rank_prop_map,
as_json=store_as_json,
n_best_kw=n_best_keywords,
min_score_threshold=min_score_threshold,
......
......@@ -222,6 +222,7 @@ class KwazonPlugin(object):
n_best_keywords=self._kwazon_cfg.n_best_kw,
min_score_threshold=self._kwazon_cfg.min_score_threshold,
cats_keyed_vectors=self._cats_keyed_vectors,
concepts_keyed_vectors=self._concepts_keyed_vectors,
vertex_id_vector_map=self._v_id_vector_map,
use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
store_as_json=self._kwazon_cfg.as_json,
......@@ -268,6 +269,7 @@ class KwazonPlugin(object):
n_best_keywords=self._kwazon_cfg.n_best_kw,
min_score_threshold=self._kwazon_cfg.min_score_threshold,
cats_keyed_vectors=self._cats_keyed_vectors,
concepts_keyed_vectors=self._concepts_keyed_vectors,
vertex_id_vector_map=self._v_id_vector_map,
use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
store_as_json=self._kwazon_cfg.as_json,
......@@ -301,6 +303,13 @@ class KwazonPlugin(object):
categories_weights=self._cat_weights_map,
as_urls=self._kwazon_cfg.as_urls)
def _write_v_id_lod_url_mapping(self, output):
with open(output, 'w') as ofile:
for k, v in self._v_id_lod_url_map.iteritems():
if isinstance(v, unicode):
v = v.encode("utf8")
ofile.write("{}\t{}\n".format(k, v))
def get_config_file_path():
return run.ConfigReader().get_config_file_path()
......
......@@ -2,8 +2,8 @@ from collections import defaultdict
import logging
import codecs
# from gensim.models import KeyedVectors
# from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from wosedon.basegraph import BaseGraph
from corpus_ccl import cclutils as ccl
......@@ -196,19 +196,38 @@ def load_vectors(vectors_file_path):
to read.
@return gensim.Word2VecKeyedVectors instance
'''
raise ValueError("Temporary disabled function")
# def _load_vectors(vectors_file_path):
# if vectors_file_path:
# if not vectors_file_path.endswith('.txt'):
# raise ValueError(
# "Currently only vectors in txt representation are handled."
# )
# return KeyedVectors.load_word2vec_format(vectors_file_path)
#
# if utils.is_zip_archive(vectors_file_path):
# return utils.run_on_zipped_file(_load_vectors)
# raise ValueError("Temporary disabled function")
def _load_vectors(vecs_file_path):
if vecs_file_path:
if isinstance(vecs_file_path, basestring):
if vecs_file_path.endswith('.txt'):
# raise ValueError(
# "Currently only vectors in txt representation are handled."
# )
return KeyedVectors.load_word2vec_format(vecs_file_path)
if vecs_file_path.endswith('.bin'):
return KeyedVectors.load(vecs_file_path, mmap='r')
else:
raise ValueError(
"Don't know how to read given file: {}".format(
vecs_file_path))
# else: # bin vectors
# return KeyedVectors.load(vecs_file_path, mmap='r')
else: # txt vectors
return KeyedVectors.load_word2vec_format(vecs_file_path)
if utils.is_zip_archive(vectors_file_path):
_log.info("Loading vectors from zipped archive: {}".format(
vectors_file_path))
return utils.run_on_zipped_file(
# vectors_file_path, _load_vectors, with_extension='bin')
vectors_file_path,
_load_vectors,
with_extension='txt')
else:
_log.info("Loading vectors from file: {}".format(vectors_file_path))
return _load_vectors(vectors_file_path)
def prepare_graph_vertex_vec_map(graph, keyed_vectors):
......@@ -228,7 +247,7 @@ def prepare_graph_vertex_vec_map(graph, keyed_vectors):
def prepare_graph_vertex_weight_map(doc_context, lod_url_vertex_id_map,
cats_keyed_vectors):
'''Generates numpy matrix with weights for every node in
'''Generates dict with weights for every node in
graph.
Currently, this weights are based on similarity between wectors
......@@ -239,23 +258,24 @@ def prepare_graph_vertex_weight_map(doc_context, lod_url_vertex_id_map,
Note: if vector for certain vertex won't be found then returns vector with zeros.
@return numpy vector with length equal to number of nodes in graph.
#@return numpy vector with length equal to number of nodes in graph.
@return dict with vertex id as key and weight as value
'''
raise ValueError("Temporary disabled function")
# doc_vec = doc_context.document_vector # TODO
# doc_vec = doc_vec.reshape(1, -1)
# sim_mat = cosine_similarity(doc_vec, cats_keyed_vectors.vectors)
# sim_mat = sim_mat[0]
# lod_url_from_graph = lod_url_vertex_id_map.keys()
# raise ValueError("Temporary disabled function")
doc_vec = doc_context.document_vector # TODO
doc_vec = doc_vec.reshape(1, -1)
sim_mat = cosine_similarity(doc_vec, cats_keyed_vectors.vectors)
sim_mat = sim_mat[0]
lod_url_from_graph = set(lod_url_vertex_id_map.keys())
# vertex_ids = lod_url_vertex_id_map.values()
# lod_url_form_vecs = cats_keyed_vectors.index2word
# vertex_weight_map = {}
# for idx, url in enumerate(lod_url_form_vecs):
# if url in lod_url_from_graph:
# v = lod_url_vertex_id_map[url]
# lod_url_sim = sim_mat[idx]
# vertex_weight_map[v] = lod_url_sim
# return vertex_weight_map
lod_url_from_vecs = cats_keyed_vectors.index2word
vertex_weight_map = {}
for idx, url in enumerate(lod_url_from_vecs):
if url in lod_url_from_graph:
v = lod_url_vertex_id_map[url]
lod_url_sim = sim_mat[idx]
vertex_weight_map[v] = lod_url_sim
return vertex_weight_map
def store_dict(d, path):
......
......@@ -22,7 +22,8 @@ class ResultContainer(object):
connection with concepts from document will be highlighted/marked.
Applicable only if saving ranking to file.
@param category_rank_map - dict with string as key and float as value
@param category_rank_map - graph_tool property map or numpy array
with node id and rank value
@param as_json - flag determining format of output file; if enabled
then JSON will be stored; otherwise plain text will
be stored
......@@ -42,8 +43,10 @@ class ResultContainer(object):
value greater than 0 will be included in output
ranking
'''
if not isinstance(category_rank_map, np.ndarray):
category_rank_map = category_rank_map.a
self._unfiltered_category_rank_list = category_rank_map
if not category_rank_map:
if category_rank_map is None:
raise ValueError("Results not present!")
self._initial_categories = set(initial_categories) \
if initial_categories else None
......@@ -170,7 +173,8 @@ class ResultContainer(object):
- sorts
- crop ranking if n_best is given
- crop ranking if min_score_threshold is given
@param ranking (np.ndarray) with shape(x, 2)
@param weights_map (dict: string -> float) - weights for categories.
Must be in [0, 1] range.
'''
......@@ -178,8 +182,8 @@ class ResultContainer(object):
# format ranking object
# ranking = self._category_rank_list
v_id_lod_url_map = self._vertex_id_lod_url_map
np_rank = ranking.a
idxes = np.arange(np_rank.shape[0])
np_rank = ranking
idxes = np.arange(np_rank.shape[0]) # FIXME don't need it
indexed_rank = np.column_stack((idxes, np_rank))
# remove entries with zero score
......
......@@ -28,9 +28,16 @@ def as_unicode(txt, encoding="utf8"):
return txt
def calculate_mean_vector(vectors):
def calculate_mean_vector(vectors, weights=None):
'''
@param vectors(list)
@param weights(list)
'''
import numpy as np
return np.mean(vectors, axis=0)
if weights:
assert len(weights) == len(vectors)
return np.average(vectors, axis=0, weights=weights)
return np.average(vectors, axis=0)
def normalize_dict_values(key_values_dict):
......@@ -42,17 +49,34 @@ def is_zip_archive(file_path):
return zipfile.is_zipfile(file_path)
def run_on_zipped_file(zipped_archive, action, check=False):
def run_on_zipped_file(zipped_archive,
action,
check=False,
with_extension=None):
'''Extracts first file from archive and calls given action on this.
Returns result of this action.
If 'check' is enabled then performs additional check before opening
the archive.
@param with_extension - run given action on first file