Commit c6c2bb18 authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski

Add mechanizm to include descriptor category in results, tests, minor changes

parent 9e680e58
# Unreleased
## Added
- list urls mapped to BN's (Biblioteka Narodowa) descriptors
- Handling special categories (BN descriptors) - e.g. including at least n
best descriptors in resulted ranking and marking them with '$' character
in ranking
- tests for np_rank_utils
## Removed
Weighting with tf-idf
# 0.7
## Added
- tests for ResultContainer
- chandle binary vectors included in zip archive
## Fixed
## Fixed/changed
- value for single occurence for category depends on number of different
categories related to analyzed concept - it's more fair approach
- fixed version of vectors for categories
......
......@@ -3,21 +3,22 @@
; directory or should be absolute paths
; Keep in mind that only files included in instalation setup.py file will be
; available in 'data' directory.
graph_file_path = graph-2018-11-13-categories-only-broader.graphml
;graph_file_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/graph-2018-11-13-categories-only-broader.graphml
;graph_file_path = graph-2018-11-13-categories-only-broader.graphml
graph_file_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/graph-2018-11-13-categories-only-broader.graphml
; concepts_categories_mapping = concept_category_index.txt.zip
; concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v4_many.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
concepts_categories_mapping = concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
;concepts_categories_mapping = concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
; path to txt file with wikipedia2vec vectors
categories_vectors = categories_vectors_v2.txt.zip
; categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors.txt.zip
;categories_vectors = categories_vectors_v2.txt.zip
; for developing purposes
; categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors_v2.txt.zip
concepts_vectors = concepts_vectors.txt.zip
; concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_vectors.txt.zip
;categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors_v2.txt.zip
categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/cats_keyed_vectors_v2.bin
;concepts_vectors = concepts_vectors.txt.zip
;concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_vectors.txt.zip
concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin
; for developing purposes
; concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin
;concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin.zip
......@@ -34,6 +35,18 @@ n_best_keywords = 10
; Only keywords with value above given will be returned in keyword list
; Note: Only used when 'n_best_keywords' is not given.
score_min_threshold = 0.0003
; some kind of special categories (urls) which should be included/excluded
; from resulted ranking. The way of usage is defined in 'spec_cats_strategy'.
; In current version these spec-cats(urls) are representing BN descriptors
special_cats_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/BN_descriptors_linked_dbpedia_en_cats.txt
; informs how to use bn_descr_cats_path categories when constructing resluted
; ranking:
; - use original ranking('all'),
; - take only them('only'),
; - exclude them from ranking('exclude'),
; - take original ranking and only first n from given categories(int:natural
; number) - REGARDLESS of the score value.
spec_cats_strategy = 1
[algorithm]
iterations = 100
......
......@@ -18,8 +18,6 @@ S_ALGORITHM = 'algorithm'
O_ITERATIONS = 'iterations'
O_DAMPING_FACTOR = 'damping_factor'
O_INIT_PERS = 'init_personalisation'
O_N_BEST_KW = 'n_best_keywords'
O_MIN_SCORE = 'score_min_threshold'
S_DATABASE = 'database'
O_ENDPOINT = 'endpoint'
......@@ -28,6 +26,10 @@ O_PASSWORD = 'password'
S_RESULTS = 'results'
O_JSON_FORMAT = 'as_json'
O_N_BEST_KW = 'n_best_keywords'
O_MIN_SCORE = 'score_min_threshold'
O_SPEC_CATS_PATH = 'special_cats_path'
O_SPEC_CATS_STRATEGY = 'spec_cats_strategy'
CONFIG_MODULE = 'keyword_assignment_tool.config'
RESOURCE_MODULE = 'keyword_assignment_tool.data'
......@@ -116,6 +118,14 @@ class ConfigReader(object):
if self.conf.has_option(S_RESULTS, O_MIN_SCORE):
return self.conf.getfloat(S_RESULTS, O_MIN_SCORE)
def get_spec_cats_path(self):
if self.conf.has_option(S_RESULTS, O_SPEC_CATS_PATH):
return self._get_file_path_attr(S_RESULTS, O_SPEC_CATS_PATH)
def get_spec_cats_strategy(self):
if self.conf.has_option(S_RESULTS, O_SPEC_CATS_STRATEGY):
return self.conf.get(S_RESULTS, O_SPEC_CATS_STRATEGY)
def get_init_pers(self):
return self.conf.getfloat(S_ALGORITHM, O_INIT_PERS)
......
http://dbpedia.org/resource/Category:Archaeology
http://dbpedia.org/resource/Category:Architecture
http://dbpedia.org/resource/Category:Building_engineering
http://dbpedia.org/resource/Category:Military
http://dbpedia.org/resource/Category:Safety
http://dbpedia.org/resource/Category:Archival_science
http://dbpedia.org/resource/Category:Library_science
http://dbpedia.org/resource/Category:Museums
http://dbpedia.org/resource/Category:Biology
http://dbpedia.org/resource/Category:Chemistry
http://dbpedia.org/resource/Category:Pedagogy
http://dbpedia.org/resource/Category:Education
http://dbpedia.org/resource/Category:Cultural_anthropology
http://dbpedia.org/resource/Category:Ethics
http://dbpedia.org/resource/Category:Philosophy
http://dbpedia.org/resource/Category:Physics
http://dbpedia.org/resource/Category:Astronomy
http://dbpedia.org/resource/Category:Geography
http://dbpedia.org/resource/Category:Earth_sciences
http://dbpedia.org/resource/Category:Finance
http://dbpedia.org/resource/Category:Economy
http://dbpedia.org/resource/Category:Economics
http://dbpedia.org/resource/Category:History
http://dbpedia.org/resource/Category:Computer_science
http://dbpedia.org/resource/Category:Information_technology
http://dbpedia.org/resource/Category:Technology
http://dbpedia.org/resource/Category:Engineering_disciplines
http://dbpedia.org/resource/Category:Linguistics
http://dbpedia.org/resource/Category:Sports
http://dbpedia.org/resource/Category:Culture
http://dbpedia.org/resource/Category:Arts
http://dbpedia.org/resource/Category:Literary_criticism
http://dbpedia.org/resource/Category:Mathematics
http://dbpedia.org/resource/Category:Mass_media
http://dbpedia.org/resource/Category:Communication
http://dbpedia.org/resource/Category:Health
http://dbpedia.org/resource/Category:Medicine
http://dbpedia.org/resource/Category:Science
http://dbpedia.org/resource/Category:Methodology
http://dbpedia.org/resource/Category:Environmental_protection
http://dbpedia.org/resource/Category:Veterinary_medicine
http://dbpedia.org/resource/Category:Politics
http://dbpedia.org/resource/Category:Political_science
http://dbpedia.org/resource/Category:Public_administration
http://dbpedia.org/resource/Category:Law
http://dbpedia.org/resource/Category:Psychology
http://dbpedia.org/resource/Category:Religion
http://dbpedia.org/resource/Category:Spirituality
http://dbpedia.org/resource/Category:Forestry
http://dbpedia.org/resource/Category:Agriculture
http://dbpedia.org/resource/Category:Society
http://dbpedia.org/resource/Category:Sociology
http://dbpedia.org/resource/Category:Transport
http://dbpedia.org/resource/Category:Logistics
http://dbpedia.org/resource/Category:Management
http://dbpedia.org/resource/Category:Marketing
......@@ -22,6 +22,7 @@ from loaders.concepts_urls_reader import ConceptsUrlsReader
from context.doc_context import DocumentContext
from algorithm.gt_pprmc import GTPPRMC
from results.results import ResultContainer
from results.special_cats import SpecialCategories
from utils import utils
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
......@@ -98,9 +99,9 @@ def _parse_args():
help=
'''Path to file containing vectors for dbpedia categories (urls).''')
parser.add_argument(
'--use-only-categories',
'--special-categories',
action='store',
dest='use_only_cats_path',
dest='spec_cats_path',
help=
'''Path to file containing list of urls of categories. If given, then this
list will be used to filter ranking - only these keywords will be given
......@@ -108,6 +109,11 @@ def _parse_args():
this list. It's used to restrict results to only return these keywords
which represents appropriate categories (main topic classification).'''
)
parser.add_argument(
'--spec-cat-strategy',
action='store',
dest='spec_cats_strategy',
help='''Explained in config file. ''')
parser.add_argument(
'--categories-weights',
action='store',
......@@ -115,7 +121,7 @@ def _parse_args():
help=
'''Path to file containing categories and corresponding weights which
will be applied to resulted ranking. Currently, used only if
'use_only_cats_path' is given.''')
'spec_cats_path' is given.''')
parser.add_argument(
'-r',
'--output-ranking',
......@@ -213,8 +219,10 @@ def _process_configuration(args=None, force_reload=False):
args.cats_vec_path = config.get_cats_vec_path()
if not args.concepts_vec_path or force_reload:
args.concepts_vec_path = config.get_concepts_vec_path()
if not args.use_only_cats_path or force_reload:
args.use_only_cats_path = config.get_use_only_cats_path()
if not args.spec_cats_path or force_reload:
args.spec_cats_path = config.get_spec_cats_path()
if not args.spec_cats_strategy or force_reload:
args.spec_cats_strategy = config.get_spec_cats_strategy()
if not args.cat_weights_path or force_reload:
args.cat_weights_path = config.get_cat_weights_path()
......@@ -252,7 +260,8 @@ def run(
cats_keyed_vectors=None,
concepts_keyed_vectors=None,
vertex_id_vector_map=None,
use_only_cats_path=None,
bn_descr_cats_path=None,
spec_cats_strategy=None, # applicable only if bn_descr_cats_path given
categories_weights=None,
as_urls=False):
'''!
......@@ -288,12 +297,22 @@ def run(
@param vertex_id_lod_url_map - dict where key is index of vertex in loaded
Graph instance and value is string with url.
@param cats_keyed_vectors - gensim.Word2VecKeyedVectors instance
@param bn_descr_cats_path - path to categories (urls) representing BN
descriptors
@param spec_cats_strategy - informs how to use bn_descr_cats_path categories
when constructing resluted ranking:
- use original ranking('all'),
- take only them('only'),
- exclude them from ranking('exclude'),
- take original ranking and only first n from
given categories(int:natural number) - REGARDLESS
of the score value.
@param as_urls - if true then doc is treated as list of urls (representing
concepts)
'''
params = [
'iterations', 'init_pers', 'damping_factor', 'n_best_keywords',
'min_score_threshold', 'store_as_json', 'use_only_cats_path', 'as_urls'
'min_score_threshold', 'store_as_json', 'bn_descr_cats_path', 'as_urls'
]
_, arg_val = utils.get_passed_params(inspect.currentframe(), params)
_log.info("Starting kwazon task with parameters: %s", arg_val)
......@@ -406,11 +425,6 @@ def run(
# doc_context, vertex_id_vector_map, cats_keyed_vectors)
use_only_cats_list = None
if use_only_cats_path:
# resources['use_only_cats'] = load_file_lines(use_only_cats_path)
use_only_cats_list = load_file_lines(use_only_cats_path)
# context = concepts_nodes_map
context = doc_context # FIXME
......@@ -423,6 +437,7 @@ def run(
debug_output_writer.close()
# _log.info("Build map")
# TODO move below code into static loading section
lod_url_vertex_id_map = {
v: k
......@@ -431,6 +446,14 @@ def run(
lod_urls = set(vertex_id_lod_url_map.values())
# _log.info("Done")
spec_cats = None
if bn_descr_cats_path:
# resources['use_only_cats'] = load_file_lines(use_only_cats_path)
spec_cats = SpecialCategories(
load_file_lines(bn_descr_cats_path),
lod_url_vertex_id_map,
sel_criteria=spec_cats_strategy)
# debug checking
# _log.info("Debug check ...")
# g = graph.use_graph_tool()
......@@ -454,7 +477,7 @@ def run(
lod_url_vertex_id_map=lod_url_vertex_id_map,
lod_urls=lod_urls,
categories_weights=categories_weights,
use_only_cats=use_only_cats_list)
spec_cats=spec_cats)
else:
res = ResultContainer(
category_rank_prop_map,
......@@ -465,7 +488,7 @@ def run(
lod_url_vertex_id_map=lod_url_vertex_id_map,
lod_urls=lod_urls,
categories_weights=categories_weights,
use_only_cats=use_only_cats_list)
spec_cats=spec_cats)
_log.debug("Store results")
res.store_results(
output, output_ranking=output_ranking, output_debug=output_debug)
......
......@@ -224,9 +224,11 @@ class KwazonPlugin(object):
cats_keyed_vectors=self._cats_keyed_vectors,
concepts_keyed_vectors=self._concepts_keyed_vectors,
vertex_id_vector_map=self._v_id_vector_map,
use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
# use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
bn_descr_cats_path=self._kwazon_cfg.spec_cats_path,
spec_cats_strategy=self._kwazon_cfg.spec_cats_strategy,
store_as_json=self._kwazon_cfg.as_json,
categories_weights=self._cat_weights_map,
# categories_weights=self._cat_weights_map,
as_urls=self._kwazon_cfg.as_urls)
def run_kwazon_prepared_debug(self,
......@@ -271,9 +273,11 @@ class KwazonPlugin(object):
cats_keyed_vectors=self._cats_keyed_vectors,
concepts_keyed_vectors=self._concepts_keyed_vectors,
vertex_id_vector_map=self._v_id_vector_map,
use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
# use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
bn_descr_cats_path=self._kwazon_cfg.spec_cats_path,
spec_cats_strategy=self._kwazon_cfg.spec_cats_strategy,
store_as_json=self._kwazon_cfg.as_json,
categories_weights=self._cat_weights_map,
# categories_weights=self._cat_weights_map,
as_urls=self._kwazon_cfg.as_urls)
def run_kwazon(self, in_cclfilepath, out_cclfilepath, reload_config=True):
......@@ -300,7 +304,9 @@ class KwazonPlugin(object):
n_best_keywords=self._kwazon_cfg.n_best_kw,
min_score_threshold=self._kwazon_cfg.min_score_threshold,
store_as_json=self._kwazon_cfg.as_json,
categories_weights=self._cat_weights_map,
bn_descr_cats_path=self._kwazon_cfg.spec_cats_path,
spec_cats_strategy=self._kwazon_cfg.spec_cats_strategy,
# categories_weights=self._cat_weights_map,
as_urls=self._kwazon_cfg.as_urls)
def _write_v_id_lod_url_mapping(self, output):
......
......@@ -13,7 +13,7 @@ class ResultContainer(object):
initial_categories=None,
vertex_id_lod_url_map=None,
lod_url_vertex_id_map=None,
use_only_cats=None,
spec_cats=None,
lod_urls=None,
categories_weights=None,
ignore_zero=True):
......@@ -28,9 +28,9 @@ class ResultContainer(object):
then JSON will be stored; otherwise plain text will
be stored
@param initial_categories - list of strings
@param use_only_cats - list of categories preferred as keywords. If given
then keywords (categories) not included in this list
won't be returned as result
@param spec_cats - instance of SpecialCategories with list of categories
preferred as keywords. Strategy about how to handle
these categories are encapsulated inside.
@param categories_weights (dict: string -> float) - Weights which are used
to calculate ultimate value of ranking
score. Currently, tf-idf values for
......@@ -52,7 +52,6 @@ class ResultContainer(object):
if initial_categories else None
self._ignore_zero = ignore_zero
self._as_json = as_json
self._use_only_cats = use_only_cats
self._vertex_id_lod_url_map = vertex_id_lod_url_map
self._lod_url_v_id_map = lod_url_vertex_id_map
self._json_key = 'keywords'
......@@ -60,13 +59,14 @@ class ResultContainer(object):
self._n_best_kw = n_best_kw
self._categories_weights = categories_weights
self._lod_urls = lod_urls
self._top_spec_cats_no = 0
# if categories_weights and use_only_cats:
# self._categories_weights = normalize_dict_values(
# categories_weights)
self._category_rank_list = self._process_ranking(
category_rank_map,
use_only_cats=use_only_cats,
spec_cats=spec_cats,
weights_map=self._categories_weights,
n_best=n_best_kw,
min_score_threshold=min_score_threshold)
......@@ -94,7 +94,10 @@ class ResultContainer(object):
# here store filtered ranking
if output_ranking:
self._store_ranking(
output_ranking, self._category_rank_list, mode='w')
output_ranking,
self._category_rank_list,
first_n_spec=self._top_spec_cats_no,
mode='w')
# here store full ranking
if output_debug:
......@@ -118,7 +121,11 @@ class ResultContainer(object):
kw_json = kw_json.encode('utf8')
ofile.write(kw_json)
def _store_ranking(self, output_file, category_rank_map, mode='w'):
def _store_ranking(self,
output_file,
category_rank_map,
first_n_spec=0,
mode='w'):
with open(output_file, mode) as ofile:
# category_rank_map = self._category_rank_list
# if self._ignore_zero:
......@@ -126,39 +133,42 @@ class ResultContainer(object):
ofile.write("RESULTED RANKING (for {} categories):\n".format(
len(category_rank_map)))
for k, v in category_rank_map:
marker = '* ' if self._initial_categories and \
(k in self._initial_categories or k.decode(\
'utf8') in self._initial_categories) else ''
marker = ''
if first_n_spec > 0:
marker = '$ '
first_n_spec -= 1
elif self._initial_categories and k in self._initial_categories:
marker = '* '
ofile.write("{}{}\t{}\n".format(marker,
k.decode('string_escape'), v))
def _sort_results(self):
self._category_rank_list = sorted(
self._category_rank_list.items(), key=operator.itemgetter(1))
self._category_rank_list.reverse() # best at the top
min_val = 1.0
for c, r in self._category_rank_list:
assert r <= min_val
min_val = r
# def _sort_results(self):
# self._category_rank_list = sorted(
# self._category_rank_list.items(), key=operator.itemgetter(1))
# self._category_rank_list.reverse() # best at the top
# min_val = 1.0
# for c, r in self._category_rank_list:
# assert r <= min_val
# min_val = r
def _filter_entries(self,
ranking,
only_from_urls=None,
n_best=None,
min_score_threshold=0.0):
if only_from_urls and self._use_only_cats:
res = [v for v in ranking \
if v[1] >= min_score_threshold and v[0] in self._use_only_cats]
else:
res = [v for v in ranking if v[1] >= min_score_threshold]
if res and n_best:
return res[:n_best]
return res
# def _filter_entries(self,
# ranking,
# only_from_urls=None,
# n_best=None,
# min_score_threshold=0.0):
# if only_from_urls and self._use_only_cats:
# res = [v for v in ranking \
# if v[1] >= min_score_threshold and v[0] in self._use_only_cats]
# else:
# res = [v for v in ranking if v[1] >= min_score_threshold]
# if res and n_best:
# return res[:n_best]
# return res
# def _get_n_best_lod_urls(self,
def _process_ranking(self,
ranking,
use_only_cats=None,
spec_cats=None,
weights_map=None,
n_best=None,
min_score_threshold=None):
......@@ -181,6 +191,8 @@ class ResultContainer(object):
# format ranking object
# ranking = self._category_rank_list
sorted_idx_rank = None
specs_indices = []
v_id_lod_url_map = self._vertex_id_lod_url_map
np_rank = ranking
idxes = np.arange(np_rank.shape[0]) # FIXME don't need it
......@@ -189,65 +201,70 @@ class ResultContainer(object):
# remove entries with zero score
indexed_rank = indexed_rank[indexed_rank[:, 1] != 0]
if use_only_cats:
lod_url_v_id_map = self._lod_url_v_id_map
if self._lod_urls:
lod_urls = self._lod_urls
else:
lod_urls = set(self._lod_url_v_id_map.keys())
sel_lod_url_v_id_map = {k: lod_url_v_id_map[k] for k in use_only_cats\
if k in lod_urls}
sel_v_ids = sel_lod_url_v_id_map.values()
if spec_cats:
# lod_url_v_id_map = self._lod_url_v_id_map
# if self._lod_urls:
# lod_urls = self._lod_urls
# else:
# lod_urls = set(self._lod_url_v_id_map.keys())
# sel_lod_url_v_id_map = {k: lod_url_v_id_map[k] for k in use_only_cats\
# if k in lod_urls}
# sel_v_ids = sel_lod_url_v_id_map.values()
# filter by vertex id
sel_indexed_rank = indexed_rank[np.where(
np.in1d(indexed_rank[:, 0], sel_v_ids))[0]]
if weights_map:
sel_v_weights = np.array([[sel_lod_url_v_id_map[cat], w] \
for cat, w in weights_map.iteritems() \
if cat in sel_lod_url_v_id_map])
# sel_indexed_rank = indexed_rank[np.where(
# np.in1d(indexed_rank[:, 0], sel_v_ids))[0]]
# fill missing values in weights array - set to 1.0 - remember
# that values in weights_map have to be normalised!
missing_v_ids = np.setdiff1d(sel_indexed_rank[:, 0],
sel_v_weights[:, 0])
missing_weights = np.column_stack(
(missing_v_ids, np.ones(missing_v_ids.shape[0])))
sel_v_weights = np.concatenate((sel_v_weights,
missing_weights))
sorted_idx_rank, specs_indices = spec_cats.filter_ranking(
indexed_rank)
self._top_spec_cats_no = len(specs_indices)
# if weights_map: # TODO disable weights
# sel_v_weights = np.array([[sel_lod_url_v_id_map[cat], w] \
# for cat, w in weights_map.iteritems() \
# if cat in sel_lod_url_v_id_map])
# filter sel_v_weights to contain only v_ids relevant for this doc
sel_v_weights = sel_v_weights[np.where(
np.in1d(sel_v_weights[:, 0], sel_indexed_rank[:, 0]))[0]]
assert sel_v_weights.shape == sel_indexed_rank.shape, \
"%s <> %s" % (sel_v_weights.shape, sel_indexed_rank.shape)
# sort both arrays to match indices (first column)
sorted_sel_v_weights = sel_v_weights[
sel_v_weights[:, 0].argsort()]
sorted_sel_indexed_rank = sel_indexed_rank[
sel_indexed_rank[:, 0].argsort()]
assert all(sorted_sel_v_weights[:, 0] ==
sorted_sel_indexed_rank[:, 0])
#
# fill missing values in weights array - set to 1.0 - remember
# that values in weights_map have to be normalised!
# missing_v_ids = np.setdiff1d(sel_indexed_rank[:, 0],
# sel_v_weights[:, 0])
# missing_weights = np.column_stack(
# (missing_v_ids, np.ones(missing_v_ids.shape[0])))
# sel_v_weights = np.concatenate((sel_v_weights,
# missing_weights))
#
# filter sel_v_weights to contain only v_ids relevant for this doc
# sel_v_weights = sel_v_weights[np.where(
# np.in1d(sel_v_weights[:, 0], sel_indexed_rank[:, 0]))[0]]
#
# assert sel_v_weights.shape == sel_indexed_rank.shape, \
# "%s <> %s" % (sel_v_weights.shape, sel_indexed_rank.shape)
# sort both arrays to match indices (first column)
# sorted_sel_v_weights = sel_v_weights[
# sel_v_weights[:, 0].argsort()]
# sorted_sel_indexed_rank = sel_indexed_rank[
# sel_indexed_rank[:, 0].argsort()]
# assert all(sorted_sel_v_weights[:, 0] ==
# sorted_sel_indexed_rank[:, 0])
#
# apply weights
# indexed_rank = np.column_stack((sorted_sel_indexed_rank[:, 0], \
# sorted_sel_indexed_rank[:, 1] * sorted_sel_v_weights[:, 1]))
# apply weights
indexed_rank = np.column_stack((sorted_sel_indexed_rank[:, 0], \
sorted_sel_indexed_rank[:, 1] * sorted_sel_v_weights[:, 1]))
else:
# use filtered rank
indexed_rank = sel_indexed_rank
# else:
# indexed_rank = sel_indexed_rank
if sorted_idx_rank is None:
sorted_idx_rank = indexed_rank[indexed_rank[:, 1].argsort()][::-1]
assert sorted_idx_rank[0][1] >= sorted_idx_rank[-1][1]
existed_v_ids = sorted_idx_rank[:, 0]
v_id_lod_url_map = {v: v_id_lod_url_map[v] for v in existed_v_ids}
if min_score_threshold:
prepared_url_rank_list = [(v_id_lod_url_map[i], rank) \
for i, rank in sorted_idx_rank \
if rank >= min_score_threshold]
if rank >= min_score_threshold \
or int(i) in specs_indices]
else:
prepared_url_rank_list = [(v_id_lod_url_map[i], rank)\
for i, rank in sorted_idx_rank]
......
import numpy as np
import logging
from keyword_assignment_tool.utils.np_rank_utils import filter_rank_by_ids,\
sort_rank