Commit c6c2bb18 authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski

Add mechanizm to include descriptor category in results, tests, minor changes

parent 9e680e58
# Unreleased # Unreleased
## Added ## Added
- list urls mapped to BN's (Biblioteka Narodowa) descriptors
- Handling special categories (BN descriptors) - e.g. including at least n
best descriptors in resulted ranking and marking them with '$' character
in ranking
- tests for np_rank_utils
## Removed
Weighting with tf-idf
# 0.7
## Added
- tests for ResultContainer - tests for ResultContainer
- chandle binary vectors included in zip archive - chandle binary vectors included in zip archive
## Fixed ## Fixed/changed
- value for single occurence for category depends on number of different - value for single occurence for category depends on number of different
categories related to analyzed concept - it's more fair approach categories related to analyzed concept - it's more fair approach
- fixed version of vectors for categories - fixed version of vectors for categories
......
...@@ -3,21 +3,22 @@ ...@@ -3,21 +3,22 @@
; directory or should be absolute paths ; directory or should be absolute paths
; Keep in mind that only files included in instalation setup.py file will be ; Keep in mind that only files included in instalation setup.py file will be
; available in 'data' directory. ; available in 'data' directory.
graph_file_path = graph-2018-11-13-categories-only-broader.graphml ;graph_file_path = graph-2018-11-13-categories-only-broader.graphml
;graph_file_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/graph-2018-11-13-categories-only-broader.graphml graph_file_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/graph-2018-11-13-categories-only-broader.graphml
; concepts_categories_mapping = concept_category_index.txt.zip ; concepts_categories_mapping = concept_category_index.txt.zip
; concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index.txt.zip ; concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts.txt.zip ;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v4_many.txt.zip ;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v4_many.txt.zip
;concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip concepts_categories_mapping = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
concepts_categories_mapping = concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip ;concepts_categories_mapping = concept_category_index_only_dbpedia_concepts_reduced_v5_at_least_one.txt.zip
; path to txt file with wikipedia2vec vectors ; path to txt file with wikipedia2vec vectors
categories_vectors = categories_vectors_v2.txt.zip ;categories_vectors = categories_vectors_v2.txt.zip
; categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors.txt.zip
; for developing purposes ; for developing purposes
; categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors_v2.txt.zip ;categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/categories_vectors_v2.txt.zip
concepts_vectors = concepts_vectors.txt.zip categories_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/cats_keyed_vectors_v2.bin
; concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_vectors.txt.zip ;concepts_vectors = concepts_vectors.txt.zip
;concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_vectors.txt.zip
concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin
; for developing purposes ; for developing purposes
; concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin ; concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin
;concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin.zip ;concepts_vectors = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/concepts_keyed_vectors.bin.zip
...@@ -34,6 +35,18 @@ n_best_keywords = 10 ...@@ -34,6 +35,18 @@ n_best_keywords = 10
; Only keywords with value above given will be returned in keyword list ; Only keywords with value above given will be returned in keyword list
; Note: Only used when 'n_best_keywords' is not given. ; Note: Only used when 'n_best_keywords' is not given.
score_min_threshold = 0.0003 score_min_threshold = 0.0003
; some kind of special categories (urls) which should be included/excluded
; from resulted ranking. The way of usage is defined in 'spec_cats_strategy'.
; In current version these spec-cats(urls) are representing BN descriptors
special_cats_path = /home/gkostkowski/WORKING_REPOS/kwazon/keyword_assignment_tool/data/BN_descriptors_linked_dbpedia_en_cats.txt
; informs how to use bn_descr_cats_path categories when constructing resluted
; ranking:
; - use original ranking('all'),
; - take only them('only'),
; - exclude them from ranking('exclude'),
; - take original ranking and only first n from given categories(int:natural
; number) - REGARDLESS of the score value.
spec_cats_strategy = 1
[algorithm] [algorithm]
iterations = 100 iterations = 100
......
...@@ -18,8 +18,6 @@ S_ALGORITHM = 'algorithm' ...@@ -18,8 +18,6 @@ S_ALGORITHM = 'algorithm'
O_ITERATIONS = 'iterations' O_ITERATIONS = 'iterations'
O_DAMPING_FACTOR = 'damping_factor' O_DAMPING_FACTOR = 'damping_factor'
O_INIT_PERS = 'init_personalisation' O_INIT_PERS = 'init_personalisation'
O_N_BEST_KW = 'n_best_keywords'
O_MIN_SCORE = 'score_min_threshold'
S_DATABASE = 'database' S_DATABASE = 'database'
O_ENDPOINT = 'endpoint' O_ENDPOINT = 'endpoint'
...@@ -28,6 +26,10 @@ O_PASSWORD = 'password' ...@@ -28,6 +26,10 @@ O_PASSWORD = 'password'
S_RESULTS = 'results' S_RESULTS = 'results'
O_JSON_FORMAT = 'as_json' O_JSON_FORMAT = 'as_json'
O_N_BEST_KW = 'n_best_keywords'
O_MIN_SCORE = 'score_min_threshold'
O_SPEC_CATS_PATH = 'special_cats_path'
O_SPEC_CATS_STRATEGY = 'spec_cats_strategy'
CONFIG_MODULE = 'keyword_assignment_tool.config' CONFIG_MODULE = 'keyword_assignment_tool.config'
RESOURCE_MODULE = 'keyword_assignment_tool.data' RESOURCE_MODULE = 'keyword_assignment_tool.data'
...@@ -116,6 +118,14 @@ class ConfigReader(object): ...@@ -116,6 +118,14 @@ class ConfigReader(object):
if self.conf.has_option(S_RESULTS, O_MIN_SCORE): if self.conf.has_option(S_RESULTS, O_MIN_SCORE):
return self.conf.getfloat(S_RESULTS, O_MIN_SCORE) return self.conf.getfloat(S_RESULTS, O_MIN_SCORE)
def get_spec_cats_path(self):
if self.conf.has_option(S_RESULTS, O_SPEC_CATS_PATH):
return self._get_file_path_attr(S_RESULTS, O_SPEC_CATS_PATH)
def get_spec_cats_strategy(self):
if self.conf.has_option(S_RESULTS, O_SPEC_CATS_STRATEGY):
return self.conf.get(S_RESULTS, O_SPEC_CATS_STRATEGY)
def get_init_pers(self): def get_init_pers(self):
return self.conf.getfloat(S_ALGORITHM, O_INIT_PERS) return self.conf.getfloat(S_ALGORITHM, O_INIT_PERS)
......
http://dbpedia.org/resource/Category:Archaeology
http://dbpedia.org/resource/Category:Architecture
http://dbpedia.org/resource/Category:Building_engineering
http://dbpedia.org/resource/Category:Military
http://dbpedia.org/resource/Category:Safety
http://dbpedia.org/resource/Category:Archival_science
http://dbpedia.org/resource/Category:Library_science
http://dbpedia.org/resource/Category:Museums
http://dbpedia.org/resource/Category:Biology
http://dbpedia.org/resource/Category:Chemistry
http://dbpedia.org/resource/Category:Pedagogy
http://dbpedia.org/resource/Category:Education
http://dbpedia.org/resource/Category:Cultural_anthropology
http://dbpedia.org/resource/Category:Ethics
http://dbpedia.org/resource/Category:Philosophy
http://dbpedia.org/resource/Category:Physics
http://dbpedia.org/resource/Category:Astronomy
http://dbpedia.org/resource/Category:Geography
http://dbpedia.org/resource/Category:Earth_sciences
http://dbpedia.org/resource/Category:Finance
http://dbpedia.org/resource/Category:Economy
http://dbpedia.org/resource/Category:Economics
http://dbpedia.org/resource/Category:History
http://dbpedia.org/resource/Category:Computer_science
http://dbpedia.org/resource/Category:Information_technology
http://dbpedia.org/resource/Category:Technology
http://dbpedia.org/resource/Category:Engineering_disciplines
http://dbpedia.org/resource/Category:Linguistics
http://dbpedia.org/resource/Category:Sports
http://dbpedia.org/resource/Category:Culture
http://dbpedia.org/resource/Category:Arts
http://dbpedia.org/resource/Category:Literary_criticism
http://dbpedia.org/resource/Category:Mathematics
http://dbpedia.org/resource/Category:Mass_media
http://dbpedia.org/resource/Category:Communication
http://dbpedia.org/resource/Category:Health
http://dbpedia.org/resource/Category:Medicine
http://dbpedia.org/resource/Category:Science
http://dbpedia.org/resource/Category:Methodology
http://dbpedia.org/resource/Category:Environmental_protection
http://dbpedia.org/resource/Category:Veterinary_medicine
http://dbpedia.org/resource/Category:Politics
http://dbpedia.org/resource/Category:Political_science
http://dbpedia.org/resource/Category:Public_administration
http://dbpedia.org/resource/Category:Law
http://dbpedia.org/resource/Category:Psychology
http://dbpedia.org/resource/Category:Religion
http://dbpedia.org/resource/Category:Spirituality
http://dbpedia.org/resource/Category:Forestry
http://dbpedia.org/resource/Category:Agriculture
http://dbpedia.org/resource/Category:Society
http://dbpedia.org/resource/Category:Sociology
http://dbpedia.org/resource/Category:Transport
http://dbpedia.org/resource/Category:Logistics
http://dbpedia.org/resource/Category:Management
http://dbpedia.org/resource/Category:Marketing
...@@ -22,6 +22,7 @@ from loaders.concepts_urls_reader import ConceptsUrlsReader ...@@ -22,6 +22,7 @@ from loaders.concepts_urls_reader import ConceptsUrlsReader
from context.doc_context import DocumentContext from context.doc_context import DocumentContext
from algorithm.gt_pprmc import GTPPRMC from algorithm.gt_pprmc import GTPPRMC
from results.results import ResultContainer from results.results import ResultContainer
from results.special_cats import SpecialCategories
from utils import utils from utils import utils
logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.basicConfig(stream=sys.stdout, level=logging.INFO)
...@@ -98,9 +99,9 @@ def _parse_args(): ...@@ -98,9 +99,9 @@ def _parse_args():
help= help=
'''Path to file containing vectors for dbpedia categories (urls).''') '''Path to file containing vectors for dbpedia categories (urls).''')
parser.add_argument( parser.add_argument(
'--use-only-categories', '--special-categories',
action='store', action='store',
dest='use_only_cats_path', dest='spec_cats_path',
help= help=
'''Path to file containing list of urls of categories. If given, then this '''Path to file containing list of urls of categories. If given, then this
list will be used to filter ranking - only these keywords will be given list will be used to filter ranking - only these keywords will be given
...@@ -108,6 +109,11 @@ def _parse_args(): ...@@ -108,6 +109,11 @@ def _parse_args():
this list. It's used to restrict results to only return these keywords this list. It's used to restrict results to only return these keywords
which represents appropriate categories (main topic classification).''' which represents appropriate categories (main topic classification).'''
) )
parser.add_argument(
'--spec-cat-strategy',
action='store',
dest='spec_cats_strategy',
help='''Explained in config file. ''')
parser.add_argument( parser.add_argument(
'--categories-weights', '--categories-weights',
action='store', action='store',
...@@ -115,7 +121,7 @@ def _parse_args(): ...@@ -115,7 +121,7 @@ def _parse_args():
help= help=
'''Path to file containing categories and corresponding weights which '''Path to file containing categories and corresponding weights which
will be applied to resulted ranking. Currently, used only if will be applied to resulted ranking. Currently, used only if
'use_only_cats_path' is given.''') 'spec_cats_path' is given.''')
parser.add_argument( parser.add_argument(
'-r', '-r',
'--output-ranking', '--output-ranking',
...@@ -213,8 +219,10 @@ def _process_configuration(args=None, force_reload=False): ...@@ -213,8 +219,10 @@ def _process_configuration(args=None, force_reload=False):
args.cats_vec_path = config.get_cats_vec_path() args.cats_vec_path = config.get_cats_vec_path()
if not args.concepts_vec_path or force_reload: if not args.concepts_vec_path or force_reload:
args.concepts_vec_path = config.get_concepts_vec_path() args.concepts_vec_path = config.get_concepts_vec_path()
if not args.use_only_cats_path or force_reload: if not args.spec_cats_path or force_reload:
args.use_only_cats_path = config.get_use_only_cats_path() args.spec_cats_path = config.get_spec_cats_path()
if not args.spec_cats_strategy or force_reload:
args.spec_cats_strategy = config.get_spec_cats_strategy()
if not args.cat_weights_path or force_reload: if not args.cat_weights_path or force_reload:
args.cat_weights_path = config.get_cat_weights_path() args.cat_weights_path = config.get_cat_weights_path()
...@@ -252,7 +260,8 @@ def run( ...@@ -252,7 +260,8 @@ def run(
cats_keyed_vectors=None, cats_keyed_vectors=None,
concepts_keyed_vectors=None, concepts_keyed_vectors=None,
vertex_id_vector_map=None, vertex_id_vector_map=None,
use_only_cats_path=None, bn_descr_cats_path=None,
spec_cats_strategy=None, # applicable only if bn_descr_cats_path given
categories_weights=None, categories_weights=None,
as_urls=False): as_urls=False):
'''! '''!
...@@ -288,12 +297,22 @@ def run( ...@@ -288,12 +297,22 @@ def run(
@param vertex_id_lod_url_map - dict where key is index of vertex in loaded @param vertex_id_lod_url_map - dict where key is index of vertex in loaded
Graph instance and value is string with url. Graph instance and value is string with url.
@param cats_keyed_vectors - gensim.Word2VecKeyedVectors instance @param cats_keyed_vectors - gensim.Word2VecKeyedVectors instance
@param bn_descr_cats_path - path to categories (urls) representing BN
descriptors
@param spec_cats_strategy - informs how to use bn_descr_cats_path categories
when constructing resluted ranking:
- use original ranking('all'),
- take only them('only'),
- exclude them from ranking('exclude'),
- take original ranking and only first n from
given categories(int:natural number) - REGARDLESS
of the score value.
@param as_urls - if true then doc is treated as list of urls (representing @param as_urls - if true then doc is treated as list of urls (representing
concepts) concepts)
''' '''
params = [ params = [
'iterations', 'init_pers', 'damping_factor', 'n_best_keywords', 'iterations', 'init_pers', 'damping_factor', 'n_best_keywords',
'min_score_threshold', 'store_as_json', 'use_only_cats_path', 'as_urls' 'min_score_threshold', 'store_as_json', 'bn_descr_cats_path', 'as_urls'
] ]
_, arg_val = utils.get_passed_params(inspect.currentframe(), params) _, arg_val = utils.get_passed_params(inspect.currentframe(), params)
_log.info("Starting kwazon task with parameters: %s", arg_val) _log.info("Starting kwazon task with parameters: %s", arg_val)
...@@ -406,11 +425,6 @@ def run( ...@@ -406,11 +425,6 @@ def run(
# doc_context, vertex_id_vector_map, cats_keyed_vectors) # doc_context, vertex_id_vector_map, cats_keyed_vectors)
use_only_cats_list = None
if use_only_cats_path:
# resources['use_only_cats'] = load_file_lines(use_only_cats_path)
use_only_cats_list = load_file_lines(use_only_cats_path)
# context = concepts_nodes_map # context = concepts_nodes_map
context = doc_context # FIXME context = doc_context # FIXME
...@@ -423,6 +437,7 @@ def run( ...@@ -423,6 +437,7 @@ def run(
debug_output_writer.close() debug_output_writer.close()
# _log.info("Build map") # _log.info("Build map")
# TODO move below code into static loading section # TODO move below code into static loading section
lod_url_vertex_id_map = { lod_url_vertex_id_map = {
v: k v: k
...@@ -431,6 +446,14 @@ def run( ...@@ -431,6 +446,14 @@ def run(
lod_urls = set(vertex_id_lod_url_map.values()) lod_urls = set(vertex_id_lod_url_map.values())
# _log.info("Done") # _log.info("Done")
spec_cats = None
if bn_descr_cats_path:
# resources['use_only_cats'] = load_file_lines(use_only_cats_path)
spec_cats = SpecialCategories(
load_file_lines(bn_descr_cats_path),
lod_url_vertex_id_map,
sel_criteria=spec_cats_strategy)
# debug checking # debug checking
# _log.info("Debug check ...") # _log.info("Debug check ...")
# g = graph.use_graph_tool() # g = graph.use_graph_tool()
...@@ -454,7 +477,7 @@ def run( ...@@ -454,7 +477,7 @@ def run(
lod_url_vertex_id_map=lod_url_vertex_id_map, lod_url_vertex_id_map=lod_url_vertex_id_map,
lod_urls=lod_urls, lod_urls=lod_urls,
categories_weights=categories_weights, categories_weights=categories_weights,
use_only_cats=use_only_cats_list) spec_cats=spec_cats)
else: else:
res = ResultContainer( res = ResultContainer(
category_rank_prop_map, category_rank_prop_map,
...@@ -465,7 +488,7 @@ def run( ...@@ -465,7 +488,7 @@ def run(
lod_url_vertex_id_map=lod_url_vertex_id_map, lod_url_vertex_id_map=lod_url_vertex_id_map,
lod_urls=lod_urls, lod_urls=lod_urls,
categories_weights=categories_weights, categories_weights=categories_weights,
use_only_cats=use_only_cats_list) spec_cats=spec_cats)
_log.debug("Store results") _log.debug("Store results")
res.store_results( res.store_results(
output, output_ranking=output_ranking, output_debug=output_debug) output, output_ranking=output_ranking, output_debug=output_debug)
......
...@@ -224,9 +224,11 @@ class KwazonPlugin(object): ...@@ -224,9 +224,11 @@ class KwazonPlugin(object):
cats_keyed_vectors=self._cats_keyed_vectors, cats_keyed_vectors=self._cats_keyed_vectors,
concepts_keyed_vectors=self._concepts_keyed_vectors, concepts_keyed_vectors=self._concepts_keyed_vectors,
vertex_id_vector_map=self._v_id_vector_map, vertex_id_vector_map=self._v_id_vector_map,
use_only_cats_path=self._kwazon_cfg.use_only_cats_path, # use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
bn_descr_cats_path=self._kwazon_cfg.spec_cats_path,
spec_cats_strategy=self._kwazon_cfg.spec_cats_strategy,
store_as_json=self._kwazon_cfg.as_json, store_as_json=self._kwazon_cfg.as_json,
categories_weights=self._cat_weights_map, # categories_weights=self._cat_weights_map,
as_urls=self._kwazon_cfg.as_urls) as_urls=self._kwazon_cfg.as_urls)
def run_kwazon_prepared_debug(self, def run_kwazon_prepared_debug(self,
...@@ -271,9 +273,11 @@ class KwazonPlugin(object): ...@@ -271,9 +273,11 @@ class KwazonPlugin(object):
cats_keyed_vectors=self._cats_keyed_vectors, cats_keyed_vectors=self._cats_keyed_vectors,
concepts_keyed_vectors=self._concepts_keyed_vectors, concepts_keyed_vectors=self._concepts_keyed_vectors,
vertex_id_vector_map=self._v_id_vector_map, vertex_id_vector_map=self._v_id_vector_map,
use_only_cats_path=self._kwazon_cfg.use_only_cats_path, # use_only_cats_path=self._kwazon_cfg.use_only_cats_path,
bn_descr_cats_path=self._kwazon_cfg.spec_cats_path,
spec_cats_strategy=self._kwazon_cfg.spec_cats_strategy,
store_as_json=self._kwazon_cfg.as_json, store_as_json=self._kwazon_cfg.as_json,
categories_weights=self._cat_weights_map, # categories_weights=self._cat_weights_map,
as_urls=self._kwazon_cfg.as_urls) as_urls=self._kwazon_cfg.as_urls)
def run_kwazon(self, in_cclfilepath, out_cclfilepath, reload_config=True): def run_kwazon(self, in_cclfilepath, out_cclfilepath, reload_config=True):
...@@ -300,7 +304,9 @@ class KwazonPlugin(object): ...@@ -300,7 +304,9 @@ class KwazonPlugin(object):
n_best_keywords=self._kwazon_cfg.n_best_kw, n_best_keywords=self._kwazon_cfg.n_best_kw,
min_score_threshold=self._kwazon_cfg.min_score_threshold, min_score_threshold=self._kwazon_cfg.min_score_threshold,
store_as_json=self._kwazon_cfg.as_json, store_as_json=self._kwazon_cfg.as_json,
categories_weights=self._cat_weights_map, bn_descr_cats_path=self._kwazon_cfg.spec_cats_path,
spec_cats_strategy=self._kwazon_cfg.spec_cats_strategy,
# categories_weights=self._cat_weights_map,
as_urls=self._kwazon_cfg.as_urls) as_urls=self._kwazon_cfg.as_urls)
def _write_v_id_lod_url_mapping(self, output): def _write_v_id_lod_url_mapping(self, output):
......
...@@ -13,7 +13,7 @@ class ResultContainer(object): ...@@ -13,7 +13,7 @@ class ResultContainer(object):
initial_categories=None, initial_categories=None,
vertex_id_lod_url_map=None, vertex_id_lod_url_map=None,
lod_url_vertex_id_map=None, lod_url_vertex_id_map=None,
use_only_cats=None, spec_cats=None,
lod_urls=None, lod_urls=None,
categories_weights=None, categories_weights=None,
ignore_zero=True): ignore_zero=True):
...@@ -28,9 +28,9 @@ class ResultContainer(object): ...@@ -28,9 +28,9 @@ class ResultContainer(object):
then JSON will be stored; otherwise plain text will then JSON will be stored; otherwise plain text will
be stored be stored
@param initial_categories - list of strings @param initial_categories - list of strings
@param use_only_cats - list of categories preferred as keywords. If given @param spec_cats - instance of SpecialCategories with list of categories
then keywords (categories) not included in this list preferred as keywords. Strategy about how to handle
won't be returned as result these categories are encapsulated inside.
@param categories_weights (dict: string -> float) - Weights which are used @param categories_weights (dict: string -> float) - Weights which are used
to calculate ultimate value of ranking to calculate ultimate value of ranking
score. Currently, tf-idf values for score. Currently, tf-idf values for
...@@ -52,7 +52,6 @@ class ResultContainer(object): ...@@ -52,7 +52,6 @@ class ResultContainer(object):
if initial_categories else None if initial_categories else None
self._ignore_zero = ignore_zero self._ignore_zero = ignore_zero
self._as_json = as_json self._as_json = as_json
self._use_only_cats = use_only_cats
self._vertex_id_lod_url_map = vertex_id_lod_url_map self._vertex_id_lod_url_map = vertex_id_lod_url_map
self._lod_url_v_id_map = lod_url_vertex_id_map self._lod_url_v_id_map = lod_url_vertex_id_map
self._json_key = 'keywords' self._json_key = 'keywords'
...@@ -60,13 +59,14 @@ class ResultContainer(object): ...@@ -60,13 +59,14 @@ class ResultContainer(object):
self._n_best_kw = n_best_kw self._n_best_kw = n_best_kw
self._categories_weights = categories_weights self._categories_weights = categories_weights
self._lod_urls = lod_urls self._lod_urls = lod_urls
self._top_spec_cats_no = 0
# if categories_weights and use_only_cats: # if categories_weights and use_only_cats:
# self._categories_weights = normalize_dict_values( # self._categories_weights = normalize_dict_values(
# categories_weights) # categories_weights)
self._category_rank_list = self._process_ranking( self._category_rank_list = self._process_ranking(
category_rank_map, category_rank_map,
use_only_cats=use_only_cats, spec_cats=spec_cats,
weights_map=self._categories_weights, weights_map=self._categories_weights,
n_best=n_best_kw, n_best=n_best_kw,
min_score_threshold=min_score_threshold) min_score_threshold=min_score_threshold)
...@@ -94,7 +94,10 @@ class ResultContainer(object): ...@@ -94,7 +94,10 @@ class ResultContainer(object):
# here store filtered ranking # here store filtered ranking
if output_ranking: if output_ranking:
self._store_ranking( self._store_ranking(
output_ranking, self._category_rank_list, mode='w') output_ranking,
self._category_rank_list,
first_n_spec=self._top_spec_cats_no,
mode='w')
# here store full ranking # here store full ranking
if output_debug: if output_debug:
...@@ -118,7 +121,11 @@ class ResultContainer(object): ...@@ -118,7 +121,11 @@ class ResultContainer(object):
kw_json = kw_json.encode('utf8') kw_json = kw_json.encode('utf8')
ofile.write(kw_json) ofile.write(kw_json)
def _store_ranking(self, output_file, category_rank_map, mode='w'): def _store_ranking(self,
output_file,
category_rank_map,
first_n_spec=0,
mode='w'):
with open(output_file, mode) as ofile: with open(output_file, mode) as ofile:
# category_rank_map = self._category_rank_list # category_rank_map = self._category_rank_list
# if self._ignore_zero: # if self._ignore_zero:
...@@ -126,39 +133,42 @@ class ResultContainer(object): ...@@ -126,39 +133,42 @@ class ResultContainer(object):
ofile.write("RESULTED RANKING (for {} categories):\n".format( ofile.write("RESULTED RANKING (for {} categories):\n".format(
len(category_rank_map))) len(category_rank_map)))
for k, v in category_rank_map: for k, v in category_rank_map:
marker = '* ' if self._initial_categories and \ marker = ''
(k in self._initial_categories or k.decode(\ if first_n_spec > 0:
'utf8') in self._initial_categories) else '' marker = '$ '
first_n_spec -= 1
elif self._initial_categories and k in self._initial_categories:
marker = '* '
ofile.write("{}{}\t{}\n".format(marker, ofile.write("{}{}\t{}\n".format(marker,
k.decode('string_escape'), v)) k.decode('string_escape'), v))
def _sort_results(self): # def _sort_results(self):
self._category_rank_list = sorted( # self._category_rank_list = sorted(
self._category_rank_list.items(), key=operator.itemgetter(1)) # self._category_rank_list.items(), key=operator.itemgetter(1))
self._category_rank_list.reverse() # best at the top # self._category_rank_list.reverse() # best at the top
min_val = 1.0 # min_val = 1.0
for c, r in self._category_rank_list: # for c, r in self._category_rank_list:
assert r <= min_val # assert r <= min_val
min_val = r # min_val = r
def _filter_entries(self, # def _filter_entries(self,
ranking, # ranking,
only_from_urls=None, # only_from_urls=None,
n_best=None, # n_best=None,
min_score_threshold=0.0): # min_score_threshold=0.0):
if only_from_urls and self._use_only_cats: # if only_from_urls and self._use_only_cats:
res = [v for v in ranking \ # res = [v for v in ranking \
if v[1] >= min_score_threshold and v[0] in self._use_only_cats] # if v[1] >= min_score_threshold and v[0] in self._use_only_cats]
else: # else: