Commit 7e9f9751 authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski

Remove disabled code

parent 9bbdb158
......@@ -68,8 +68,6 @@ class DatabaseDao(object):
else:
sa_res = self._get_exact_match_neighb(syn_id, with_src=with_src)
sa_nodes = {url for url, src in sa_res}
# print(sa_res)
# print(sa_nodes)
res = list({
url
for sa_url in sa_nodes
......@@ -126,15 +124,12 @@ class DatabaseDao(object):
conditions=conditions_str,
output=output_flds,
coll_name=output_coll)
# print(query)
if with_src:
res = [(result[url_fld], result[src_fld]) \
for result in self._db_conn.run(query)]
else:
res = [result[url_fld] for result in self._db_conn.run(query)]
# if res:
# _log.debug("Found {} urls for lemma: {}".format(len(res), lemma))
return res
def _get_exact_match_neighb(self, node_id, by_url=False, with_src=True):
......
import re
from corpus_ccl import cclutils as ccl
from entity_linker.utils.util import as_str
# def load_excluded_patterns(pat_path):
#
# with codecs.open(pat_path, 'r', encoding="utf8") as ifile:
token_placeholder = "TOK"
PATTERNS = [
......@@ -39,9 +36,6 @@ def process_doc(doc, tok_lemma_map, ignored_ann=None):
re.IGNORECASE)
if m and m.group():
mark_tok_as_ignored(tok, ignored_ann)
# else:
# print("warn: no lemma for token {}".format(
# as_str(tok.orth_utf8())))
return doc
......
......@@ -50,8 +50,6 @@ class DocumentFilter(object):
if self._config.ignored_pos:
tagset = self._config.tagset
ignored_pos_list = self._config.ignored_pos
# import pdb;
# pdb.set_trace()
self.add_token_filter(lambda t: not self._has_ignored_pos(
t, ignored_pos_list, tagset))
......
......@@ -33,38 +33,19 @@ class EntityLinker(object):
# doc_parser = CclParser(ccl_doc, self._config)
doc_ctx = self._doc_context
# spot wsd annotated tokens
wsd_tok_syn_id_map = {}
s_tok_concepts_map = {}
if self._is_wsd_enabled:
# wsd_tok_syn_id_map = doc_parser.get_disambiguated_token_syn_id()
wsd_tok_syn_id_map = doc_ctx.get_disambiguated_token_syn_id()
# filter wsd tokens and lemmas
# additional map created to be able to filter out by lemma
# wsd_tok_lemma_map = doc_ctx.tok_syn_as_tok_lemma_map(
# wsd_tok_syn_id_map)
wsd_tok_lemma_map = doc_ctx.get_tok_lemma_map('wsd')
# print(len(wsd_tok_syn_id_map))
# print(len(wsd_tok_lemma_map))
# print(set(
# len(wsd_tok_lemma_map.keys()) == set(
# wsd_tok_syn_id_map.keys())))
filtered_wsd_tok_lemma_map = doc_ctx\
.filter_tok_lemma_map(wsd_tok_lemma_map)
if self._use_wsd_synsets:
# filtered_wsd_tok_syn_id_map = {t: wsd_tok_syn_id_map[t] \
# for t, l in wsd_tok_lemma_map.iteritems()\
# if self._doc_filter.apply_lemma_filters(l)\
# and self._doc_filter.apply_token_filters(t)}
# import pudb
# pudb.set_trace()
filtered_wsd_tok_syn_id_map = {t: wsd_tok_syn_id_map[t] \
for t in filtered_wsd_tok_lemma_map.keys()}
for t in filtered_wsd_tok_lemma_map.keys()}
wsd_tok_syn_id_map = filtered_wsd_tok_syn_id_map
# fetch URL links for sys_ids
s_tok_concepts_map = self._get_concepts_by_tok_syn_id_map(
......@@ -74,7 +55,6 @@ class EntityLinker(object):
s_tok_concepts_map = self._get_concepts_by_tok_lemma_map(
filtered_wsd_tok_lemma_map, doc_ctx.lang, extra_info='wsd')
# spot ne annotated tokens
ne_toks = doc_ctx.ne_tokens
ne_tok_lemmas_map = doc_ctx.make_tok_lemma_map(
ne_toks, token_kind='ne')
......@@ -82,35 +62,18 @@ class EntityLinker(object):
_log.debug("Found ne expressions:\n{}".format(\
str([v for k, v in ne_tok_lemmas_map.iteritems()])))
# spot mwe annotated tokens
mwe_toks = doc_ctx.mwe_tokens
# import pudb
# pudb.set_trace()
# print("mwe toks")
# print(doc_parser.print_structured_exprs(mwe_toks))
mwe_tok_lemmas_map = doc_ctx.make_tok_lemma_map(
mwe_toks, token_kind='mwe')
# mwe_tok_lemmas_map = doc_parser\
# .get_tok_lemma_by_structured_expr(mwe_toks, is_mwe=True)
# import pudb
# pudb.set_trace()
_log.debug("Found mwe expressions:\n{}".format(\
str([v for k, v in mwe_tok_lemmas_map.iteritems()])))
# fetch URL links for lemmas
# tok_lemmas_map = self._merge_mwe_ne_maps(ne_tok_lemmas_map,
# mwe_tok_lemmas_map)
# tok_lemmas_map = mwe_tok_lemmas_map
# l_tok_concepts_map = self._get_concepts_by_tok_lemma_map(
# tok_lemmas_map, lang)
# mark other tokens if _mark_without_ann - before filtering
# to not take these filtered out tokens
if self._mark_without_ann:
# get the set difference between all doc and already selected tokens
all_toks = set(doc_ctx.tokens)
# ann_toks = set(mwe_toks + ne_toks + wsd_tok_syn_id_map.values())
# without_ann_toks = all_toks.difference(mwe_toks).difference(ne_toks).difference(wsd_tok_syn_id_map.values())
without_ann_toks = all_toks.difference(
doc_ctx.mwe_tokens(flatten=True))
without_ann_toks = without_ann_toks.difference(
......@@ -119,15 +82,11 @@ class EntityLinker(object):
# wsd_tok_syn_id_map.values()) # there should be keys ?
without_ann_toks = without_ann_toks.difference(
wsd_tok_syn_id_map.values())
# without_ann_toks = all_toks.difference(ann_toks)
# build token - lemma map
if without_ann_toks:
without_ann_t_l_map = doc_ctx.get_tok_lemma(without_ann_toks)
# TODO ^ change method get_tok_lemma_by_structured_expr to handle simple list
# filter out by lemma
# without_ann_t_l_map = {t: l for t, l in without_ann_t_l_map.iteritems() \
# if self._doc_filter.apply_lemma_filters(l)\
# and self._doc_filter.apply_token_filters(t)}
without_ann_t_l_map = doc_ctx.filter_tok_lemma_map(
without_ann_t_l_map)
without_ann_concepts_map = self._get_concepts_by_tok_lemma_map(
......@@ -153,7 +112,6 @@ class EntityLinker(object):
flat_mwe_toks = _flatten_mwe_exprs(mwe_toks)
flat_ne_toks = _flatten_mwe_exprs(ne_toks)
flat_mwe_toks.extend(flat_ne_toks)
# print([t.orth_utf8() for t in flat_mwe_toks])
tok_concepts_map = l_tok_concepts_map
if s_tok_concepts_map:
s_tok_concepts_map = self._filter_out_redundant_wsd_toks(
......@@ -175,8 +133,6 @@ class EntityLinker(object):
multi-word expression then lemma represented ne will be used.
This approach is used to avoid mistakes when linking concepts.'''
return merge_two_dicts(mwe_tok_lemmas_map, ne_tok_lemmas_map)
# for t, l in ne_tok_lemmas_map.iteritems():
# print(t.orth_utf8(), len(l) if isinstance(l, list) else l)
def _get_concepts_by_tok_lemma_map(self,
tok_lemmas_map,
......@@ -225,10 +181,6 @@ class EntityLinker(object):
if k in mwe_toks:
del s_tok_concepts_map[k]
return s_tok_concepts_map
# return {
# k: v
# for k, v in s_tok_concepts_map.iteritems() if k not in mwe_toks
# }
def _flatten_mwe_exprs(mwe_toks):
......
......@@ -185,15 +185,11 @@ class CclParser(object):
@return dict with token (head or representative token) as key and
string (lemma) as value
'''
# import pudb
# pudb.set_trace()
tok_lemma_map = {}
for expr in expr_toks_list:
repr_tok = expr[0] if expr[0] is not None else expr[1][0]
lemma = self._lemma_resolver.get_lemma_from_token_expr(
expr, is_mwe=is_mwe)
# print("lemma for '{}': {}".format(
# self.get_structured_expr_str(expr), lemma))
if lemma:
tok_lemma_map[repr_tok] = lemma
return tok_lemma_map
......@@ -282,11 +278,6 @@ class LemmaResolver(object):
self._ignore_interp = ignore_interp
self._ignored_pos_list = config.ignored_pos
# def tokens_lemma(self, toks):
# search for head token
# for t in toks:
# if self._has_chan_ann(t, sent, self._mwe_chan_name = self._config["mwe_chan_name"])
def get_lemma_from_token_expr(self, expr_tok_list, is_mwe=False):
'''Returns base form of lexeme, also for multiwords expressions.
Requires following form of tokens sequence: [head, (tail, of, expr)].
......@@ -319,20 +310,6 @@ class LemmaResolver(object):
# doesn't have head
return self._lemma_from_seq(tail)
# is_mwe = self._has_chan_ann(tok, sent, self._config["mwe_chan_name"])
# mwe_base is stored only in token with head
# if is_mwe:
# if not token_metadata and token.has_metadata():
# token_metadata - token.get_metadata()
# is_head = tutils.is_head_of(sentence, token, self._config["mwe_chan_name"])
# if is_head and token_metadata and token_metadata.has_attribute(
# self._config["mwe_base_prop_key"]):
# print("Calling is_head_of for {}".format(token.orth_utf8()))
# return token_metadata.get_attribute(self._config["mwe_base_prop_key"])
# else:
# return token.get_preferred_lexeme(tagset).lemma_utf8()
# return None
def _lemma_from_mwe_head(self, head):
if head.has_metadata():
md = head.get_metadata()
......
......@@ -118,10 +118,6 @@ def _build_entity_linker(dao_obj, config):
return EntityLinker(dao_obj, doc_filter, config)
# def _build_doc_context(doc, output, lang, tagset):
# return doc_cfg.document_context(doc, output, lang, tagset)
def main(document_context=None, config=None, dao_obj=None, elinker=None):
if not config:
config = _process_configuration()
......@@ -149,36 +145,10 @@ def main(document_context=None, config=None, dao_obj=None, elinker=None):
.save_annotated_doc()
# def main(document_context=None, config=None, dao_obj=None, elinker=None):
# if not config:
# config = _process_configuration()
# ccl_doc = ccldoc.CclDocument(config.doc)
#
# ccl_parser = CclParser(ccl_doc, config)
# ccl_doc = ann_excluded_phrases(ccl_doc, config, doc_parser=ccl_parser)
# if not dao_obj:
# dao_obj = _build_dao(config)
# doc_parser = ccl_parser
# doc_filter = DocumentFilter(config)
# if not document_context:
# doc_context = DocumentContext(ccl_doc, doc_parser, doc_filter,
# config.lang, config.tagset)
#
# if not elinker:
# elinker = EntityLinker(dao_obj, doc_parser, doc_filter, config)
#
# tok_concepts_map = elinker.link_entities_for_doc(doc_context)
#
# CclWriter(ccl_doc, tok_concepts_map, config.output, config)\
# .save_annotated_doc()
if __name__ == "__main__":
# note: to run mithout installing provide path to config.ini with absolute
# paths in it. This will help to avoid resolving paths by pkg_source.
#
# python run.py data/sample/00120827-no-urls.saper.lod.xml pl ./00120827.mwe.xml.wsd.elinker --config /home/gkostkowski/keyword-assignment/kwazon/entity-linker/config/config.ini
# python run.py -d data/sample/00120827-no-urls.saper.lod.xml -o ./00120827.mwe.xml.wsd.elinker --config /home/gkostkowski/keyword-assignment/kwazon/entity-linker/config/config.ini
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment