Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Team-Semantics
elinker
Commits
7e9f9751
Commit
7e9f9751
authored
Aug 28, 2019
by
Grzegorz Kostkowski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Remove disabled code
parent
9bbdb158
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2 additions
and
116 deletions
+2
-116
entity_linker/dao/db_dao.py
entity_linker/dao/db_dao.py
+0
-5
entity_linker/filters/annotate_excluded_phrases.py
entity_linker/filters/annotate_excluded_phrases.py
+0
-6
entity_linker/filters/document_filter.py
entity_linker/filters/document_filter.py
+0
-2
entity_linker/linker/elinker.py
entity_linker/linker/elinker.py
+1
-49
entity_linker/parser/ccl_parser.py
entity_linker/parser/ccl_parser.py
+0
-23
entity_linker/run.py
entity_linker/run.py
+1
-31
No files found.
entity_linker/dao/db_dao.py
View file @
7e9f9751
...
...
@@ -68,8 +68,6 @@ class DatabaseDao(object):
else
:
sa_res
=
self
.
_get_exact_match_neighb
(
syn_id
,
with_src
=
with_src
)
sa_nodes
=
{
url
for
url
,
src
in
sa_res
}
# print(sa_res)
# print(sa_nodes)
res
=
list
({
url
for
sa_url
in
sa_nodes
...
...
@@ -126,15 +124,12 @@ class DatabaseDao(object):
conditions
=
conditions_str
,
output
=
output_flds
,
coll_name
=
output_coll
)
# print(query)
if
with_src
:
res
=
[(
result
[
url_fld
],
result
[
src_fld
])
\
for
result
in
self
.
_db_conn
.
run
(
query
)]
else
:
res
=
[
result
[
url_fld
]
for
result
in
self
.
_db_conn
.
run
(
query
)]
# if res:
# _log.debug("Found {} urls for lemma: {}".format(len(res), lemma))
return
res
def
_get_exact_match_neighb
(
self
,
node_id
,
by_url
=
False
,
with_src
=
True
):
...
...
entity_linker/filters/annotate_excluded_phrases.py
View file @
7e9f9751
import
re
from
corpus_ccl
import
cclutils
as
ccl
from
entity_linker.utils.util
import
as_str
# def load_excluded_patterns(pat_path):
#
# with codecs.open(pat_path, 'r', encoding="utf8") as ifile:
token_placeholder
=
"TOK"
PATTERNS
=
[
...
...
@@ -39,9 +36,6 @@ def process_doc(doc, tok_lemma_map, ignored_ann=None):
re
.
IGNORECASE
)
if
m
and
m
.
group
():
mark_tok_as_ignored
(
tok
,
ignored_ann
)
# else:
# print("warn: no lemma for token {}".format(
# as_str(tok.orth_utf8())))
return
doc
...
...
entity_linker/filters/document_filter.py
View file @
7e9f9751
...
...
@@ -50,8 +50,6 @@ class DocumentFilter(object):
if
self
.
_config
.
ignored_pos
:
tagset
=
self
.
_config
.
tagset
ignored_pos_list
=
self
.
_config
.
ignored_pos
# import pdb;
# pdb.set_trace()
self
.
add_token_filter
(
lambda
t
:
not
self
.
_has_ignored_pos
(
t
,
ignored_pos_list
,
tagset
))
...
...
entity_linker/linker/elinker.py
View file @
7e9f9751
...
...
@@ -33,38 +33,19 @@ class EntityLinker(object):
# doc_parser = CclParser(ccl_doc, self._config)
doc_ctx
=
self
.
_doc_context
# spot wsd annotated tokens
wsd_tok_syn_id_map
=
{}
s_tok_concepts_map
=
{}
if
self
.
_is_wsd_enabled
:
# wsd_tok_syn_id_map = doc_parser.get_disambiguated_token_syn_id()
wsd_tok_syn_id_map
=
doc_ctx
.
get_disambiguated_token_syn_id
()
# filter wsd tokens and lemmas
# additional map created to be able to filter out by lemma
# wsd_tok_lemma_map = doc_ctx.tok_syn_as_tok_lemma_map(
# wsd_tok_syn_id_map)
wsd_tok_lemma_map
=
doc_ctx
.
get_tok_lemma_map
(
'wsd'
)
# print(len(wsd_tok_syn_id_map))
# print(len(wsd_tok_lemma_map))
# print(set(
# len(wsd_tok_lemma_map.keys()) == set(
# wsd_tok_syn_id_map.keys())))
filtered_wsd_tok_lemma_map
=
doc_ctx
\
.
filter_tok_lemma_map
(
wsd_tok_lemma_map
)
if
self
.
_use_wsd_synsets
:
# filtered_wsd_tok_syn_id_map = {t: wsd_tok_syn_id_map[t] \
# for t, l in wsd_tok_lemma_map.iteritems()\
# if self._doc_filter.apply_lemma_filters(l)\
# and self._doc_filter.apply_token_filters(t)}
# import pudb
# pudb.set_trace()
filtered_wsd_tok_syn_id_map
=
{
t
:
wsd_tok_syn_id_map
[
t
]
\
for
t
in
filtered_wsd_tok_lemma_map
.
keys
()}
for
t
in
filtered_wsd_tok_lemma_map
.
keys
()}
wsd_tok_syn_id_map
=
filtered_wsd_tok_syn_id_map
# fetch URL links for sys_ids
s_tok_concepts_map
=
self
.
_get_concepts_by_tok_syn_id_map
(
...
...
@@ -74,7 +55,6 @@ class EntityLinker(object):
s_tok_concepts_map
=
self
.
_get_concepts_by_tok_lemma_map
(
filtered_wsd_tok_lemma_map
,
doc_ctx
.
lang
,
extra_info
=
'wsd'
)
# spot ne annotated tokens
ne_toks
=
doc_ctx
.
ne_tokens
ne_tok_lemmas_map
=
doc_ctx
.
make_tok_lemma_map
(
ne_toks
,
token_kind
=
'ne'
)
...
...
@@ -82,35 +62,18 @@ class EntityLinker(object):
_log
.
debug
(
"Found ne expressions:
\n
{}"
.
format
(
\
str
([
v
for
k
,
v
in
ne_tok_lemmas_map
.
iteritems
()])))
# spot mwe annotated tokens
mwe_toks
=
doc_ctx
.
mwe_tokens
# import pudb
# pudb.set_trace()
# print("mwe toks")
# print(doc_parser.print_structured_exprs(mwe_toks))
mwe_tok_lemmas_map
=
doc_ctx
.
make_tok_lemma_map
(
mwe_toks
,
token_kind
=
'mwe'
)
# mwe_tok_lemmas_map = doc_parser\
# .get_tok_lemma_by_structured_expr(mwe_toks, is_mwe=True)
# import pudb
# pudb.set_trace()
_log
.
debug
(
"Found mwe expressions:
\n
{}"
.
format
(
\
str
([
v
for
k
,
v
in
mwe_tok_lemmas_map
.
iteritems
()])))
# fetch URL links for lemmas
# tok_lemmas_map = self._merge_mwe_ne_maps(ne_tok_lemmas_map,
# mwe_tok_lemmas_map)
# tok_lemmas_map = mwe_tok_lemmas_map
# l_tok_concepts_map = self._get_concepts_by_tok_lemma_map(
# tok_lemmas_map, lang)
# mark other tokens if _mark_without_ann - before filtering
# to not take these filtered out tokens
if
self
.
_mark_without_ann
:
# get the set difference between all doc and already selected tokens
all_toks
=
set
(
doc_ctx
.
tokens
)
# ann_toks = set(mwe_toks + ne_toks + wsd_tok_syn_id_map.values())
# without_ann_toks = all_toks.difference(mwe_toks).difference(ne_toks).difference(wsd_tok_syn_id_map.values())
without_ann_toks
=
all_toks
.
difference
(
doc_ctx
.
mwe_tokens
(
flatten
=
True
))
without_ann_toks
=
without_ann_toks
.
difference
(
...
...
@@ -119,15 +82,11 @@ class EntityLinker(object):
# wsd_tok_syn_id_map.values()) # there should be keys ?
without_ann_toks
=
without_ann_toks
.
difference
(
wsd_tok_syn_id_map
.
values
())
# without_ann_toks = all_toks.difference(ann_toks)
# build token - lemma map
if
without_ann_toks
:
without_ann_t_l_map
=
doc_ctx
.
get_tok_lemma
(
without_ann_toks
)
# TODO ^ change method get_tok_lemma_by_structured_expr to handle simple list
# filter out by lemma
# without_ann_t_l_map = {t: l for t, l in without_ann_t_l_map.iteritems() \
# if self._doc_filter.apply_lemma_filters(l)\
# and self._doc_filter.apply_token_filters(t)}
without_ann_t_l_map
=
doc_ctx
.
filter_tok_lemma_map
(
without_ann_t_l_map
)
without_ann_concepts_map
=
self
.
_get_concepts_by_tok_lemma_map
(
...
...
@@ -153,7 +112,6 @@ class EntityLinker(object):
flat_mwe_toks
=
_flatten_mwe_exprs
(
mwe_toks
)
flat_ne_toks
=
_flatten_mwe_exprs
(
ne_toks
)
flat_mwe_toks
.
extend
(
flat_ne_toks
)
# print([t.orth_utf8() for t in flat_mwe_toks])
tok_concepts_map
=
l_tok_concepts_map
if
s_tok_concepts_map
:
s_tok_concepts_map
=
self
.
_filter_out_redundant_wsd_toks
(
...
...
@@ -175,8 +133,6 @@ class EntityLinker(object):
multi-word expression then lemma represented ne will be used.
This approach is used to avoid mistakes when linking concepts.'''
return
merge_two_dicts
(
mwe_tok_lemmas_map
,
ne_tok_lemmas_map
)
# for t, l in ne_tok_lemmas_map.iteritems():
# print(t.orth_utf8(), len(l) if isinstance(l, list) else l)
def
_get_concepts_by_tok_lemma_map
(
self
,
tok_lemmas_map
,
...
...
@@ -225,10 +181,6 @@ class EntityLinker(object):
if
k
in
mwe_toks
:
del
s_tok_concepts_map
[
k
]
return
s_tok_concepts_map
# return {
# k: v
# for k, v in s_tok_concepts_map.iteritems() if k not in mwe_toks
# }
def
_flatten_mwe_exprs
(
mwe_toks
):
...
...
entity_linker/parser/ccl_parser.py
View file @
7e9f9751
...
...
@@ -185,15 +185,11 @@ class CclParser(object):
@return dict with token (head or representative token) as key and
string (lemma) as value
'''
# import pudb
# pudb.set_trace()
tok_lemma_map
=
{}
for
expr
in
expr_toks_list
:
repr_tok
=
expr
[
0
]
if
expr
[
0
]
is
not
None
else
expr
[
1
][
0
]
lemma
=
self
.
_lemma_resolver
.
get_lemma_from_token_expr
(
expr
,
is_mwe
=
is_mwe
)
# print("lemma for '{}': {}".format(
# self.get_structured_expr_str(expr), lemma))
if
lemma
:
tok_lemma_map
[
repr_tok
]
=
lemma
return
tok_lemma_map
...
...
@@ -282,11 +278,6 @@ class LemmaResolver(object):
self
.
_ignore_interp
=
ignore_interp
self
.
_ignored_pos_list
=
config
.
ignored_pos
# def tokens_lemma(self, toks):
# search for head token
# for t in toks:
# if self._has_chan_ann(t, sent, self._mwe_chan_name = self._config["mwe_chan_name"])
def
get_lemma_from_token_expr
(
self
,
expr_tok_list
,
is_mwe
=
False
):
'''Returns base form of lexeme, also for multiwords expressions.
Requires following form of tokens sequence: [head, (tail, of, expr)].
...
...
@@ -319,20 +310,6 @@ class LemmaResolver(object):
# doesn't have head
return
self
.
_lemma_from_seq
(
tail
)
# is_mwe = self._has_chan_ann(tok, sent, self._config["mwe_chan_name"])
# mwe_base is stored only in token with head
# if is_mwe:
# if not token_metadata and token.has_metadata():
# token_metadata - token.get_metadata()
# is_head = tutils.is_head_of(sentence, token, self._config["mwe_chan_name"])
# if is_head and token_metadata and token_metadata.has_attribute(
# self._config["mwe_base_prop_key"]):
# print("Calling is_head_of for {}".format(token.orth_utf8()))
# return token_metadata.get_attribute(self._config["mwe_base_prop_key"])
# else:
# return token.get_preferred_lexeme(tagset).lemma_utf8()
# return None
def
_lemma_from_mwe_head
(
self
,
head
):
if
head
.
has_metadata
():
md
=
head
.
get_metadata
()
...
...
entity_linker/run.py
View file @
7e9f9751
...
...
@@ -118,10 +118,6 @@ def _build_entity_linker(dao_obj, config):
return
EntityLinker
(
dao_obj
,
doc_filter
,
config
)
# def _build_doc_context(doc, output, lang, tagset):
# return doc_cfg.document_context(doc, output, lang, tagset)
def
main
(
document_context
=
None
,
config
=
None
,
dao_obj
=
None
,
elinker
=
None
):
if
not
config
:
config
=
_process_configuration
()
...
...
@@ -149,36 +145,10 @@ def main(document_context=None, config=None, dao_obj=None, elinker=None):
.
save_annotated_doc
()
# def main(document_context=None, config=None, dao_obj=None, elinker=None):
# if not config:
# config = _process_configuration()
# ccl_doc = ccldoc.CclDocument(config.doc)
#
# ccl_parser = CclParser(ccl_doc, config)
# ccl_doc = ann_excluded_phrases(ccl_doc, config, doc_parser=ccl_parser)
# if not dao_obj:
# dao_obj = _build_dao(config)
# doc_parser = ccl_parser
# doc_filter = DocumentFilter(config)
# if not document_context:
# doc_context = DocumentContext(ccl_doc, doc_parser, doc_filter,
# config.lang, config.tagset)
#
# if not elinker:
# elinker = EntityLinker(dao_obj, doc_parser, doc_filter, config)
#
# tok_concepts_map = elinker.link_entities_for_doc(doc_context)
#
# CclWriter(ccl_doc, tok_concepts_map, config.output, config)\
# .save_annotated_doc()
if
__name__
==
"__main__"
:
# note: to run mithout installing provide path to config.ini with absolute
# paths in it. This will help to avoid resolving paths by pkg_source.
#
# python run.py data/sample/00120827-no-urls.saper.lod.xml
pl
./00120827.mwe.xml.wsd.elinker --config /home/gkostkowski/keyword-assignment/kwazon/entity-linker/config/config.ini
# python run.py
-d
data/sample/00120827-no-urls.saper.lod.xml
-o
./00120827.mwe.xml.wsd.elinker --config /home/gkostkowski/keyword-assignment/kwazon/entity-linker/config/config.ini
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment