Commit f9038678 authored by Arkadiusz Janz's avatar Arkadiusz Janz

removed unnecessary and deprecated _try_decode and _try_encode functions which...

removed unnecessary and deprecated _try_decode and _try_encode functions which were used for Python 2; fixed an issue with annotation number check
parent 3df1bbf7
""" Helper functions for manipulating token attributes and annotations. """
from builtins import dict
from collections import defaultdict
from corpus2 import AnnotatedSentence_wrap_sentence as annotate_sentence
......@@ -24,7 +25,7 @@ class _RaiseClass(object):
RAISE = _RaiseClass()
def get_attributes(token, to_unicode=False):
def get_attributes(token):
"""
Get attributes of a token.
......@@ -32,8 +33,6 @@ def get_attributes(token, to_unicode=False):
Args:
token (Corpus2.token)
to_unicode (:obj:`bool`, optional): Cast keys and values to unicode.
(Default value = False)
Returns:
dict
......@@ -42,13 +41,10 @@ def get_attributes(token, to_unicode=False):
return {}
metadata = token.get_metadata()
attributes = dict(metadata.attributes())
if to_unicode:
attributes = {_try_decode(key): _try_decode(value)
for (key, value) in list(attributes.items())}
return attributes
def get_attribute(token, key, default=RAISE, to_unicode=False):
def get_attribute(token, key, default=RAISE):
"""
Get named attribute of a token.
......@@ -59,15 +55,11 @@ def get_attribute(token, key, default=RAISE, to_unicode=False):
key (object): Attribute name, automatically casted to string.
default (:obj:`object`, optional): If given, and key not found,
returns this value instead. Raises KeyError otherwise.
to_unicode (:obj:`bool`, optional): Cast value to unicode.
(Default value = False)
Returns:
str
"""
attributes = get_attributes(token, to_unicode)
if to_unicode:
key = _try_decode(key)
attributes = get_attributes(token)
if default is not RAISE:
return attributes.get(key, default)
return attributes[key]
......@@ -138,6 +130,26 @@ def get_annotations(sentence, token, tok_in_sent_index=None):
}
def group_by_annotation_number(sentence, key):
"""
Group tokens having the same annotation type by their annotation number.
Args:
sentence (Corpus2.Sentence)
key (str)
Returns:
dict
"""
groups = defaultdict(list)
for token in sentence.tokens():
annotation = get_annotation(sentence, token, key)
if annotation > 0:
groups[annotation].append(token)
return groups
def _find_token(sentence, token):
for (index, token_in_sentence) in enumerate(sentence.tokens()):
if token_in_sentence.is_same(token):
......@@ -189,7 +201,7 @@ def set_annotation_for_token(sentence, token, key, value=None, set_head=False):
channel = ann_sentence.get_channel(key)
token_index = _find_token(sentence, token)
if value:
if value is not None:
try:
segment = int(value)
except TypeError:
......@@ -208,31 +220,3 @@ def is_head_of(sentence, token, key):
channel = ann_sentence.get_channel(key)
token_index = _find_token(sentence, token)
return channel.is_head_at(token_index)
def _try_decode(value):
try:
value = str(value)
except UnicodeEncodeError:
pass
try:
value = value.decode("utf-8")
except (UnicodeDecodeError, AttributeError):
pass
return value
def _try_encode(value):
try:
value = str(value)
except UnicodeEncodeError:
pass
try:
value = value.encode("utf-8")
except (UnicodeEncodeError, AttributeError):
pass
return value
......@@ -9,8 +9,8 @@ ENCODING = "utf-8"
__all__ = [
'copy_chunk',
'copy_sentence',
'copy_relation'
# todo: add 'copy_token' function
'copy_relation',
'sentence2str'
]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment