Skip to content
Snippets Groups Projects
Commit 0d507b8f authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski
Browse files

Use ann chan number, implement ann removal, minor improvements

parent 3ff58725
No related branches found
No related tags found
No related merge requests found
Pipeline #4531 failed
...@@ -20,7 +20,7 @@ build-env: ...@@ -20,7 +20,7 @@ build-env:
# help:... python) dependencies # help:... python) dependencies
.PHONY: rebuild-env .PHONY: rebuild-env
rebuild-env: rebuild-env:
docker build -f docker/Dockerfile --no-cache -t cclutils-base docker build . -f docker/Dockerfile -t cclutils-base --no-cache
# help: build-prod-env # help: build-prod-env
# help:... build production container (used for CI/CD deploy) # help:... build production container (used for CI/CD deploy)
......
...@@ -14,6 +14,7 @@ from corpus2 import DocumentPtr, SentencePtr, Tagset, Token ...@@ -14,6 +14,7 @@ from corpus2 import DocumentPtr, SentencePtr, Tagset, Token
__all__ = ["AnnotatedExpression", "DocumentAnnotations", "get_document_annotations"] __all__ = ["AnnotatedExpression", "DocumentAnnotations", "get_document_annotations"]
AnnDescr = Tuple[str, str, str, int] # ann_name, sent_id, par_id, chan_val
AnnRepr = Union[Tuple[str, ...], str, "AnnotatedExpression"] AnnRepr = Union[Tuple[str, ...], str, "AnnotatedExpression"]
SentPosition = Tuple[str, str] SentPosition = Tuple[str, str]
TagsetRepr = Union[str, Tagset] TagsetRepr = Union[str, Tagset]
...@@ -37,6 +38,7 @@ class AnnotatedExpression(object): ...@@ -37,6 +38,7 @@ class AnnotatedExpression(object):
sent: SentencePtr, sent: SentencePtr,
ann_name: str, ann_name: str,
tok_position: TokenPosition, tok_position: TokenPosition,
chan_val: int,
tagset: Optional[TagsetRepr] = "nkjp", tagset: Optional[TagsetRepr] = "nkjp",
ann_base_prop_name: Optional[str] = None, ann_base_prop_name: Optional[str] = None,
doc: Optional[DocumentPtr] = None, doc: Optional[DocumentPtr] = None,
...@@ -54,6 +56,7 @@ class AnnotatedExpression(object): ...@@ -54,6 +56,7 @@ class AnnotatedExpression(object):
ann_name: name of annotation (annotation channel). ann_name: name of annotation (annotation channel).
tok_position: position of `token` in the document (tok_sent_idx, tok_position: position of `token` in the document (tok_sent_idx,
sent_id, par_id). sent_id, par_id).
chan_val: value in annotation channel.
tagset: name of `Tagset` object, defaults to 'nkjp'. tagset: name of `Tagset` object, defaults to 'nkjp'.
ann_base_prop_name: name of property stroring base form of annotation. ann_base_prop_name: name of property stroring base form of annotation.
If not given then '{ann_name}_base' will be used as base prop name. If not given then '{ann_name}_base' will be used as base prop name.
...@@ -63,11 +66,6 @@ class AnnotatedExpression(object): ...@@ -63,11 +66,6 @@ class AnnotatedExpression(object):
self._sent = sent self._sent = sent
self._ann_name = ann_name self._ann_name = ann_name
self._ann_base_prop_name = ann_base_prop_name self._ann_base_prop_name = ann_base_prop_name
self._pref_lex: Optional[Tuple[str, ...]] = None
self._tok_lemmas: Optional[Tuple[str, ...]] = None
self._ann_base_lemma: Optional[str] = None
self._ann_base_token: Optional[Token] = None
self._has_ann_base_lemma: Optional[bool] = None
self._doc = doc self._doc = doc
if isinstance(tagset, str): if isinstance(tagset, str):
tagset = ccl.get_tagset(tagset) tagset = ccl.get_tagset(tagset)
...@@ -75,6 +73,12 @@ class AnnotatedExpression(object): ...@@ -75,6 +73,12 @@ class AnnotatedExpression(object):
self.toks_ids = set([tok_position[0]]) self.toks_ids = set([tok_position[0]])
self.sent_id = tok_position[1] self.sent_id = tok_position[1]
self.par_id = tok_position[2] self.par_id = tok_position[2]
self._chan_val = chan_val
self._pref_lex: Optional[Tuple[str, ...]] = None
self._tok_lemmas: Optional[Tuple[str, ...]] = None
self._ann_base_lemma: Optional[str] = None
self._ann_base_token: Optional[Token] = None
self._has_ann_base_lemma: Optional[bool] = None
@property @property
def annotation_name(self) -> str: def annotation_name(self) -> str:
...@@ -178,6 +182,11 @@ class AnnotatedExpression(object): ...@@ -178,6 +182,11 @@ class AnnotatedExpression(object):
indexes, sent, par = self.position indexes, sent, par = self.position
return f"{par}:{sent}:{','.join(sorted(['t' + str(i) for i in indexes]))}" return f"{par}:{sent}:{','.join(sorted(['t' + str(i) for i in indexes]))}"
@property
def ann_description(self) -> AnnDescr:
sent_id, par_id = self.position[1:]
return (self.annotation_name, sent_id, par_id, self._chan_val)
@property @property
def has_ann_base_lemma(self) -> Optional[bool]: def has_ann_base_lemma(self) -> Optional[bool]:
""" """
...@@ -188,14 +197,17 @@ class AnnotatedExpression(object): ...@@ -188,14 +197,17 @@ class AnnotatedExpression(object):
self._find_ann_base_lemma() self._find_ann_base_lemma()
return self._has_ann_base_lemma return self._has_ann_base_lemma
def append(self, token: Token, tok_position: TokenPosition) -> None: def append(self, token: Token, tok_position: TokenPosition, chan_val: str) -> None:
""" """
Extends annotation object by including next token belonging to that annotation. Extends annotation object by including next token belonging to that annotation.
Args: Args:
tok_position: (tok_sent_idx, sent_id, par_id) token: token.
tok_position: (tok_sent_idx, sent_id, par_id) triple identyfing
token in the document.
chan_val: Value of the annotation channel.
""" """
self._check_position(*tok_position) self._check_position(*tok_position, chan_val)
self._tokens.append(token) self._tokens.append(token)
self.toks_ids.add(tok_position[0]) self.toks_ids.add(tok_position[0])
...@@ -279,10 +291,13 @@ class AnnotatedExpression(object): ...@@ -279,10 +291,13 @@ class AnnotatedExpression(object):
(False, False, True): self.annotation_base_lemma, (False, False, True): self.annotation_base_lemma,
}[(as_orths, as_lexemes, as_ann_base)] }[(as_orths, as_lexemes, as_ann_base)]
def _check_position(self, tok_idx: int, sent_id: str, par_id: str) -> None: def _check_position(
self, tok_idx: int, sent_id: str, par_id: str, chan_val: str
) -> None:
""" """
Checks whether newly appended token is placed in same paragraph and Checks whether newly appended token is placed in same paragraph and
sentence as already present one. sentence as already present one. Additionally, checks if annotation
channel value for this token match with values of other tokens.
""" """
if self.par_id and self.par_id != par_id: if self.par_id and self.par_id != par_id:
raise ValueError( raise ValueError(
...@@ -297,6 +312,12 @@ class AnnotatedExpression(object): ...@@ -297,6 +312,12 @@ class AnnotatedExpression(object):
if self.toks_ids and tok_idx in self.toks_ids: if self.toks_ids and tok_idx in self.toks_ids:
raise ValueError(f"Token at position {tok_idx} already added!") raise ValueError(f"Token at position {tok_idx} already added!")
if chan_val != self._chan_val:
raise ValueError(
"Token does not belong to the same annotation (channel values"
f"differs; is: {chan_val}, expected: {self._chan_val})!"
)
def __eq__(self, other): def __eq__(self, other):
""" """
Two annotated expressions are equal if their base expressions are equal Two annotated expressions are equal if their base expressions are equal
...@@ -320,6 +341,9 @@ class AnnotatedExpression(object): ...@@ -320,6 +341,9 @@ class AnnotatedExpression(object):
(self.annotation_name, self.position, self.tokens_pref_lexemes_lowered) (self.annotation_name, self.position, self.tokens_pref_lexemes_lowered)
) )
def __len__(self):
return self.length
def __repr__(self): def __repr__(self):
expr_str = f"{self._ann_name}:{self.tokens_orths}" expr_str = f"{self._ann_name}:{self.tokens_orths}"
indexes, sent, par = self.position indexes, sent, par = self.position
...@@ -346,7 +370,7 @@ class DocumentAnnotations(object): ...@@ -346,7 +370,7 @@ class DocumentAnnotations(object):
): ):
self._doc = doc self._doc = doc
self.tagset = tagset self.tagset = tagset
self._ann_dict: Dict[Tuple[str, str, str, int], AnnotatedExpression] = {} self._ann_dict: Dict[AnnDescr, AnnotatedExpression] = {}
self._tok_pos_to_tok: Dict[TokenPosition, Token] = {} self._tok_pos_to_tok: Dict[TokenPosition, Token] = {}
self._tok_dict: Dict[TokenPosition, List[AnnotatedExpression]] = defaultdict( self._tok_dict: Dict[TokenPosition, List[AnnotatedExpression]] = defaultdict(
list list
...@@ -362,10 +386,12 @@ class DocumentAnnotations(object): ...@@ -362,10 +386,12 @@ class DocumentAnnotations(object):
@property @property
def expressions_index( def expressions_index(
self, self,
) -> Dict[Tuple[str, str, str, int], AnnotatedExpression]: ) -> Dict[AnnDescr, AnnotatedExpression]:
""" """
Returns index of all annotations found in the document. Returns index of all annotations found in the document.
Index preserves original order of annotations in value lists.
Returns: Returns:
Dict: Dict:
key: Tuple[annotation_name, sent_id, par_id, chan_val] key: Tuple[annotation_name, sent_id, par_id, chan_val]
...@@ -411,16 +437,22 @@ class DocumentAnnotations(object): ...@@ -411,16 +437,22 @@ class DocumentAnnotations(object):
""" """
if chan_val != 0 and (not accepted or ann_name in accepted): if chan_val != 0 and (not accepted or ann_name in accepted):
sent_id, par_id = tok_pos[1], tok_pos[2] sent_id, par_id = tok_pos[1], tok_pos[2]
ann_dict_key = (ann_name, sent_id, par_id, chan_val) ann_descr: AnnDescr = (ann_name, sent_id, par_id, chan_val)
related_ann = None related_ann = None
if ann_dict_key in self._ann_dict: if ann_descr in self._ann_dict:
related_ann = self._ann_dict[ann_dict_key] related_ann = self._ann_dict[ann_descr]
related_ann.append(token, tok_pos) related_ann.append(token, tok_pos, chan_val)
else: else:
related_ann = AnnotatedExpression( related_ann = AnnotatedExpression(
token, sent, ann_name, tok_pos, tagset=self.tagset, doc=self._doc token,
sent,
ann_name,
tok_pos,
chan_val,
tagset=self.tagset,
doc=self._doc,
) )
self._ann_dict[ann_dict_key] = related_ann self._ann_dict[ann_descr] = related_ann
self._tok_dict[tok_pos].append(related_ann) self._tok_dict[tok_pos].append(related_ann)
self._tok_pos_to_tok[tok_pos] = token self._tok_pos_to_tok[tok_pos] = token
...@@ -448,9 +480,21 @@ class DocumentAnnotations(object): ...@@ -448,9 +480,21 @@ class DocumentAnnotations(object):
token, sent, tok_pos, ann, chan_val, accepted=accepted token, sent, tok_pos, ann, chan_val, accepted=accepted
) )
def erase_annotation( def erase_annotation_by_descr(self, ann_descr: AnnDescr) -> None:
self, ann_name: str, sent_pos: SentPosition, chan_val: int """
) -> None: Erase annotation specified by its description in sentence' tokens.
Technically speaking, for all token related to annotation specified by
input params, this method will set values of annotation channel to 0.
Args:
ann_descr: Tuple containing information about annotation:
(ann name, sent id, par id, chan value)
"""
related_ann = self.expressions_index[ann_descr]
self.erase_annotation(related_ann)
def erase_annotation(self, ann: AnnotatedExpression) -> None:
""" """
Erase annotation in sentence' tokens. Erase annotation in sentence' tokens.
...@@ -462,19 +506,19 @@ class DocumentAnnotations(object): ...@@ -462,19 +506,19 @@ class DocumentAnnotations(object):
ann_name: name of annotation to erase. ann_name: name of annotation to erase.
chan_val: current value for annotation in annotation channel. chan_val: current value for annotation in annotation channel.
""" """
related_ann = self.expressions_index[(ann_name, *sent_pos, chan_val)] if ann:
if related_ann: ann_descr = ann.ann_description
# erase annotation in underlying tokens # erase annotation in underlying tokens
related_ann.erase_annotation() ann.erase_annotation()
# remove from `_tok_dict` (tok_pos -> anns index) # remove from `_tok_dict` (tok_pos -> anns index)
# TODO what about _tok_dict index?! maybe better option is to just # TODO what about _tok_dict index?! maybe better option is to just
# mark `AnnotatedExpression` instance as stale? # mark `AnnotatedExpression` instance as stale?
for pos in related_ann.tokens_positions: for pos in ann.tokens_positions:
self._tok_dict[pos].remove(related_ann) self._tok_dict[pos].remove(ann)
# delete from base index # delete from base index
del self.expressions_index[(ann_name, *sent_pos, chan_val)] del self.expressions_index[ann_descr]
def group_by_chan_name( def group_by_chan_name(
self, self,
...@@ -485,6 +529,9 @@ class DocumentAnnotations(object): ...@@ -485,6 +529,9 @@ class DocumentAnnotations(object):
""" """
Returns annotations grouped by channel name, in one of specified forms. Returns annotations grouped by channel name, in one of specified forms.
Original (document) order of annotations in value lists is preserved
(in case of two or more annotations of certain type in document).
Args: Args:
as_orths: returns orths instead of `AnnotatedExpression` instances as_orths: returns orths instead of `AnnotatedExpression` instances
as_lexemes: returns pref lexemes instead of `AnnotatedExpression` as_lexemes: returns pref lexemes instead of `AnnotatedExpression`
......
...@@ -6,7 +6,7 @@ setup( ...@@ -6,7 +6,7 @@ setup(
author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski', author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski',
description='''A convenient API based on Corpus2 library for analyzing textual description='''A convenient API based on Corpus2 library for analyzing textual
corpora in CCL format.''', corpora in CCL format.''',
version='1.1', version='1.1.1rc1',
packages=['cclutils', 'cclutils.extras'], packages=['cclutils', 'cclutils.extras'],
zip_safe=False zip_safe=False
) )
...@@ -111,9 +111,9 @@ def test_erase_ann_with_base(): ...@@ -111,9 +111,9 @@ def test_erase_ann_with_base():
sent_pos = ("s1", "ch1") sent_pos = ("s1", "ch1")
ann_name = "room_type" ann_name = "room_type"
chan_val = 1 chan_val = 1
dla_dwoch_osob_room_type_pos = (ann_name, *sent_pos, chan_val) dla_dwoch_osob_room_type_ann_descr = (ann_name, *sent_pos, chan_val)
dla_dwoch_osob_room_type_ann = anns.expressions_index[ dla_dwoch_osob_room_type_ann = anns.expressions_index[
(dla_dwoch_osob_room_type_pos) dla_dwoch_osob_room_type_ann_descr
] ]
ann_tokens = dla_dwoch_osob_room_type_ann._tokens ann_tokens = dla_dwoch_osob_room_type_ann._tokens
ann_sent = dla_dwoch_osob_room_type_ann._sent ann_sent = dla_dwoch_osob_room_type_ann._sent
...@@ -145,7 +145,7 @@ def test_erase_ann_with_base(): ...@@ -145,7 +145,7 @@ def test_erase_ann_with_base():
raise AssertionError("Document does not meet initial test conditions!") from e raise AssertionError("Document does not meet initial test conditions!") from e
# when # when
anns.erase_annotation(ann_name, sent_pos, chan_val) anns.erase_annotation_by_descr(dla_dwoch_osob_room_type_ann_descr)
# then # then
assert ccl.get_annotation(ann_sent, dla_tok, "room_type") == 0 assert ccl.get_annotation(ann_sent, dla_tok, "room_type") == 0
...@@ -178,11 +178,13 @@ def test_erase_ann_with_base(): ...@@ -178,11 +178,13 @@ def test_erase_ann_with_base():
# test in-memory object # test in-memory object
with pytest.raises(KeyError): with pytest.raises(KeyError):
anns.expressions_index[(dla_dwoch_osob_room_type_pos)] anns.expressions_index[dla_dwoch_osob_room_type_ann_descr]
# finally re-read annotations and check # finally re-read annotations and check
with pytest.raises(KeyError): with pytest.raises(KeyError):
get_document_annotations(doc).expressions_index[(dla_dwoch_osob_room_type_pos)] get_document_annotations(doc).expressions_index[
dla_dwoch_osob_room_type_ann_descr
]
def test_erase_ann_without_base(): def test_erase_ann_without_base():
...@@ -191,8 +193,35 @@ def test_erase_ann_without_base(): ...@@ -191,8 +193,35 @@ def test_erase_ann_without_base():
sent_pos = ("s1", "ch1") sent_pos = ("s1", "ch1")
ann_name = "region" ann_name = "region"
chan_val = 1 chan_val = 1
pos = (ann_name, *sent_pos, chan_val) ann_descr = (ann_name, *sent_pos, chan_val)
region_ann = anns.expressions_index[pos] region_ann = anns.expressions_index[ann_descr]
gdansk_tok = region_ann._tokens[0]
ann_sent = region_ann._sent
# given
try:
assert not region_ann.has_ann_base_lemma
assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
assert ccl.get_annotation(ann_sent, gdansk_tok, "region") == 1
except AssertionError as e:
raise AssertionError("Document does not meet initial test conditions!") from e
# when
anns.erase_annotation_by_descr(ann_descr)
# then
assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
assert ccl.get_annotation(ann_sent, gdansk_tok, "region") == 0
def test_erase_ann_by_obj_without_base():
doc = ccl.read(CCL_TEST_PATH_02)
anns = get_document_annotations(doc)
sent_pos = ("s1", "ch1")
ann_name = "region"
chan_val = 1
ann_descr = (ann_name, *sent_pos, chan_val)
region_ann = anns.expressions_index[ann_descr]
gdansk_tok = region_ann._tokens[0] gdansk_tok = region_ann._tokens[0]
ann_sent = region_ann._sent ann_sent = region_ann._sent
...@@ -205,7 +234,7 @@ def test_erase_ann_without_base(): ...@@ -205,7 +234,7 @@ def test_erase_ann_without_base():
raise AssertionError("Document does not meet initial test conditions!") from e raise AssertionError("Document does not meet initial test conditions!") from e
# when # when
anns.erase_annotation(ann_name, sent_pos, chan_val) anns.erase_annotation(region_ann)
# then # then
assert ccl.get_attribute(gdansk_tok, "region_base", False) == False assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment