Commit 0d507b8f authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski

Use ann chan number, implement ann removal, minor improvements

parent 3ff58725
Pipeline #4531 failed with stage
in 2 minutes
......@@ -20,7 +20,7 @@ build-env:
# help:... python) dependencies
.PHONY: rebuild-env
rebuild-env:
docker build -f docker/Dockerfile --no-cache -t cclutils-base
docker build . -f docker/Dockerfile -t cclutils-base --no-cache
# help: build-prod-env
# help:... build production container (used for CI/CD deploy)
......@@ -73,7 +73,7 @@ check-types-dev:
# help: test-dev
# help:... run tests inside the container (without rebuilding), mounts current
# help:... version of tests. To enable pudb (or pass other flags) run "make
# help:... version of tests. To enable pudb (or pass other flags) run "make
# help:... flags=--pudb test-dev"
.PHONY: test-dev
test-dev:
......
......@@ -14,6 +14,7 @@ from corpus2 import DocumentPtr, SentencePtr, Tagset, Token
__all__ = ["AnnotatedExpression", "DocumentAnnotations", "get_document_annotations"]
AnnDescr = Tuple[str, str, str, int] # ann_name, sent_id, par_id, chan_val
AnnRepr = Union[Tuple[str, ...], str, "AnnotatedExpression"]
SentPosition = Tuple[str, str]
TagsetRepr = Union[str, Tagset]
......@@ -37,6 +38,7 @@ class AnnotatedExpression(object):
sent: SentencePtr,
ann_name: str,
tok_position: TokenPosition,
chan_val: int,
tagset: Optional[TagsetRepr] = "nkjp",
ann_base_prop_name: Optional[str] = None,
doc: Optional[DocumentPtr] = None,
......@@ -54,6 +56,7 @@ class AnnotatedExpression(object):
ann_name: name of annotation (annotation channel).
tok_position: position of `token` in the document (tok_sent_idx,
sent_id, par_id).
chan_val: value in annotation channel.
tagset: name of `Tagset` object, defaults to 'nkjp'.
ann_base_prop_name: name of property stroring base form of annotation.
If not given then '{ann_name}_base' will be used as base prop name.
......@@ -63,11 +66,6 @@ class AnnotatedExpression(object):
self._sent = sent
self._ann_name = ann_name
self._ann_base_prop_name = ann_base_prop_name
self._pref_lex: Optional[Tuple[str, ...]] = None
self._tok_lemmas: Optional[Tuple[str, ...]] = None
self._ann_base_lemma: Optional[str] = None
self._ann_base_token: Optional[Token] = None
self._has_ann_base_lemma: Optional[bool] = None
self._doc = doc
if isinstance(tagset, str):
tagset = ccl.get_tagset(tagset)
......@@ -75,6 +73,12 @@ class AnnotatedExpression(object):
self.toks_ids = set([tok_position[0]])
self.sent_id = tok_position[1]
self.par_id = tok_position[2]
self._chan_val = chan_val
self._pref_lex: Optional[Tuple[str, ...]] = None
self._tok_lemmas: Optional[Tuple[str, ...]] = None
self._ann_base_lemma: Optional[str] = None
self._ann_base_token: Optional[Token] = None
self._has_ann_base_lemma: Optional[bool] = None
@property
def annotation_name(self) -> str:
......@@ -178,6 +182,11 @@ class AnnotatedExpression(object):
indexes, sent, par = self.position
return f"{par}:{sent}:{','.join(sorted(['t' + str(i) for i in indexes]))}"
@property
def ann_description(self) -> AnnDescr:
sent_id, par_id = self.position[1:]
return (self.annotation_name, sent_id, par_id, self._chan_val)
@property
def has_ann_base_lemma(self) -> Optional[bool]:
"""
......@@ -188,14 +197,17 @@ class AnnotatedExpression(object):
self._find_ann_base_lemma()
return self._has_ann_base_lemma
def append(self, token: Token, tok_position: TokenPosition) -> None:
def append(self, token: Token, tok_position: TokenPosition, chan_val: str) -> None:
"""
Extends annotation object by including next token belonging to that annotation.
Args:
tok_position: (tok_sent_idx, sent_id, par_id)
token: token.
tok_position: (tok_sent_idx, sent_id, par_id) triple identyfing
token in the document.
chan_val: Value of the annotation channel.
"""
self._check_position(*tok_position)
self._check_position(*tok_position, chan_val)
self._tokens.append(token)
self.toks_ids.add(tok_position[0])
......@@ -279,10 +291,13 @@ class AnnotatedExpression(object):
(False, False, True): self.annotation_base_lemma,
}[(as_orths, as_lexemes, as_ann_base)]
def _check_position(self, tok_idx: int, sent_id: str, par_id: str) -> None:
def _check_position(
self, tok_idx: int, sent_id: str, par_id: str, chan_val: str
) -> None:
"""
Checks whether newly appended token is placed in same paragraph and
sentence as already present one.
sentence as already present one. Additionally, checks if annotation
channel value for this token match with values of other tokens.
"""
if self.par_id and self.par_id != par_id:
raise ValueError(
......@@ -297,6 +312,12 @@ class AnnotatedExpression(object):
if self.toks_ids and tok_idx in self.toks_ids:
raise ValueError(f"Token at position {tok_idx} already added!")
if chan_val != self._chan_val:
raise ValueError(
"Token does not belong to the same annotation (channel values"
f"differs; is: {chan_val}, expected: {self._chan_val})!"
)
def __eq__(self, other):
"""
Two annotated expressions are equal if their base expressions are equal
......@@ -320,6 +341,9 @@ class AnnotatedExpression(object):
(self.annotation_name, self.position, self.tokens_pref_lexemes_lowered)
)
def __len__(self):
return self.length
def __repr__(self):
expr_str = f"{self._ann_name}:{self.tokens_orths}"
indexes, sent, par = self.position
......@@ -346,7 +370,7 @@ class DocumentAnnotations(object):
):
self._doc = doc
self.tagset = tagset
self._ann_dict: Dict[Tuple[str, str, str, int], AnnotatedExpression] = {}
self._ann_dict: Dict[AnnDescr, AnnotatedExpression] = {}
self._tok_pos_to_tok: Dict[TokenPosition, Token] = {}
self._tok_dict: Dict[TokenPosition, List[AnnotatedExpression]] = defaultdict(
list
......@@ -362,10 +386,12 @@ class DocumentAnnotations(object):
@property
def expressions_index(
self,
) -> Dict[Tuple[str, str, str, int], AnnotatedExpression]:
) -> Dict[AnnDescr, AnnotatedExpression]:
"""
Returns index of all annotations found in the document.
Index preserves original order of annotations in value lists.
Returns:
Dict:
key: Tuple[annotation_name, sent_id, par_id, chan_val]
......@@ -411,16 +437,22 @@ class DocumentAnnotations(object):
"""
if chan_val != 0 and (not accepted or ann_name in accepted):
sent_id, par_id = tok_pos[1], tok_pos[2]
ann_dict_key = (ann_name, sent_id, par_id, chan_val)
ann_descr: AnnDescr = (ann_name, sent_id, par_id, chan_val)
related_ann = None
if ann_dict_key in self._ann_dict:
related_ann = self._ann_dict[ann_dict_key]
related_ann.append(token, tok_pos)
if ann_descr in self._ann_dict:
related_ann = self._ann_dict[ann_descr]
related_ann.append(token, tok_pos, chan_val)
else:
related_ann = AnnotatedExpression(
token, sent, ann_name, tok_pos, tagset=self.tagset, doc=self._doc
token,
sent,
ann_name,
tok_pos,
chan_val,
tagset=self.tagset,
doc=self._doc,
)
self._ann_dict[ann_dict_key] = related_ann
self._ann_dict[ann_descr] = related_ann
self._tok_dict[tok_pos].append(related_ann)
self._tok_pos_to_tok[tok_pos] = token
......@@ -448,9 +480,21 @@ class DocumentAnnotations(object):
token, sent, tok_pos, ann, chan_val, accepted=accepted
)
def erase_annotation(
self, ann_name: str, sent_pos: SentPosition, chan_val: int
) -> None:
def erase_annotation_by_descr(self, ann_descr: AnnDescr) -> None:
"""
Erase annotation specified by its description in sentence' tokens.
Technically speaking, for all token related to annotation specified by
input params, this method will set values of annotation channel to 0.
Args:
ann_descr: Tuple containing information about annotation:
(ann name, sent id, par id, chan value)
"""
related_ann = self.expressions_index[ann_descr]
self.erase_annotation(related_ann)
def erase_annotation(self, ann: AnnotatedExpression) -> None:
"""
Erase annotation in sentence' tokens.
......@@ -462,19 +506,19 @@ class DocumentAnnotations(object):
ann_name: name of annotation to erase.
chan_val: current value for annotation in annotation channel.
"""
related_ann = self.expressions_index[(ann_name, *sent_pos, chan_val)]
if related_ann:
if ann:
ann_descr = ann.ann_description
# erase annotation in underlying tokens
related_ann.erase_annotation()
ann.erase_annotation()
# remove from `_tok_dict` (tok_pos -> anns index)
# TODO what about _tok_dict index?! maybe better option is to just
# mark `AnnotatedExpression` instance as stale?
for pos in related_ann.tokens_positions:
self._tok_dict[pos].remove(related_ann)
for pos in ann.tokens_positions:
self._tok_dict[pos].remove(ann)
# delete from base index
del self.expressions_index[(ann_name, *sent_pos, chan_val)]
del self.expressions_index[ann_descr]
def group_by_chan_name(
self,
......@@ -485,6 +529,9 @@ class DocumentAnnotations(object):
"""
Returns annotations grouped by channel name, in one of specified forms.
Original (document) order of annotations in value lists is preserved
(in case of two or more annotations of certain type in document).
Args:
as_orths: returns orths instead of `AnnotatedExpression` instances
as_lexemes: returns pref lexemes instead of `AnnotatedExpression`
......
......@@ -6,7 +6,7 @@ setup(
author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski',
description='''A convenient API based on Corpus2 library for analyzing textual
corpora in CCL format.''',
version='1.1',
version='1.1.1rc1',
packages=['cclutils', 'cclutils.extras'],
zip_safe=False
)
......@@ -111,9 +111,9 @@ def test_erase_ann_with_base():
sent_pos = ("s1", "ch1")
ann_name = "room_type"
chan_val = 1
dla_dwoch_osob_room_type_pos = (ann_name, *sent_pos, chan_val)
dla_dwoch_osob_room_type_ann_descr = (ann_name, *sent_pos, chan_val)
dla_dwoch_osob_room_type_ann = anns.expressions_index[
(dla_dwoch_osob_room_type_pos)
dla_dwoch_osob_room_type_ann_descr
]
ann_tokens = dla_dwoch_osob_room_type_ann._tokens
ann_sent = dla_dwoch_osob_room_type_ann._sent
......@@ -145,7 +145,7 @@ def test_erase_ann_with_base():
raise AssertionError("Document does not meet initial test conditions!") from e
# when
anns.erase_annotation(ann_name, sent_pos, chan_val)
anns.erase_annotation_by_descr(dla_dwoch_osob_room_type_ann_descr)
# then
assert ccl.get_annotation(ann_sent, dla_tok, "room_type") == 0
......@@ -178,11 +178,13 @@ def test_erase_ann_with_base():
# test in-memory object
with pytest.raises(KeyError):
anns.expressions_index[(dla_dwoch_osob_room_type_pos)]
anns.expressions_index[dla_dwoch_osob_room_type_ann_descr]
# finally re-read annotations and check
with pytest.raises(KeyError):
get_document_annotations(doc).expressions_index[(dla_dwoch_osob_room_type_pos)]
get_document_annotations(doc).expressions_index[
dla_dwoch_osob_room_type_ann_descr
]
def test_erase_ann_without_base():
......@@ -191,8 +193,35 @@ def test_erase_ann_without_base():
sent_pos = ("s1", "ch1")
ann_name = "region"
chan_val = 1
pos = (ann_name, *sent_pos, chan_val)
region_ann = anns.expressions_index[pos]
ann_descr = (ann_name, *sent_pos, chan_val)
region_ann = anns.expressions_index[ann_descr]
gdansk_tok = region_ann._tokens[0]
ann_sent = region_ann._sent
# given
try:
assert not region_ann.has_ann_base_lemma
assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
assert ccl.get_annotation(ann_sent, gdansk_tok, "region") == 1
except AssertionError as e:
raise AssertionError("Document does not meet initial test conditions!") from e
# when
anns.erase_annotation_by_descr(ann_descr)
# then
assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
assert ccl.get_annotation(ann_sent, gdansk_tok, "region") == 0
def test_erase_ann_by_obj_without_base():
doc = ccl.read(CCL_TEST_PATH_02)
anns = get_document_annotations(doc)
sent_pos = ("s1", "ch1")
ann_name = "region"
chan_val = 1
ann_descr = (ann_name, *sent_pos, chan_val)
region_ann = anns.expressions_index[ann_descr]
gdansk_tok = region_ann._tokens[0]
ann_sent = region_ann._sent
......@@ -205,7 +234,7 @@ def test_erase_ann_without_base():
raise AssertionError("Document does not meet initial test conditions!") from e
# when
anns.erase_annotation(ann_name, sent_pos, chan_val)
anns.erase_annotation(region_ann)
# then
assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment