Use ann chan number, implement ann removal, minor improvements

0d507b8f · Grzegorz Kostkowski · 3ff58725 · 0d507b8f · 0d507b8f · 0d507b8f
Commit 0d507b8f authored 3 years ago by Grzegorz Kostkowski
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ build-env:
 # help:... python) dependencies
 .PHONY: rebuild-env
 rebuild-env:
-	docker build -f docker/Dockerfile --no-cache -t cclutils-base
+	docker build . -f docker/Dockerfile -t cclutils-base --no-cache

 # help: build-prod-env
 # help:... build production container (used for CI/CD deploy)
@@ -73,7 +73,7 @@ check-types-dev:

 # help: test-dev
 # help:... run tests inside the container (without rebuilding), mounts current
-# help:... version of tests. To enable pudb (or pass other flags) run "make 
+# help:... version of tests. To enable pudb (or pass other flags) run "make
 # help:... flags=--pudb test-dev"
 .PHONY: test-dev
 test-dev:

--- a/cclutils/extras/annotations.py
+++ b/cclutils/extras/annotations.py
@@ -14,6 +14,7 @@ from corpus2 import DocumentPtr, SentencePtr, Tagset, Token
 __all__ = ["AnnotatedExpression", "DocumentAnnotations", "get_document_annotations"]


+AnnDescr = Tuple[str, str, str, int]  # ann_name, sent_id, par_id, chan_val
 AnnRepr = Union[Tuple[str, ...], str, "AnnotatedExpression"]
 SentPosition = Tuple[str, str]
 TagsetRepr = Union[str, Tagset]
@@ -37,6 +38,7 @@ class AnnotatedExpression(object):
        sent: SentencePtr,
        ann_name: str,
        tok_position: TokenPosition,
+        chan_val: int,
        tagset: Optional[TagsetRepr] = "nkjp",
        ann_base_prop_name: Optional[str] = None,
        doc: Optional[DocumentPtr] = None,
@@ -54,6 +56,7 @@ class AnnotatedExpression(object):
            ann_name: name of annotation (annotation channel).
            tok_position: position of `token` in the document (tok_sent_idx,
                sent_id, par_id).
+            chan_val: value in annotation channel.
            tagset: name of `Tagset` object, defaults to 'nkjp'.
            ann_base_prop_name: name of property stroring base form of annotation.
                If not given then '{ann_name}_base' will be used as base prop name.
@@ -63,11 +66,6 @@ class AnnotatedExpression(object):
        self._sent = sent
        self._ann_name = ann_name
        self._ann_base_prop_name = ann_base_prop_name
-        self._pref_lex: Optional[Tuple[str, ...]] = None
-        self._tok_lemmas: Optional[Tuple[str, ...]] = None
-        self._ann_base_lemma: Optional[str] = None
-        self._ann_base_token: Optional[Token] = None
-        self._has_ann_base_lemma: Optional[bool] = None
        self._doc = doc
        if isinstance(tagset, str):
            tagset = ccl.get_tagset(tagset)
@@ -75,6 +73,12 @@ class AnnotatedExpression(object):
        self.toks_ids = set([tok_position[0]])
        self.sent_id = tok_position[1]
        self.par_id = tok_position[2]
+        self._chan_val = chan_val
+        self._pref_lex: Optional[Tuple[str, ...]] = None
+        self._tok_lemmas: Optional[Tuple[str, ...]] = None
+        self._ann_base_lemma: Optional[str] = None
+        self._ann_base_token: Optional[Token] = None
+        self._has_ann_base_lemma: Optional[bool] = None

    @property
    def annotation_name(self) -> str:
@@ -178,6 +182,11 @@ class AnnotatedExpression(object):
        indexes, sent, par = self.position
        return f"{par}:{sent}:{','.join(sorted(['t' + str(i) for i in indexes]))}"

+    @property
+    def ann_description(self) -> AnnDescr:
+        sent_id, par_id = self.position[1:]
+        return (self.annotation_name, sent_id, par_id, self._chan_val)
+
    @property
    def has_ann_base_lemma(self) -> Optional[bool]:
        """
@@ -188,14 +197,17 @@ class AnnotatedExpression(object):
            self._find_ann_base_lemma()
        return self._has_ann_base_lemma

-    def append(self, token: Token, tok_position: TokenPosition) -> None:
+    def append(self, token: Token, tok_position: TokenPosition, chan_val: str) -> None:
        """
        Extends annotation object by including next token belonging to that annotation.

        Args:
-            tok_position: (tok_sent_idx, sent_id, par_id)
+            token: token.
+            tok_position: (tok_sent_idx, sent_id, par_id) triple identyfing
+            token in the document.
+            chan_val: Value of the annotation channel.
        """
-        self._check_position(*tok_position)
+        self._check_position(*tok_position, chan_val)
        self._tokens.append(token)
        self.toks_ids.add(tok_position[0])

@@ -279,10 +291,13 @@ class AnnotatedExpression(object):
            (False, False, True): self.annotation_base_lemma,
        }[(as_orths, as_lexemes, as_ann_base)]

-    def _check_position(self, tok_idx: int, sent_id: str, par_id: str) -> None:
+    def _check_position(
+        self, tok_idx: int, sent_id: str, par_id: str, chan_val: str
+    ) -> None:
        """
        Checks whether newly appended token is placed in same paragraph and
-        sentence as already present one.
+        sentence as already present one. Additionally, checks if annotation
+        channel value for this token match with values of other tokens.
        """
        if self.par_id and self.par_id != par_id:
            raise ValueError(
@@ -297,6 +312,12 @@ class AnnotatedExpression(object):
        if self.toks_ids and tok_idx in self.toks_ids:
            raise ValueError(f"Token at position {tok_idx} already added!")

+        if chan_val != self._chan_val:
+            raise ValueError(
+                "Token does not belong to the same annotation (channel values"
+                f"differs; is: {chan_val}, expected: {self._chan_val})!"
+            )
+
    def __eq__(self, other):
        """
        Two annotated expressions are equal if their base expressions are equal
@@ -320,6 +341,9 @@ class AnnotatedExpression(object):
            (self.annotation_name, self.position, self.tokens_pref_lexemes_lowered)
        )

+    def __len__(self):
+        return self.length
+
    def __repr__(self):
        expr_str = f"{self._ann_name}:{self.tokens_orths}"
        indexes, sent, par = self.position
@@ -346,7 +370,7 @@ class DocumentAnnotations(object):
    ):
        self._doc = doc
        self.tagset = tagset
-        self._ann_dict: Dict[Tuple[str, str, str, int], AnnotatedExpression] = {}
+        self._ann_dict: Dict[AnnDescr, AnnotatedExpression] = {}
        self._tok_pos_to_tok: Dict[TokenPosition, Token] = {}
        self._tok_dict: Dict[TokenPosition, List[AnnotatedExpression]] = defaultdict(
            list
@@ -362,10 +386,12 @@ class DocumentAnnotations(object):
    @property
    def expressions_index(
        self,
-    ) -> Dict[Tuple[str, str, str, int], AnnotatedExpression]:
+    ) -> Dict[AnnDescr, AnnotatedExpression]:
        """
        Returns index of all annotations found in the document.

+        Index preserves original order of annotations in value lists.
+
        Returns:
            Dict:
                key: Tuple[annotation_name, sent_id, par_id, chan_val]
@@ -411,16 +437,22 @@ class DocumentAnnotations(object):
        """
        if chan_val != 0 and (not accepted or ann_name in accepted):
            sent_id, par_id = tok_pos[1], tok_pos[2]
-            ann_dict_key = (ann_name, sent_id, par_id, chan_val)
+            ann_descr: AnnDescr = (ann_name, sent_id, par_id, chan_val)
            related_ann = None
-            if ann_dict_key in self._ann_dict:
-                related_ann = self._ann_dict[ann_dict_key]
-                related_ann.append(token, tok_pos)
+            if ann_descr in self._ann_dict:
+                related_ann = self._ann_dict[ann_descr]
+                related_ann.append(token, tok_pos, chan_val)
            else:
                related_ann = AnnotatedExpression(
-                    token, sent, ann_name, tok_pos, tagset=self.tagset, doc=self._doc
+                    token,
+                    sent,
+                    ann_name,
+                    tok_pos,
+                    chan_val,
+                    tagset=self.tagset,
+                    doc=self._doc,
                )
-                self._ann_dict[ann_dict_key] = related_ann
+                self._ann_dict[ann_descr] = related_ann
            self._tok_dict[tok_pos].append(related_ann)
            self._tok_pos_to_tok[tok_pos] = token

@@ -448,9 +480,21 @@ class DocumentAnnotations(object):
                token, sent, tok_pos, ann, chan_val, accepted=accepted
            )

-    def erase_annotation(
-        self, ann_name: str, sent_pos: SentPosition, chan_val: int
-    ) -> None:
+    def erase_annotation_by_descr(self, ann_descr: AnnDescr) -> None:
+        """
+        Erase annotation specified by its description in sentence' tokens.
+
+        Technically speaking, for all token related to annotation specified by
+        input params, this method will set values of annotation channel to 0.
+
+        Args:
+            ann_descr: Tuple containing information about annotation:
+                (ann name, sent id, par id, chan value)
+        """
+        related_ann = self.expressions_index[ann_descr]
+        self.erase_annotation(related_ann)
+
+    def erase_annotation(self, ann: AnnotatedExpression) -> None:
        """
        Erase annotation in sentence' tokens.

@@ -462,19 +506,19 @@ class DocumentAnnotations(object):
            ann_name: name of annotation to erase.
            chan_val: current value for annotation in annotation channel.
        """
-        related_ann = self.expressions_index[(ann_name, *sent_pos, chan_val)]
-        if related_ann:
+        if ann:
+            ann_descr = ann.ann_description
            # erase annotation in underlying tokens
-            related_ann.erase_annotation()
+            ann.erase_annotation()

            # remove from `_tok_dict` (tok_pos -> anns index)
            # TODO what about _tok_dict index?! maybe better option is to just
            # mark `AnnotatedExpression` instance as stale?
-            for pos in related_ann.tokens_positions:
-                self._tok_dict[pos].remove(related_ann)
+            for pos in ann.tokens_positions:
+                self._tok_dict[pos].remove(ann)

            # delete from base index
-            del self.expressions_index[(ann_name, *sent_pos, chan_val)]
+            del self.expressions_index[ann_descr]

    def group_by_chan_name(
        self,
@@ -485,6 +529,9 @@ class DocumentAnnotations(object):
        """
        Returns annotations grouped by channel name, in one of specified forms.

+        Original (document) order of annotations in value lists is preserved
+        (in case of two or more annotations of certain type in document).
+
        Args:
            as_orths: returns orths instead of `AnnotatedExpression` instances
            as_lexemes: returns pref lexemes instead of `AnnotatedExpression`

--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ setup(
    author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski',
    description='''A convenient API based on Corpus2 library for analyzing textual
        corpora in CCL format.''',
-    version='1.1',
+    version='1.1.1rc1',
    packages=['cclutils', 'cclutils.extras'],
    zip_safe=False
 )
--- a/tests/test_annotations.py
+++ b/tests/test_annotations.py
@@ -111,9 +111,9 @@ def test_erase_ann_with_base():
    sent_pos = ("s1", "ch1")
    ann_name = "room_type"
    chan_val = 1
-    dla_dwoch_osob_room_type_pos = (ann_name, *sent_pos, chan_val)
+    dla_dwoch_osob_room_type_ann_descr = (ann_name, *sent_pos, chan_val)
    dla_dwoch_osob_room_type_ann = anns.expressions_index[
-        (dla_dwoch_osob_room_type_pos)
+        dla_dwoch_osob_room_type_ann_descr
    ]
    ann_tokens = dla_dwoch_osob_room_type_ann._tokens
    ann_sent = dla_dwoch_osob_room_type_ann._sent
@@ -145,7 +145,7 @@ def test_erase_ann_with_base():
        raise AssertionError("Document does not meet initial test conditions!") from e

    # when
-    anns.erase_annotation(ann_name, sent_pos, chan_val)
+    anns.erase_annotation_by_descr(dla_dwoch_osob_room_type_ann_descr)

    # then
    assert ccl.get_annotation(ann_sent, dla_tok, "room_type") == 0
@@ -178,11 +178,13 @@ def test_erase_ann_with_base():

    # test in-memory object
    with pytest.raises(KeyError):
-        anns.expressions_index[(dla_dwoch_osob_room_type_pos)]
+        anns.expressions_index[dla_dwoch_osob_room_type_ann_descr]

    # finally re-read annotations and check
    with pytest.raises(KeyError):
-        get_document_annotations(doc).expressions_index[(dla_dwoch_osob_room_type_pos)]
+        get_document_annotations(doc).expressions_index[
+            dla_dwoch_osob_room_type_ann_descr
+        ]


 def test_erase_ann_without_base():
@@ -191,8 +193,35 @@ def test_erase_ann_without_base():
    sent_pos = ("s1", "ch1")
    ann_name = "region"
    chan_val = 1
-    pos = (ann_name, *sent_pos, chan_val)
-    region_ann = anns.expressions_index[pos]
+    ann_descr = (ann_name, *sent_pos, chan_val)
+    region_ann = anns.expressions_index[ann_descr]
+    gdansk_tok = region_ann._tokens[0]
+    ann_sent = region_ann._sent
+
+    # given
+    try:
+        assert not region_ann.has_ann_base_lemma
+        assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
+        assert ccl.get_annotation(ann_sent, gdansk_tok, "region") == 1
+    except AssertionError as e:
+        raise AssertionError("Document does not meet initial test conditions!") from e
+
+    # when
+    anns.erase_annotation_by_descr(ann_descr)
+
+    # then
+    assert ccl.get_attribute(gdansk_tok, "region_base", False) == False
+    assert ccl.get_annotation(ann_sent, gdansk_tok, "region") == 0
+
+
+def test_erase_ann_by_obj_without_base():
+    doc = ccl.read(CCL_TEST_PATH_02)
+    anns = get_document_annotations(doc)
+    sent_pos = ("s1", "ch1")
+    ann_name = "region"
+    chan_val = 1
+    ann_descr = (ann_name, *sent_pos, chan_val)
+    region_ann = anns.expressions_index[ann_descr]
    gdansk_tok = region_ann._tokens[0]
    ann_sent = region_ann._sent

@@ -205,7 +234,7 @@ def test_erase_ann_without_base():
        raise AssertionError("Document does not meet initial test conditions!") from e

    # when
-    anns.erase_annotation(ann_name, sent_pos, chan_val)
+    anns.erase_annotation(region_ann)

    # then
    assert ccl.get_attribute(gdansk_tok, "region_base", False) == False