Add soft filtering for tokens

Important note: current implementation of soft filtering assumes that head of annotation always occurs before tail tokens, but this can be not true in all cases.

Add soft filtering for tokens
007dbba6 · Grzegorz Kostkowski · 57e64764 · 007dbba6 · 007dbba6 · 007dbba6
Commit 007dbba6 authored Mar 7, 2022 by Grzegorz Kostkowski
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# Unreleased
+
+## Added
+- Added erasing of annotations in extras/annotations.py together with tests
+    and examples of usage,
+- New functions (methods) in public interface:
+    - ``AnnotatedExpression.tokens_positions``
+    - ``AnnotatedExpression.annotation_base_token``
+    - ``AnnotatedExpression.has_ann_base_lemma``
+    - ``AnnotatedExpression.erase_annotation``
+    - ``DocumentAnnotations.erase_annotation``
+- Add soft filtering for tokens
+
+## Changed
+- Changed names of methods:
+    - ``AnnotatedExpression.base_annotation_name`` becomes
+      ``AnnotatedExpression.annotation_base_prop_name``,
+    - ``AnnotatedExpression.base_annotation_lemma`` becomes
+      ``AnnotatedExpression.annotation_base_lemma``,
+- minor fixes in Makefile has been made
+
+## Fixed
+
+
+# 1.1
+Add new high-level annotation module.
+The purpose of new module is to provide high-level functions for
+reading CCL annotations and provide easy way to get them based
+on various needs.
+
+## Added
+- Implemented annotations module,
+- Provided test data and implement tests,
+- Prepared Makefile as make can serve as unified dev / test / ci environment,
+- Provided examples of usage in README.
+
+## Changed
+- Changed .gitlab-ci.yml to use make and images defined in this repo,
--- a/cclutils/extras/annotations.py
+++ b/cclutils/extras/annotations.py
@@ -5,7 +5,7 @@
 Module provide easy way to read CCL annotations.
 """
 from collections import defaultdict, OrderedDict
-from typing import Any, Dict, Iterable, List, Set, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Set, Optional, Tuple, Union

 import cclutils as ccl
 from corpus2 import DocumentPtr, SentencePtr, Tagset, Token
@@ -419,6 +419,7 @@ class DocumentAnnotations(object):
        ann_name: str,
        chan_val: int,
        accepted: Optional[Set[str]] = None,
+        create: Optional[bool] = True,
    ) -> None:
        """
        Append token with single annotation to the index. Ignores if value
@@ -431,6 +432,14 @@ class DocumentAnnotations(object):
                contains passed token.
            accepted(set): if given, only tokens with annotations specified
                in this set will be added
+            create: specifies whether new annotation based on metadata in passed
+                `token` should be created. If `False`, then passed token will be
+                added to ann index only if annotation to which it belongs
+                already exists in index. Following key is used to find
+                annotation related with the token: (ann_name, sent_id, par_id,
+                chan_val).
+                This parameter can be used to e.g. add only selected head tokens
+                and all "tail" tokens.
        """
        if chan_val != 0 and (not accepted or ann_name in accepted):
            sent_id, par_id = tok_pos[1], tok_pos[2]
@@ -439,7 +448,7 @@ class DocumentAnnotations(object):
            if ann_descr in self._ann_dict:
                related_ann = self._ann_dict[ann_descr]
                related_ann.append(token, tok_pos, chan_val)
-            else:
+            elif create:
                related_ann = AnnotatedExpression(
                    token,
                    sent,
@@ -460,6 +469,7 @@ class DocumentAnnotations(object):
        tok_pos: TokenPosition,
        ann_name_val_dict: Dict[str, int],
        accepted: Optional[Set[str]] = None,
+        create: Optional[bool] = True,
    ) -> None:
        """
        Append token with all related annotations to the index.
@@ -471,10 +481,24 @@ class DocumentAnnotations(object):
                contains passed token.
            accepted(set): if given, only tokens with annotations specified
                in this set will be added
+            create: specifies whether new annotation based on metadata in passed
+                `token` should be created. If `False`, then passed token will be
+                added to ann index only if annotation to which it belongs
+                already exists in index. Following key is used to find
+                annotation related with the token: (ann_name, sent_id, par_id,
+                chan_val).
+                This parameter can be used to e.g. add only selected head tokens
+                and all "tail" tokens.
        """
        for ann, chan_val in ann_name_val_dict.items():
            self.append_token_with_ann(
-                token, sent, tok_pos, ann, chan_val, accepted=accepted
+                token,
+                sent,
+                tok_pos,
+                ann,
+                chan_val,
+                accepted=accepted,
+                create=create,
            )

    def erase_annotation_by_descr(self, ann_descr: AnnDescr) -> None:
@@ -597,6 +621,7 @@ def get_document_annotations(
    ccl_obj_or_path: Union[DocumentPtr, str],
    tagset: Optional[Union[Tagset, str]] = "nkjp",
    annotations: Optional[Set[str]] = None,
+    token_soft_filter: Optional[Callable[[Token], bool]] = None,
 ) -> DocumentAnnotations:
    """
    Finds annotations in CCL document and returns in a form allowing easy access
@@ -614,6 +639,12 @@ def get_document_annotations(
        tagset: document tagset (`corpus2.Tagset` or `str` name).
        annotations: set of names (string) of annotations (annotation channels)
        to find. If not given, then finds all annotated expressions.
+        token_soft_filter: callable used to filter tokens. If callable will
+            return `True` then token will be included; otherwise will be excluded.
+            This is kind of "soft" filtering as it will not apply to tokens
+            containing the "tail" of already included annotation. This is
+            required, as in other case it would lead to incomplete multiword
+            annotations.

    Returns:
        `DocumentAnnotations` instance with gathered annotated tokens.
@@ -624,10 +655,16 @@ def get_document_annotations(
    for p in doc.paragraphs():
        for s in p.sentences():
            for i, t in enumerate(s.tokens()):
+                add_new = token_soft_filter(t) if token_soft_filter else True
                anns_dict: Dict[str, int] = ccl.get_annotations(s, t, i)
                tok_in_doc_pos = (i, s.id(), p.get_attribute("id"))
                doc_ann.append_token_with_all_ann(
-                    t, s, tok_in_doc_pos, anns_dict, accepted=annotations
+                    t,
+                    s,
+                    tok_in_doc_pos,
+                    anns_dict,
+                    accepted=annotations,
+                    create=add_new,
                )
    return doc_ann


--- a/setup.py
+++ b/setup.py
@@ -2,11 +2,11 @@ from setuptools import setup


 setup(
-    name='cclutils',
-    author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski',
-    description='''A convenient API based on Corpus2 library for analyzing textual
-        corpora in CCL format.''',
-    version='1.1.1rc1',
-    packages=['cclutils', 'cclutils.extras'],
-    zip_safe=False
+    name="cclutils",
+    author="Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski",
+    description="""A convenient API based on Corpus2 library for analyzing textual
+        corpora in CCL format.""",
+    version="1.1.1rc2",
+    packages=["cclutils", "cclutils.extras"],
+    zip_safe=False,
 )
--- a/tests/data/ccl02.xml
+++ b/tests/data/ccl02.xml
@@ -52,6 +52,8 @@
    <ann chan="designation">0</ann>
    <ann chan="region">1</ann>
    <ann chan="room_type">0</ann>
+    <prop key="e01">http://dbpedia.org/resource/Gdańsk</prop>
+    <prop key="entities">e01</prop>
   </tok>
   <ns/>
   <tok>
@@ -75,6 +77,8 @@
    <ann chan="room_type">0</ann>
    <prop key="attraction_base">hotel</prop>
    <prop key="hotel_name_base">Hotel</prop>
+    <prop key="e01">http://plwordnet.pwr.wroc.pl/wordnet/synset/1068</prop>
+    <prop key="entities">e01</prop>
   </tok>
    <ns/>
   <tok>
@@ -238,6 +242,8 @@
    <ann chan="hotel_name">0</ann>
    <ann chan="room_type">0</ann>
    <prop key="food_base">pełne wyżywienie</prop>
+    <prop key="e01">http://plwordnet.pwr.wroc.pl/wordnet/synset/96893</prop>
+    <prop key="entities"></prop>
   </tok>
   <tok>
    <orth>wyżywieniem</orth>

--- a/tests/test_annotations.py
+++ b/tests/test_annotations.py
@@ -99,6 +99,24 @@ def test_restricted_ann_set():
    }


+def test_filtered_tokens():
+    def _has_entities_property(token):
+        return ccl.get_attribute(token, "entities", default=None) is not None
+
+    anns = get_document_annotations(
+        CCL_TEST_PATH_02,
+        token_soft_filter=_has_entities_property,
+    )
+
+    expressions_orth_index = as_expressions_orth_index(anns.expressions_index)
+    assert expressions_orth_index == {
+        ("region", "s1", "ch1", 1): ("Gdańsk",),
+        ("attraction", "s2", "ch2", 1): ("Hotel",),
+        ("hotel_name", "s2", "ch2", 1): ("Hotel",),
+        ("food", "s2", "ch2", 2): ("pełnym", "wyżywieniem"),
+    }
+
+
 def test_erase_ann_with_base():
    doc = ccl.read(CCL_TEST_PATH_02)
    anns = get_document_annotations(