Skip to content
Snippets Groups Projects
Commit 007dbba6 authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski
Browse files

Add soft filtering for tokens

Important note: current implementation of soft filtering assumes that head of
annotation always occurs before tail tokens, but this can be not true in all cases.
parent 57e64764
Branches
No related tags found
No related merge requests found
Pipeline #4548 failed
# Unreleased
## Added
- Added erasing of annotations in extras/annotations.py together with tests
and examples of usage,
- New functions (methods) in public interface:
- ``AnnotatedExpression.tokens_positions``
- ``AnnotatedExpression.annotation_base_token``
- ``AnnotatedExpression.has_ann_base_lemma``
- ``AnnotatedExpression.erase_annotation``
- ``DocumentAnnotations.erase_annotation``
- Add soft filtering for tokens
## Changed
- Changed names of methods:
- ``AnnotatedExpression.base_annotation_name`` becomes
``AnnotatedExpression.annotation_base_prop_name``,
- ``AnnotatedExpression.base_annotation_lemma`` becomes
``AnnotatedExpression.annotation_base_lemma``,
- minor fixes in Makefile has been made
## Fixed
# 1.1
Add new high-level annotation module.
The purpose of new module is to provide high-level functions for
reading CCL annotations and provide easy way to get them based
on various needs.
## Added
- Implemented annotations module,
- Provided test data and implement tests,
- Prepared Makefile as make can serve as unified dev / test / ci environment,
- Provided examples of usage in README.
## Changed
- Changed .gitlab-ci.yml to use make and images defined in this repo,
......@@ -5,7 +5,7 @@
Module provide easy way to read CCL annotations.
"""
from collections import defaultdict, OrderedDict
from typing import Any, Dict, Iterable, List, Set, Optional, Tuple, Union
from typing import Any, Callable, Dict, Iterable, List, Set, Optional, Tuple, Union
import cclutils as ccl
from corpus2 import DocumentPtr, SentencePtr, Tagset, Token
......@@ -419,6 +419,7 @@ class DocumentAnnotations(object):
ann_name: str,
chan_val: int,
accepted: Optional[Set[str]] = None,
create: Optional[bool] = True,
) -> None:
"""
Append token with single annotation to the index. Ignores if value
......@@ -431,6 +432,14 @@ class DocumentAnnotations(object):
contains passed token.
accepted(set): if given, only tokens with annotations specified
in this set will be added
create: specifies whether new annotation based on metadata in passed
`token` should be created. If `False`, then passed token will be
added to ann index only if annotation to which it belongs
already exists in index. Following key is used to find
annotation related with the token: (ann_name, sent_id, par_id,
chan_val).
This parameter can be used to e.g. add only selected head tokens
and all "tail" tokens.
"""
if chan_val != 0 and (not accepted or ann_name in accepted):
sent_id, par_id = tok_pos[1], tok_pos[2]
......@@ -439,7 +448,7 @@ class DocumentAnnotations(object):
if ann_descr in self._ann_dict:
related_ann = self._ann_dict[ann_descr]
related_ann.append(token, tok_pos, chan_val)
else:
elif create:
related_ann = AnnotatedExpression(
token,
sent,
......@@ -460,6 +469,7 @@ class DocumentAnnotations(object):
tok_pos: TokenPosition,
ann_name_val_dict: Dict[str, int],
accepted: Optional[Set[str]] = None,
create: Optional[bool] = True,
) -> None:
"""
Append token with all related annotations to the index.
......@@ -471,10 +481,24 @@ class DocumentAnnotations(object):
contains passed token.
accepted(set): if given, only tokens with annotations specified
in this set will be added
create: specifies whether new annotation based on metadata in passed
`token` should be created. If `False`, then passed token will be
added to ann index only if annotation to which it belongs
already exists in index. Following key is used to find
annotation related with the token: (ann_name, sent_id, par_id,
chan_val).
This parameter can be used to e.g. add only selected head tokens
and all "tail" tokens.
"""
for ann, chan_val in ann_name_val_dict.items():
self.append_token_with_ann(
token, sent, tok_pos, ann, chan_val, accepted=accepted
token,
sent,
tok_pos,
ann,
chan_val,
accepted=accepted,
create=create,
)
def erase_annotation_by_descr(self, ann_descr: AnnDescr) -> None:
......@@ -597,6 +621,7 @@ def get_document_annotations(
ccl_obj_or_path: Union[DocumentPtr, str],
tagset: Optional[Union[Tagset, str]] = "nkjp",
annotations: Optional[Set[str]] = None,
token_soft_filter: Optional[Callable[[Token], bool]] = None,
) -> DocumentAnnotations:
"""
Finds annotations in CCL document and returns in a form allowing easy access
......@@ -614,6 +639,12 @@ def get_document_annotations(
tagset: document tagset (`corpus2.Tagset` or `str` name).
annotations: set of names (string) of annotations (annotation channels)
to find. If not given, then finds all annotated expressions.
token_soft_filter: callable used to filter tokens. If callable will
return `True` then token will be included; otherwise will be excluded.
This is kind of "soft" filtering as it will not apply to tokens
containing the "tail" of already included annotation. This is
required, as in other case it would lead to incomplete multiword
annotations.
Returns:
`DocumentAnnotations` instance with gathered annotated tokens.
......@@ -624,10 +655,16 @@ def get_document_annotations(
for p in doc.paragraphs():
for s in p.sentences():
for i, t in enumerate(s.tokens()):
add_new = token_soft_filter(t) if token_soft_filter else True
anns_dict: Dict[str, int] = ccl.get_annotations(s, t, i)
tok_in_doc_pos = (i, s.id(), p.get_attribute("id"))
doc_ann.append_token_with_all_ann(
t, s, tok_in_doc_pos, anns_dict, accepted=annotations
t,
s,
tok_in_doc_pos,
anns_dict,
accepted=annotations,
create=add_new,
)
return doc_ann
......
......@@ -2,11 +2,11 @@ from setuptools import setup
setup(
name='cclutils',
author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski',
description='''A convenient API based on Corpus2 library for analyzing textual
corpora in CCL format.''',
version='1.1.1rc1',
packages=['cclutils', 'cclutils.extras'],
zip_safe=False
name="cclutils",
author="Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski",
description="""A convenient API based on Corpus2 library for analyzing textual
corpora in CCL format.""",
version="1.1.1rc2",
packages=["cclutils", "cclutils.extras"],
zip_safe=False,
)
......@@ -52,6 +52,8 @@
<ann chan="designation">0</ann>
<ann chan="region">1</ann>
<ann chan="room_type">0</ann>
<prop key="e01">http://dbpedia.org/resource/Gdańsk</prop>
<prop key="entities">e01</prop>
</tok>
<ns/>
<tok>
......@@ -75,6 +77,8 @@
<ann chan="room_type">0</ann>
<prop key="attraction_base">hotel</prop>
<prop key="hotel_name_base">Hotel</prop>
<prop key="e01">http://plwordnet.pwr.wroc.pl/wordnet/synset/1068</prop>
<prop key="entities">e01</prop>
</tok>
<ns/>
<tok>
......@@ -238,6 +242,8 @@
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
<prop key="food_base">pełne wyżywienie</prop>
<prop key="e01">http://plwordnet.pwr.wroc.pl/wordnet/synset/96893</prop>
<prop key="entities"></prop>
</tok>
<tok>
<orth>wyżywieniem</orth>
......
......@@ -99,6 +99,24 @@ def test_restricted_ann_set():
}
def test_filtered_tokens():
def _has_entities_property(token):
return ccl.get_attribute(token, "entities", default=None) is not None
anns = get_document_annotations(
CCL_TEST_PATH_02,
token_soft_filter=_has_entities_property,
)
expressions_orth_index = as_expressions_orth_index(anns.expressions_index)
assert expressions_orth_index == {
("region", "s1", "ch1", 1): ("Gdańsk",),
("attraction", "s2", "ch2", 1): ("Hotel",),
("hotel_name", "s2", "ch2", 1): ("Hotel",),
("food", "s2", "ch2", 2): ("pełnym", "wyżywieniem"),
}
def test_erase_ann_with_base():
doc = ccl.read(CCL_TEST_PATH_02)
anns = get_document_annotations(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment