Implement annotations module together with tests and docker envs

The purpose of new module is to provide high-level functions for reading CCL annotations and provide easy way to get them based on various needs. Scope of changes: - implement annotations module (annotations.py) - provide test data and implement tests (test_annotations.py) - prepare Makefile as make can serve as unified dev / test / ci environment (tox is not an option as does not handle OS dependencies (or I don't know it) - change .gitlab-ci.yml to use make and images defined in this repo - provide examples of usage in README

Implement annotations module together with tests and docker envs
ebc8136b · Grzegorz Kostkowski · 3bed04eb · ebc8136b · ebc8136b · ebc8136b
Commit ebc8136b authored Dec 3, 2021 by Grzegorz Kostkowski
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-image: clarinpl/python:3.6
-before_script:
-  - pip install tox==2.9.1
-cache:
-  paths:
-  - .tox
 stages:
- # - check_style
+  - test
- - push_wheel
+  - deploy
-# pep8:
+test:
-  # stage: check_style
+  stage: test
-  # script:
+  image: docker:18.09.7
-   # - tox -v -e pep8
+  services:
-# 
+    - docker:18.09.7-dind
-# docstyle:
+  before_script:
-  # stage: check_style
+    - apk --no-cache add make
-  # script:
+    - make build-test-env
-   # - tox -v -e docstyle
+  script:
+    - make test check-types
 push_wheel:
+  stage: deploy
+  image: docker:18.09.7
+  services:
+    - docker:18.09.7-dind
  before_script:
-   - pip3.6 install twine
+    - apk --no-cache add make
+    - make build-prod-env
  only:
    - master
-  stage: push_wheel
  when: on_success
  script:
-   - python3.6 setup.py sdist bdist_wheel
+    - make deploy
-   - python3.6 -m twine upload 
-     --repository-url https://pypi.clarin-pl.eu/
-     -u $PIPY_USER -p $PIPY_PASS dist/cclutils*.whl
--- a/Makefile
+++ b/Makefile
+MAKEFLAGS += --no-print-directory
+# help: cclutils Makefile help
+# help: help
+# help:...	display this makefile's help information
+.PHONY: help
+help:
+	@grep "^# help\:" Makefile | sed 's/\# help\: *//;s/^...\s\+/\t\t/'
+# help: build-env
+# help:... build container with installed cclutils together with (OS and
+# help:... python) dependencies
+.PHONY: build-env
+build-env:
+	docker build . -f docker/Dockerfile -t cclutils-base
+# help: rebuild-env
+# help:... rebuild container with installed cclutils together with (OS and
+# help:... python) dependencies
+.PHONY: rebuild-env
+rebuild-env:
+	docker build -f docker/Dockerfile --no-cache -t cclutils-base
+# help: build-prod-env
+# help:... build production container (used for CI/CD deploy)
+.PHONY: build-prod-env
+build-prod-env: build-env
+	docker build . -f docker/prod.Dockerfile -t cclutils-prod
+.PHONY: deploy
+deploy:
+	@docker run \
+		-e PIPY_USER \
+		-e PIPY_PASS \
+		--rm \
+		-t \
+		cclutils-prod bash -c \
+	'python3.6 setup.py sdist bdist_wheel && python3.6 -m twine upload --repository-url https://pypi.clarin-pl.eu/ -u $(PIPY_USER) -p $(PIPY_PASS) dist/cclutils*.whl'
+# help: build-test-env
+# help:... build test container (use cache if built already)
+.PHONY: build-test-env
+build-test-env: build-env
+	docker build . -f docker/test.Dockerfile -t cclutils-test
+# help: rebuild-test-env
+# help:... rebuild test container (no cache)
+.PHONY: rebuild-test-env
+rebuild-test-env: rebuild-env
+	docker build . -f docker/test.Dockerfile -t cclutils-test --no-cache
+# help: test
+# help:... run tests inside the container
+# help:... need to run 'build-test-env' task at least at the first time
+.PHONY: test
+test:
+	docker run --rm -t cclutils-test pytest
+# help: check-types
+# help:... check type hint annotations
+.PHONY: check-types
+check-types:
+	docker run --rm -t cclutils-test \
+		bash -c 'cd /home/install/cclutils; mypy -p extras --ignore-missing-imports'
+# help: check-types-dev
+# help:... check type hint annotations, mounts current version of code
+.PHONY: check-types-dev
+check-types-dev:
+	docker run --rm -t -v $(PWD)/cclutils:/home/install/cclutils cclutils-test \
+		bash -c 'cd /home/install/cclutils; mypy -p extras --ignore-missing-imports'
+# help: test-dev
+# help:... run tests inside the container (without rebuilding), mounts current
+# help:... version of tests. To enable pudb (or pass other flags) run "make 
+# help:... flags=--pudb test-dev"
+.PHONY: test-dev
+test-dev:
+	docker run -t -v $(PWD)/tests:/home/install/tests cclutils-test pytest $(flags)
+# help: ipython-dev
+# help:... launch ipython inside the container for developing purposes;
+# help:... mounting ipython directory allows to keep ipython history from
+# help:... previous calls
+.PHONY: ipython-dev
+ipython-dev:
+	mkdir -p $(PWD)/.dev_ipython && docker run -it \
+		-v $(PWD)/tests:/home/install/tests \
+		-v $(PWD)/.dev_ipython:/root/.ipython \
+		cclutils-test ipython
--- a/README.md
+++ b/README.md
@@ -195,3 +195,247 @@ sentences = (sentence for paragraph in document.paragraphs()
 for sentence in sentences:
    print(cclutils.sentence2str(sentence))
 ```
+Reading annotations
+===================
+Extracting annotations from CCL document is available with
+`cclutils.extras.annotations` module built at the top of the core ``cclutils``
+functionality.
+The main function of this module is ``get_document_annotations`` which reads
+annotations from CCL document (from file or ``corpus2.DocumentPtr`` object).
+```python
+from cclutils.extras.annotations import get_document_annotations
+```
+The annotations are organized with use of two classes:
+1. ``AnnotatedExpression``: represents single annotation (annotated expression),
+    located in specified paragraph and sentence. Module supports annotations
+    describing single word and multiword expressions (more than one token).
+1. ``DocumentAnnotations``: keeps annotations of entire document, provides
+   methods to facilitate gathering and accessing annotations.
+#### Read annotations of a given document
+1. Read all annotations
+    ```python
+    >>> anns = get_document_annotations(cclutils.read('tests/data/ccl02.xml'))
+    >>> anns
+    <DocumentAnnotations for 10 annotated expressions: [<AnnotatedExpression for
+        annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla',
+        'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for
+        annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa',
+        'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation
+        'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>,
+        <AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)';
+        ('hotel',) at position: ch2>s2>t0>, <AnnotatedExpression for annotation
+        'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
+        <AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)';
+        ('śniadanie',) at position: ch2>s2>t3>, <AnnotatedExpression for annotation
+        'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>,
+        <AnnotatedExpression for annotation 'designation': 'designation:('dla',
+        'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>, <AnnotatedExpression
+        for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position:
+        ch2>s2>t13>, <AnnotatedExpression for annotation 'food': 'food:('pełnym',
+        'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]>
+    ```
+1. Read only specified annotations
+    ```python
+    >>> anns = get_document_annotations(cclutils.read('tests/data/ccl02.xml'), annotations={'designation'})
+    >>> anns
+    <DocumentAnnotations for 2 annotated expressions: [<AnnotatedExpression for
+        annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla',
+        'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for
+        annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko')
+        at position: ch2>s2>t10,t11>]>
+    ```
+#### Get annotations in one of preferred forms
+1. Get annotations index containing full information about annotations
+    * key is a tuple containing following values: (annotation channel name,
+        sentence id, paragraph id, channel numeric value)
+    ```python
+    >>> anns.expressions_index
+    defaultdict(list,
+                {('designation',
+                's1',
+                'ch1',
+                1): <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
+                ('room_type',
+                's1',
+                'ch1',
+                1): <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
+                ('region',
+                's1',
+                'ch1',
+                1): <AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>,
+                ('attraction',
+                's2',
+                'ch2',
+                1): <AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
+                ('hotel_name',
+                's2',
+                'ch2',
+                1): <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
+                ('food',
+                's2',
+                'ch2',
+                1): <AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>,
+                ('room_type',
+                's2',
+                'ch2',
+                1): <AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>,
+                ('designation',
+                's2',
+                'ch2',
+                1): <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>,
+                ('attraction',
+                's2',
+                'ch2',
+                2): <AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>,
+                ('food',
+                's2',
+                'ch2',
+                2): <AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>})
+    ```
+1. Get annotations grouped by annotation channel name, in one of formats:
+    * annotation object
+    * orths
+    * preferred lexemes
+    * annotation base lemma
+    ```python
+    >>> anns.group_by_chan_name()
+    defaultdict(list,
+                {'designation': [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
+                <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
+                'room_type': [<AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
+                <AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>],
+                'region': [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>],
+                'attraction': [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
+                <AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>],
+                'hotel_name': [<AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>],
+                'food': [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>,
+                <AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]})
+    >>> anns.group_by_chan_name(as_orths=True)
+    defaultdict(list,
+                {'designation': [('dla', 'dwóch', 'osób'), ('dla', 'dzieci')],
+                'room_type': [('dla', 'dwóch', 'osób'), ('łazienką',)],
+                'region': [('Gdańsk',)],
+                'attraction': [('Hotel',), ('spa',)],
+                'hotel_name': [('Hotel',)],
+                'food': [('śniadaniem',), ('pełnym', 'wyżywieniem')]})
+    >>> anns.group_by_chan_name(as_lexemes=True)
+    defaultdict(list,
+                {'designation': [('dla', 'dwa', 'osoba'), ('dla', 'dziecko')],
+                'room_type': [('dla', 'dwa', 'osoba'), ('łazienka',)],
+                'region': [('Gdańsk',)],
+                'attraction': [('hotel',), ('spa',)],
+                'hotel_name': [('hotel',)],
+                'food': [('śniadanie',), ('pełny', 'wyżywienie')]})
+    >>> anns.group_by_chan_name(as_ann_base=True)
+    defaultdict(list,
+                {'designation': ['dla dwóch osób', 'dla dziecka'],
+                'room_type': ['dla dwóch osób', 'łazienka'],
+                'region': [''],
+                'attraction': ['hotel', 'spa'],
+                'hotel_name': ['Hotel'],
+                'food': ['śniadanie', 'pełne wyżywienie']})
+    ```
+1. Get annotations grouped by token (token position), in one of formats (usage
+same as in case of ``group_by_chan_name`` method):
+    * annotation object
+    * orths
+    * preferred lexemes
+    * annotation base lemma
+    ```python
+    >>> anns.group_by_token()
+    {(1,
+    's1',
+    'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
+    : 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
+    (2,
+    's1',
+    'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
+    : 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
+    (3,
+    's1',
+    'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
+    : 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
+    (4,
+    's1',
+    'ch1'): [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>],
+    (0,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>, <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel
+    ',) at position: ch2>s2>t0>],
+    (3,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>],
+    (7,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>],
+    (10,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
+    (11,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
+    (13,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>],
+    (17,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>],
+    (18,
+    's2',
+    'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]}
+    ```
+1. Get annotations grouped by token, with original document order (tokens
+   order):
+    ```python
+    >>> anns.group_by_token(retain_order=True)
+    OrderedDict([((1, 's1', 'ch1'),
+                [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
+                <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
+                ((2, 's1', 'ch1'),
+                [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
+                <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
+                ((3, 's1', 'ch1'),
+                [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
+                <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
+                ((4, 's1', 'ch1'),
+                [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>]),
+                ((0, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
+                <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>]),
+                ((3, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>]),
+                ((7, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>]),
+                ((10, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>]),
+                ((11, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>]),
+                ((13, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>]),
+                ((17, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]),
+                ((18, 's2', 'ch2'),
+                [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>])])
+    ```
+#### Get token by token position
+1. When using above methods, you may want to get ``corpus2.Token`` object
+    referenced by position:
+    ```python
+    >>> anns.token_by_position_index[(17, 's2', 'ch2')]
+    <corpus2.Token; proxy of <Swig Object of type 'Corpus2::Token *' at 0x7f71edfced80> >
+    ```
\ No newline at end of file
--- a/cclutils/extras/__init__.py
+++ b/cclutils/extras/__init__.py
+"""
+Package contains extras extending base functionality of `cclutils`,
+utilizing core `cclutils` functions.
+"""
--- a/cclutils/extras/annotations.py
+++ b/cclutils/extras/annotations.py
+#!/usr/bin/env python3.6
+# -*- coding: utf-8 -*-
+"""
+Module provide easy way to read CCL annotations.
+"""
+from collections import defaultdict, OrderedDict
+from typing import Any, Dict, Iterable, List, Set, Optional, Tuple, Union
+import cclutils as ccl
+from corpus2 import DocumentPtr, Tagset, Token
+__all__ = ["AnnotatedExpression", "DocumentAnnotations", "get_document_annotations"]
+AnnRepr = Union[Tuple[str, ...], str, "AnnotatedExpression"]
+TagsetRepr = Union[str, Tagset]
+TokenPosition = Tuple[int, str, str]
+class AnnotatedExpression(object):
+    """
+    Representation of annotated expression in CCL document. Consists of one
+    or more tokens and contains information about name of annotation.
+    Note: for multiword annotations, supports only such annotations related
+    with adjecent tokens.
+    """
+    def __init__(
+        self,
+        token: Token,
+        ann_name: str,
+        tok_position: TokenPosition,
+        tagset: Optional[TagsetRepr] = "nkjp",
+        base_ann_name: Optional[str] = None,
+        doc: Optional[DocumentPtr] = None,
+    ) -> None:
+        """
+        Initialize with single token.
+        More tokens (in case of mwe expression) can be added later with `append`
+        method.
+        Args:
+            token: corpus2 token instance (reference).
+            ann_name: name of annotation (annotation channel).
+            tok_position: position of `token` in the document (tok_sent_idx,
+                sent_id, par_id).
+            tagset: name of `Tagset` object, defaults to 'nkjp'.
+            base_ann_name: name of property stroring base form of annotation.
+                If not given then '{ann_name}_base' will be used as base prop name.
+            doc: related CCL document.
+        """
+        self._tokens: List[Token] = [token]
+        self._ann_name = ann_name
+        self._base_ann_name = base_ann_name
+        self._pref_lex: Optional[Tuple[str, ...]] = None
+        self._tok_lemmas: Optional[Tuple[str, ...]] = None
+        self._base_ann_lemma: Optional[str] = None
+        self._doc = doc
+        if isinstance(tagset, str):
+            tagset = ccl.get_tagset(tagset)
+        self.tagset = tagset
+        self.toks_ids = set([tok_position[0]])
+        self.sent_id = tok_position[1]
+        self.par_id = tok_position[2]
+    @property
+    def annotation_name(self) -> str:
+        """
+        Name of annotation channel.
+        """
+        return self._ann_name
+    @property
+    def base_annotation_name(self) -> str:
+        """
+        Name of property with base form of annotation.
+        Defaults to annotation name with appended '_base'.
+        """
+        if not self._base_ann_name:
+            self._base_ann_name = f"{self.annotation_name}_base"
+        return self._base_ann_name
+    @property
+    def length(self) -> int:
+        """
+        Returns length of annotated phrase (number of tokens).
+        """
+        return len(self._tokens)
+    @property
+    def position(self) -> Tuple[Iterable[int], str, str]:
+        """
+        Returns 'coordinates' (position) of token in the document.
+        Such position is composed of identifiers of paragraph, sentence and
+        token (index in sentence as tokens does not have real identifiers).
+        Such position allows to identify token in the document.
+        """
+        toks_ids = tuple(sorted(self.toks_ids))
+        return (toks_ids, self.sent_id, self.par_id)
+    @property
+    def tokens_pref_lexemes(self) -> Tuple[str, ...]:
+        """
+        Returns preferred lexemes (preferred) of tokens referred by this annotation.
+        """
+        if self._pref_lex is None:
+            self._pref_lex = _tokens_pref_lexemes(self._tokens, self.tagset)
+        return self._pref_lex
+    @property
+    def tokens_pref_lexemes_lowered(self) -> Tuple[str, ...]:
+        """
+        Returns lowered preferred lexemes (preferred) of tokens referred by this annotation.
+        """
+        return tuple(l.lower() for l in self.tokens_pref_lexemes)
+    @property
+    def tokens_orths(self) -> Tuple[str, ...]:
+        """
+        Returns orths (original text forms) of tokens referred by this annotation.
+        """
+        if self._tok_lemmas is None:
+            self._tok_lemmas = tuple(t.orth_utf8() for t in self._tokens)
+        return self._tok_lemmas
+    @property
+    def base_annotation_lemma(self) -> str:
+        """
+        Returns base lemma for annotation.
+        Looks for `self.base_annotation_name` property in included tokens.
+        Returns:
+            base lemma or empty string ('') if not found.
+        """
+        if self._base_ann_lemma is None:
+            i = 0
+            while not self._base_ann_lemma and i < len(self._tokens):
+                t = self._tokens[i]
+                if t.has_metadata():
+                    md = t.get_metadata()
+                    if md.has_attribute(self.base_annotation_name):
+                        self._base_ann_lemma = md.get_attribute(
+                            self.base_annotation_name
+                        )
+                i += 1
+            if self._base_ann_lemma is None:
+                self._base_ann_lemma = ""  # there is no base annotation
+        return self._base_ann_lemma
+    @property
+    def position_str(self) -> str:
+        """
+        Returns textual representation of token position.
+        """
+        indexes, sent, par = self.position
+        return f"{par}:{sent}:{','.join(sorted(['t' + str(i) for i in indexes]))}"
+    def append(self, token: Token, tok_position: TokenPosition) -> None:
+        """
+        Extends annotation object by including next token belonging to that annotation.
+        Args:
+            tok_position: (tok_sent_idx, sent_id, par_id)
+        """
+        self._check_position(*tok_position)
+        self._tokens.append(token)
+        self.toks_ids.add(tok_position[0])
+    def get_alt_repr(
+        self,
+        as_orths: bool = False,
+        as_lexemes: bool = False,
+        as_ann_base: bool = False,
+    ) -> Any:
+        """
+        Utility method to get annotation in a one of possible representations:
+        1) as an `AnnotatedExpression` instance,
+        2) as a tuple of orths,
+        3) as a tuple of preferred lexemes,
+        4) as a base of annotation (if specified); may differ from 3) in case of
+           mwe.
+        Args:
+            as_orths: returns orths instead of `AnnotatedExpression` instance.
+            as_lexemes: returns pref lexemes instead of `AnnotatedExpression`
+            instance.
+        Returns:
+            depending on passed flags: `AnnotatedExpression` or tuple of strings:
+        """
+        if sum(map(bool, (as_orths, as_lexemes, as_ann_base))) > 1:
+            raise ValueError(
+                "No more than one flag (as_orths, as_lexemes, as_ann_base) can be enabled!"
+            )
+        # returns AnnRepr but mypy does not recognize it correctly, states that it
+        # is an object
+        return {
+            (False, False, False): self,
+            (True, False, False): self.tokens_orths,
+            (False, True, False): self.tokens_pref_lexemes,
+            (False, False, True): self.base_annotation_lemma,
+        }[(as_orths, as_lexemes, as_ann_base)]
+    def _check_position(self, tok_idx: int, sent_id: str, par_id: str) -> None:
+        """
+        Checks whether newly appended token is placed in same paragraph and
+        sentence as already present one.
+        """
+        if self.par_id and self.par_id != par_id:
+            raise ValueError(
+                "Annotation tokens must be placed in the same "
+                f"paragraph! ({par_id}, {self.par_id})"
+            )
+        if self.sent_id and self.sent_id != sent_id:
+            raise ValueError(
+                "Annotation tokens must be placed in the same "
+                f"sentence! ({sent_id}, {self.sent_id})"
+            )
+        if self.toks_ids and tok_idx in self.toks_ids:
+            raise ValueError(f"Token at position {tok_idx} already added!")
+    def __eq__(self, other):
+        """
+        Two annotated expressions are equal if their base expressions are equal
+        (if given) or if they preferred lexemes are equal.
+        """
+        if not isinstance(other, AnnotatedExpression):
+            return False
+        if self.annotation_name != other.annotation_name:
+            return False
+        if self.length != other.length:
+            return False
+        if self.position != other.position:
+            return False
+        l1 = self.tokens_pref_lexemes_lowered
+        l2 = other.tokens_pref_lexemes_lowered
+        if l1 and l2:
+            return all(e1 == e2 for e1, e2 in zip(l1, l2))
+    def __hash__(self):
+        return hash(
+            (self.annotation_name, self.position, self.tokens_pref_lexemes_lowered)
+        )
+    def __repr__(self):
+        expr_str = f"{self._ann_name}:{self.tokens_orths}"
+        indexes, sent, par = self.position
+        pos_str = f"{par}>{sent}>{','.join(sorted(['t' + str(i) for i in indexes]))}"
+        return (
+            f"<AnnotatedExpression for annotation '{self._ann_name}': "
+            f"'{expr_str}'; {self.tokens_pref_lexemes} at position: {pos_str}>"
+        )
+class DocumentAnnotations(object):
+    """
+    Representation of annotations in CCL document.
+    Acts as a container keeping annotations and providing methods to facilitate
+    gathering and accessing such annotations. Uses `AnnotatedExpression` as
+    a representation of single annotation.
+    """
+    def __init__(
+        self,
+        tagset: Optional[TagsetRepr] = "nkjp",
+        doc: Optional[DocumentPtr] = None,
+    ):
+        self._doc = doc
+        self.tagset = tagset
+        self._ann_dict: Dict[Tuple[str, str, str, int], AnnotatedExpression] = {}
+        self._tok_dict: Dict[TokenPosition, List[AnnotatedExpression]] = defaultdict(
+            list
+        )
+        self._tok_pos_to_tok: Dict[TokenPosition, Token] = {}
+    @property
+    def anns_names(self) -> Set[str]:
+        """
+        Set of unique annotation names (channel names) found in document.
+        """
+        return {k[0] for k in self.expressions_index}
+    @property
+    def expressions_index(
+        self,
+    ) -> Dict[Tuple[str, str, str, int], AnnotatedExpression]:
+        """
+        Returns index of all annotations found in the document.
+        Returns:
+            Dict:
+                key: Tuple[annotation_name, sent_id, par_id, chan_val]
+                value: AnnotatedExpression
+        """
+        return self._ann_dict
+    @property
+    def expressions(self) -> Iterable[AnnotatedExpression]:
+        """
+        Returns all annotations found in the document.
+        """
+        for ann in self.expressions_index.values():
+            yield ann
+    @property
+    def token_by_position_index(self) -> Dict[TokenPosition, Token]:
+        """
+        Index of token position (used in this class) and corresponding token object.
+        """
+        return self._tok_pos_to_tok
+    def append_token_with_ann(
+        self,
+        token: Token,
+        tok_pos: TokenPosition,
+        ann_name: str,
+        chan_val: int,
+        accepted: Optional[Set[str]] = None,
+    ) -> None:
+        """
+        Append token with single annotation to the index. Ignores if value
+        in annotation channel is 0 (mean that token is not annotated).
+        Args:
+            tok_pos(tuple) - tuple with three values representing position of
+                token in document: (token_sent_index, sent_id, paragraph_id)
+            accepted(set): if given, only tokens with annotations specified
+                in this set will be added
+        """
+        if chan_val != 0 and (not accepted or ann_name in accepted):
+            sent_id, par_id = tok_pos[1], tok_pos[2]
+            ann_dict_key = (ann_name, sent_id, par_id, chan_val)
+            related_ann = None
+            if ann_dict_key in self._ann_dict:
+                related_ann = self._ann_dict[ann_dict_key]
+                related_ann.append(token, tok_pos)
+            else:
+                related_ann = AnnotatedExpression(
+                    token, ann_name, tok_pos, tagset=self.tagset, doc=self._doc
+                )
+                self._ann_dict[ann_dict_key] = related_ann
+            self._tok_dict[tok_pos].append(related_ann)
+            self._tok_pos_to_tok[tok_pos] = token
+    def append_token_with_all_ann(
+        self,
+        token: Token,
+        tok_pos: TokenPosition,
+        ann_name_val_dict: Dict[str, int],
+        accepted: Optional[Set[str]] = None,
+    ) -> None:
+        """
+        Append token with all related annotations to the index.
+        Args:
+            tok_pos(tuple) - tuple with three values representing position of
+                token in document: (token_sent_index, sent_id, paragraph_id)
+            accepted(set): if given, only tokens with annotations specified
+                in this set will be added
+        """
+        for ann, chan_val in ann_name_val_dict.items():
+            self.append_token_with_ann(token, tok_pos, ann, chan_val, accepted=accepted)
+    def group_by_chan_name(
+        self,
+        as_orths: bool = False,
+        as_lexemes: bool = False,
+        as_ann_base: bool = False,
+    ) -> Dict[str, List[AnnRepr]]:
+        """
+        Returns annotations grouped by channel name, in one of specified forms.
+        Args:
+            as_orths: returns orths instead of `AnnotatedExpression` instances
+            as_lexemes: returns pref lexemes instead of `AnnotatedExpression`
+            instances
+        Returns:
+            dict of annotations:
+                key: annotation name.
+                value: depending on passed flags, list of `AnnotatedExpression`
+                    or list of tuples of strings.
+        """
+        d = defaultdict(list)
+        for k, ann_obj in self.expressions_index.items():
+            ann_name = k[0]
+            d[ann_name].append(ann_obj.get_alt_repr(as_orths, as_lexemes, as_ann_base))
+        return d
+    def group_by_token(
+        self,
+        retain_order: bool = False,
+        as_orths: bool = False,
+        as_lexemes: bool = False,
+        as_ann_base: bool = False,
+    ) -> Dict[TokenPosition, List[AnnRepr]]:
+        """
+        Returns index of token position (used in this class) and corresponding
+        annotations.
+        Args:
+            retain_order: If enabled, then returns dict ordered by token position.
+                The order corresponds to the order of occurences in document
+                (ascending sort: by paragraph id, then by sentence id, finally
+                by token index). Disabled by default, to avoid additional computation
+                time when order is not important.
+        Returns:
+            dict:
+                key: tuple representing token position.
+                value: depending on passed flags, list of `AnnotatedExpression`
+                    or list of tuples of strings.
+                dictionary concrete class depends on ordering:
+                    without sorting: `dict`
+                    document order: `OrderedDict`
+        """
+        d: Dict[TokenPosition, List[AnnRepr]] = {}
+        for t_pos, ann_objs in self._tok_dict.items():
+            d[t_pos] = [
+                ann_obj.get_alt_repr(as_orths, as_lexemes, as_ann_base)
+                for ann_obj in ann_objs
+            ]
+        if retain_order:
+            return OrderedDict(
+                sorted(d.items(), key=lambda e: (e[0][2], e[0][1], e[0][0]))
+            )
+        return d
+    def __repr__(self):
+        return (
+            f"<DocumentAnnotations for {len(self._ann_dict)} annotated "
+            f"expressions: {[e for e in self._ann_dict.values()]}>"
+        )
+def get_document_annotations(
+    ccl_obj_or_path: Union[DocumentPtr, str],
+    tagset: Optional[Union[Tagset, str]] = "nkjp",
+    annotations: Optional[Set[str]] = None,
+) -> DocumentAnnotations:
+    """
+    Finds annotations in CCL document and returns in a form allowing easy access
+    to the most important informations.
+    By default, detects all annotations (specified with `ann` tag). Set of
+    recognized annotations can be restricted by specifying a set of names of
+    annotations in `annotations` (channel names).
+    Check documentation of `DocumentAnnotations` class to find out how to use
+    returned object.
+    Args:
+        ccl_obj_or_path: CCL document (`corpus2.DocumentPtr` or `str` path).
+        tagset: document tagset (`corpus2.Tagset` or `str` name).
+        annotations: set of names (string) of annotations (annotation channels)
+        to find. If not given, then finds all annotated expressions.
+    Returns:
+        `DocumentAnnotations` instance with gathered annotated tokens.
+    """
+    tagset = _as_corpus2_tagset(tagset)
+    doc = _as_corpus2_doc(ccl_obj_or_path, tagset)
+    doc_ann = DocumentAnnotations(doc=doc, tagset=tagset)
+    for p in doc.paragraphs():
+        for s in p.sentences():
+            for i, t in enumerate(s.tokens()):
+                anns_dict: Dict[str, int] = ccl.get_annotations(s, t, i)
+                tok_in_doc_pos = (i, s.id(), p.get_attribute("id"))
+                doc_ann.append_token_with_all_ann(
+                    t, tok_in_doc_pos, anns_dict, accepted=annotations
+                )
+    return doc_ann
+def _as_corpus2_doc(ccl_obj_or_path, tagset):
+    if not isinstance(ccl_obj_or_path, DocumentPtr):
+        return ccl.read(ccl_obj_or_path, tagset=tagset)
+    return ccl_obj_or_path
+def _as_corpus2_tagset(tagset_obj_or_name):
+    if not isinstance(tagset_obj_or_name, Tagset):
+        return ccl.get_tagset(tagset_obj_or_name)
+    return tagset_obj_or_name
+def _get_document_preferred_lexemes(
+    doc: DocumentPtr, tagset: Tagset
+) -> Tuple[str, ...]:
+    """
+    Returns tuple of preferred lexemes of every token in document.
+    Structure of document does not impact output tuple.
+    """
+    toks = [t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()]
+    return _tokens_pref_lexemes(toks, tagset)
+def _tokens_pref_lexemes(tokens, tagset) -> Tuple[str, ...]:
+    """
+    Returns tuple of preferred lexemes for given tokens.
+    """
+    return tuple(t.get_preferred_lexeme(tagset).lemma_utf8() for t in tokens)
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM clarinpl/python:3.6 as cclutils-base
+RUN apt-get update && apt-get install -y apt-transport-https \
+	corpus2-python3.6 \
+    corpus2mwe-python3.6
+WORKDIR /home/install
+COPY setup.py .
+COPY cclutils ./cclutils
+RUN python setup.py install
--- a/docker/prod.Dockerfile
+++ b/docker/prod.Dockerfile
+FROM cclutils-base as cclutils-prod
+WORKDIR /home/install
+RUN pip install twine
--- a/docker/test.Dockerfile
+++ b/docker/test.Dockerfile
+FROM cclutils-base as cclutils-test
+COPY tests ./tests
+COPY requirements-test.txt .
+RUN python -m pip install -r requirements-test.txt
+COPY requirements-dev.txt .
+RUN python -m pip install -r requirements-dev.txt
+WORKDIR /home/install/tests
+CMD ["pytest"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
+ipython
+ipykernel
+mypy
--- a/requirements-test.txt
+++ b/requirements-test.txt
+pytest
+pytest-icdiff
+pytest-pudb
--- a/setup.py
+++ b/setup.py
@@ -3,10 +3,10 @@ from setuptools import setup
 setup(
    name='cclutils',
-    author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski',
+    author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski',
    description='''A convenient API based on Corpus2 library for analyzing textual
        corpora in CCL format.''',
-    version='1.0.3',
+    version='1.1',
-    packages=['cclutils'],
+    packages=['cclutils', 'cclutils.extras'],
    zip_safe=False
 )
--- a/tests/data/ccl01.xml
+++ b/tests/data/ccl01.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chunkList SYSTEM "ccl.dtd">
+<chunkList>
+  <chunk id="ch1">
+    <sentence id="sent4">
+      <tok>
+        <orth>Oprócz</orth>
+        <lex disamb="1">
+          <base>oprócz</base>
+          <ctag>prep:gen</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>bogatej</orth>
+        <lex disamb="1">
+          <base>bogaty</base>
+          <ctag>adj:sg:gen:f:pos</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>historii</orth>
+        <lex disamb="1">
+          <base>historia</base>
+          <ctag>subst:sg:gen:f</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <ns/>
+      <tok>
+        <orth>,</orth>
+        <lex disamb="1">
+          <base>,</base>
+          <ctag>interp</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>fascynujących</orth>
+        <lex disamb="1">
+          <base>fascynujący</base>
+          <ctag>adj:pl:gen:m3:pos</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>zabytków</orth>
+        <lex disamb="1">
+          <base>zabytek</base>
+          <ctag>subst:pl:gen:m3</ctag>
+        </lex>
+        <ann chan="attraction_classes">1</ann>
+        <prop key="attraction_classes_base">zabytek</prop>
+      </tok>
+      <ns/>
+      <tok>
+        <orth>,</orth>
+        <lex disamb="1">
+          <base>,</base>
+          <ctag>interp</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>będą</orth>
+        <lex disamb="1">
+          <base>być</base>
+          <ctag>bedzie:pl:ter:imperf</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>piękne</orth>
+        <lex disamb="1">
+          <base>piękny</base>
+          <ctag>adj:pl:nom:m3:pos</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>widoki</orth>
+        <lex disamb="1">
+          <base>widok</base>
+          <ctag>subst:pl:nom:m3</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>i</orth>
+        <lex disamb="1">
+          <base>i</base>
+          <ctag>conj</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>zachwycająca</orth>
+        <lex disamb="1">
+          <base>zachwycający</base>
+          <ctag>adj:sg:nom:f:pos</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>przyroda</orth>
+        <lex disamb="1">
+          <base>przyroda</base>
+          <ctag>subst:sg:nom:f</ctag>
+        </lex>
+        <ann chan="attraction_classes">2</ann>
+      </tok>
+      <ns/>
+      <tok>
+        <orth>.</orth>
+        <lex disamb="1">
+          <base>.</base>
+          <ctag>interp</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>przyroda</orth>
+        <lex disamb="1">
+          <base>przyroda</base>
+          <ctag>subst:sg:nom:f</ctag>
+        </lex>
+        <ann chan="attraction_classes">3</ann>
+      </tok>
+    </sentence>
+    <sentence id="sent5">
+      <tok>
+        <orth>dużo</orth>
+        <lex disamb="1">
+          <base>dużo</base>
+          <ctag>adj:pl:gen:m3:pos</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>fascynujących</orth>
+        <lex disamb="1">
+          <base>fascynujący</base>
+          <ctag>adj:pl:gen:m3:pos</ctag>
+        </lex>
+        <ann chan="attraction_classes">0</ann>
+      </tok>
+      <tok>
+        <orth>zabytków</orth>
+        <lex disamb="1">
+          <base>zabytek</base>
+          <ctag>subst:pl:gen:m3</ctag>
+        </lex>
+        <ann chan="attraction_classes">1</ann>
+        <prop key="attraction_classes_base">zabytek</prop>
+      </tok>
+    </sentence>
+  </chunk>
+</chunkList>
--- a/tests/data/ccl02.xml
+++ b/tests/data/ccl02.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chunkList SYSTEM "ccl.dtd">
+<chunkList>
+ <chunk id="ch1">
+  <sentence id="s1">
+   <tok>
+    <orth>Wycieczka</orth>
+    <lex disamb="1"><base>wycieczka</base><ctag>subst:sg:nom:f</ctag></lex>
+    <ann chan="designation">0</ann>
+    <ann chan="region">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>dla</orth>
+    <lex disamb="1"><base>dla</base><ctag>prep:gen</ctag></lex>
+    <ann chan="designation">1</ann>
+    <ann chan="region">0</ann>
+    <ann chan="room_type">1</ann>
+    <prop key="designation_base">dla dwóch osób</prop>
+    <prop key="room_type_base">dla dwóch osób</prop>
+   </tok>
+   <tok>
+    <orth>dwóch</orth>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
+    <ann chan="designation">1</ann>
+    <ann chan="region">0</ann>
+    <ann chan="room_type">1</ann>
+   </tok>
+   <tok>
+    <orth>osób</orth>
+    <lex disamb="1"><base>osoba</base><ctag>subst:pl:gen:f</ctag></lex>
+    <ann chan="designation">1</ann>
+    <ann chan="region">0</ann>
+    <ann chan="room_type">1</ann>
+    <prop key="region_base">Osobie</prop>
+   </tok>
+   <tok>
+    <orth>Gdańsk</orth>
+    <lex disamb="1"><base>Gdańsk</base><ctag>subst:sg:nom:m3</ctag></lex>
+    <ann chan="designation">0</ann>
+    <ann chan="region">1</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="designation">0</ann>
+    <ann chan="region">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+  </sentence>
+ </chunk>
+ <chunk id="ch2">
+  <sentence id="s2">
+   <tok>
+    <orth>Hotel</orth>
+    <lex disamb="1"><base>hotel</base><ctag>subst:sg:nom:m3</ctag></lex>
+    <ann chan="attraction">1</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">1</ann>
+    <ann chan="room_type">0</ann>
+    <prop key="attraction_base">hotel</prop>
+    <prop key="hotel_name_base">Hotel</prop>
+   </tok>
+    <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>ze</orth>
+    <lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>śniadaniem</orth>
+    <lex disamb="1"><base>śniadanie</base><ctag>subst:sg:nom:m3</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">1</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+    <prop key="food_base">śniadanie</prop>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>z</orth>
+    <lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>prywatną</orth>
+    <lex disamb="1"><base>prywatny</base><ctag>adj:sg:acc:f:pos</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>łazienką</orth>
+    <lex disamb="1"><base>łazienka</base><ctag>subst:sg:inst:f</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">1</ann>
+    <prop key="room_type_base">łazienka</prop>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>atrakcje</orth>
+    <lex disamb="1"><base>atrakcja</base><ctag>subst:pl:nom:f</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>dla</orth>
+    <lex disamb="1"><base>dla</base><ctag>prep:gen</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">1</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+    <prop key="designation_base">dla dziecka</prop>
+   </tok>
+   <tok>
+    <orth>dzieci</orth>
+    <lex disamb="1"><base>dziecko</base><ctag>subst:pl:nom:n</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">1</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>spa</orth>
+    <lex disamb="1"><base>spa</base><ctag>subst:sg:nom:n</ctag></lex>
+    <ann chan="attraction">2</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+    <prop key="attraction_base">spa</prop>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>z</orth>
+    <lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <tok>
+    <orth>pełnym</orth>
+    <lex disamb="1"><base>pełny</base><ctag>adj:sg:nom:n:pos</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">2</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+    <prop key="food_base">pełne wyżywienie</prop>
+   </tok>
+   <tok>
+    <orth>wyżywieniem</orth>
+    <lex disamb="1"><base>wyżywienie</base><ctag>subst:sg:nom:n</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">2</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="attraction">0</ann>
+    <ann chan="designation">0</ann>
+    <ann chan="food">0</ann>
+    <ann chan="hotel_name">0</ann>
+    <ann chan="room_type">0</ann>
+   </tok>
+  </sentence>
+ </chunk>
+</chunkList>
--- a/tests/test_annotations.py
+++ b/tests/test_annotations.py
+# coding: utf-8
+from collections import OrderedDict
+import os
+from typing import Dict, Tuple
+from cclutils.extras.annotations import get_document_annotations
+TEST_ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
+TEST_DATA_DIR = os.path.join(TEST_ROOT_DIR, "data")
+CCL_TEST_PATH_01 = os.path.join(TEST_DATA_DIR, "ccl01.xml")
+CCL_TEST_PATH_02 = os.path.join(TEST_DATA_DIR, "ccl02.xml")
+def as_expressions_orth_index(ann_expr_index) -> Dict[Tuple[str, str, str, int], str]:
+    return {k: ann.tokens_orths for k, ann in ann_expr_index.items()}
+def test_simple():
+    ann = "attraction_classes"
+    anns = get_document_annotations(CCL_TEST_PATH_01, "nkjp")
+    attractions = anns.group_by_chan_name()[ann]
+    assert len(attractions) == 4
+def test_complex():
+    anns = get_document_annotations(CCL_TEST_PATH_02)
+    assert anns.anns_names == {
+        "attraction",
+        "designation",
+        "food",
+        "hotel_name",
+        "region",
+        "room_type",
+    }
+    expressions_orth_index = as_expressions_orth_index(anns.expressions_index)
+    assert expressions_orth_index == {
+        ("designation", "s1", "ch1", 1): ("dla", "dwóch", "osób"),
+        ("room_type", "s1", "ch1", 1): ("dla", "dwóch", "osób"),
+        ("region", "s1", "ch1", 1): ("Gdańsk",),
+        ("attraction", "s2", "ch2", 1): ("Hotel",),
+        ("hotel_name", "s2", "ch2", 1): ("Hotel",),
+        ("food", "s2", "ch2", 1): ("śniadaniem",),
+        ("room_type", "s2", "ch2", 1): ("łazienką",),
+        ("designation", "s2", "ch2", 1): ("dla", "dzieci"),
+        ("attraction", "s2", "ch2", 2): ("spa",),
+        ("food", "s2", "ch2", 2): ("pełnym", "wyżywieniem"),
+    }
+    assert anns.group_by_chan_name(as_ann_base=True) == {
+        "designation": ["dla dwóch osób", "dla dziecka"],
+        "room_type": ["dla dwóch osób", "łazienka"],
+        "region": [""],
+        "attraction": ["hotel", "spa"],
+        "hotel_name": ["Hotel"],
+        "food": ["śniadanie", "pełne wyżywienie"],
+    }
+    assert anns.group_by_token(retain_order=True, as_orths=True) == OrderedDict(
+        [
+            ((1, "s1", "ch1"), [("dla", "dwóch", "osób"), ("dla", "dwóch", "osób")]),
+            ((2, "s1", "ch1"), [("dla", "dwóch", "osób"), ("dla", "dwóch", "osób")]),
+            ((3, "s1", "ch1"), [("dla", "dwóch", "osób"), ("dla", "dwóch", "osób")]),
+            ((4, "s1", "ch1"), [("Gdańsk",)]),
+            ((0, "s2", "ch2"), [("Hotel",), ("Hotel",)]),
+            ((3, "s2", "ch2"), [("śniadaniem",)]),
+            ((7, "s2", "ch2"), [("łazienką",)]),
+            ((10, "s2", "ch2"), [("dla", "dzieci")]),
+            ((11, "s2", "ch2"), [("dla", "dzieci")]),
+            ((13, "s2", "ch2"), [("spa",)]),
+            ((17, "s2", "ch2"), [("pełnym", "wyżywieniem")]),
+            ((18, "s2", "ch2"), [("pełnym", "wyżywieniem")]),
+        ]
+    )
+def test_restricted_ann_set():
+    anns = get_document_annotations(
+        CCL_TEST_PATH_02,
+        annotations={
+            "designation",
+            "food",
+            "region",
+        },
+    )
+    assert anns.anns_names == {
+        "designation",
+        "food",
+        "region",
+    }
+    expressions_orth_index = as_expressions_orth_index(anns.expressions_index)
+    assert expressions_orth_index == {
+        ("designation", "s1", "ch1", 1): ("dla", "dwóch", "osób"),
+        ("region", "s1", "ch1", 1): ("Gdańsk",),
+        ("food", "s2", "ch2", 1): ("śniadaniem",),
+        ("designation", "s2", "ch2", 1): ("dla", "dzieci"),
+        ("food", "s2", "ch2", 2): ("pełnym", "wyżywieniem"),
+    }