Skip to content
Snippets Groups Projects
Commit ebc8136b authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski
Browse files

Implement annotations module together with tests and docker envs

The purpose of new module is to provide high-level functions for
reading CCL annotations and provide easy way to get them based
on various needs.

Scope of changes:
- implement annotations module (annotations.py)
- provide test data and implement tests (test_annotations.py)
- prepare Makefile as make can serve as unified dev / test / ci environment
  (tox is not an option as does not handle OS dependencies (or I don't
  know it)
- change .gitlab-ci.yml to use make and images defined in this repo
- provide examples of usage in README
parent 3bed04eb
Branches
Tags
1 merge request!9Implement annotations module together with tests and docker envs
image: clarinpl/python:3.6
before_script:
- pip install tox==2.9.1
cache:
paths:
- .tox
stages: stages:
# - check_style - test
- push_wheel - deploy
# pep8: test:
# stage: check_style stage: test
# script: image: docker:18.09.7
# - tox -v -e pep8 services:
# - docker:18.09.7-dind
# docstyle: before_script:
# stage: check_style - apk --no-cache add make
# script: - make build-test-env
# - tox -v -e docstyle script:
- make test check-types
push_wheel: push_wheel:
stage: deploy
image: docker:18.09.7
services:
- docker:18.09.7-dind
before_script: before_script:
- pip3.6 install twine - apk --no-cache add make
- make build-prod-env
only: only:
- master - master
stage: push_wheel
when: on_success when: on_success
script: script:
- python3.6 setup.py sdist bdist_wheel - make deploy
- python3.6 -m twine upload
--repository-url https://pypi.clarin-pl.eu/
-u $PIPY_USER -p $PIPY_PASS dist/cclutils*.whl
Makefile 0 → 100644
MAKEFLAGS += --no-print-directory
# help: cclutils Makefile help
# help: help
# help:... display this makefile's help information
.PHONY: help
help:
@grep "^# help\:" Makefile | sed 's/\# help\: *//;s/^...\s\+/\t\t/'
# help: build-env
# help:... build container with installed cclutils together with (OS and
# help:... python) dependencies
.PHONY: build-env
build-env:
docker build . -f docker/Dockerfile -t cclutils-base
# help: rebuild-env
# help:... rebuild container with installed cclutils together with (OS and
# help:... python) dependencies
.PHONY: rebuild-env
rebuild-env:
docker build -f docker/Dockerfile --no-cache -t cclutils-base
# help: build-prod-env
# help:... build production container (used for CI/CD deploy)
.PHONY: build-prod-env
build-prod-env: build-env
docker build . -f docker/prod.Dockerfile -t cclutils-prod
.PHONY: deploy
deploy:
@docker run \
-e PIPY_USER \
-e PIPY_PASS \
--rm \
-t \
cclutils-prod bash -c \
'python3.6 setup.py sdist bdist_wheel && python3.6 -m twine upload --repository-url https://pypi.clarin-pl.eu/ -u $(PIPY_USER) -p $(PIPY_PASS) dist/cclutils*.whl'
# help: build-test-env
# help:... build test container (use cache if built already)
.PHONY: build-test-env
build-test-env: build-env
docker build . -f docker/test.Dockerfile -t cclutils-test
# help: rebuild-test-env
# help:... rebuild test container (no cache)
.PHONY: rebuild-test-env
rebuild-test-env: rebuild-env
docker build . -f docker/test.Dockerfile -t cclutils-test --no-cache
# help: test
# help:... run tests inside the container
# help:... need to run 'build-test-env' task at least at the first time
.PHONY: test
test:
docker run --rm -t cclutils-test pytest
# help: check-types
# help:... check type hint annotations
.PHONY: check-types
check-types:
docker run --rm -t cclutils-test \
bash -c 'cd /home/install/cclutils; mypy -p extras --ignore-missing-imports'
# help: check-types-dev
# help:... check type hint annotations, mounts current version of code
.PHONY: check-types-dev
check-types-dev:
docker run --rm -t -v $(PWD)/cclutils:/home/install/cclutils cclutils-test \
bash -c 'cd /home/install/cclutils; mypy -p extras --ignore-missing-imports'
# help: test-dev
# help:... run tests inside the container (without rebuilding), mounts current
# help:... version of tests. To enable pudb (or pass other flags) run "make
# help:... flags=--pudb test-dev"
.PHONY: test-dev
test-dev:
docker run -t -v $(PWD)/tests:/home/install/tests cclutils-test pytest $(flags)
# help: ipython-dev
# help:... launch ipython inside the container for developing purposes;
# help:... mounting ipython directory allows to keep ipython history from
# help:... previous calls
.PHONY: ipython-dev
ipython-dev:
mkdir -p $(PWD)/.dev_ipython && docker run -it \
-v $(PWD)/tests:/home/install/tests \
-v $(PWD)/.dev_ipython:/root/.ipython \
cclutils-test ipython
...@@ -195,3 +195,247 @@ sentences = (sentence for paragraph in document.paragraphs() ...@@ -195,3 +195,247 @@ sentences = (sentence for paragraph in document.paragraphs()
for sentence in sentences: for sentence in sentences:
print(cclutils.sentence2str(sentence)) print(cclutils.sentence2str(sentence))
``` ```
Reading annotations
===================
Extracting annotations from CCL document is available with
`cclutils.extras.annotations` module built at the top of the core ``cclutils``
functionality.
The main function of this module is ``get_document_annotations`` which reads
annotations from CCL document (from file or ``corpus2.DocumentPtr`` object).
```python
from cclutils.extras.annotations import get_document_annotations
```
The annotations are organized with use of two classes:
1. ``AnnotatedExpression``: represents single annotation (annotated expression),
located in specified paragraph and sentence. Module supports annotations
describing single word and multiword expressions (more than one token).
1. ``DocumentAnnotations``: keeps annotations of entire document, provides
methods to facilitate gathering and accessing annotations.
#### Read annotations of a given document
1. Read all annotations
```python
>>> anns = get_document_annotations(cclutils.read('tests/data/ccl02.xml'))
>>> anns
<DocumentAnnotations for 10 annotated expressions: [<AnnotatedExpression for
annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla',
'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for
annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa',
'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation
'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>,
<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)';
('hotel',) at position: ch2>s2>t0>, <AnnotatedExpression for annotation
'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)';
('śniadanie',) at position: ch2>s2>t3>, <AnnotatedExpression for annotation
'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>,
<AnnotatedExpression for annotation 'designation': 'designation:('dla',
'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>, <AnnotatedExpression
for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position:
ch2>s2>t13>, <AnnotatedExpression for annotation 'food': 'food:('pełnym',
'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]>
```
1. Read only specified annotations
```python
>>> anns = get_document_annotations(cclutils.read('tests/data/ccl02.xml'), annotations={'designation'})
>>> anns
<DocumentAnnotations for 2 annotated expressions: [<AnnotatedExpression for
annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla',
'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for
annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko')
at position: ch2>s2>t10,t11>]>
```
#### Get annotations in one of preferred forms
1. Get annotations index containing full information about annotations
* key is a tuple containing following values: (annotation channel name,
sentence id, paragraph id, channel numeric value)
```python
>>> anns.expressions_index
defaultdict(list,
{('designation',
's1',
'ch1',
1): <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
('room_type',
's1',
'ch1',
1): <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
('region',
's1',
'ch1',
1): <AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>,
('attraction',
's2',
'ch2',
1): <AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
('hotel_name',
's2',
'ch2',
1): <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
('food',
's2',
'ch2',
1): <AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>,
('room_type',
's2',
'ch2',
1): <AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>,
('designation',
's2',
'ch2',
1): <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>,
('attraction',
's2',
'ch2',
2): <AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>,
('food',
's2',
'ch2',
2): <AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>})
```
1. Get annotations grouped by annotation channel name, in one of formats:
* annotation object
* orths
* preferred lexemes
* annotation base lemma
```python
>>> anns.group_by_chan_name()
defaultdict(list,
{'designation': [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
'room_type': [<AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
<AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>],
'region': [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>],
'attraction': [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
<AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>],
'hotel_name': [<AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>],
'food': [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>,
<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]})
>>> anns.group_by_chan_name(as_orths=True)
defaultdict(list,
{'designation': [('dla', 'dwóch', 'osób'), ('dla', 'dzieci')],
'room_type': [('dla', 'dwóch', 'osób'), ('łazienką',)],
'region': [('Gdańsk',)],
'attraction': [('Hotel',), ('spa',)],
'hotel_name': [('Hotel',)],
'food': [('śniadaniem',), ('pełnym', 'wyżywieniem')]})
>>> anns.group_by_chan_name(as_lexemes=True)
defaultdict(list,
{'designation': [('dla', 'dwa', 'osoba'), ('dla', 'dziecko')],
'room_type': [('dla', 'dwa', 'osoba'), ('łazienka',)],
'region': [('Gdańsk',)],
'attraction': [('hotel',), ('spa',)],
'hotel_name': [('hotel',)],
'food': [('śniadanie',), ('pełny', 'wyżywienie')]})
>>> anns.group_by_chan_name(as_ann_base=True)
defaultdict(list,
{'designation': ['dla dwóch osób', 'dla dziecka'],
'room_type': ['dla dwóch osób', 'łazienka'],
'region': [''],
'attraction': ['hotel', 'spa'],
'hotel_name': ['Hotel'],
'food': ['śniadanie', 'pełne wyżywienie']})
```
1. Get annotations grouped by token (token position), in one of formats (usage
same as in case of ``group_by_chan_name`` method):
* annotation object
* orths
* preferred lexemes
* annotation base lemma
```python
>>> anns.group_by_token()
{(1,
's1',
'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
: 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
(2,
's1',
'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
: 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
(3,
's1',
'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
: 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
(4,
's1',
'ch1'): [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>],
(0,
's2',
'ch2'): [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>, <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel
',) at position: ch2>s2>t0>],
(3,
's2',
'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>],
(7,
's2',
'ch2'): [<AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>],
(10,
's2',
'ch2'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
(11,
's2',
'ch2'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
(13,
's2',
'ch2'): [<AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>],
(17,
's2',
'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>],
(18,
's2',
'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]}
```
1. Get annotations grouped by token, with original document order (tokens
order):
```python
>>> anns.group_by_token(retain_order=True)
OrderedDict([((1, 's1', 'ch1'),
[<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
<AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
((2, 's1', 'ch1'),
[<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
<AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
((3, 's1', 'ch1'),
[<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
<AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
((4, 's1', 'ch1'),
[<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>]),
((0, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
<AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>]),
((3, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>]),
((7, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>]),
((10, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>]),
((11, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>]),
((13, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>]),
((17, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]),
((18, 's2', 'ch2'),
[<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>])])
```
#### Get token by token position
1. When using above methods, you may want to get ``corpus2.Token`` object
referenced by position:
```python
>>> anns.token_by_position_index[(17, 's2', 'ch2')]
<corpus2.Token; proxy of <Swig Object of type 'Corpus2::Token *' at 0x7f71edfced80> >
```
\ No newline at end of file
"""
Package contains extras extending base functionality of `cclutils`,
utilizing core `cclutils` functions.
"""
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
"""
Module provide easy way to read CCL annotations.
"""
from collections import defaultdict, OrderedDict
from typing import Any, Dict, Iterable, List, Set, Optional, Tuple, Union
import cclutils as ccl
from corpus2 import DocumentPtr, Tagset, Token
__all__ = ["AnnotatedExpression", "DocumentAnnotations", "get_document_annotations"]
AnnRepr = Union[Tuple[str, ...], str, "AnnotatedExpression"]
TagsetRepr = Union[str, Tagset]
TokenPosition = Tuple[int, str, str]
class AnnotatedExpression(object):
"""
Representation of annotated expression in CCL document. Consists of one
or more tokens and contains information about name of annotation.
Note: for multiword annotations, supports only such annotations related
with adjecent tokens.
"""
def __init__(
self,
token: Token,
ann_name: str,
tok_position: TokenPosition,
tagset: Optional[TagsetRepr] = "nkjp",
base_ann_name: Optional[str] = None,
doc: Optional[DocumentPtr] = None,
) -> None:
"""
Initialize with single token.
More tokens (in case of mwe expression) can be added later with `append`
method.
Args:
token: corpus2 token instance (reference).
ann_name: name of annotation (annotation channel).
tok_position: position of `token` in the document (tok_sent_idx,
sent_id, par_id).
tagset: name of `Tagset` object, defaults to 'nkjp'.
base_ann_name: name of property stroring base form of annotation.
If not given then '{ann_name}_base' will be used as base prop name.
doc: related CCL document.
"""
self._tokens: List[Token] = [token]
self._ann_name = ann_name
self._base_ann_name = base_ann_name
self._pref_lex: Optional[Tuple[str, ...]] = None
self._tok_lemmas: Optional[Tuple[str, ...]] = None
self._base_ann_lemma: Optional[str] = None
self._doc = doc
if isinstance(tagset, str):
tagset = ccl.get_tagset(tagset)
self.tagset = tagset
self.toks_ids = set([tok_position[0]])
self.sent_id = tok_position[1]
self.par_id = tok_position[2]
@property
def annotation_name(self) -> str:
"""
Name of annotation channel.
"""
return self._ann_name
@property
def base_annotation_name(self) -> str:
"""
Name of property with base form of annotation.
Defaults to annotation name with appended '_base'.
"""
if not self._base_ann_name:
self._base_ann_name = f"{self.annotation_name}_base"
return self._base_ann_name
@property
def length(self) -> int:
"""
Returns length of annotated phrase (number of tokens).
"""
return len(self._tokens)
@property
def position(self) -> Tuple[Iterable[int], str, str]:
"""
Returns 'coordinates' (position) of token in the document.
Such position is composed of identifiers of paragraph, sentence and
token (index in sentence as tokens does not have real identifiers).
Such position allows to identify token in the document.
"""
toks_ids = tuple(sorted(self.toks_ids))
return (toks_ids, self.sent_id, self.par_id)
@property
def tokens_pref_lexemes(self) -> Tuple[str, ...]:
"""
Returns preferred lexemes (preferred) of tokens referred by this annotation.
"""
if self._pref_lex is None:
self._pref_lex = _tokens_pref_lexemes(self._tokens, self.tagset)
return self._pref_lex
@property
def tokens_pref_lexemes_lowered(self) -> Tuple[str, ...]:
"""
Returns lowered preferred lexemes (preferred) of tokens referred by this annotation.
"""
return tuple(l.lower() for l in self.tokens_pref_lexemes)
@property
def tokens_orths(self) -> Tuple[str, ...]:
"""
Returns orths (original text forms) of tokens referred by this annotation.
"""
if self._tok_lemmas is None:
self._tok_lemmas = tuple(t.orth_utf8() for t in self._tokens)
return self._tok_lemmas
@property
def base_annotation_lemma(self) -> str:
"""
Returns base lemma for annotation.
Looks for `self.base_annotation_name` property in included tokens.
Returns:
base lemma or empty string ('') if not found.
"""
if self._base_ann_lemma is None:
i = 0
while not self._base_ann_lemma and i < len(self._tokens):
t = self._tokens[i]
if t.has_metadata():
md = t.get_metadata()
if md.has_attribute(self.base_annotation_name):
self._base_ann_lemma = md.get_attribute(
self.base_annotation_name
)
i += 1
if self._base_ann_lemma is None:
self._base_ann_lemma = "" # there is no base annotation
return self._base_ann_lemma
@property
def position_str(self) -> str:
"""
Returns textual representation of token position.
"""
indexes, sent, par = self.position
return f"{par}:{sent}:{','.join(sorted(['t' + str(i) for i in indexes]))}"
def append(self, token: Token, tok_position: TokenPosition) -> None:
"""
Extends annotation object by including next token belonging to that annotation.
Args:
tok_position: (tok_sent_idx, sent_id, par_id)
"""
self._check_position(*tok_position)
self._tokens.append(token)
self.toks_ids.add(tok_position[0])
def get_alt_repr(
self,
as_orths: bool = False,
as_lexemes: bool = False,
as_ann_base: bool = False,
) -> Any:
"""
Utility method to get annotation in a one of possible representations:
1) as an `AnnotatedExpression` instance,
2) as a tuple of orths,
3) as a tuple of preferred lexemes,
4) as a base of annotation (if specified); may differ from 3) in case of
mwe.
Args:
as_orths: returns orths instead of `AnnotatedExpression` instance.
as_lexemes: returns pref lexemes instead of `AnnotatedExpression`
instance.
Returns:
depending on passed flags: `AnnotatedExpression` or tuple of strings:
"""
if sum(map(bool, (as_orths, as_lexemes, as_ann_base))) > 1:
raise ValueError(
"No more than one flag (as_orths, as_lexemes, as_ann_base) can be enabled!"
)
# returns AnnRepr but mypy does not recognize it correctly, states that it
# is an object
return {
(False, False, False): self,
(True, False, False): self.tokens_orths,
(False, True, False): self.tokens_pref_lexemes,
(False, False, True): self.base_annotation_lemma,
}[(as_orths, as_lexemes, as_ann_base)]
def _check_position(self, tok_idx: int, sent_id: str, par_id: str) -> None:
"""
Checks whether newly appended token is placed in same paragraph and
sentence as already present one.
"""
if self.par_id and self.par_id != par_id:
raise ValueError(
"Annotation tokens must be placed in the same "
f"paragraph! ({par_id}, {self.par_id})"
)
if self.sent_id and self.sent_id != sent_id:
raise ValueError(
"Annotation tokens must be placed in the same "
f"sentence! ({sent_id}, {self.sent_id})"
)
if self.toks_ids and tok_idx in self.toks_ids:
raise ValueError(f"Token at position {tok_idx} already added!")
def __eq__(self, other):
"""
Two annotated expressions are equal if their base expressions are equal
(if given) or if they preferred lexemes are equal.
"""
if not isinstance(other, AnnotatedExpression):
return False
if self.annotation_name != other.annotation_name:
return False
if self.length != other.length:
return False
if self.position != other.position:
return False
l1 = self.tokens_pref_lexemes_lowered
l2 = other.tokens_pref_lexemes_lowered
if l1 and l2:
return all(e1 == e2 for e1, e2 in zip(l1, l2))
def __hash__(self):
return hash(
(self.annotation_name, self.position, self.tokens_pref_lexemes_lowered)
)
def __repr__(self):
expr_str = f"{self._ann_name}:{self.tokens_orths}"
indexes, sent, par = self.position
pos_str = f"{par}>{sent}>{','.join(sorted(['t' + str(i) for i in indexes]))}"
return (
f"<AnnotatedExpression for annotation '{self._ann_name}': "
f"'{expr_str}'; {self.tokens_pref_lexemes} at position: {pos_str}>"
)
class DocumentAnnotations(object):
"""
Representation of annotations in CCL document.
Acts as a container keeping annotations and providing methods to facilitate
gathering and accessing such annotations. Uses `AnnotatedExpression` as
a representation of single annotation.
"""
def __init__(
self,
tagset: Optional[TagsetRepr] = "nkjp",
doc: Optional[DocumentPtr] = None,
):
self._doc = doc
self.tagset = tagset
self._ann_dict: Dict[Tuple[str, str, str, int], AnnotatedExpression] = {}
self._tok_dict: Dict[TokenPosition, List[AnnotatedExpression]] = defaultdict(
list
)
self._tok_pos_to_tok: Dict[TokenPosition, Token] = {}
@property
def anns_names(self) -> Set[str]:
"""
Set of unique annotation names (channel names) found in document.
"""
return {k[0] for k in self.expressions_index}
@property
def expressions_index(
self,
) -> Dict[Tuple[str, str, str, int], AnnotatedExpression]:
"""
Returns index of all annotations found in the document.
Returns:
Dict:
key: Tuple[annotation_name, sent_id, par_id, chan_val]
value: AnnotatedExpression
"""
return self._ann_dict
@property
def expressions(self) -> Iterable[AnnotatedExpression]:
"""
Returns all annotations found in the document.
"""
for ann in self.expressions_index.values():
yield ann
@property
def token_by_position_index(self) -> Dict[TokenPosition, Token]:
"""
Index of token position (used in this class) and corresponding token object.
"""
return self._tok_pos_to_tok
def append_token_with_ann(
self,
token: Token,
tok_pos: TokenPosition,
ann_name: str,
chan_val: int,
accepted: Optional[Set[str]] = None,
) -> None:
"""
Append token with single annotation to the index. Ignores if value
in annotation channel is 0 (mean that token is not annotated).
Args:
tok_pos(tuple) - tuple with three values representing position of
token in document: (token_sent_index, sent_id, paragraph_id)
accepted(set): if given, only tokens with annotations specified
in this set will be added
"""
if chan_val != 0 and (not accepted or ann_name in accepted):
sent_id, par_id = tok_pos[1], tok_pos[2]
ann_dict_key = (ann_name, sent_id, par_id, chan_val)
related_ann = None
if ann_dict_key in self._ann_dict:
related_ann = self._ann_dict[ann_dict_key]
related_ann.append(token, tok_pos)
else:
related_ann = AnnotatedExpression(
token, ann_name, tok_pos, tagset=self.tagset, doc=self._doc
)
self._ann_dict[ann_dict_key] = related_ann
self._tok_dict[tok_pos].append(related_ann)
self._tok_pos_to_tok[tok_pos] = token
def append_token_with_all_ann(
self,
token: Token,
tok_pos: TokenPosition,
ann_name_val_dict: Dict[str, int],
accepted: Optional[Set[str]] = None,
) -> None:
"""
Append token with all related annotations to the index.
Args:
tok_pos(tuple) - tuple with three values representing position of
token in document: (token_sent_index, sent_id, paragraph_id)
accepted(set): if given, only tokens with annotations specified
in this set will be added
"""
for ann, chan_val in ann_name_val_dict.items():
self.append_token_with_ann(token, tok_pos, ann, chan_val, accepted=accepted)
def group_by_chan_name(
self,
as_orths: bool = False,
as_lexemes: bool = False,
as_ann_base: bool = False,
) -> Dict[str, List[AnnRepr]]:
"""
Returns annotations grouped by channel name, in one of specified forms.
Args:
as_orths: returns orths instead of `AnnotatedExpression` instances
as_lexemes: returns pref lexemes instead of `AnnotatedExpression`
instances
Returns:
dict of annotations:
key: annotation name.
value: depending on passed flags, list of `AnnotatedExpression`
or list of tuples of strings.
"""
d = defaultdict(list)
for k, ann_obj in self.expressions_index.items():
ann_name = k[0]
d[ann_name].append(ann_obj.get_alt_repr(as_orths, as_lexemes, as_ann_base))
return d
def group_by_token(
self,
retain_order: bool = False,
as_orths: bool = False,
as_lexemes: bool = False,
as_ann_base: bool = False,
) -> Dict[TokenPosition, List[AnnRepr]]:
"""
Returns index of token position (used in this class) and corresponding
annotations.
Args:
retain_order: If enabled, then returns dict ordered by token position.
The order corresponds to the order of occurences in document
(ascending sort: by paragraph id, then by sentence id, finally
by token index). Disabled by default, to avoid additional computation
time when order is not important.
Returns:
dict:
key: tuple representing token position.
value: depending on passed flags, list of `AnnotatedExpression`
or list of tuples of strings.
dictionary concrete class depends on ordering:
without sorting: `dict`
document order: `OrderedDict`
"""
d: Dict[TokenPosition, List[AnnRepr]] = {}
for t_pos, ann_objs in self._tok_dict.items():
d[t_pos] = [
ann_obj.get_alt_repr(as_orths, as_lexemes, as_ann_base)
for ann_obj in ann_objs
]
if retain_order:
return OrderedDict(
sorted(d.items(), key=lambda e: (e[0][2], e[0][1], e[0][0]))
)
return d
def __repr__(self):
return (
f"<DocumentAnnotations for {len(self._ann_dict)} annotated "
f"expressions: {[e for e in self._ann_dict.values()]}>"
)
def get_document_annotations(
ccl_obj_or_path: Union[DocumentPtr, str],
tagset: Optional[Union[Tagset, str]] = "nkjp",
annotations: Optional[Set[str]] = None,
) -> DocumentAnnotations:
"""
Finds annotations in CCL document and returns in a form allowing easy access
to the most important informations.
By default, detects all annotations (specified with `ann` tag). Set of
recognized annotations can be restricted by specifying a set of names of
annotations in `annotations` (channel names).
Check documentation of `DocumentAnnotations` class to find out how to use
returned object.
Args:
ccl_obj_or_path: CCL document (`corpus2.DocumentPtr` or `str` path).
tagset: document tagset (`corpus2.Tagset` or `str` name).
annotations: set of names (string) of annotations (annotation channels)
to find. If not given, then finds all annotated expressions.
Returns:
`DocumentAnnotations` instance with gathered annotated tokens.
"""
tagset = _as_corpus2_tagset(tagset)
doc = _as_corpus2_doc(ccl_obj_or_path, tagset)
doc_ann = DocumentAnnotations(doc=doc, tagset=tagset)
for p in doc.paragraphs():
for s in p.sentences():
for i, t in enumerate(s.tokens()):
anns_dict: Dict[str, int] = ccl.get_annotations(s, t, i)
tok_in_doc_pos = (i, s.id(), p.get_attribute("id"))
doc_ann.append_token_with_all_ann(
t, tok_in_doc_pos, anns_dict, accepted=annotations
)
return doc_ann
def _as_corpus2_doc(ccl_obj_or_path, tagset):
if not isinstance(ccl_obj_or_path, DocumentPtr):
return ccl.read(ccl_obj_or_path, tagset=tagset)
return ccl_obj_or_path
def _as_corpus2_tagset(tagset_obj_or_name):
if not isinstance(tagset_obj_or_name, Tagset):
return ccl.get_tagset(tagset_obj_or_name)
return tagset_obj_or_name
def _get_document_preferred_lexemes(
doc: DocumentPtr, tagset: Tagset
) -> Tuple[str, ...]:
"""
Returns tuple of preferred lexemes of every token in document.
Structure of document does not impact output tuple.
"""
toks = [t for p in doc.paragraphs() for s in p.sentences() for t in s.tokens()]
return _tokens_pref_lexemes(toks, tagset)
def _tokens_pref_lexemes(tokens, tagset) -> Tuple[str, ...]:
"""
Returns tuple of preferred lexemes for given tokens.
"""
return tuple(t.get_preferred_lexeme(tagset).lemma_utf8() for t in tokens)
FROM clarinpl/python:3.6 as cclutils-base
RUN apt-get update && apt-get install -y apt-transport-https \
corpus2-python3.6 \
corpus2mwe-python3.6
WORKDIR /home/install
COPY setup.py .
COPY cclutils ./cclutils
RUN python setup.py install
FROM cclutils-base as cclutils-prod
WORKDIR /home/install
RUN pip install twine
FROM cclutils-base as cclutils-test
COPY tests ./tests
COPY requirements-test.txt .
RUN python -m pip install -r requirements-test.txt
COPY requirements-dev.txt .
RUN python -m pip install -r requirements-dev.txt
WORKDIR /home/install/tests
CMD ["pytest"]
ipython
ipykernel
mypy
pytest
pytest-icdiff
pytest-pudb
...@@ -3,10 +3,10 @@ from setuptools import setup ...@@ -3,10 +3,10 @@ from setuptools import setup
setup( setup(
name='cclutils', name='cclutils',
author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski', author='Arkadiusz Janz, Anna Gut, Dominik Kaszewski, Grzegorz Kostkowski',
description='''A convenient API based on Corpus2 library for analyzing textual description='''A convenient API based on Corpus2 library for analyzing textual
corpora in CCL format.''', corpora in CCL format.''',
version='1.0.3', version='1.1',
packages=['cclutils'], packages=['cclutils', 'cclutils.extras'],
zip_safe=False zip_safe=False
) )
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE chunkList SYSTEM "ccl.dtd">
<chunkList>
<chunk id="ch1">
<sentence id="sent4">
<tok>
<orth>Oprócz</orth>
<lex disamb="1">
<base>oprócz</base>
<ctag>prep:gen</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>bogatej</orth>
<lex disamb="1">
<base>bogaty</base>
<ctag>adj:sg:gen:f:pos</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>historii</orth>
<lex disamb="1">
<base>historia</base>
<ctag>subst:sg:gen:f</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<ns/>
<tok>
<orth>,</orth>
<lex disamb="1">
<base>,</base>
<ctag>interp</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>fascynujących</orth>
<lex disamb="1">
<base>fascynujący</base>
<ctag>adj:pl:gen:m3:pos</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>zabytków</orth>
<lex disamb="1">
<base>zabytek</base>
<ctag>subst:pl:gen:m3</ctag>
</lex>
<ann chan="attraction_classes">1</ann>
<prop key="attraction_classes_base">zabytek</prop>
</tok>
<ns/>
<tok>
<orth>,</orth>
<lex disamb="1">
<base>,</base>
<ctag>interp</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>będą</orth>
<lex disamb="1">
<base>być</base>
<ctag>bedzie:pl:ter:imperf</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>piękne</orth>
<lex disamb="1">
<base>piękny</base>
<ctag>adj:pl:nom:m3:pos</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>widoki</orth>
<lex disamb="1">
<base>widok</base>
<ctag>subst:pl:nom:m3</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>i</orth>
<lex disamb="1">
<base>i</base>
<ctag>conj</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>zachwycająca</orth>
<lex disamb="1">
<base>zachwycający</base>
<ctag>adj:sg:nom:f:pos</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>przyroda</orth>
<lex disamb="1">
<base>przyroda</base>
<ctag>subst:sg:nom:f</ctag>
</lex>
<ann chan="attraction_classes">2</ann>
</tok>
<ns/>
<tok>
<orth>.</orth>
<lex disamb="1">
<base>.</base>
<ctag>interp</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>przyroda</orth>
<lex disamb="1">
<base>przyroda</base>
<ctag>subst:sg:nom:f</ctag>
</lex>
<ann chan="attraction_classes">3</ann>
</tok>
</sentence>
<sentence id="sent5">
<tok>
<orth>dużo</orth>
<lex disamb="1">
<base>dużo</base>
<ctag>adj:pl:gen:m3:pos</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>fascynujących</orth>
<lex disamb="1">
<base>fascynujący</base>
<ctag>adj:pl:gen:m3:pos</ctag>
</lex>
<ann chan="attraction_classes">0</ann>
</tok>
<tok>
<orth>zabytków</orth>
<lex disamb="1">
<base>zabytek</base>
<ctag>subst:pl:gen:m3</ctag>
</lex>
<ann chan="attraction_classes">1</ann>
<prop key="attraction_classes_base">zabytek</prop>
</tok>
</sentence>
</chunk>
</chunkList>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE chunkList SYSTEM "ccl.dtd">
<chunkList>
<chunk id="ch1">
<sentence id="s1">
<tok>
<orth>Wycieczka</orth>
<lex disamb="1"><base>wycieczka</base><ctag>subst:sg:nom:f</ctag></lex>
<ann chan="designation">0</ann>
<ann chan="region">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>dla</orth>
<lex disamb="1"><base>dla</base><ctag>prep:gen</ctag></lex>
<ann chan="designation">1</ann>
<ann chan="region">0</ann>
<ann chan="room_type">1</ann>
<prop key="designation_base">dla dwóch osób</prop>
<prop key="room_type_base">dla dwóch osób</prop>
</tok>
<tok>
<orth>dwóch</orth>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m1:rec</ctag></lex>
<ann chan="designation">1</ann>
<ann chan="region">0</ann>
<ann chan="room_type">1</ann>
</tok>
<tok>
<orth>osób</orth>
<lex disamb="1"><base>osoba</base><ctag>subst:pl:gen:f</ctag></lex>
<ann chan="designation">1</ann>
<ann chan="region">0</ann>
<ann chan="room_type">1</ann>
<prop key="region_base">Osobie</prop>
</tok>
<tok>
<orth>Gdańsk</orth>
<lex disamb="1"><base>Gdańsk</base><ctag>subst:sg:nom:m3</ctag></lex>
<ann chan="designation">0</ann>
<ann chan="region">1</ann>
<ann chan="room_type">0</ann>
</tok>
<ns/>
<tok>
<orth>.</orth>
<lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
<ann chan="designation">0</ann>
<ann chan="region">0</ann>
<ann chan="room_type">0</ann>
</tok>
</sentence>
</chunk>
<chunk id="ch2">
<sentence id="s2">
<tok>
<orth>Hotel</orth>
<lex disamb="1"><base>hotel</base><ctag>subst:sg:nom:m3</ctag></lex>
<ann chan="attraction">1</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">1</ann>
<ann chan="room_type">0</ann>
<prop key="attraction_base">hotel</prop>
<prop key="hotel_name_base">Hotel</prop>
</tok>
<ns/>
<tok>
<orth>,</orth>
<lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>ze</orth>
<lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>śniadaniem</orth>
<lex disamb="1"><base>śniadanie</base><ctag>subst:sg:nom:m3</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">1</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
<prop key="food_base">śniadanie</prop>
</tok>
<ns/>
<tok>
<orth>,</orth>
<lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>z</orth>
<lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>prywatną</orth>
<lex disamb="1"><base>prywatny</base><ctag>adj:sg:acc:f:pos</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>łazienką</orth>
<lex disamb="1"><base>łazienka</base><ctag>subst:sg:inst:f</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">1</ann>
<prop key="room_type_base">łazienka</prop>
</tok>
<ns/>
<tok>
<orth>,</orth>
<lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>atrakcje</orth>
<lex disamb="1"><base>atrakcja</base><ctag>subst:pl:nom:f</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>dla</orth>
<lex disamb="1"><base>dla</base><ctag>prep:gen</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">1</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
<prop key="designation_base">dla dziecka</prop>
</tok>
<tok>
<orth>dzieci</orth>
<lex disamb="1"><base>dziecko</base><ctag>subst:pl:nom:n</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">1</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<ns/>
<tok>
<orth>,</orth>
<lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>spa</orth>
<lex disamb="1"><base>spa</base><ctag>subst:sg:nom:n</ctag></lex>
<ann chan="attraction">2</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
<prop key="attraction_base">spa</prop>
</tok>
<ns/>
<tok>
<orth>,</orth>
<lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>,</orth>
<lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>z</orth>
<lex disamb="1"><base>z</base><ctag>prep:gen:nwok</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<tok>
<orth>pełnym</orth>
<lex disamb="1"><base>pełny</base><ctag>adj:sg:nom:n:pos</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">2</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
<prop key="food_base">pełne wyżywienie</prop>
</tok>
<tok>
<orth>wyżywieniem</orth>
<lex disamb="1"><base>wyżywienie</base><ctag>subst:sg:nom:n</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">2</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
<ns/>
<tok>
<orth>.</orth>
<lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
<ann chan="attraction">0</ann>
<ann chan="designation">0</ann>
<ann chan="food">0</ann>
<ann chan="hotel_name">0</ann>
<ann chan="room_type">0</ann>
</tok>
</sentence>
</chunk>
</chunkList>
# coding: utf-8
from collections import OrderedDict
import os
from typing import Dict, Tuple
from cclutils.extras.annotations import get_document_annotations
TEST_ROOT_DIR = os.path.abspath(os.path.dirname(__file__))
TEST_DATA_DIR = os.path.join(TEST_ROOT_DIR, "data")
CCL_TEST_PATH_01 = os.path.join(TEST_DATA_DIR, "ccl01.xml")
CCL_TEST_PATH_02 = os.path.join(TEST_DATA_DIR, "ccl02.xml")
def as_expressions_orth_index(ann_expr_index) -> Dict[Tuple[str, str, str, int], str]:
return {k: ann.tokens_orths for k, ann in ann_expr_index.items()}
def test_simple():
ann = "attraction_classes"
anns = get_document_annotations(CCL_TEST_PATH_01, "nkjp")
attractions = anns.group_by_chan_name()[ann]
assert len(attractions) == 4
def test_complex():
anns = get_document_annotations(CCL_TEST_PATH_02)
assert anns.anns_names == {
"attraction",
"designation",
"food",
"hotel_name",
"region",
"room_type",
}
expressions_orth_index = as_expressions_orth_index(anns.expressions_index)
assert expressions_orth_index == {
("designation", "s1", "ch1", 1): ("dla", "dwóch", "osób"),
("room_type", "s1", "ch1", 1): ("dla", "dwóch", "osób"),
("region", "s1", "ch1", 1): ("Gdańsk",),
("attraction", "s2", "ch2", 1): ("Hotel",),
("hotel_name", "s2", "ch2", 1): ("Hotel",),
("food", "s2", "ch2", 1): ("śniadaniem",),
("room_type", "s2", "ch2", 1): ("łazienką",),
("designation", "s2", "ch2", 1): ("dla", "dzieci"),
("attraction", "s2", "ch2", 2): ("spa",),
("food", "s2", "ch2", 2): ("pełnym", "wyżywieniem"),
}
assert anns.group_by_chan_name(as_ann_base=True) == {
"designation": ["dla dwóch osób", "dla dziecka"],
"room_type": ["dla dwóch osób", "łazienka"],
"region": [""],
"attraction": ["hotel", "spa"],
"hotel_name": ["Hotel"],
"food": ["śniadanie", "pełne wyżywienie"],
}
assert anns.group_by_token(retain_order=True, as_orths=True) == OrderedDict(
[
((1, "s1", "ch1"), [("dla", "dwóch", "osób"), ("dla", "dwóch", "osób")]),
((2, "s1", "ch1"), [("dla", "dwóch", "osób"), ("dla", "dwóch", "osób")]),
((3, "s1", "ch1"), [("dla", "dwóch", "osób"), ("dla", "dwóch", "osób")]),
((4, "s1", "ch1"), [("Gdańsk",)]),
((0, "s2", "ch2"), [("Hotel",), ("Hotel",)]),
((3, "s2", "ch2"), [("śniadaniem",)]),
((7, "s2", "ch2"), [("łazienką",)]),
((10, "s2", "ch2"), [("dla", "dzieci")]),
((11, "s2", "ch2"), [("dla", "dzieci")]),
((13, "s2", "ch2"), [("spa",)]),
((17, "s2", "ch2"), [("pełnym", "wyżywieniem")]),
((18, "s2", "ch2"), [("pełnym", "wyżywieniem")]),
]
)
def test_restricted_ann_set():
anns = get_document_annotations(
CCL_TEST_PATH_02,
annotations={
"designation",
"food",
"region",
},
)
assert anns.anns_names == {
"designation",
"food",
"region",
}
expressions_orth_index = as_expressions_orth_index(anns.expressions_index)
assert expressions_orth_index == {
("designation", "s1", "ch1", 1): ("dla", "dwóch", "osób"),
("region", "s1", "ch1", 1): ("Gdańsk",),
("food", "s2", "ch2", 1): ("śniadaniem",),
("designation", "s2", "ch2", 1): ("dla", "dzieci"),
("food", "s2", "ch2", 2): ("pełnym", "wyżywieniem"),
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment