Analysers
cclutils

Repository

wget -q -O - http://apt.clarin-pl.eu/KEY.gpg | apt-key add -
echo 'deb https://apt.clarin-pl.eu/ /' > /etc/apt/sources.list.d/clarin.list

apt-get update && apt-get install corpus2-python3.6
FROM clarinpl/python:3.6

RUN apt-get update && apt-get install -y \
    corpus2-python3.6

RUN pip install --upgrade pip && pip install cclutils
pip install cclutils --extra-index-url https://pypi.clarin-pl.eu/
import cclutils

filepath = './example.xml'
document = cclutils.read(filepath)

cclpath = './example.xml'
relpath = './exampel.rel.xml'
document = cclutils.read(cclpath, relpath)
document = cclutils.read(cclpath, relpath, 'nkjp')
document = cclutils.read(filepath)
...
cclutils.write(document, './out.xml')
cclutils.write(document, './out.xml', rel_path='./out.rel.xml')
cclutils.write(document, './out.xml', rel_path='./out.rel.xml', tagset='spacy')
tagset = cclutils.get_tagset('nkjp')
tagset = cclutils.get_tagset('spacy')
...
document = cclutils.read('./example.xml')
for paragraph in document.paragraphs():
    ...
    for sentence in paragraph.sentences():
        ...
        for token in sentence.tokens():
            ...
document = cclutils.read('./example.xml')

# tokens is a generator:
tokens = (token for paragraph in document.paragraphs()
    for sentence in paragraph.sentences()
    for token in sentence.tokens())

for token in tokens:
    ...
it = read_chunks_it(ccl_path)
for paragraph in it:
    pass

it = read_sentences_it(ccl_path)
for sentence in it:
    pass
>>> tagset = cclutils.get_tagset('nkjp')
>>> get_pos(token, tagset)
'subst:pl:inst:f'

>>> tagset = cclutils.get_tagset('nkjp')
>>> get_pos(token, tagset, main_only=True)
'subst'
>>> tagset = cclutils.get_tagset('nkjp')
>>> get_coarse_pos(token, tagset)
'noun'
>>> convert_to_coarse_pos('subst')
'noun'
>>> get_lexeme_lemma(token)
'samolot'
>>> token.after_space()
True
>>> token.set_wa(False)
>>> token.after_space()
False
document = cclutils.read('./example.xml')

sentences = (sentence for paragraph in document.paragraphs()
    for sentence in paragraph.sentences())

for sentence in sentences:
    print(cclutils.sentence2str(sentence))
from cclutils.extras.annotations import get_document_annotations
>>> anns = get_document_annotations(cclutils.read('tests/data/ccl02.xml'))
>>> anns
<DocumentAnnotations for 10 annotated expressions: [<AnnotatedExpression for
    annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla',
    'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for
    annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa',
    'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation
    'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>,
    <AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)';
    ('hotel',) at position: ch2>s2>t0>, <AnnotatedExpression for annotation
    'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
    <AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)';
    ('śniadanie',) at position: ch2>s2>t3>, <AnnotatedExpression for annotation
    'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>,
    <AnnotatedExpression for annotation 'designation': 'designation:('dla',
    'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>, <AnnotatedExpression
    for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position:
    ch2>s2>t13>, <AnnotatedExpression for annotation 'food': 'food:('pełnym',
    'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]>
>>> anns = get_document_annotations(cclutils.read('tests/data/ccl02.xml'), annotations={'designation'})
>>> anns
<DocumentAnnotations for 2 annotated expressions: [<AnnotatedExpression for
    annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla',
    'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for
    annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko')
    at position: ch2>s2>t10,t11>]>
>>> anns.expressions_index
defaultdict(list,
            {('designation',
            's1',
            'ch1',
            1): <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
            ('room_type',
            's1',
            'ch1',
            1): <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
            ('region',
            's1',
            'ch1',
            1): <AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>,
            ('attraction',
            's2',
            'ch2',
            1): <AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
            ('hotel_name',
            's2',
            'ch2',
            1): <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
            ('food',
            's2',
            'ch2',
            1): <AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>,
            ('room_type',
            's2',
            'ch2',
            1): <AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>,
            ('designation',
            's2',
            'ch2',
            1): <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>,
            ('attraction',
            's2',
            'ch2',
            2): <AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>,
            ('food',
            's2',
            'ch2',
            2): <AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>})

>>> anns.group_by_chan_name()
defaultdict(list,
            {'designation': [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
            <AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
            'room_type': [<AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
            <AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>],
            'region': [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>],
            'attraction': [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
            <AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>],
            'hotel_name': [<AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>],
            'food': [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>,
            <AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]})

>>> anns.group_by_chan_name(as_orths=True)
defaultdict(list,
            {'designation': [('dla', 'dwóch', 'osób'), ('dla', 'dzieci')],
            'room_type': [('dla', 'dwóch', 'osób'), ('łazienką',)],
            'region': [('Gdańsk',)],
            'attraction': [('Hotel',), ('spa',)],
            'hotel_name': [('Hotel',)],
            'food': [('śniadaniem',), ('pełnym', 'wyżywieniem')]})

>>> anns.group_by_chan_name(as_lexemes=True)
defaultdict(list,
            {'designation': [('dla', 'dwa', 'osoba'), ('dla', 'dziecko')],
            'room_type': [('dla', 'dwa', 'osoba'), ('łazienka',)],
            'region': [('Gdańsk',)],
            'attraction': [('hotel',), ('spa',)],
            'hotel_name': [('hotel',)],
            'food': [('śniadanie',), ('pełny', 'wyżywienie')]})

>>> anns.group_by_chan_name(as_ann_base=True)
defaultdict(list,
            {'designation': ['dla dwóch osób', 'dla dziecka'],
            'room_type': ['dla dwóch osób', 'łazienka'],
            'region': [''],
            'attraction': ['hotel', 'spa'],
            'hotel_name': ['Hotel'],
            'food': ['śniadanie', 'pełne wyżywienie']})
>>> anns.group_by_token()
{(1,
's1',
'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
: 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
(2,
's1',
'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
: 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
(3,
's1',
'ch1'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>, <AnnotatedExpression for annotation 'room_type'
: 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>],
(4,
's1',
'ch1'): [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>],
(0,
's2',
'ch2'): [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>, <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel
',) at position: ch2>s2>t0>],
(3,
's2',
'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>],
(7,
's2',
'ch2'): [<AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>],
(10,
's2',
'ch2'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
(11,
's2',
'ch2'): [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>],
(13,
's2',
'ch2'): [<AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>],
(17,
's2',
'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>],
(18,
's2',
'ch2'): [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]}
>>> anns.group_by_token(retain_order=True)
OrderedDict([((1, 's1', 'ch1'),
            [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
            <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
            ((2, 's1', 'ch1'),
            [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
            <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
            ((3, 's1', 'ch1'),
            [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>,
            <AnnotatedExpression for annotation 'room_type': 'room_type:('dla', 'dwóch', 'osób')'; ('dla', 'dwa', 'osoba') at position: ch1>s1>t1,t2,t3>]),
            ((4, 's1', 'ch1'),
            [<AnnotatedExpression for annotation 'region': 'region:('Gdańsk',)'; ('Gdańsk',) at position: ch1>s1>t4>]),
            ((0, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'attraction': 'attraction:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>,
            <AnnotatedExpression for annotation 'hotel_name': 'hotel_name:('Hotel',)'; ('hotel',) at position: ch2>s2>t0>]),
            ((3, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'food': 'food:('śniadaniem',)'; ('śniadanie',) at position: ch2>s2>t3>]),
            ((7, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'room_type': 'room_type:('łazienką',)'; ('łazienka',) at position: ch2>s2>t7>]),
            ((10, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>]),
            ((11, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'designation': 'designation:('dla', 'dzieci')'; ('dla', 'dziecko') at position: ch2>s2>t10,t11>]),
            ((13, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'attraction': 'attraction:('spa',)'; ('spa',) at position: ch2>s2>t13>]),
            ((17, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>]),
            ((18, 's2', 'ch2'),
            [<AnnotatedExpression for annotation 'food': 'food:('pełnym', 'wyżywieniem')'; ('pełny', 'wyżywienie') at position: ch2>s2>t17,t18>])])
>>> anns.token_by_position_index[(17, 's2', 'ch2')]
<corpus2.Token; proxy of <Swig Object of type 'Corpus2::Token *' at 0x7f71edfced80> >