Analysers
cclutils

Repository

wget -q -O - http://apt.clarin-pl.eu/KEY.gpg | apt-key add -
echo 'deb https://apt.clarin-pl.eu/ /' > /etc/apt/sources.list.d/clarin.list

apt-get update && apt-get install corpus2-python3.6
FROM clarinpl/python:3.6

RUN apt-get update && apt-get install -y \
    corpus2-python3.6

RUN pip install --upgrade pip && pip install cclutils
pip install cclutils --extra-index-url https://pypi.clarin-pl.eu/
import cclutils

filepath = './example.xml'
document = cclutils.read(filepath)

cclpath = './example.xml'
relpath = './exampel.rel.xml'
document = cclutils.read(cclpath, relpath)
document = cclutils.read(cclpath, relpath, 'nkjp')
document = cclutils.read(filepath)
...
cclutils.write(document, './out.xml')
cclutils.write(document, './out.xml', rel_path='./out.rel.xml')
cclutils.write(document, './out.xml', rel_path='./out.rel.xml', tagset='spacy')
tagset = cclutils.get_tagset('nkjp')
tagset = cclutils.get_tagset('spacy')
...
document = cclutils.read('./example.xml')
for paragraph in document.paragraphs():
    ...
    for sentence in paragraph.sentences():
        ...
        for token in sentence.tokens():
            ...
document = cclutils.read('./example.xml')

# tokens is a generator:
tokens = (token for paragraph in document.paragraphs()
    for sentence in paragraph.sentences()
    for token in sentence.tokens())

for token in tokens:
    ...
it = read_chunks_it(ccl_path)
for paragraph in it:
    pass

it = read_sentences_it(ccl_path)
for sentence in it:
    pass
>>> tagset = cclutils.get_tagset('nkjp')
>>> get_pos(token, tagset)
'subst:pl:inst:f'

>>> tagset = cclutils.get_tagset('nkjp')
>>> get_pos(token, tagset, main_only=True)
'subst'
>>> tagset = cclutils.get_tagset('nkjp')
>>> get_coarse_pos(token, tagset)
'noun'
>>> convert_to_coarse_pos('subst')
'noun'
>>> get_lexeme_lemma(token)
'samolot'
>>> token.after_space()
True
>>> token.set_wa(False)
>>> token.after_space()
False
document = cclutils.read('./example.xml')

sentences = (sentence for paragraph in document.paragraphs()
    for sentence in paragraph.sentences())

for sentence in sentences:
    print(cclutils.sentence2str(sentence))