diff --git a/README.md b/README.md index d10e8a0339e63b1ecd27df785bed17e8d736d226..3b00726f3b6cf92060c6ff8df2e7af481bac71c0 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,25 @@ A convenient API based on Corpus2 library for reading, writing, and processing textual corpora represented as CCL (XML) documents. +###### Requirements +python3.6, corpus2 + +To install Corpus2 you have to add new source for APT: + +```bash +wget -q -O - http://apt.clarin-pl.eu/KEY.gpg | apt-key add - +echo 'deb https://apt.clarin-pl.eu/ /' > /etc/apt/sources.list.d/clarin.list + +apt-get update && apt-get install corpus2-python3.6 +``` + +Install +======= + +```bash +pip install cclutils --extra-index-url https://pypi.clarin-pl.eu/ +``` + IO ====== @@ -81,15 +100,72 @@ document = cclutils.read('./example.xml') tokens = (token for paragraph in document.paragraphs() for sentence in paragraph.sentences() for token in sentence.tokens()) + +for token in tokens: + ... +``` + +To avoid loading large CCL documents to RAM (DOM parsers) we can read them +iteratively, chunk by chunk, or sentence by sentence (SAX-based approach): + +```python +it = read_chunks_it(ccl_path) +for paragraph in it: + pass + +it = read_sentences_it(ccl_path) +for sentence in it: + pass ``` Token manipulation ================== -1. Get Part-of-Speech (simple) +1. Get Part-of-Speech (simple, returns complete <ctag>) ```python -tagset = cclutils.get_tagset('nkjp') -... -pos = get_pos(token, tagset) +>>> tagset = cclutils.get_tagset('nkjp') +>>> get_pos(token, tagset) +'subst:pl:inst:f' + +``` + +2. Get Part-of-Speech (main_only, returns only the main part of <ctag>) + +```python +>>> tagset = cclutils.get_tagset('nkjp') +>>> get_pos(token, tagset, main_only=True) +'subst' +``` + +3. Get coarse-grained PoS (NKJP only for now) + +```python +>>> tagset = cclutils.get_tagset('nkjp') +>>> get_coarse_pos(token, tagset) +'noun' +``` + +4. Convert to coarse-grained PoS (NKJP only for now) + +```python +>>> convert_to_coarse_pos('subst') +'noun' +``` + +5. Get token lemma + +```python +>>> get_lexeme_lemma(token) +'samolot' +``` + +6. Check if a token is preceded by whitespace. Add or remove a whitespace. + +```python +>>> token.after_space() +True +>>> token.set_wa(False) +>>> token.after_space() +False ``` \ No newline at end of file diff --git a/cclutils/_base.py b/cclutils/_base.py index 971c01b6cb608e4b3ab2eb3c6c6121b0fd4aee5e..48bd5e8423706219d58ce628ae859ea2d9481757 100644 --- a/cclutils/_base.py +++ b/cclutils/_base.py @@ -159,3 +159,52 @@ def get_tagset(tagset): if isinstance(tagset, str): tagset = corpus2.get_named_tagset(tagset) return tagset + + +def read_chunks_it(filepath, tagset='nkjp'): +""" Returns a iterable chunk generator. + + Args: + filepath: a path to CCL file + tagset: the name of the tagset that is used in the document or a tagset object itself. + + Returns: + a iterable chunk generator. + """ + tagset = get_tagset(tagset) + reader = corpus2.TokenReader_create_path_reader('ccl', tagset, filepath) + + while True: + chunk = reader.get_next_chunk() + + if not chunk: + break + yield chunk + + del reader + + +def read_sentences_it(filepath, tagset='nkjp'): + """ Returns a iterable sentence generator. + + Args: + filepath: a path to CCL file + tagset: the name of the tagset that is used in the document or a tagset object itself. + + Returns: + a iterable sentence generator. + + """ + tagset = get_tagset(tagset) + reader = corpus2.TokenReader_create_path_reader('ccl', tagset, filepath) + + while True: + sentence = reader.get_next_sentence() + + if not sentence: + break + yield sentence + + del reader + +