Commit f18a0312 authored by Arkadiusz Janz's avatar Arkadiusz Janz

Merge branch 'develop' into 'master'

Sentence/chunk generators

See merge request !1
parents 212a8e4a 72657010
......@@ -159,3 +159,52 @@ def get_tagset(tagset):
if isinstance(tagset, str):
tagset = corpus2.get_named_tagset(tagset)
return tagset
def read_chunks_it(filepath, tagset='nkjp'):
""" Returns a iterable chunk generator.
Args:
filepath: a path to CCL file
tagset: the name of the tagset that is used in the document or a tagset object itself.
Returns:
a iterable chunk generator.
"""
tagset = get_tagset(tagset)
reader = corpus2.TokenReader_create_path_reader('ccl', tagset, filepath)
while True:
chunk = reader.get_next_chunk()
if not chunk:
break
yield chunk
del reader
def read_sentences_it(filepath, tagset='nkjp'):
""" Returns a iterable sentence generator.
Args:
filepath: a path to CCL file
tagset: the name of the tagset that is used in the document or a tagset object itself.
Returns:
a iterable sentence generator.
"""
tagset = get_tagset(tagset)
reader = corpus2.TokenReader_create_path_reader('ccl', tagset, filepath)
while True:
sentence = reader.get_next_sentence()
if not sentence:
break
yield sentence
del reader
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment