Commit d46fe12a authored by Piotr Wątorski's avatar Piotr Wątorski

Sentence/chunk generators

parent 212a8e4a
......@@ -159,3 +159,53 @@ def get_tagset(tagset):
if isinstance(tagset, str):
tagset = corpus2.get_named_tagset(tagset)
return tagset
def read_chunks_it(filepath, tagset='nkjp'):
""" Returns a iterable sentence generator.
Args:
filepath: a path to CCL file
tagset: the name of the tagset that is used in the document or a tagset object itself.
Returns: a iterable sentence generator.
"""
tagset = get_tagset(tagset)
reader = corpus2.TokenReader_create_path_reader('ccl', tagset, filepath)
while True:
chunk = reader.get_next_chunk()
if chunk:
yield chunk
else:
break
del reader
def read_sentences_it(filepath, tagset='nkjp'):
""" Returns a iterable sentence generator.
Args:
filepath: a path to CCL file
tagset: the name of the tagset that is used in the document or a tagset object itself.
Returns:
a iterable sentence generator.
"""
tagset = get_tagset(tagset)
reader = corpus2.TokenReader_create_path_reader('ccl', tagset, filepath)
while True:
sentence = reader.get_next_sentence()
if sentence:
yield sentence
else:
break
del reader
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment