Skip to content
Snippets Groups Projects

Develop

Merged Grzegorz Kostkowski requested to merge develop into master
4 files
+ 113
30
Compare changes
  • Side-by-side
  • Inline
Files
4
+ 30
0
@@ -12,7 +12,10 @@ __all__ = [
'read',
'write',
'read_from_directory',
'read_chunks_it',
'read_sentences_it',
'get_tagset',
'sentence2str'
]
@@ -206,3 +209,30 @@ def read_sentences_it(filepath, tagset='nkjp'):
yield sentence
del reader
def sentence2str(sentence, use_bases=False, tagset='nkjp'):
""" Return corpus2.Sentence as a string.
Args:
sentence: a sentence object (corpus2.Sentence).
use_bases: if set to True, the we take base forms
instead of taking the orths.
Returns:
a string representation of the input sentence object.
"""
if isinstance(tagset, str):
tagset = corpus2.get_named_tagset(tagset)
text = []
for token in sentence.tokens():
text.append(" " if token.after_space() else "")
if not use_bases:
token_string = token.orth_utf8()
else:
token_string = token.get_preferred_lexeme(tagset).lemma_utf8()
text.append(token_string)
return "".join(text).strip()
Loading