Skip to content
Snippets Groups Projects
Commit 84404f95 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

helper script for text extraction

parent f8acea67
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
descr = """%prog [options] INPUT OUTPUT
Reads input and saves as plain text. By default, paragraphs are separated with
two newlines, sentence division is not marked."""
from optparse import OptionParser
import sys, codecs
import corpus2
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
parser.add_option('-s', '--sent-sep', type='string', action='store',
dest='sent_sep', default='',
help='set the sentence separator; default: (empty)')
parser.add_option('-p', '--par-sep', type='string', action='store',
dest='par_sep', default='\n\n',
help='set the sentence separator; default: (two newlines)')
(options, args) = parser.parse_args()
if len(args) != 2:
print 'Need to provide input and output.'
print 'See --help for details.'
print
sys.exit(1)
fn_input, fn_output = args
with codecs.open(fn_output, 'wb', 'utf-8') as out:
tagset = corpus2.get_named_tagset(options.tagset)
rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input)
first = True
while True:
par = rdr.get_next_chunk()
if options.par_sep:
first = True # if non-empty par separator, skip pre-spaces
if not par:
break
for sent in par.sentences():
if options.sent_sep:
first = True # if non-empty sent sep, skip pre-spaces
for tok in sent.tokens():
if not first and tok.after_space():
out.write(' ')
out.write(unicode(tok.orth()))
first = False
out.write(options.sent_sep)
out.write(options.par_sep)
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment