From da6bd398a800c2e6d7a94997b1eb036af5796bfd Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Wed, 18 Apr 2012 10:42:11 +0200 Subject: [PATCH] fix par spacing in text extractor script --- utils/corptext.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/utils/corptext.py b/utils/corptext.py index 36a67d7..a2c8081 100755 --- a/utils/corptext.py +++ b/utils/corptext.py @@ -17,12 +17,11 @@ def go(): parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='nkjp', help='set the tagset used in input; default: nkjp') - parser.add_option('-s', '--sent-sep', type='string', action='store', - dest='sent_sep', default='', - help='set the sentence separator; default: (empty)') parser.add_option('-p', '--par-sep', type='string', action='store', dest='par_sep', default='\n\n', - help='set the sentence separator; default: (two newlines)') + help='set the paragraph separator; default: (two newlines)') + parser.add_option('--ignore-ns-sent', action='store_true', default=False, + dest='ignore_ns_sent', help='ignore no-space markers on sent boundaries') (options, args) = parser.parse_args() if len(args) != 2: print 'Need to provide input and output.' @@ -38,19 +37,17 @@ def go(): first = True while True: par = rdr.get_next_chunk() - if options.par_sep: - first = True # if non-empty par separator, skip pre-spaces + parfirst = True if not par: break for sent in par.sentences(): - if options.sent_sep: - first = True # if non-empty sent sep, skip pre-spaces + sentfirst = True # if non-empty sent sep, skip pre-spaces for tok in sent.tokens(): - if not first and tok.after_space(): + if not parfirst and ((sentfirst and options.ignore_ns_sent) or tok.after_space()): out.write(' ') out.write(unicode(tok.orth())) - first = False - out.write(options.sent_sep) + sentfirst = False + parfirst = False out.write(options.par_sep) if __name__ == '__main__': -- GitLab