diff --git a/src/doc/wccl-rules.py b/src/doc/wccl-rules.py index 2b239b61bb227fb395c2e753f3fb426d1352435f..39d848a23ba73f40ce1b68d222dcac008f2023e0 100755 --- a/src/doc/wccl-rules.py +++ b/src/doc/wccl-rules.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- import sys from optparse import OptionParser @@ -69,12 +68,12 @@ def process_sent(asent, wccl_file, shall_print_ann_info): match_rules = wccl_file.get_match_rules_ptr() match_rules.apply_all(asent) if shall_print_ann_info: - print '####', ' '.join(tok.orth_utf8() for tok in asent.tokens()) - print asent.annotation_info() + print('####', ' '.join(tok.orth_utf8() for tok in asent.tokens())) + print(asent.annotation_info()) for chan_name in asent.all_channels(): chan = asent.get_channel(chan_name) # get the internal representation -- annotation id vector - print chan_name, '\t', chan.dump_segments() + print(chan_name, '\t', chan.dump_segments()) # if you want to get it as a int vector, use chan.segments() # or to get one it at idx -- chan.get_segment_at(idx) # 0 means no annot there, positive values==ann indices @@ -84,7 +83,7 @@ def process_sent(asent, wccl_file, shall_print_ann_info): for ann in ann_vec: idx_text = ','.join(str(idx) for idx in ann.indices) orth_text = ' '.join(asent.tokens()[idx].orth_utf8() for idx in ann.indices) - print '\t%s\t%s\t(%s)' % (chan_name, idx_text, orth_text) + print('\t%s\t%s\t(%s)' % (chan_name, idx_text, orth_text)) def go(): @@ -147,32 +146,25 @@ def go(): writer = get_writer(options.out_path, tagset, options.output_format) # processing paragraph-by-paragraph - if options.preserve_chunks: - while True: - chunk = reader.get_next_chunk() - if not chunk: - break # end of input - # process each sentence separately - for sent in chunk.sentences(): - # wrap the sentence as an AnnotatedSentence - asent = corpus2.AnnotatedSentence.wrap_sentence(sent) - process_sent(sent, wccl_file, options.ann_info) - # save processed chunk - # NOTE: if the input sent was not AnnotatedSentence, the changes - # will be discarded - writer.write_chunk(chunk) - else: - while True: - sent = reader.get_next_sentence() - if not sent: - break # end of input + + while True: + chunk = reader.get_next_chunk() + if not chunk: + break # end of input + # process each sentence separately + for sent in chunk.sentences(): # wrap the sentence as an AnnotatedSentence asent = corpus2.AnnotatedSentence.wrap_sentence(sent) process_sent(asent, wccl_file, options.ann_info) - # save processed sentence (safe) - # NOTE: if the input sent was not AnnotatedSentence, the changes - # will be discarded - writer.write_sentence(sent) + # save processed chunk + # NOTE: if the input sent was not AnnotatedSentence, the changes + # will be discarded + if writer: + if options.preserve_chunks: + writer.write_chunk(chunk) + else: + writer.write_sentence(sent) + writer.finish() if __name__ == '__main__': go()