Skip to content
Snippets Groups Projects
Commit af5d164e authored by Jarema Radom's avatar Jarema Radom
Browse files

Migrated wccl-rules from py2 to py3

parent 17d9424d
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from optparse import OptionParser
......@@ -69,12 +68,12 @@ def process_sent(asent, wccl_file, shall_print_ann_info):
match_rules = wccl_file.get_match_rules_ptr()
match_rules.apply_all(asent)
if shall_print_ann_info:
print '####', ' '.join(tok.orth_utf8() for tok in asent.tokens())
print asent.annotation_info()
print('####', ' '.join(tok.orth_utf8() for tok in asent.tokens()))
print(asent.annotation_info())
for chan_name in asent.all_channels():
chan = asent.get_channel(chan_name)
# get the internal representation -- annotation id vector
print chan_name, '\t', chan.dump_segments()
print(chan_name, '\t', chan.dump_segments())
# if you want to get it as a int vector, use chan.segments()
# or to get one it at idx -- chan.get_segment_at(idx)
# 0 means no annot there, positive values==ann indices
......@@ -84,7 +83,7 @@ def process_sent(asent, wccl_file, shall_print_ann_info):
for ann in ann_vec:
idx_text = ','.join(str(idx) for idx in ann.indices)
orth_text = ' '.join(asent.tokens()[idx].orth_utf8() for idx in ann.indices)
print '\t%s\t%s\t(%s)' % (chan_name, idx_text, orth_text)
print('\t%s\t%s\t(%s)' % (chan_name, idx_text, orth_text))
def go():
......@@ -147,32 +146,25 @@ def go():
writer = get_writer(options.out_path, tagset, options.output_format)
# processing paragraph-by-paragraph
if options.preserve_chunks:
while True:
chunk = reader.get_next_chunk()
if not chunk:
break # end of input
# process each sentence separately
for sent in chunk.sentences():
# wrap the sentence as an AnnotatedSentence
asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
process_sent(sent, wccl_file, options.ann_info)
# save processed chunk
# NOTE: if the input sent was not AnnotatedSentence, the changes
# will be discarded
writer.write_chunk(chunk)
else:
while True:
sent = reader.get_next_sentence()
if not sent:
break # end of input
while True:
chunk = reader.get_next_chunk()
if not chunk:
break # end of input
# process each sentence separately
for sent in chunk.sentences():
# wrap the sentence as an AnnotatedSentence
asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
process_sent(asent, wccl_file, options.ann_info)
# save processed sentence (safe)
# NOTE: if the input sent was not AnnotatedSentence, the changes
# will be discarded
writer.write_sentence(sent)
# save processed chunk
# NOTE: if the input sent was not AnnotatedSentence, the changes
# will be discarded
if writer:
if options.preserve_chunks:
writer.write_chunk(chunk)
else:
writer.write_sentence(sent)
writer.finish()
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment