diff --git a/utils/corpspace.py b/utils/corpspace.py new file mode 100755 index 0000000000000000000000000000000000000000..ac3b429a7159152aff331686f52141a012064342 --- /dev/null +++ b/utils/corpspace.py @@ -0,0 +1,61 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from optparse import OptionParser +import sys +import corpus2 +from StringIO import StringIO +from collections import defaultdict as dd +descr = """%prog [options] TAGOUT MORPHO OUT + +Util to synchronise no-space markers between tagger output (TAGOUT) that +contains the wanted disamb lexemes but may be devoid of no-space markers +with the tagger input containing proper no-space markers but no disambs. +""" + + + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='xces', + help='set the output format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose') + parser.add_option('-d', '--debug', action='store_true', dest='debug_mode') + (options, args) = parser.parse_args() + + if len(args) != 3: + print 'You need to provide a TAGOUT, MORPHO and OUTPUT files.' + print 'See --help for details.' + print + sys.exit(1) + + tag_fn, mor_fn, out_fn = args + tagset = corpus2.get_named_tagset(options.tagset) + + tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, tag_fn) + mor_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, mor_fn) + + writer = corpus2.TokenWriter.create_path_writer(options.output_format, out_fn, tagset) + + while True: + mor_sent = mor_rdr.get_next_sentence() + tag_sent = tag_rdr.get_next_sentence() + assert (not mor_sent) == (not tag_sent) + if not mor_sent: + break + for mor_tok, tag_tok in zip(mor_sent.tokens(), tag_sent.tokens()): + assert unicode(mor_tok.orth()) == unicode(tag_tok.orth()), unicode(tag_tok.orth()) + tag_tok.set_wa(mor_tok.wa()) + writer.write_sentence(tag_sent) + + writer.finish() + +if __name__ == '__main__': + go()