Skip to content
Snippets Groups Projects
Commit 5950f62c authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

util for plain text exctraction

parent 1771a56c
Branches
No related tags found
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
from optparse import OptionParser
import sys
import corpus2
from StringIO import StringIO
from collections import defaultdict as dd
descr = """%prog [options] TAGOUT MORPHO OUT
Util to synchronise no-space markers between tagger output (TAGOUT) that
contains the wanted disamb lexemes but may be devoid of no-space markers
with the tagger input containing proper no-space markers but no disambs.
"""
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='xces',
help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose')
parser.add_option('-d', '--debug', action='store_true', dest='debug_mode')
(options, args) = parser.parse_args()
if len(args) != 3:
print 'You need to provide a TAGOUT, MORPHO and OUTPUT files.'
print 'See --help for details.'
print
sys.exit(1)
tag_fn, mor_fn, out_fn = args
tagset = corpus2.get_named_tagset(options.tagset)
tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, tag_fn)
mor_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, mor_fn)
writer = corpus2.TokenWriter.create_path_writer(options.output_format, out_fn, tagset)
while True:
mor_sent = mor_rdr.get_next_sentence()
tag_sent = tag_rdr.get_next_sentence()
assert (not mor_sent) == (not tag_sent)
if not mor_sent:
break
for mor_tok, tag_tok in zip(mor_sent.tokens(), tag_sent.tokens()):
assert unicode(mor_tok.orth()) == unicode(tag_tok.orth()), unicode(tag_tok.orth())
tag_tok.set_wa(mor_tok.wa())
writer.write_sentence(tag_sent)
writer.finish()
if __name__ == '__main__':
go()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment