Skip to content
Snippets Groups Projects
Select Git revision
  • f5e3a8389b111057a2e0899934b0e136476676a0
  • master default protected
  • develop protected
  • feat_remove_attr
  • python2.7
  • python3.8
6 results

corpspace.py

Blame
  • user avatar
    Pawel Orlowicz authored
    e826e03f
    History
    corpspace.py 2.58 KiB
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    
    # Copyright (C) 2012 Adam Radziszewski.
    # This program is free software; you can redistribute and/or modify it
    # under the terms of the GNU Lesser General Public License as published by the Free
    # Software Foundation; either version 3 of the License, or (at your option)
    # any later version.
    #
    # This program is distributed in the hope that it will be useful, but
    # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    # or FITNESS FOR A PARTICULAR PURPOSE.
    #
    # See the LICENCE and COPYING files for more details
    
    from optparse import OptionParser
    import sys
    import corpus2
    from StringIO import StringIO
    from collections import defaultdict as dd
    descr = """%prog [options] TAGOUT MORPHO OUT
    
    Util to synchronise no-space markers between tagger output (TAGOUT) that
    contains the wanted disamb lexemes but may be devoid of no-space markers
    with the tagger input containing proper no-space markers but no disambs.
    """
    
    
    
    def go():
    	parser = OptionParser(usage=descr)
    	parser.add_option('-i', '--input-format', type='string', action='store',
    		dest='input_format', default='xces',
    		help='set the input format; default: xces')
    	parser.add_option('-o', '--output-format', type='string', action='store',
    		dest='output_format', default='xces',
    		help='set the output format; default: xces')
    	parser.add_option('-t', '--tagset', type='string', action='store',
    		dest='tagset', default='nkjp',
    		help='set the tagset used in input; default: nkjp')
    	parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose')
    	parser.add_option('-d', '--debug', action='store_true', dest='debug_mode')
    	(options, args) = parser.parse_args()
    	
    	if len(args) != 3:
    		print 'You need to provide a TAGOUT, MORPHO and OUTPUT files.'
    		print 'See --help for details.'
    		print
    		sys.exit(1)
    	
    	tag_fn, mor_fn, out_fn = args
    	tagset = corpus2.get_named_tagset(options.tagset)
    	
    	tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, tag_fn)
    	mor_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, mor_fn)
    	
    	writer = corpus2.TokenWriter.create_path_writer(options.output_format, out_fn, tagset)
    	
    	while True:
    		mor_sent = mor_rdr.get_next_sentence()
    		tag_sent = tag_rdr.get_next_sentence()
    		assert (not mor_sent) == (not tag_sent)
    		if not mor_sent:
    			break
    		for mor_tok, tag_tok in zip(mor_sent.tokens(), tag_sent.tokens()):
    			assert unicode(mor_tok.orth()) == unicode(tag_tok.orth()), unicode(tag_tok.orth())
    			tag_tok.set_wa(mor_tok.wa())
    		writer.write_sentence(tag_sent)
    		
    	writer.finish()
    	
    if __name__ == '__main__':
    	go()