Skip to content
Snippets Groups Projects
Select Git revision
  • 3f31261e9a5c90e2d6f094e32dfc5fc583943e79
  • master default protected
  • develop
  • sane_tag_sentence
  • tag_heads
5 results

iobber_txt.py

Blame
  • user avatar
    Adam Radziszewski authored
    3f31261e
    History
    iobber_txt.py 6.46 KiB
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    # This file is part of IOBBER
    # Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
    # IOBBER is free software; you can redistribute and/or modify it
    # under the terms of the GNU Lesser General Public License as published by the Free
    # Software Foundation; either version 3 of the License, or (at your option)
    # any later version.
    #
    # IOBBER is distributed in the hope that it will be useful, but
    # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    # or FITNESS FOR A PARTICULAR PURPOSE.
    #
    # See the LICENCE and COPYING files for more details
    import sys
    from optparse import OptionParser
    
    import corpus2
    
    from wcrft import tagger
    from wcrft import corpio as tagger_io
    
    import chunker
    
    #ioformats = corpio.format_help
    #ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
    
    descr = """%prog [options] [input_files]
    
    IOBBER, configurable chunker.
    (C) 2012, Wroclaw University of Technology
    
    Processes input files through the tagger (WCRFT, must be installed) and the chunker.
    By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
    
    Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
    Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
    
    The default values for -c and -w are recommended, but you may need to set trained model
    directories (-C and -W).
    
    Use -O to specify output path (by default will write to stdout).
    Use - to process stdin to stdout.
    
    When processing multiple files, either give the filenames directly as arguments,
    or use --batch and provide a filename to a list of paths. Either way, each file
    will be chunked and the output writted to FILENAME.chunked.
    
    """
    
    
    def lines(pathfilename):
    	with open(pathfilename) as f:
    		return [line.strip() for line in f if line.strip()]
    
    def go():
    	parser = OptionParser(usage=descr)
    	parser.add_option('-i', '--input-format', type='string', action='store',
    		dest='input_format', default='txt',
    		help='set the input format; default: txt')
    	parser.add_option('-o', '--output-format', type='string', action='store',
    		dest='output_format', default='ccl',
    		help='set the output format; default: ccl')
    	parser.add_option('-O', '--output-file', type='string', action='store',
    		dest='out_path', default='',
    		help='set output filename (do not write to stdout)')
    	parser.add_option('--no-chunk', action='store_false',
    		dest='shall_chunk', default=True,
    		help='don\'t run the chunker, only the tagger')
    	parser.add_option('-c', '--chunker-config', type='string', action='store',
    		dest='chunker_config', default='kpwr.ini',
    		help='use given chunker config (default: kpwr.ini)')
    	parser.add_option('-C', '--chunker-model', type='string', action='store',
    		dest='chunker_dir', default='model-kpwr11-H',
    		help='read chunker trained model from the given dir (default: model-kpwr11-H)')
    	parser.add_option('-w', '--tagger-config', type='string', action='store',
    		dest='tagger_config', default='nkjp_s2.ini',
    		help='use given tagger (wcrft) config (default: nkjp_s2.ini)')
    	parser.add_option('-W', '--tagger-model', type='string', action='store',
    		dest='tagger_dir', default='model_nkjp10_wcrft_s2',
    		help='read tagger (wcrft) trained model from the given dir (default: model_nkjp10_wcrft_s2)')
    	parser.add_option('-m', '--maca-config', type='string', action='store',
    		dest='maca_config', default='',
    		help='override maca config file specified in tagger config')
    	parser.add_option('--batch', action='store_true',
    		help='treat arguments as lists of paths to files to tag')
    	parser.add_option('-v', '--verbose', action='store_true',
    		dest='verbose', default=False,
    		help='write additional info to stderr')
    	(options, args) = parser.parse_args()
    	
    	files = args
    	
    	main(files, options.tagger_config, options.tagger_dir, options.shall_chunk,
    		options.chunker_config, options.chunker_dir, options.maca_config,
    		options.batch, options.out_path, options.verbose,
    		options.input_format, options.output_format)
    
    def main(files, tagger_config, tagger_dir, shall_chunk,
    		chunker_config, chunker_dir, maca_config,
    		batch, out_path, verbose,
    		input_format, output_format):
    
    	tagr = tagger.Tagger(tagger_config, tagger_dir)
    	if shall_chunk:
    		chunkr = chunker.Chunker(chunker_config, chunker_dir)
    
    	if maca_config != '':
    		tagr.maca_config = maca_config
    
    	# tag and chunk
    	inputs = []
    	outputs = []
    	if batch: # read each arg as input path list
    		for pathpath in files:
    			inputs.extend(lines(pathpath))
    		outputs = [path + '.tag' for path in inputs]
    	elif len(files) == 1:
    		if files[0] == '-': # stdin to stdout
    			inputs.append(None)
    			outputs.append(None)
    		else:
    			inputs.append(files[0])
    			outputs.append(out_path)
    	else: # multiple paths as args
    		inputs = files
    		outputs = [path + '.tag' for path in inputs]
    	if inputs:
    		tagr.load_model()
    		if shall_chunk:
    			chunkr.load_model()
    			assert (tagr.tagset.name()
    					== chunkr.tagset.name()), ('Tagger and chunker config must'
    							+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
    								chunkr.tagset.name()))
    		for in_path, out_path in zip(inputs, outputs):
    			if in_path and verbose:
    				sys.stderr.write('Processing %s...\n' % in_path)
    			reader = tagger_io.get_reader(in_path, tagr.tagset, input_format, tagr.maca_config)
    			writer = tagger_io.get_writer(out_path, tagr.tagset, output_format)
    			while True:
    				par = reader.get_next_chunk() # here `chunk' denotes paragraph
    				if not par:
    					break # end of input
    				# prepare new paragraph: chunker will need AnnotatedSentence objects
    				new_par = corpus2.Chunk()
    				for attr_key in par.attributes():
    					new_par.set_attribute(
    						attr_key, par.get_attribute(attr_key))
    
    				for sent in par.sentences():
    					# let it be wrapped and cloned if necessary
    					# this will make sure that later wrapping will
    					# not cause any further cloning, so references to new_sent
    					# and new_asent will be referring to the same underlying obj
    					new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
    					new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
    					# preserve_ambiguity = False
    					tagr.tag_sentence(new_sent, False)
    					if shall_chunk:
    						chunkr.tag_sentence(new_sent)
    					# create a new paragraph with the new sentence
    					new_par.append(new_sent)
    				# save tagged paragraph
    				writer.write_chunk(new_par)
    			del reader
    			writer.finish()
    			del writer
    	else:
    		sys.stderr.write('Nothing to do. See --help\n')
    if __name__ == '__main__':
    	go()