diff --git a/iobber/iobber_txt.py b/iobber/iobber_txt.py new file mode 100755 index 0000000000000000000000000000000000000000..19346d80be3718e12f410bb4d19950969b99d169 --- /dev/null +++ b/iobber/iobber_txt.py @@ -0,0 +1,153 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# This file is part of IOBBER +# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz. +# IOBBER is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# IOBBER is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE and COPYING files for more details +import sys +from optparse import OptionParser + +import corpus2 + +from wcrft import tagger +from wcrft import corpio as tagger_io + +import chunker + +#ioformats = corpio.format_help +#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl') + +descr = """%prog [options] [input_files] + +IOBBER, configurable chunker. +(C) 2012, Wroclaw University of Technology + +Processes input files through the tagger (WCRFT, must be installed) and the chunker. +By default input is assumed to be plain text (UTF-8) and writes output in the CCL format. + +Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model. +Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model. + +The default values for -c and -w are recommended, but you may need to set trained model +directories (-C and -W). + +Use -O to specify output path (by default will write to stdout). +Use - to process stdin to stdout. + +When processing multiple files, either give the filenames directly as arguments, +or use --batch and provide a filename to a list of paths. Either way, each file +will be chunked and the output writted to FILENAME.chunked. + +""" + + +def lines(pathfilename): + with open(pathfilename) as f: + return [line.strip() for line in f if line.strip()] + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='txt', + help='set the input format; default: txt') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('-O', '--output-file', type='string', action='store', + dest='out_path', default='', + help='set output filename (do not write to stdout)') + parser.add_option('-c', '--chunker-config', type='string', action='store', + dest='chunker_config', default='kpwr.ini', + help='use given chunker config (default: kpwr.ini)') + parser.add_option('-C', '--chunker-model', type='string', action='store', + dest='chunker_dir', default='', + help='read chunker trained model from the given dir') + parser.add_option('-w', '--tagger-config', type='string', action='store', + dest='tagger_config', default='nkjp.ini', + help='use given tagger (wcrft) config (default: nkjp.ini)') + parser.add_option('-W', '--tagger-model', type='string', action='store', + dest='tagger_dir', default='', + help='read tagger (wcrft) trained model from the given dir') + parser.add_option('-m', '--maca-config', type='string', action='store', + dest='maca_config', default='', + help='override maca config file specified in tagger config') + parser.add_option('--batch', action='store_true', + help='treat arguments as lists of paths to files to tag') + parser.add_option('-v', '--verbose', action='store_true', + dest='verbose', default=False, + help='write additional info to stderr') + (options, args) = parser.parse_args() + + files = args + + chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir) + tagr = tagger.Tagger(options.tagger_config, options.tagger_dir) + + if options.maca_config != '': + tagr.maca_config = options.maca_config + + # TODO option not to use chunker + # tag and chunk + inputs = [] + outputs = [] + if options.batch: # read each arg as input path list + for pathpath in files: + inputs.extend(lines(pathpath)) + outputs = [path + '.tag' for path in inputs] + elif len(files) == 1: + if files[0] == '-': # stdin to stdout + inputs.append(None) + outputs.append(None) + else: + inputs.append(files[0]) + outputs.append(options.out_path) + else: # multiple paths as args + inputs = files + outputs = [path + '.tag' for path in inputs] + if inputs: + tagr.load_model() + chunkr.load_model() + assert (tagr.tagset.name() + == chunkr.tagset.name()), ('Tagger and chunker config must' + + 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(), + chunkr.tagset.name())) + for in_path, out_path in zip(inputs, outputs): + if in_path and options.verbose: + sys.stderr.write('Processing %s...\n' % in_path) + reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config) + writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format) + while True: + par = reader.get_next_chunk() # here `chunk' denotes paragraph + if not par: + break # end of input + # prepare new paragraph: chunker will need AnnotatedSentence objects + new_par = corpus2.Chunk() + for attr_key in par.attributes(): + new_par.set_attr(attr_key, par.get_attribute(attr_key)) + + for sent in par.sentences(): + # let it be wrapped and cloned if necessary + # this will make sure that later wrapping will + # not cause any further cloning, so references to new_sent + # and new_asent will be referring to the same underlying obj + new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent) + # preserve_ambiguity = False + tagr.disambiguate_sentence(new_sent) + chunkr.tag_sentence(new_sent) + # create a new paragraph with the new sentence + new_par.append(new_sent) + # save tagged paragraph + writer.write_chunk(new_par) + else: + sys.stderr.write('Nothing to do. See --help\n') +if __name__ == '__main__': + go()