diff --git a/iobber/iobber_txt.py b/iobber/iobber_txt.py new file mode 100644 index 0000000000000000000000000000000000000000..b0bc022d10fe30905efb721f4609e694e182289b --- /dev/null +++ b/iobber/iobber_txt.py @@ -0,0 +1,113 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# This file is part of IOBBER +# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz. +# IOBBER is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# IOBBER is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE and COPYING files for more details +import sys +from optparse import OptionParser + +from wcrft import tagger +from wcrft import corpio as tagger_io + +import chunker + +#ioformats = corpio.format_help +#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl') + +descr = """%prog [options] TODO +""" + ioformats + + +def lines(pathfilename): + with open(pathfilename) as f: + return [line.strip() for line in f if line.strip()] + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='txt', + help='set the input format; default: txt') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('-O', '--output-file', type='string', action='store', + dest='out_path', default='', + help='set output filename (do not write to stdout)') + parser.add_option('-c', '--chunker-config', type='string', action='store', + dest='chunker_config', default='kpwr.ini', + help='use given chunker config (default: kpwr.ini)') + parser.add_option('-C', '--chunker-model', type='string', action='store', + dest='chunker_dir', default='', + help='read chunker trained model from the given dir') + parser.add_option('-w', '--tagger-config', type='string', action='store', + dest='tagger_config', default='nkjp.ini', + help='use given tagger (wcrft) config (default: nkjp.ini)') + parser.add_option('-W', '--tagger-model', type='string', action='store', + dest='tagger_dir', default='', + help='read tagger (wcrft) trained model from the given dir') + parser.add_option('-m', '--maca-config', type='string', action='store', + dest='maca_config', default='', + help='override maca config file specified in tagger config') + parser.add_option('--batch', action='store_true', + help='treat arguments as lists of paths to files to tag') + parser.add_option('-v', '--verbose', action='store_true', + dest='verbose', default=False, + help='write additional info to stderr') + (options, args) = parser.parse_args() + + files = args + + chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir) + tagr = tagger.Tagger(options.tagger_config, options.tagger_dir) + + if options.maca_config != '': + tagr.maca_config = options.maca_config + + # TODO option not to use chunker + # tag and chunk + inputs = [] + outputs = [] + if options.batch: # read each arg as input path list + for pathpath in files: + inputs.extend(lines(pathpath)) + outputs = [path + '.tag' for path in inputs] + elif len(files) == 1: + if files[0] == '-': # stdin to stdout + inputs.append(None) + outputs.append(None) + else: + inputs.append(files[0]) + outputs.append(options.out_path) + else: # multiple paths as args + inputs = files + outputs = [path + '.tag' for path in inputs] + tagr.load_model() + chunkr.load_model() + for in_path, out_path in zip(inputs, outputs): + if in_path and options.verbose: + sys.stderr.write('Processing %s...\n' % in_path) + reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config) + writer = tagger_io.get_writer(out_path, self.tagset, output_format) + while True: + par = reader.get_next_chunk() # here `chunk' denotes paragraph + if not par: + break # end of input + # process each sentence separately + for sent in chunk.sentences(): + # preserve_ambiguity = False + self.disambiguate_sentence(sent) + # TODO: chunk it actually + # save tagged paragraph + writer.write_chunk(par) + +if __name__ == '__main__': + go()