Skip to content
Snippets Groups Projects
Commit 5a22c1f2 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

initial (incomplete) iobber_txt script to tag&chunk

parent 5c47d551
Branches
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This file is part of IOBBER
# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
# IOBBER is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# IOBBER is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details
import sys
from optparse import OptionParser
from wcrft import tagger
from wcrft import corpio as tagger_io
import chunker
#ioformats = corpio.format_help
#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
descr = """%prog [options] TODO
""" + ioformats
def lines(pathfilename):
with open(pathfilename) as f:
return [line.strip() for line in f if line.strip()]
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='txt',
help='set the input format; default: txt')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='ccl',
help='set the output format; default: ccl')
parser.add_option('-O', '--output-file', type='string', action='store',
dest='out_path', default='',
help='set output filename (do not write to stdout)')
parser.add_option('-c', '--chunker-config', type='string', action='store',
dest='chunker_config', default='kpwr.ini',
help='use given chunker config (default: kpwr.ini)')
parser.add_option('-C', '--chunker-model', type='string', action='store',
dest='chunker_dir', default='',
help='read chunker trained model from the given dir')
parser.add_option('-w', '--tagger-config', type='string', action='store',
dest='tagger_config', default='nkjp.ini',
help='use given tagger (wcrft) config (default: nkjp.ini)')
parser.add_option('-W', '--tagger-model', type='string', action='store',
dest='tagger_dir', default='',
help='read tagger (wcrft) trained model from the given dir')
parser.add_option('-m', '--maca-config', type='string', action='store',
dest='maca_config', default='',
help='override maca config file specified in tagger config')
parser.add_option('--batch', action='store_true',
help='treat arguments as lists of paths to files to tag')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='write additional info to stderr')
(options, args) = parser.parse_args()
files = args
chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir)
tagr = tagger.Tagger(options.tagger_config, options.tagger_dir)
if options.maca_config != '':
tagr.maca_config = options.maca_config
# TODO option not to use chunker
# tag and chunk
inputs = []
outputs = []
if options.batch: # read each arg as input path list
for pathpath in files:
inputs.extend(lines(pathpath))
outputs = [path + '.tag' for path in inputs]
elif len(files) == 1:
if files[0] == '-': # stdin to stdout
inputs.append(None)
outputs.append(None)
else:
inputs.append(files[0])
outputs.append(options.out_path)
else: # multiple paths as args
inputs = files
outputs = [path + '.tag' for path in inputs]
tagr.load_model()
chunkr.load_model()
for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose:
sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config)
writer = tagger_io.get_writer(out_path, self.tagset, output_format)
while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph
if not par:
break # end of input
# process each sentence separately
for sent in chunk.sentences():
# preserve_ambiguity = False
self.disambiguate_sentence(sent)
# TODO: chunk it actually
# save tagged paragraph
writer.write_chunk(par)
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment