Skip to content
Snippets Groups Projects
Commit 1769ac2b authored by jezozwierzak's avatar jezozwierzak
Browse files

Merge branch 'master' of nlp.pwr.wroc.pl:iobber

parents 26075b92 9fae38f0
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This file is part of IOBBER
# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
# IOBBER is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# IOBBER is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details
import sys
from optparse import OptionParser
import corpus2
from wcrft import tagger
from wcrft import corpio as tagger_io
import chunker
#ioformats = corpio.format_help
#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
descr = """%prog [options] [input_files]
IOBBER, configurable chunker.
(C) 2012, Wroclaw University of Technology
Processes input files through the tagger (WCRFT, must be installed) and the chunker.
By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
The default values for -c and -w are recommended, but you may need to set trained model
directories (-C and -W).
Use -O to specify output path (by default will write to stdout).
Use - to process stdin to stdout.
When processing multiple files, either give the filenames directly as arguments,
or use --batch and provide a filename to a list of paths. Either way, each file
will be chunked and the output writted to FILENAME.chunked.
"""
def lines(pathfilename):
with open(pathfilename) as f:
return [line.strip() for line in f if line.strip()]
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='txt',
help='set the input format; default: txt')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='ccl',
help='set the output format; default: ccl')
parser.add_option('-O', '--output-file', type='string', action='store',
dest='out_path', default='',
help='set output filename (do not write to stdout)')
parser.add_option('-c', '--chunker-config', type='string', action='store',
dest='chunker_config', default='kpwr.ini',
help='use given chunker config (default: kpwr.ini)')
parser.add_option('-C', '--chunker-model', type='string', action='store',
dest='chunker_dir', default='',
help='read chunker trained model from the given dir')
parser.add_option('-w', '--tagger-config', type='string', action='store',
dest='tagger_config', default='nkjp.ini',
help='use given tagger (wcrft) config (default: nkjp.ini)')
parser.add_option('-W', '--tagger-model', type='string', action='store',
dest='tagger_dir', default='',
help='read tagger (wcrft) trained model from the given dir')
parser.add_option('-m', '--maca-config', type='string', action='store',
dest='maca_config', default='',
help='override maca config file specified in tagger config')
parser.add_option('--batch', action='store_true',
help='treat arguments as lists of paths to files to tag')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='write additional info to stderr')
(options, args) = parser.parse_args()
files = args
chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir)
tagr = tagger.Tagger(options.tagger_config, options.tagger_dir)
if options.maca_config != '':
tagr.maca_config = options.maca_config
# TODO option not to use chunker
# tag and chunk
inputs = []
outputs = []
if options.batch: # read each arg as input path list
for pathpath in files:
inputs.extend(lines(pathpath))
outputs = [path + '.tag' for path in inputs]
elif len(files) == 1:
if files[0] == '-': # stdin to stdout
inputs.append(None)
outputs.append(None)
else:
inputs.append(files[0])
outputs.append(options.out_path)
else: # multiple paths as args
inputs = files
outputs = [path + '.tag' for path in inputs]
if inputs:
tagr.load_model()
chunkr.load_model()
assert (tagr.tagset.name()
== chunkr.tagset.name()), ('Tagger and chunker config must'
+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
chunkr.tagset.name()))
for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose:
sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config)
writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format)
while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph
if not par:
break # end of input
# prepare new paragraph: chunker will need AnnotatedSentence objects
new_par = corpus2.Chunk()
for attr_key in par.attributes():
new_par.set_attr(attr_key, par.get_attribute(attr_key))
for sent in par.sentences():
# let it be wrapped and cloned if necessary
# this will make sure that later wrapping will
# not cause any further cloning, so references to new_sent
# and new_asent will be referring to the same underlying obj
new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
# preserve_ambiguity = False
tagr.disambiguate_sentence(new_sent)
chunkr.tag_sentence(new_sent)
# create a new paragraph with the new sentence
new_par.append(new_sent)
# save tagged paragraph
writer.write_chunk(new_par)
else:
sys.stderr.write('Nothing to do. See --help\n')
if __name__ == '__main__':
go()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment