Skip to content
Snippets Groups Projects
Commit 9fae38f0 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

working iobber_txt, todo: add possibility to turn off chunker and default model names

parent 5a22c1f2
Branches
No related merge requests found
......@@ -15,6 +15,8 @@
import sys
from optparse import OptionParser
import corpus2
from wcrft import tagger
from wcrft import corpio as tagger_io
......@@ -23,8 +25,28 @@ import chunker
#ioformats = corpio.format_help
#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
descr = """%prog [options] TODO
""" + ioformats
descr = """%prog [options] [input_files]
IOBBER, configurable chunker.
(C) 2012, Wroclaw University of Technology
Processes input files through the tagger (WCRFT, must be installed) and the chunker.
By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
The default values for -c and -w are recommended, but you may need to set trained model
directories (-C and -W).
Use -O to specify output path (by default will write to stdout).
Use - to process stdin to stdout.
When processing multiple files, either give the filenames directly as arguments,
or use --batch and provide a filename to a list of paths. Either way, each file
will be chunked and the output writted to FILENAME.chunked.
"""
def lines(pathfilename):
......@@ -90,24 +112,42 @@ def go():
else: # multiple paths as args
inputs = files
outputs = [path + '.tag' for path in inputs]
tagr.load_model()
chunkr.load_model()
for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose:
sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config)
writer = tagger_io.get_writer(out_path, self.tagset, output_format)
while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph
if not par:
break # end of input
# process each sentence separately
for sent in chunk.sentences():
# preserve_ambiguity = False
self.disambiguate_sentence(sent)
# TODO: chunk it actually
# save tagged paragraph
writer.write_chunk(par)
if inputs:
tagr.load_model()
chunkr.load_model()
assert (tagr.tagset.name()
== chunkr.tagset.name()), ('Tagger and chunker config must'
+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
chunkr.tagset.name()))
for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose:
sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config)
writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format)
while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph
if not par:
break # end of input
# prepare new paragraph: chunker will need AnnotatedSentence objects
new_par = corpus2.Chunk()
for attr_key in par.attributes():
new_par.set_attr(attr_key, par.get_attribute(attr_key))
for sent in par.sentences():
# let it be wrapped and cloned if necessary
# this will make sure that later wrapping will
# not cause any further cloning, so references to new_sent
# and new_asent will be referring to the same underlying obj
new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
# preserve_ambiguity = False
tagr.disambiguate_sentence(new_sent)
chunkr.tag_sentence(new_sent)
# create a new paragraph with the new sentence
new_par.append(new_sent)
# save tagged paragraph
writer.write_chunk(new_par)
else:
sys.stderr.write('Nothing to do. See --help\n')
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment