Skip to content
Snippets Groups Projects
Commit 9fae38f0 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

working iobber_txt, todo: add possibility to turn off chunker and default model names

parent 5a22c1f2
No related branches found
No related tags found
No related merge requests found
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
import sys import sys
from optparse import OptionParser from optparse import OptionParser
import corpus2
from wcrft import tagger from wcrft import tagger
from wcrft import corpio as tagger_io from wcrft import corpio as tagger_io
...@@ -23,8 +25,28 @@ import chunker ...@@ -23,8 +25,28 @@ import chunker
#ioformats = corpio.format_help #ioformats = corpio.format_help
#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl') #ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
descr = """%prog [options] TODO descr = """%prog [options] [input_files]
""" + ioformats
IOBBER, configurable chunker.
(C) 2012, Wroclaw University of Technology
Processes input files through the tagger (WCRFT, must be installed) and the chunker.
By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
The default values for -c and -w are recommended, but you may need to set trained model
directories (-C and -W).
Use -O to specify output path (by default will write to stdout).
Use - to process stdin to stdout.
When processing multiple files, either give the filenames directly as arguments,
or use --batch and provide a filename to a list of paths. Either way, each file
will be chunked and the output writted to FILENAME.chunked.
"""
def lines(pathfilename): def lines(pathfilename):
...@@ -90,24 +112,42 @@ def go(): ...@@ -90,24 +112,42 @@ def go():
else: # multiple paths as args else: # multiple paths as args
inputs = files inputs = files
outputs = [path + '.tag' for path in inputs] outputs = [path + '.tag' for path in inputs]
if inputs:
tagr.load_model() tagr.load_model()
chunkr.load_model() chunkr.load_model()
assert (tagr.tagset.name()
== chunkr.tagset.name()), ('Tagger and chunker config must'
+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
chunkr.tagset.name()))
for in_path, out_path in zip(inputs, outputs): for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose: if in_path and options.verbose:
sys.stderr.write('Processing %s...\n' % in_path) sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config) reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config)
writer = tagger_io.get_writer(out_path, self.tagset, output_format) writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format)
while True: while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph par = reader.get_next_chunk() # here `chunk' denotes paragraph
if not par: if not par:
break # end of input break # end of input
# process each sentence separately # prepare new paragraph: chunker will need AnnotatedSentence objects
for sent in chunk.sentences(): new_par = corpus2.Chunk()
for attr_key in par.attributes():
new_par.set_attr(attr_key, par.get_attribute(attr_key))
for sent in par.sentences():
# let it be wrapped and cloned if necessary
# this will make sure that later wrapping will
# not cause any further cloning, so references to new_sent
# and new_asent will be referring to the same underlying obj
new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
# preserve_ambiguity = False # preserve_ambiguity = False
self.disambiguate_sentence(sent) tagr.disambiguate_sentence(new_sent)
# TODO: chunk it actually chunkr.tag_sentence(new_sent)
# create a new paragraph with the new sentence
new_par.append(new_sent)
# save tagged paragraph # save tagged paragraph
writer.write_chunk(par) writer.write_chunk(new_par)
else:
sys.stderr.write('Nothing to do. See --help\n')
if __name__ == '__main__': if __name__ == '__main__':
go() go()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment