diff --git a/iobber/iobber_txt.py b/iobber/iobber_txt.py old mode 100644 new mode 100755 index b0bc022d10fe30905efb721f4609e694e182289b..19346d80be3718e12f410bb4d19950969b99d169 --- a/iobber/iobber_txt.py +++ b/iobber/iobber_txt.py @@ -15,6 +15,8 @@ import sys from optparse import OptionParser +import corpus2 + from wcrft import tagger from wcrft import corpio as tagger_io @@ -23,8 +25,28 @@ import chunker #ioformats = corpio.format_help #ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl') -descr = """%prog [options] TODO -""" + ioformats +descr = """%prog [options] [input_files] + +IOBBER, configurable chunker. +(C) 2012, Wroclaw University of Technology + +Processes input files through the tagger (WCRFT, must be installed) and the chunker. +By default input is assumed to be plain text (UTF-8) and writes output in the CCL format. + +Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model. +Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model. + +The default values for -c and -w are recommended, but you may need to set trained model +directories (-C and -W). + +Use -O to specify output path (by default will write to stdout). +Use - to process stdin to stdout. + +When processing multiple files, either give the filenames directly as arguments, +or use --batch and provide a filename to a list of paths. Either way, each file +will be chunked and the output writted to FILENAME.chunked. + +""" def lines(pathfilename): @@ -90,24 +112,42 @@ def go(): else: # multiple paths as args inputs = files outputs = [path + '.tag' for path in inputs] - tagr.load_model() - chunkr.load_model() - for in_path, out_path in zip(inputs, outputs): - if in_path and options.verbose: - sys.stderr.write('Processing %s...\n' % in_path) - reader = tagger_io.get_reader(in_path, self.tagset, input_format, self.maca_config) - writer = tagger_io.get_writer(out_path, self.tagset, output_format) - while True: - par = reader.get_next_chunk() # here `chunk' denotes paragraph - if not par: - break # end of input - # process each sentence separately - for sent in chunk.sentences(): - # preserve_ambiguity = False - self.disambiguate_sentence(sent) - # TODO: chunk it actually - # save tagged paragraph - writer.write_chunk(par) + if inputs: + tagr.load_model() + chunkr.load_model() + assert (tagr.tagset.name() + == chunkr.tagset.name()), ('Tagger and chunker config must' + + 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(), + chunkr.tagset.name())) + for in_path, out_path in zip(inputs, outputs): + if in_path and options.verbose: + sys.stderr.write('Processing %s...\n' % in_path) + reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config) + writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format) + while True: + par = reader.get_next_chunk() # here `chunk' denotes paragraph + if not par: + break # end of input + # prepare new paragraph: chunker will need AnnotatedSentence objects + new_par = corpus2.Chunk() + for attr_key in par.attributes(): + new_par.set_attr(attr_key, par.get_attribute(attr_key)) + for sent in par.sentences(): + # let it be wrapped and cloned if necessary + # this will make sure that later wrapping will + # not cause any further cloning, so references to new_sent + # and new_asent will be referring to the same underlying obj + new_asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent) + # preserve_ambiguity = False + tagr.disambiguate_sentence(new_sent) + chunkr.tag_sentence(new_sent) + # create a new paragraph with the new sentence + new_par.append(new_sent) + # save tagged paragraph + writer.write_chunk(new_par) + else: + sys.stderr.write('Nothing to do. See --help\n') if __name__ == '__main__': go()