#!/usr/bin/python # -*- coding: utf-8 -*- import sys, os from xml.sax.saxutils import escape from optparse import OptionParser from collections import defaultdict as dd from itertools import repeat, izip import corpus2 descr = """%prog [options] CORPUSFILES Reads corpus files and outputs everything. Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """ """ + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """ Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """ """ + ' '.join(corpus2.TokenWriter.available_writer_types_help()) def sentences(rdr): """Yields subsequent sentences from a reader.""" while True: sent = rdr.get_next_sentence() if not sent: break yield sent def chunks(rdr): """Yields subsequent sentences from a reader.""" while True: chunk = rdr.get_next_chunk() if not chunk: break yield chunk def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces-fast') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='xces', help='set the output format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') parser.add_option('-C', '--chunks', action='store_true', dest='chunks', default=False, help='Process chunks (select chunks/sentences, not tokens)') parser.add_option('--prefix-chunks', action='store_true', dest='prefix_chunks', default=False, help='Prefix chunk ids with filename (file:NAME:ORIGID)') parser.add_option('--prefix-sentences', action='store_true', dest='prefix_sentences', default=False, help='Prefix sentneces ids with filename (file:NAME:ORIGID)') parser.add_option('--documents-as-chunks', action='store_true', dest='documents_as_chunks', default=False, help='Writes every document into single chunk node') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='verbose mode') parser.add_option('--input-list', default=None, help='file with a list of input files') (options, args) = parser.parse_args() if options.input_list is not None: with open(options.input_list) as listfile: for line in listfile: args.append(line.strip()) if len(args) < 1: print 'You need to provide at least one input corpus.' print 'See %s --help' % sys.argv[0] sys.exit(1) inpath = args[0] # load a tagset, create a reader tagset = corpus2.get_named_tagset(options.tagset) writer = corpus2.TokenWriter.create_stdout_writer(options.output_format, tagset) for arg in args: reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, arg) fname, _ = os.path.splitext(os.path.basename(arg)) fname = escape(fname) if options.chunks: chunk_no = 1 for chunk in chunks(reader): if options.prefix_chunks: if chunk.has_attribute('id'): their_id = chunk.get_attribute('id') else: # autogen their_id = ('auto%03d' % chunk_no) full_id = 'file:%s:%s' % (fname, their_id) chunk.set_attribute('id', full_id) writer.write_chunk(chunk) chunk_no += 1 else: big_chunk = None if options.documents_as_chunks: big_chunk = corpus2.Chunk() big_chunk.set_attribute('id', 'file:%s:%s' % (fname, 'ch1')) sent_no = 1 for sent in sentences(reader): if options.prefix_sentences: if not sent.id(): their_id = sent.id() else: #autogen their_id = ('s%d' % sent_no) full_id = 'file:%s:%s' % (fname, their_id) sent.set_id(full_id) if big_chunk: big_chunk.append(sent) else: writer.write_sentence(sent) sent_no += 1 if big_chunk: writer.write_chunk(big_chunk) del reader if __name__ == '__main__': go()