#!/usr/bin/python # -*- coding: utf-8 -*- import sys from optparse import OptionParser from collections import defaultdict as dd from itertools import repeat, izip import corpus2 descr = """%prog [options] CORPUSFILE [SENTENCERANGE[:TOKENRANGE]] Reads a corpus file and outputs all or some tokens. Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """ """ + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """ Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """ """ + ' '.join(corpus2.TokenWriter.available_writer_types_help()) def parse_range_info(s): """Parses a comma-separated list of numbers that can also be dash-separated ranges""" selection = set() for elem in (x.strip() for x in s.split(',')): try: selection.add(int(elem)) except: split = [int(x.strip()) for x in elem.split('-')] try: if len(split) == 2: split.sort() for x in xrange(split[0], split[1]+1): selection.add(x) else: raise except: print "Fail:", elem return selection def sentences(rdr): """Yields subsequent sentences from a reader. Declared here for demonstration.""" while True: sent = rdr.get_next_sentence() if not sent: break yield sent def chunks(rdr): """Yields subsequent sentences from a reader.""" while True: chunk = rdr.get_next_chunk() if not chunk: break yield chunk def write_selected_sentences(sents, writer, selection, maxsel = None): sid = 0 for sent in sents: if sid in selection: if len(selection[sid]) == 0: writer.write_sentence(sent) else: tid = 0 for tok in sent.tokens(): if tid in selection[sid]: writer.write_token(tok) tid += 1 sid += 1 if maxsel is not None and sid > maxsel: break def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces-fast') parser.add_option('-o', '--output-format', type='string', action='store', dest='output_format', default='xces', help='set the output format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') parser.add_option('-C', '--chunks', action='store_true', dest='chunks', default=False, help='Process chunks (select chunks/sentences, not tokens)') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='verbose mode') parser.add_option('--input-list', default=None, help='file with a list of input files') (options, args) = parser.parse_args() if len(args) < 1: print 'You need to provide an input corpus.' print 'See %s --help' % sys.argv[0] sys.exit(1) inpath = args[0] # load a tagset, create a reader tagset = corpus2.get_named_tagset(options.tagset) reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath) writer = corpus2.TokenWriter.create_stdout_writer(options.output_format, tagset) selection = {} for arg in args[1:]: if ':' in arg: sp = arg.split(':') if len(sp) == 2 and options.chunks: selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(())))))) elif len(sp) == 3 and options.chunks: selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(parse_range_info(sp[2]))))))) elif len(sp) == 2: selection.update(izip(parse_range_info(sp[0]), repeat(parse_range_info(sp[1])))) else: print >> sys.stderr, "Invalid argument:", arg return else: selection.update(izip(parse_range_info(arg), repeat(()))) maxsel = max(selection.keys()) if selection.keys() != [] else None if selection == {}: if options.chunks: for chunk in chunks(reader): writer.write_chunk(chunk) else: for sent in sentences(reader): writer.write_sentence(sent) else: if options.chunks: cid = 0 for chunk in chunks(reader): if cid in selection: if len(selection[cid]) == 0: writer.write_chunk(chunk) else: write_selected_sentences(chunk.sentences(), writer, selection[cid]) cid += 1 if maxsel is not None and cid > maxsel: break else: write_selected_sentences(sentences(reader), writer, selection, maxsel) if __name__ == '__main__': go()