#!/usr/bin/python # -*- coding: utf-8 -*- import sys from optparse import OptionParser from collections import defaultdict as dd import corpus2 descr = """%prog [options] CORPUSFILE Reads a corpus file and reports some statistics. This script is a demo of the Python API. """ def tokens(rdr): """Yields subsequent tokens from a reader. Declared here for demonstration.""" while True: tok = rdr.get_next_token() if not tok: break yield tok def sentences(rdr): """Yields subsequent sentences from a reader. Declared here for demonstration.""" while True: sent = rdr.get_next_sentence() if not sent: break yield sent def chunks(rdr): """Yields subsequent paragraphs from a reader.""" while True: chunk = rdr.get_next_chunk() if not chunk: break yield chunk def go(): parser = OptionParser(usage=descr) parser.add_option('-i', '--input-format', type='string', action='store', dest='input_format', default='xces', help='set the input format; default: xces') #parser.add_option('-o', '--output-format', type='string', action='store', #dest='output_format', default='xces', #help='set the output format; default: xces') parser.add_option('-t', '--tagset', type='string', action='store', dest='tagset', default='kipi', help='set the tagset used in input; default: kipi') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='report each token') parser.add_option('-n', '--number-of-tags', type='int', action='store', dest='num_tags', default=10, help='set the max number of tags to report') (options, args) = parser.parse_args() if len(args) != 1: print 'You need to provide an input corpus.' print 'See %s --help' % sys.argv[0] sys.exit(1) inpath = args[0] # load a tagset, create a reader tagset = corpus2.get_named_tagset(options.tagset) rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath) # init stats (for this example) num_toks, num_sents, num_chunks = 0, 0, 0 tag_count = dd(int) for chunk in chunks(rdr): for sent in chunk.sentences(): for tok in sent.tokens(): if options.verbose: print tok.orth_utf8() for lex in tok.lexemes(): tag_str = tagset.tag_to_string(lex.tag()) tag_count[tag_str] += 1 if options.verbose: lemma = lex.lemma_utf8() print ('+' if lex.is_disamb() else ' '), lemma, tag_str # if you want a unicode object, orth_utf8().decode('utf-8') num_toks += 1 num_sents += 1 num_chunks += 1 print 'Tokens:', num_toks print 'Sents: ', num_sents print 'Chunks:', num_chunks print print 'Most frequent tags:' for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]: print '\t%s\t%d' % tc if __name__ == '__main__': go()