demo Python script

69967758 · Adam Radziszewski · 2c70f1ac · 69967758
Commit 69967758 authored 13 years ago by Adam Radziszewski
--- a/doc/corpstats.py
+++ b/doc/corpstats.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import sys
+from optparse import OptionParser
+from collections import defaultdict as dd
+import corpus2
+
+descr = """%prog [options] CORPUSFILE
+Reads a corpus file and reports some statistics.
+This script is a demo of the Python API.
+"""
+
+def tokens(rdr):
+	"""Yields subsequent tokens from a reader.
+	Declared here for demonstration."""
+	while True:
+		tok = rdr.get_next_token()
+		if not tok:
+			break
+		yield tok
+
+def sentences(rdr):
+	"""Yields subsequent sentences from a reader.
+	Declared here for demonstration."""
+	while True:
+		sent = rdr.get_next_sentence()
+		if not sent:
+			break
+		yield sent
+
+def chunks(rdr):
+	"""Yields subsequent sentences from a reader."""
+	while True:
+		chunk = rdr.get_next_chunk()
+		if not chunk:
+			break
+		yield chunk
+
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='xces',
+		help='set the input format; default: xces')
+	#parser.add_option('-o', '--output-format', type='string', action='store',
+		#dest='output_format', default='xces',
+		#help='set the output format; default: xces')
+	parser.add_option('-t', '--tagset', type='string', action='store',
+		dest='tagset', default='kipi',
+		help='set the tagset used in input; default: kipi')
+	parser.add_option('-v', '--verbose', action='store_true',
+		dest='verbose', default=False,
+		help='report each token')
+	parser.add_option('-n', '--number-of-tags', type='int', action='store',
+		dest='num_tags', default=10,
+		help='set the max number of tags to report')
+	(options, args) = parser.parse_args()
+	
+	if len(args) != 1:
+		print 'You need to provide an input corpus.'
+		print 'See %s --help' % sys.argv[0]
+		sys.exit(1)
+	
+	inpath = args[0]
+	# load a tagset, create a reader
+	tagset = corpus2.get_named_tagset(options.tagset)
+	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
+	# init stats (for this example)
+	num_toks, num_sents, num_chunks = 0, 0, 0
+	tag_count = dd(int) 
+	
+	for chunk in chunks(rdr):
+		for sent in chunk.sentences():
+			for tok in sent.tokens():
+				if options.verbose:
+					print tok.orth_utf8()
+				
+				for lex in tok.lexemes():
+					tag_str = tagset.tag_to_string(lex.tag())
+					tag_count[tag_str] += 1
+					
+					if options.verbose:
+						lemma = lex.lemma_utf8()
+						print ('+' if lex.is_disamb() else ' '), lemma, tag_str
+						# if you want a unicode object, orth_utf8().decode('utf-8')
+				num_toks += 1
+			num_sents += 1
+		num_chunks += 1
+		
+	
+	print 'Tokens:', num_toks
+	print 'Sents: ', num_sents
+	print 'Chunks:', num_chunks
+	print
+	print 'Most frequent tags:'
+	for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]:
+		print '\t%s\t%d' % tc
+		
+
+
+if __name__ == '__main__':
+	go()