Skip to content
Snippets Groups Projects
Commit 69967758 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

demo Python script

parent 2c70f1ac
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from optparse import OptionParser
from collections import defaultdict as dd
import corpus2
descr = """%prog [options] CORPUSFILE
Reads a corpus file and reports some statistics.
This script is a demo of the Python API.
"""
def tokens(rdr):
"""Yields subsequent tokens from a reader.
Declared here for demonstration."""
while True:
tok = rdr.get_next_token()
if not tok:
break
yield tok
def sentences(rdr):
"""Yields subsequent sentences from a reader.
Declared here for demonstration."""
while True:
sent = rdr.get_next_sentence()
if not sent:
break
yield sent
def chunks(rdr):
"""Yields subsequent sentences from a reader."""
while True:
chunk = rdr.get_next_chunk()
if not chunk:
break
yield chunk
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces')
#parser.add_option('-o', '--output-format', type='string', action='store',
#dest='output_format', default='xces',
#help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='kipi',
help='set the tagset used in input; default: kipi')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='report each token')
parser.add_option('-n', '--number-of-tags', type='int', action='store',
dest='num_tags', default=10,
help='set the max number of tags to report')
(options, args) = parser.parse_args()
if len(args) != 1:
print 'You need to provide an input corpus.'
print 'See %s --help' % sys.argv[0]
sys.exit(1)
inpath = args[0]
# load a tagset, create a reader
tagset = corpus2.get_named_tagset(options.tagset)
rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
# init stats (for this example)
num_toks, num_sents, num_chunks = 0, 0, 0
tag_count = dd(int)
for chunk in chunks(rdr):
for sent in chunk.sentences():
for tok in sent.tokens():
if options.verbose:
print tok.orth_utf8()
for lex in tok.lexemes():
tag_str = tagset.tag_to_string(lex.tag())
tag_count[tag_str] += 1
if options.verbose:
lemma = lex.lemma_utf8()
print ('+' if lex.is_disamb() else ' '), lemma, tag_str
# if you want a unicode object, orth_utf8().decode('utf-8')
num_toks += 1
num_sents += 1
num_chunks += 1
print 'Tokens:', num_toks
print 'Sents: ', num_sents
print 'Chunks:', num_chunks
print
print 'Most frequent tags:'
for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]:
print '\t%s\t%d' % tc
if __name__ == '__main__':
go()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment