Skip to content
Snippets Groups Projects
Commit 69967758 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

demo Python script

parent 2c70f1ac
Branches
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from optparse import OptionParser
from collections import defaultdict as dd
import corpus2
descr = """%prog [options] CORPUSFILE
Reads a corpus file and reports some statistics.
This script is a demo of the Python API.
"""
def tokens(rdr):
"""Yields subsequent tokens from a reader.
Declared here for demonstration."""
while True:
tok = rdr.get_next_token()
if not tok:
break
yield tok
def sentences(rdr):
"""Yields subsequent sentences from a reader.
Declared here for demonstration."""
while True:
sent = rdr.get_next_sentence()
if not sent:
break
yield sent
def chunks(rdr):
"""Yields subsequent sentences from a reader."""
while True:
chunk = rdr.get_next_chunk()
if not chunk:
break
yield chunk
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces')
#parser.add_option('-o', '--output-format', type='string', action='store',
#dest='output_format', default='xces',
#help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='kipi',
help='set the tagset used in input; default: kipi')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='report each token')
parser.add_option('-n', '--number-of-tags', type='int', action='store',
dest='num_tags', default=10,
help='set the max number of tags to report')
(options, args) = parser.parse_args()
if len(args) != 1:
print 'You need to provide an input corpus.'
print 'See %s --help' % sys.argv[0]
sys.exit(1)
inpath = args[0]
# load a tagset, create a reader
tagset = corpus2.get_named_tagset(options.tagset)
rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
# init stats (for this example)
num_toks, num_sents, num_chunks = 0, 0, 0
tag_count = dd(int)
for chunk in chunks(rdr):
for sent in chunk.sentences():
for tok in sent.tokens():
if options.verbose:
print tok.orth_utf8()
for lex in tok.lexemes():
tag_str = tagset.tag_to_string(lex.tag())
tag_count[tag_str] += 1
if options.verbose:
lemma = lex.lemma_utf8()
print ('+' if lex.is_disamb() else ' '), lemma, tag_str
# if you want a unicode object, orth_utf8().decode('utf-8')
num_toks += 1
num_sents += 1
num_chunks += 1
print 'Tokens:', num_toks
print 'Sents: ', num_sents
print 'Chunks:', num_chunks
print
print 'Most frequent tags:'
for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]:
print '\t%s\t%d' % tc
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment