Merge branch 'master' of nlp.pwr.wroc.pl:corpus2

Conflicts: swig/libcorpustokenreader.i

Merge branch 'master' of nlp.pwr.wroc.pl:corpus2
Conflicts: swig/libcorpustokenreader.i
2b8f6a60 · ilor · 56a53217 · 69967758 · 2b8f6a60 · 2b8f6a60
Commit 2b8f6a60 authored 13 years ago by ilor
--- a/doc/corpstats.py
+++ b/doc/corpstats.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import sys
+from optparse import OptionParser
+from collections import defaultdict as dd
+import corpus2
+descr = """%prog [options] CORPUSFILE
+Reads a corpus file and reports some statistics.
+This script is a demo of the Python API.
+"""
+def tokens(rdr):
+	"""Yields subsequent tokens from a reader.
+	Declared here for demonstration."""
+	while True:
+		tok = rdr.get_next_token()
+		if not tok:
+			break
+		yield tok
+def sentences(rdr):
+	"""Yields subsequent sentences from a reader.
+	Declared here for demonstration."""
+	while True:
+		sent = rdr.get_next_sentence()
+		if not sent:
+			break
+		yield sent
+def chunks(rdr):
+	"""Yields subsequent sentences from a reader."""
+	while True:
+		chunk = rdr.get_next_chunk()
+		if not chunk:
+			break
+		yield chunk
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='xces',
+		help='set the input format; default: xces')
+	#parser.add_option('-o', '--output-format', type='string', action='store',
+		#dest='output_format', default='xces',
+		#help='set the output format; default: xces')
+	parser.add_option('-t', '--tagset', type='string', action='store',
+		dest='tagset', default='kipi',
+		help='set the tagset used in input; default: kipi')
+	parser.add_option('-v', '--verbose', action='store_true',
+		dest='verbose', default=False,
+		help='report each token')
+	parser.add_option('-n', '--number-of-tags', type='int', action='store',
+		dest='num_tags', default=10,
+		help='set the max number of tags to report')
+	(options, args) = parser.parse_args()
+	if len(args) != 1:
+		print 'You need to provide an input corpus.'
+		print 'See %s --help' % sys.argv[0]
+		sys.exit(1)
+	inpath = args[0]
+	# load a tagset, create a reader
+	tagset = corpus2.get_named_tagset(options.tagset)
+	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
+	# init stats (for this example)
+	num_toks, num_sents, num_chunks = 0, 0, 0
+	tag_count = dd(int) 
+	for chunk in chunks(rdr):
+		for sent in chunk.sentences():
+			for tok in sent.tokens():
+				if options.verbose:
+					print tok.orth_utf8()
+				for lex in tok.lexemes():
+					tag_str = tagset.tag_to_string(lex.tag())
+					tag_count[tag_str] += 1
+					if options.verbose:
+						lemma = lex.lemma_utf8()
+						print ('+' if lex.is_disamb() else ' '), lemma, tag_str
+						# if you want a unicode object, orth_utf8().decode('utf-8')
+				num_toks += 1
+			num_sents += 1
+		num_chunks += 1
+	print 'Tokens:', num_toks
+	print 'Sents: ', num_sents
+	print 'Chunks:', num_chunks
+	print
+	print 'Most frequent tags:'
+	for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]:
+		print '\t%s\t%d' % tc
+if __name__ == '__main__':
+	go()
--- a/swig/libcorpustokenreader.i
+++ b/swig/libcorpustokenreader.i
@@ -18,13 +18,15 @@
 %nodefaultctor Corpus2::TokenReader;
 %template(TokenReaderPtr) boost::shared_ptr<Corpus2::TokenReader>;
+%template(TokenPtr) boost::shared_ptr<Corpus2::Token>;
 // %template(StdStringVector) std::vector<std::string>;
 // %template(ChunkPtr) boost::shared_ptr<Corpus2::Chunk>;
+typedef boost::shared_ptr<Corpus2::Token> TokenPtr;
 namespace Corpus2 {
  class TokenReader {
  public:
    typedef boost::shared_ptr<TokenReader> TokenReaderPtr;
+    //typedef boost::shared_ptr<Token> TokenPtr;
    /* --------------------------------------------------------------------- */
    explicit TokenReader(const Tagset& tagset);
@@ -60,7 +62,7 @@ namespace Corpus2 {
      std::istream& stream);
    /* --------------------------------------------------------------------- */
-    virtual Token* get_next_token() = 0;
+    /* virtual Token* get_next_token() = 0; */
    virtual Sentence::Ptr get_next_sentence() = 0;
    virtual boost::shared_ptr<Chunk> get_next_chunk() = 0;
@@ -77,7 +79,14 @@ namespace Corpus2 {
    static std::vector<std::string> available_reader_types_help();
  };
-  %feature("autodoc", "1");
+  %extend TokenReader {
+    /* modfify the native get_next_token to wrap the tokens into shared_ptr */
+    boost::shared_ptr<Corpus2::Token> get_next_token() {
+      return boost::shared_ptr<Corpus2::Token>(self->get_next_token());
+    }
+  }
+%feature("autodoc", "1");
  std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string(
    const std::string& data, const Tagset& tagset, const std::string& format);