rewrite corpus-get in python

27e5a477 · ilor · 6cc8e28a · 27e5a477 · 27e5a477 · 6cc8e28a
Commit 27e5a477 authored May 24, 2011 by ilor
--- a/corpus2tools/CMakeLists.txt
+++ b/corpus2tools/CMakeLists.txt
@@ -12,15 +12,15 @@ include_directories( ${CMAKE_SOURCE_DIR} )
 add_executable( tagset-tool tagset-tool.cpp )
 target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})

-add_executable( corpus-get corpus-get.cpp )
-target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
-
 include_directories(${Boost_INCLUDE_DIR})
 link_directories(${Boost_LIBRARY_DIRS})

 if(UNIX)

-	install(TARGETS tagset-tool corpus-get
+	install(TARGETS tagset-tool
 		RUNTIME DESTINATION bin)
-
+	install(FILES corpus-get
+		DESTINATION bin
+		PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
+		GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
 endif(UNIX)
--- a/corpus2tools/corpus-get
+++ b/corpus2tools/corpus-get
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import sys
+from optparse import OptionParser
+from collections import defaultdict as dd
+from itertools import repeat, izip
+import corpus2
+
+descr = """%prog [options] CORPUSFILE [SENTENCERANGE[:TOKENRANGE]]
+Reads a corpus file and outputs all or some tokens.
+Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """
+""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """
+Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """
+""" + ' '.join(corpus2.TokenWriter.available_writer_types_help())
+
+def parse_range_info(s):
+	"""Parses a comma-separated list of numbers that
+	can also be dash-separated ranges"""
+	selection = set()
+	for elem in (x.strip() for x in s.split(',')):
+		try:
+			selection.add(int(elem))
+		except:
+			split = [x.strip() for x in elem.split('-')]
+			try:
+				if len(split) == 2:
+					split.sort()
+					for x in xrange(int(split[0]), int(split[1])+1):
+						selection.add(x)
+				else:
+					raise
+			except:
+				print "Fail:", elem
+	return selection
+
+def sentences(rdr):
+	"""Yields subsequent sentences from a reader.
+	Declared here for demonstration."""
+	while True:
+		sent = rdr.get_next_sentence()
+		if not sent:
+			break
+		yield sent
+
+
+def chunks(rdr):
+	"""Yields subsequent sentences from a reader."""
+	while True:
+		chunk = rdr.get_next_chunk()
+		if not chunk:
+			break
+		yield chunk
+
+
+def write_selected_sentences(sents, writer, selection):
+	sid = 0
+	for sent in sents:
+		if sid in selection:
+			if len(selection[sid]) == 0:
+				writer.write_sentence(sent)
+			else:
+				tid = 0
+				for tok in sent.tokens():
+					if tid in selection[sid]:
+						writer.write_token(tok)
+					tid += 1
+		sid += 1
+
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='xces',
+		help='set the input format; default: xces-fast')
+	parser.add_option('-o', '--output-format', type='string', action='store',
+		dest='output_format', default='xces',
+		help='set the output format; default: xces')
+	parser.add_option('-t', '--tagset', type='string', action='store',
+		dest='tagset', default='kipi',
+		help='set the tagset used in input; default: kipi')
+	parser.add_option('-C', '--chunks', action='store_true',
+		dest='chunks', default=False,
+		help='Process chunks (select chunks/sentences, not tokens)')
+	parser.add_option('-v', '--verbose', action='store_true',
+		dest='verbose', default=False,
+		help='verbose mode')
+	(options, args) = parser.parse_args()
+	
+	if len(args) < 1:
+		print 'You need to provide an input corpus.'
+		print 'See %s --help' % sys.argv[0]
+		sys.exit(1)
+	
+	inpath = args[0]
+	# load a tagset, create a reader
+	tagset = corpus2.get_named_tagset(options.tagset)
+	reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
+	writer = corpus2.TokenWriter.create_stdout_writer(options.output_format, tagset)
+	selection = {}
+	for arg in args[1:]:
+		if ':' in arg:
+			sp = arg.split(':')
+			if len(sp) == 2 and options.chunks:
+				selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(()))))))
+			elif len(sp) == 3 and options.chunks:
+				selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(parse_range_info(sp[2])))))))
+			elif len(sp) == 2:
+				selection.update(izip(parse_range_info(sp[0]), repeat(parse_range_info(sp[1]))))
+			else:
+				print >> sys.stderr, "Invalid argument:", arg
+				return
+		else:
+			selection.update(izip(parse_range_info(arg), repeat(())))
+	if selection == {}:
+		if options.chunks:
+			for chunk in chunks(reader):
+				writer.write_chunk(chunk)
+		else:
+			for sent in sentences(reader):
+				writer.write_sentence(sent)
+	else:
+		if options.chunks:
+			cid = 0
+			for chunk in chunks(reader):
+				if cid in selection:
+					if len(selection[cid]) == 0:
+						writer.write_chunk(chunk)
+					else:
+						write_selected_sentences(chunk.sentences(), writer, selection[cid])
+				cid += 1
+		else:
+			write_selected_sentences(sentences(reader), writer, selection)
+
+if __name__ == '__main__':
+	go()
--- a/corpus2tools/corpus-get.cpp
+++ b/corpus2tools/corpus-get.cpp
-#include <libcorpus2/tagsetmanager.h>
-#include <libcorpus2/util/ioformat-options.h>
-#include <boost/program_options.hpp>
-#include <boost/algorithm/string.hpp>
-
-int main(int argc, char** argv)
-{
-	std::string tagset_name, filename;
-	std::string input_format, output_format;
-	int sentence, token = -1;
-	size_t stats = 0;
-	using boost::program_options::value;
-	boost::program_options::options_description desc("Allowed options");
-	desc.add_options()
-			("filename,F", value(&filename),
-			 "filename")
-			("sentence,S", value(&sentence),
-			 "Sentence idx")
-			("stats,s", value(&stats),
-			 "Stats")
-			("token,T", value(&token),
-			 "Token idx ")
-			("tagset,t", value(&tagset_name)->default_value("kipi"),
-			 "Tagset name")
-			;
-	Corpus2::add_input_options(desc);
-	Corpus2::add_output_options(desc);
-	boost::program_options::variables_map vm;
-	boost::program_options::positional_options_description p;
-	p.add("filename", 1);
-	p.add("sentence", 1);
-	p.add("token", 1);
-
-	try {
-		boost::program_options::store(
-			boost::program_options::command_line_parser(argc, argv)
-			.options(desc).positional(p).run(), vm);
-	} catch (boost::program_options::error& e) {
-		std::cerr << e.what() << "\n";
-		return 2;
-	}
-	boost::program_options::notify(vm);
-	if (vm.count("help")) {
-		std::cout << desc << "\n";
-		return 1;
-	}
-	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name);
-	boost::shared_ptr<Corpus2::TokenReader> reader;
-	reader = Corpus2::create_reader(vm, tagset, filename);
-	Corpus2::Sentence::Ptr s;
-	boost::shared_ptr<Corpus2::TokenWriter> writer;
-	writer = Corpus2::create_writer(vm, tagset);
-	std::map<int,int> lens;
-	for (int i = 0; i <= sentence; ++i) {
-		s = reader->get_next_sentence();
-		if (s) {
-			lens[s->size()]++;
-			if (s->size() > stats) {
-				std::cerr << i << "\n";
-				writer->write_sentence(*s);
-			}
-		}
-	}
-	if (s) {
-		if (token == -1) {
-			writer->write_sentence(*s);
-		} else if (static_cast<size_t>(token) < s->size()) {
-			writer->write_token(*(*s)[token]);
-		}
-	}
-	if (stats) {
-		typedef std::pair<int,int> pp;
-		foreach (const pp& p, lens) {
-			std::cerr << p.first << " " << p.second << "\n";
-		}
-	}
-}