From 0c1e9467948965d4032234457e8c96984e9d0187 Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Thu, 28 Apr 2011 16:41:17 +0200
Subject: [PATCH] add --input-format/-i to wcclrun

---
 wccl-apps/wccl-run.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/wccl-apps/wccl-run.cpp b/wccl-apps/wccl-run.cpp
index 9dfd594..daec529 100644
--- a/wccl-apps/wccl-run.cpp
+++ b/wccl-apps/wccl-run.cpp
@@ -65,7 +65,7 @@ public:
 	}
 
 	void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence);
-	void do_stream(std::istream& is, bool first);
+	void run(boost::shared_ptr<Corpus2::TokenReader> , bool first);
 
 	void output_tabular(const std::vector< std::vector< UnicodeString > > outputs);
 
@@ -225,12 +225,11 @@ void Runner::output_tabular(const std::vector<std::vector<UnicodeString> > outpu
 	}
 }
 
-void Runner::do_stream(std::istream& is, bool first)
+void Runner::run(boost::shared_ptr<Corpus2::TokenReader> reader, bool first)
 {
-	Corpus2::XcesReader xr(tagset_, is);
 	Corpus2::Sentence::Ptr s;
 	Corpus2::TokenTimer& timer = Corpus2::global_timer();
-	while ((s = xr.get_next_sentence())) {
+	while ((s = reader->get_next_sentence())) {
 		do_sentence(s);
 		timer.count_sentence(*s);
 		if (progress_) {
@@ -245,9 +244,12 @@ int main(int argc, char** argv)
 {
 	std::string tagset_load = "kipi";
 	bool first = false, progress = false;
+	std::string input_format;
 	std::vector<std::string> corpora_files, files, operator_strings;
 	bool corpus_stdin = false;
 	using boost::program_options::value;
+	std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " ");
+	std::string readers_help = "Input format, any of: " + readers + "\n";
 
 	boost::program_options::options_description desc("Allowed options");
 	desc.add_options()
@@ -263,6 +265,8 @@ int main(int argc, char** argv)
 			 "Read corpus from stdin")
 			("quiet,q", value(&quiet)->zero_tokens(),
 			 "Suppress messages")
+			("input-format,i", value(&input_format)->default_value("xces"),
+			 readers_help.c_str())
 			("first-sentence-only,1", value(&first)->zero_tokens(),
 			 "Only process first sentence")
 			("tabs", value(&tabs)->zero_tokens(),
@@ -336,15 +340,12 @@ int main(int argc, char** argv)
 		}
 		if (!runner.operators().empty()) {
 			foreach (const std::string& f, corpora_files) {
-				std::ifstream ifs(f.c_str());
-				if (ifs.good()) {
-					runner.do_stream(ifs, first);
-				} else {
-					std::cerr << "Error reading corpus from " << f << "\n";
-				}
+				runner.run(Corpus2::TokenReader::create_path_reader(
+						input_format, tagset, f), first);
 			}
 			if (corpus_stdin) {
-				runner.do_stream(std::cin, first);
+				runner.run(Corpus2::TokenReader::create_stream_reader(
+						input_format, tagset, std::cin), first);
 			}
 			if (progress) {
 				Corpus2::TokenTimer& timer = Corpus2::global_timer();
-- 
GitLab