From 0c1e9467948965d4032234457e8c96984e9d0187 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 28 Apr 2011 16:41:17 +0200 Subject: [PATCH] add --input-format/-i to wcclrun --- wccl-apps/wccl-run.cpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/wccl-apps/wccl-run.cpp b/wccl-apps/wccl-run.cpp index 9dfd594..daec529 100644 --- a/wccl-apps/wccl-run.cpp +++ b/wccl-apps/wccl-run.cpp @@ -65,7 +65,7 @@ public: } void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence); - void do_stream(std::istream& is, bool first); + void run(boost::shared_ptr<Corpus2::TokenReader> , bool first); void output_tabular(const std::vector< std::vector< UnicodeString > > outputs); @@ -225,12 +225,11 @@ void Runner::output_tabular(const std::vector<std::vector<UnicodeString> > outpu } } -void Runner::do_stream(std::istream& is, bool first) +void Runner::run(boost::shared_ptr<Corpus2::TokenReader> reader, bool first) { - Corpus2::XcesReader xr(tagset_, is); Corpus2::Sentence::Ptr s; Corpus2::TokenTimer& timer = Corpus2::global_timer(); - while ((s = xr.get_next_sentence())) { + while ((s = reader->get_next_sentence())) { do_sentence(s); timer.count_sentence(*s); if (progress_) { @@ -245,9 +244,12 @@ int main(int argc, char** argv) { std::string tagset_load = "kipi"; bool first = false, progress = false; + std::string input_format; std::vector<std::string> corpora_files, files, operator_strings; bool corpus_stdin = false; using boost::program_options::value; + std::string readers = boost::algorithm::join(Corpus2::TokenReader::available_reader_types_help(), " "); + std::string readers_help = "Input format, any of: " + readers + "\n"; boost::program_options::options_description desc("Allowed options"); desc.add_options() @@ -263,6 +265,8 @@ int main(int argc, char** argv) "Read corpus from stdin") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages") + ("input-format,i", value(&input_format)->default_value("xces"), + readers_help.c_str()) ("first-sentence-only,1", value(&first)->zero_tokens(), "Only process first sentence") ("tabs", value(&tabs)->zero_tokens(), @@ -336,15 +340,12 @@ int main(int argc, char** argv) } if (!runner.operators().empty()) { foreach (const std::string& f, corpora_files) { - std::ifstream ifs(f.c_str()); - if (ifs.good()) { - runner.do_stream(ifs, first); - } else { - std::cerr << "Error reading corpus from " << f << "\n"; - } + runner.run(Corpus2::TokenReader::create_path_reader( + input_format, tagset, f), first); } if (corpus_stdin) { - runner.do_stream(std::cin, first); + runner.run(Corpus2::TokenReader::create_stream_reader( + input_format, tagset, std::cin), first); } if (progress) { Corpus2::TokenTimer& timer = Corpus2::global_timer(); -- GitLab