From 43ff7c3f286decbadd53f4e7d420f88ca0918077 Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Mon, 16 May 2011 11:13:29 +0200
Subject: [PATCH] add util/ioformat-options, change tools dir, add corpus-get

---
 CMakeLists.txt                                |  2 +-
 {tagset-tool => corpus2tools}/CMakeLists.txt  |  8 +-
 corpus2tools/corpus-get.cpp                   | 77 +++++++++++++++++++
 .../main.cpp => corpus2tools/tagset-tool.cpp  |  0
 libcorpus2/CMakeLists.txt                     |  1 +
 libcorpus2/util/ioformat-options.cpp          | 66 ++++++++++++++++
 libcorpus2/util/ioformat-options.h            | 34 ++++++++
 7 files changed, 184 insertions(+), 4 deletions(-)
 rename {tagset-tool => corpus2tools}/CMakeLists.txt (68%)
 create mode 100644 corpus2tools/corpus-get.cpp
 rename tagset-tool/main.cpp => corpus2tools/tagset-tool.cpp (100%)
 create mode 100644 libcorpus2/util/ioformat-options.cpp
 create mode 100644 libcorpus2/util/ioformat-options.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55f0fe3..86c2925 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,7 @@ MARK_AS_ADVANCED(LIBCORPUS2_SRC_DATA_DIR)
 
 add_subdirectory(libpwrutils)
 add_subdirectory(libcorpus2)
-add_subdirectory(tagset-tool)
+add_subdirectory(corpus2tools)
 add_subdirectory(tests)
 add_subdirectory(swig)
 
diff --git a/tagset-tool/CMakeLists.txt b/corpus2tools/CMakeLists.txt
similarity index 68%
rename from tagset-tool/CMakeLists.txt
rename to corpus2tools/CMakeLists.txt
index c527eda..3e7b198 100644
--- a/tagset-tool/CMakeLists.txt
+++ b/corpus2tools/CMakeLists.txt
@@ -9,16 +9,18 @@ endif (Libedit_FOUND)
 
 include_directories( ${CMAKE_SOURCE_DIR} )
 
-add_executable( tagset-tool main.cpp )
-
+add_executable( tagset-tool tagset-tool.cpp )
 target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
 
+add_executable( corpus-get corpus-get.cpp )
+target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
+
 include_directories(${Boost_INCLUDE_DIR})
 link_directories(${Boost_LIBRARY_DIRS})
 
 if(UNIX)
 
-	install(TARGETS tagset-tool
+	install(TARGETS tagset-tool corpus-get
 		RUNTIME DESTINATION bin)
 
 endif(UNIX)
diff --git a/corpus2tools/corpus-get.cpp b/corpus2tools/corpus-get.cpp
new file mode 100644
index 0000000..9ea144f
--- /dev/null
+++ b/corpus2tools/corpus-get.cpp
@@ -0,0 +1,77 @@
+#include <libcorpus2/tagsetmanager.h>
+#include <libcorpus2/util/ioformat-options.h>
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
+
+int main(int argc, char** argv)
+{
+	std::string tagset_name, filename;
+	std::string input_format, output_format;
+	int sentence, token = -1;
+	size_t stats = 0;
+	using boost::program_options::value;
+	boost::program_options::options_description desc("Allowed options");
+	desc.add_options()
+			("filename,F", value(&filename),
+			 "filename")
+			("sentence,S", value(&sentence),
+			 "Sentence idx")
+			("stats,s", value(&stats),
+			 "Stats")
+			("token,T", value(&token),
+			 "Token idx ")
+			("tagset,t", value(&tagset_name)->default_value("kipi"),
+			 "Tagset name")
+			;
+	Corpus2::add_input_options(desc);
+	Corpus2::add_output_options(desc);
+	boost::program_options::variables_map vm;
+	boost::program_options::positional_options_description p;
+	p.add("filename", 1);
+	p.add("sentence", 1);
+	p.add("token", 1);
+
+	try {
+		boost::program_options::store(
+			boost::program_options::command_line_parser(argc, argv)
+			.options(desc).positional(p).run(), vm);
+	} catch (boost::program_options::error& e) {
+		std::cerr << e.what() << "\n";
+		return 2;
+	}
+	boost::program_options::notify(vm);
+	if (vm.count("help")) {
+		std::cout << desc << "\n";
+		return 1;
+	}
+	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name);
+	boost::shared_ptr<Corpus2::TokenReader> reader;
+	reader = Corpus2::create_reader(vm, tagset, filename);
+	Corpus2::Sentence::Ptr s;
+	boost::shared_ptr<Corpus2::TokenWriter> writer;
+	writer = Corpus2::create_writer(vm, tagset);
+	std::map<int,int> lens;
+	for (int i = 0; i <= sentence; ++i) {
+		s = reader->get_next_sentence();
+		if (s) {
+			lens[s->size()]++;
+			if (s->size() > stats) {
+				std::cerr << i << "\n";
+				writer->write_sentence(*s);
+			}
+		}
+	}
+	if (s) {
+		if (token == -1) {
+			writer->write_sentence(*s);
+		} else if (static_cast<size_t>(token) < s->size()) {
+			writer->write_token(*(*s)[token]);
+		}
+	}
+	if (stats) {
+		typedef std::pair<int,int> pp;
+		foreach (const pp& p, lens) {
+			std::cerr << p.first << " " << p.second << "\n";
+		}
+	}
+}
diff --git a/tagset-tool/main.cpp b/corpus2tools/tagset-tool.cpp
similarity index 100%
rename from tagset-tool/main.cpp
rename to corpus2tools/tagset-tool.cpp
diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt
index b225155..2af29e1 100644
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -75,6 +75,7 @@ SET(libcorpus2_STAT_SRC
 	io/xceswriter.cpp
 	io/xmlreader.cpp
 	io/xmlwriter.cpp
+	util/ioformat-options.cpp
 	util/settings.cpp
 	util/symboldictionary.cpp
 	util/tokentimer.cpp
diff --git a/libcorpus2/util/ioformat-options.cpp b/libcorpus2/util/ioformat-options.cpp
new file mode 100644
index 0000000..0ed6ccf
--- /dev/null
+++ b/libcorpus2/util/ioformat-options.cpp
@@ -0,0 +1,66 @@
+#include <libcorpus2/util/ioformat-options.h>
+#include <boost/algorithm/string/join.hpp>
+
+namespace Corpus2 {
+
+void add_input_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format /*= "xces"*/
+	)
+{
+	std::string readers = boost::algorithm::join(
+		Corpus2::TokenReader::available_reader_types_help(), " ");
+	std::string readers_help = "Input format, any of: " + readers + "\n";
+	desc.add_options()
+		("input-format,i",
+		 boost::program_options::value<std::string>()->default_value(default_format),
+		 readers_help.c_str());
+}
+
+void add_output_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format /*= "xces"*/
+	)
+{
+	std::string writers = boost::algorithm::join(
+		Corpus2::TokenWriter::available_writer_types_help(), " ");
+	std::string writers_help = "Output format, any of: " + writers + "\n";
+	desc.add_options()
+		("output-format,o",
+		 boost::program_options::value<std::string>()->default_value(default_format),
+		 writers_help.c_str());
+}
+
+boost::shared_ptr<Corpus2::TokenReader> create_reader(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename /*= "-"*/
+	)
+{
+	std::string format = vm["input-format"].as<std::string>();
+	if (filename.empty() || filename == "-") {
+		return Corpus2::TokenReader::create_stream_reader(
+			format, tagset, std::cin);
+	} else {
+		return Corpus2::TokenReader::create_path_reader(
+			format, tagset, filename);
+	}
+}
+
+boost::shared_ptr<Corpus2::TokenWriter> create_writer(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename /*= "-"*/
+	)
+{
+	std::string format = vm["output-format"].as<std::string>();
+	if (filename.empty() || filename == "-") {
+		return Corpus2::TokenWriter::create_stream_writer(
+			format, std::cout, tagset);
+	} else {
+		return Corpus2::TokenWriter::create_path_writer(
+			format, filename, tagset);
+	}
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/util/ioformat-options.h b/libcorpus2/util/ioformat-options.h
new file mode 100644
index 0000000..cab0054
--- /dev/null
+++ b/libcorpus2/util/ioformat-options.h
@@ -0,0 +1,34 @@
+#ifndef LIBSORPUS2_UTIL_IOFORMAT_OPTIONS_H
+#define LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H
+
+#include <boost/program_options.hpp>
+#include <libcorpus2/io/reader.h>
+#include <libcorpus2/io/writer.h>
+
+namespace Corpus2 {
+
+void add_input_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format = "xces"
+	);
+
+void add_output_options(
+	boost::program_options::options_description& desc,
+	const std::string& default_format = "xces"
+	);
+
+boost::shared_ptr<Corpus2::TokenReader> create_reader(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename = "-"
+	);
+
+boost::shared_ptr<Corpus2::TokenWriter> create_writer(
+	boost::program_options::variables_map& vm,
+	const Corpus2::Tagset& tagset,
+	const std::string& filename = "-"
+	);
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H
-- 
GitLab