From 70a3893cb079c833bf81dfd8514f693fa497e667 Mon Sep 17 00:00:00 2001
From: unknown <bOBaN@bOBaN-desktop.(none)>
Date: Tue, 25 Sep 2012 19:20:58 +0200
Subject: [PATCH] corpus2get c++ utility

---
 corpus2tools/CMakeLists.txt |   4 +-
 corpus2tools/corpus2get.cpp | 105 ++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 corpus2tools/corpus2get.cpp

diff --git a/corpus2tools/CMakeLists.txt b/corpus2tools/CMakeLists.txt
index cdfc097..1be6155 100644
--- a/corpus2tools/CMakeLists.txt
+++ b/corpus2tools/CMakeLists.txt
@@ -18,10 +18,12 @@ link_directories(${Boost_LIBRARY_DIRS})
 link_directories(${ICU_LIBRARY_DIRS})
 
 add_executable( tagset-tool tagset-tool.cpp )
+add_executable( corpus2get corpus2get.cpp )
 target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
+target_link_libraries ( corpus2get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
 if(UNIX)
 
-	install(TARGETS tagset-tool
+	install(TARGETS tagset-tool corpus2get
 		RUNTIME DESTINATION bin)
 	install(FILES corpus-get corpus-merge
 		DESTINATION bin
diff --git a/corpus2tools/corpus2get.cpp b/corpus2tools/corpus2get.cpp
new file mode 100644
index 0000000..53c5018
--- /dev/null
+++ b/corpus2tools/corpus2get.cpp
@@ -0,0 +1,105 @@
+/*
+    Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski
+    Part of the libcorpus2 project
+
+    This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+    This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. 
+
+    See the LICENSE and COPYING files for more details.
+*/
+
+#include <libcorpus2/util/settings.h>
+#include <libcorpus2/tagsetparser.h>
+#include <libcorpus2/tagsetmanager.h>
+#include <libcorpus2/token.h>
+#include <libcorpus2/sentence.h>
+
+#include <libcorpus2/io/reader.h>
+#include <libcorpus2/io/writer.h>
+
+#include <libpwrutils/foreach.h>
+
+#include <boost/program_options.hpp>
+#include <boost/bind.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <algorithm>
+#include <fstream>
+#include <iterator>
+
+#ifdef HAVE_LIBEDIT
+#include <histedit.h>
+#endif
+
+#ifdef HAVE_CONFIG_D_H
+#include <libcorpus2/config_d.h>
+#endif
+
+// not checking for HAVE_VERSION, there is no reason it shouldn't be there
+#include <libcorpus2/version.h>
+
+int main(int argc, char** argv)
+{
+	std::string tagset_load, file;
+	bool quiet = false, internals = false;
+	bool parse = false, validate = false, sort = false;
+	using boost::program_options::value;
+
+	boost::program_options::options_description desc("Allowed options");
+	desc.add_options()
+			("tagset,t", value(&tagset_load)->default_value("kipi"),
+			 "tagset")
+			("file", value(&file),
+			 "file to load")
+			("help,h", "Show help")
+			("version", "print version string")
+			;
+	boost::program_options::variables_map vm;
+	boost::program_options::positional_options_description p;
+	p.add("file", -1);
+
+	try {
+		boost::program_options::store(
+			boost::program_options::command_line_parser(argc, argv)
+			.options(desc).positional(p).run(), vm);
+	} catch (boost::program_options::error& e) {
+		std::cerr << e.what() << "\n";
+		return 2;
+	}
+	boost::program_options::notify(vm);
+
+	if (vm.count("help")) {
+		std::cout << desc << "\n";
+		std::cout << "Available tagsets: ";
+		std::cout << Corpus2::available_tagsets() << "\n";
+		std::cout << "Available readers: ";
+		std::cout << boost::algorithm::join(Corpus2::TokenReader::available_reader_types(), " ") << "\n";
+		std::cout << "Available writers: ";
+		std::cout << boost::algorithm::join(Corpus2::TokenWriter::available_writer_types(), " ") << "\n";
+		return 1;
+	}
+	if (vm.count("version")) {
+		std::cout << "corpus-read (libcorpus2) " << LIBCORPUS2_VERSION << "\n";
+		return 0;
+	}
+
+	try {
+		const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load);
+		const Corpus2::TokenReader::TokenReaderPtr reader = Corpus2::TokenReader::create_path_reader(
+			"xces", tagset, file);
+		const Corpus2::TokenWriter::TokenWriterPtr writer = Corpus2::TokenWriter::create_stream_writer(
+			"xces", std::cout, tagset);
+		while (Corpus2::Sentence::Ptr sentence = reader->get_next_sentence()) {
+			writer->write_sentence(*sentence);
+		}
+	} catch (PwrNlp::PwrNlpError& e) {
+		std::cerr << "Error: " << e.info() << "\n";
+	}
+	return 0;
+}
-- 
GitLab