From a0aee2cc43fc7052381342b9ce4a22817f95bbb3 Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Mon, 20 Jun 2011 14:50:02 +0200
Subject: [PATCH] add missing reader for the 'plain' format

---
 libcorpus2/CMakeLists.txt     |  1 +
 libcorpus2/io/plainreader.cpp | 93 +++++++++++++++++++++++++++++++++++
 libcorpus2/io/plainreader.h   | 36 ++++++++++++++
 3 files changed, 130 insertions(+)
 create mode 100644 libcorpus2/io/plainreader.cpp
 create mode 100644 libcorpus2/io/plainreader.h

diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt
index bb21452..d00090a 100644
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -63,6 +63,7 @@ SET(libcorpus2_STAT_SRC
 	io/nonewriter.cpp
 	io/orthwriter.cpp
 	io/pathwriter.cpp
+	io/plainreader.cpp
 	io/plainwriter.cpp
 	io/premorphwriter.cpp
 	io/reader.cpp
diff --git a/libcorpus2/io/plainreader.cpp b/libcorpus2/io/plainreader.cpp
new file mode 100644
index 0000000..2d322cf
--- /dev/null
+++ b/libcorpus2/io/plainreader.cpp
@@ -0,0 +1,93 @@
+#include <libcorpus2/io/plainreader.h>
+#include <libpwrutils/foreach.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include <boost/make_shared.hpp>
+#include <fstream>
+
+namespace Corpus2 {
+
+bool PlainReader::registered = TokenReader::register_reader<PlainReader>("plain",
+	"ign,loose,strict");
+
+
+PlainReader::PlainReader(const Tagset& tagset, std::istream& is)
+	: BufferedSentenceReader(tagset), is_(&is)
+{
+}
+
+PlainReader::PlainReader(const Tagset& tagset, const std::string& filename)
+	: BufferedSentenceReader(tagset), is_()
+{
+	is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));
+	if (!this->is_owned_->good()) {
+		throw Corpus2Error("File not found!");
+	}
+	else {
+		this->is_ = is_owned_.get();
+	}
+}
+
+Sentence::Ptr PlainReader::actual_next_sentence()
+{
+	std::string line;
+	Sentence::Ptr s;
+	size_t line_no = 0;
+	while (is().good()) {
+		std::getline(is(), line);
+		++line_no;
+		if (line.empty()) {
+			return s;
+		} else {
+			std::vector<std::string> fields;
+			boost::algorithm::split(fields, line, boost::is_any_of("\t"));
+			assert(!fields.empty());
+			if (fields[0].empty()) { //lexeme
+				if (s->empty()) {
+					throw Corpus2Error("PlainReader lexemes without a token at "
+							+ boost::lexical_cast<std::string>(line_no));
+				}
+				if (fields.size() < 3) {
+					throw Corpus2Error("PlainReader not enough fields at "
+							+ boost::lexical_cast<std::string>(line_no));
+				}
+				const std::string& lemma = fields[1];
+				const std::string& tag_string = fields[2];
+				Tag tag = parse_tag(tag_string);
+				Token* last_token = s->tokens().back();
+				last_token->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag));
+				if (fields.size() > 3 && fields[3] == "disamb") {
+					last_token->lexemes().back().set_disamb(true);
+				}
+			} else { // orth-ws
+				Token* t = new Token();
+				const std::string& orth = fields[0];
+				t->set_orth(UnicodeString::fromUTF8(orth));
+				PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space;
+				if (!s) {
+					s = make_sentence();
+					wa = PwrNlp::Whitespace::Newline;
+				}
+				if (fields.size() > 1) {
+					wa = PwrNlp::Whitespace::from_string(fields[1]);
+				}
+				t->set_wa(wa);
+				s->append(t);
+			}
+		}
+	}
+	return s;
+}
+
+void PlainReader::set_option(const std::string &option)
+{
+	BufferedSentenceReader::set_option(option);
+}
+
+std::string PlainReader::get_option(const std::string &option) const
+{
+	return BufferedSentenceReader::get_option(option);
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/io/plainreader.h b/libcorpus2/io/plainreader.h
new file mode 100644
index 0000000..c17cb9e
--- /dev/null
+++ b/libcorpus2/io/plainreader.h
@@ -0,0 +1,36 @@
+#ifndef LIBSORPUS2_IO_PLAINREADER_H
+#define LIBCORPUS2_IO_PLAINREADER_H
+
+#include <libcorpus2/io/reader.h>
+#include <boost/scoped_ptr.hpp>
+
+namespace Corpus2 {
+
+class PlainReader : public BufferedSentenceReader
+{
+public:
+	PlainReader(const Tagset& tagset, std::istream& is);
+
+	PlainReader(const Tagset& tagset, const std::string& filename);
+
+	std::istream& is() {
+		return *is_;
+	}
+
+	void set_option(const std::string& option);
+
+	std::string get_option(const std::string& option) const;
+
+	static bool registered;
+
+protected:
+	/// BufferedSentenceReader override
+	Sentence::Ptr actual_next_sentence();
+
+	std::istream* is_;
+	boost::scoped_ptr<std::istream> is_owned_;
+};
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_IO_PLAINREADER_H
-- 
GitLab