From 223523fa57180e14060d62a4c4c076656db1e56b Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Mon, 23 May 2011 17:18:29 +0200
Subject: [PATCH] WIP iob-chan

---
 libcorpus2/CMakeLists.txt  |   1 +
 libcorpus2/ann/channel.cpp |   2 +-
 libcorpus2/ann/channel.h   |   2 +-
 libcorpus2/io/iob-chan.cpp | 180 +++++++++++++++++++++++++++++++++++++
 libcorpus2/io/iob-chan.h   |  86 ++++++++++++++++++
 libcorpus2/io/rft.cpp      |   1 -
 libcorpus2/token.cpp       |  11 +--
 7 files changed, 275 insertions(+), 8 deletions(-)
 create mode 100644 libcorpus2/io/iob-chan.cpp
 create mode 100644 libcorpus2/io/iob-chan.h

diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt
index b225155..9b8fbd4 100644
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -58,6 +58,7 @@ SET(libcorpus2_STAT_SRC
 	io/cclreader.cpp
 	io/cclwriter.cpp
 	io/fastxces.cpp
+	io/iob-chan.cpp
 	io/nonewriter.cpp
 	io/orthwriter.cpp
 	io/pathwriter.cpp
diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp
index 0138795..34e7549 100644
--- a/libcorpus2/ann/channel.cpp
+++ b/libcorpus2/ann/channel.cpp
@@ -107,7 +107,7 @@ void AnnotationChannel::set_segment_at(int token_idx, int segment_idx)
 	}
 }
 
-IOB::Enum AnnotationChannel::get_iob_at(int idx)
+IOB::Enum AnnotationChannel::get_iob_at(int idx) const
 {
 	if (idx >= 0 && idx < static_cast<int>(iobs_.size())) {
 		return iobs_[idx];
diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h
index d4b02bc..204ee2c 100644
--- a/libcorpus2/ann/channel.h
+++ b/libcorpus2/ann/channel.h
@@ -130,7 +130,7 @@ public:
 	/**
 	 * IOB getter, returns IOB::O if idx is out of range.
 	 */
-	IOB::Enum get_iob_at(int idx);
+	IOB::Enum get_iob_at(int idx) const;
 
 	/**
 	 * IOB setter, out of range indices are not processed.
diff --git a/libcorpus2/io/iob-chan.cpp b/libcorpus2/io/iob-chan.cpp
new file mode 100644
index 0000000..2962d83
--- /dev/null
+++ b/libcorpus2/io/iob-chan.cpp
@@ -0,0 +1,180 @@
+/*
+	Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski
+	Part of the libcorpus2 project
+
+	This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+	This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.
+
+	See the LICENSE and COPYING files for more details.
+*/
+
+#include <libcorpus2/io/iob-chan.h>
+#include <libpwrutils/foreach.h>
+#include <libcorpus2/ann/annotatedsentence.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/make_shared.hpp>
+#include <fstream>
+#include <boost/algorithm/string/split.hpp>
+
+namespace Corpus2 {
+
+bool IobChanWriter::registered = TokenWriter::register_writer<IobChanWriter>(
+		"iob-chan", "nowarn");
+
+IobChanWriter::IobChanWriter(std::ostream& os, const Tagset& tagset,
+		const string_range_vector& params)
+	: TokenWriter(os, tagset, params), warn_on_no_lexemes_(true)
+{
+	foreach (const string_range& param, params) {
+		std::string p = boost::copy_range<std::string>(param);
+		if (p == "nowarn") {
+			warn_on_no_lexemes_ = false;
+		}
+	}
+}
+
+void IobChanWriter::write_token(const Token& t)
+{
+	os() << t.orth_utf8();
+	if (t.lexemes().empty()) {
+		if (warn_on_no_lexemes_) {
+			std::cerr << "No lexemes for token!";
+		}
+	} else {
+		const Lexeme& pref = t.get_preferred_lexeme(tagset());
+		os() << "\t";
+		write_tag(pref.tag());
+	}
+	os() << "\n";
+}
+
+void IobChanWriter::write_sentence(const Sentence& s)
+{
+	const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s);
+	for (size_t idx = 0; idx < s.size(); ++idx) {
+		const Token* t = s.tokens()[idx];
+		os() << t->orth_utf8();
+		if (t->lexemes().empty()) {
+			if (warn_on_no_lexemes_) {
+				std::cerr << "No lexemes for token!";
+			}
+		} else {
+			const Lexeme& pref = t->get_preferred_lexeme(tagset());
+			os() << "\t";
+			write_tag(pref.tag());
+		}
+		if (ann) {
+			bool first = true;
+			foreach (const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) {
+				if (!first) {
+					os() << ",";
+				}
+				os() << v.first << "-";
+				os() << Corpus2::IOB::to_string(v.second.get_iob_at(idx));
+				first = false;
+			}
+		}
+		os() << "\n";
+	}
+	os() << "\n";
+}
+
+void IobChanWriter::write_chunk(const Chunk& c)
+{
+	foreach (const Sentence::ConstPtr& s, c.sentences()) {
+		write_sentence(*s);
+	}
+}
+
+void IobChanWriter::write_tag(const Tag& tag)
+{
+	os() << tagset().tag_to_string(tag);
+}
+
+
+bool IobChanReader::registered = TokenReader::register_reader<IobChanReader>("iob-chan",
+	"ign,loose,strict,no_set_disamb");
+
+
+IobChanReader::IobChanReader(const Tagset& tagset, std::istream& is)
+	: BufferedSentenceReader(tagset), is_(&is), disamb_(true)
+{
+}
+
+IobChanReader::IobChanReader(const Tagset& tagset, const std::string& filename)
+	: BufferedSentenceReader(tagset), is_(), disamb_(true)
+{
+	is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));
+	if (!this->is_owned_->good()) {
+		throw Corpus2Error("File not found!");
+	}
+	else {
+		this->is_ = is_owned_.get();
+	}
+}
+
+Sentence::Ptr IobChanReader::actual_next_sentence()
+{
+	std::string line;
+	AnnotatedSentence::Ptr s;
+	typedef boost::split_iterator<std::string::const_iterator> string_split_iterator;
+
+	while (is().good()) {
+		std::getline(is(), line);
+		if (line.empty()) {
+			return s;
+		}
+		std::vector<std::string> spl;
+		boost::algorithm::split(spl, line, boost::is_any_of("\t"));
+		if (spl.size() != 4) {
+			std::cerr << "Invalid line: " << line << "\n";
+		} else {
+			const std::string& orth = spl[0];
+			const std::string& lemma = spl[0];
+			const std::string& tag_string = spl[1];
+			Tag tag = parse_tag(tag_string);
+			Token* t = new Token();
+			t->set_orth(UnicodeString::fromUTF8(orth));
+			t->set_wa(PwrNlp::Whitespace::Space);
+			t->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag));
+			if (disamb_) {
+				t->lexemes().back().set_disamb(true);
+			}
+			s->append(t);
+			const std::string& cline = line;
+			for (string_split_iterator value_it = boost::make_split_iterator(
+					cline, boost::token_finder(boost::is_any_of(",")));
+					value_it != string_split_iterator();
+					++value_it) {
+
+			}
+		}
+	}
+	return s;
+}
+
+void IobChanReader::set_option(const std::string &option)
+{
+	if (option == "no_set_disamb") {
+		disamb_ = false;
+	} else {
+		BufferedSentenceReader::set_option(option);
+	}
+}
+
+std::string IobChanReader::get_option(const std::string &option) const
+{
+	if (option == "no_set_disamb") {
+		return !disamb_ ? option : "";
+	}
+	return BufferedSentenceReader::get_option(option);
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/io/iob-chan.h b/libcorpus2/io/iob-chan.h
new file mode 100644
index 0000000..ea6bb8a
--- /dev/null
+++ b/libcorpus2/io/iob-chan.h
@@ -0,0 +1,86 @@
+/*
+	Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski
+	Part of the libcorpus2 project
+
+	This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+	This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.
+
+	See the LICENSE and COPYING files for more details.
+*/
+
+#ifndef LIBSORPUS2_IO_IOB_CHAN_H
+#define LIBCORPUS2_IO_IOB_CHAN_H
+
+#include <libcorpus2/io/reader.h>
+#include <libcorpus2/io/writer.h>
+#include <boost/scoped_ptr.hpp>
+
+namespace Corpus2 {
+
+/**
+ * Simple writer class to output token in RFTagger-compatible corpora form.
+ *
+ * One token per line, token line consists of the orth, followed by the
+ * tag, followed by newline (one tag per token only). Each sentence
+ * is followed by a blank line.
+ *
+ * The first lexeme is used. No-lexeme tokens trigger a warning unless
+ * nowarn is passed.
+ */
+class IobChanWriter : public TokenWriter
+{
+public:
+	IobChanWriter(std::ostream& os, const Tagset& tagset,
+			const string_range_vector& params);
+
+	void write_token(const Token& t);
+
+	void write_sentence(const Sentence& s);
+
+	void write_chunk(const Chunk &p);
+
+	void write_tag(const Tag& tag);
+
+	static bool registered;
+
+private:
+	bool warn_on_no_lexemes_;
+};
+
+class IobChanReader : public BufferedSentenceReader
+{
+public:
+	IobChanReader(const Tagset& tagset, std::istream& is);
+
+	IobChanReader(const Tagset& tagset, const std::string& filename);
+
+	std::istream& is() {
+		return *is_;
+	}
+
+	void set_option(const std::string& option);
+
+	std::string get_option(const std::string& option) const;
+
+	static bool registered;
+
+protected:
+	/// BufferedSentenceReader override
+	Sentence::Ptr actual_next_sentence();
+
+	std::istream* is_;
+	boost::scoped_ptr<std::istream> is_owned_;
+
+	/// Whether to mark all incoming tags as disambiguated
+	bool disamb_;
+};
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_IO_IOB_CHAN_H
diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp
index ead4021..67552df 100644
--- a/libcorpus2/io/rft.cpp
+++ b/libcorpus2/io/rft.cpp
@@ -67,7 +67,6 @@ void RftWriter::write_token(const Token& t)
 		const Lexeme& pref = t.get_preferred_lexeme(tagset());
 		os() << "\t";
 		write_tag(pref.tag());
-		std::string tag_str = tagset().tag_to_no_opt_string(pref.tag());
 	}
 	os() << "\n";
 }
diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp
index 38f3bdb..8593675 100644
--- a/libcorpus2/token.cpp
+++ b/libcorpus2/token.cpp
@@ -65,12 +65,13 @@ struct preferred_lexeme_cmp
 	const Tagset* tagset;
 
 	bool operator()(const Lexeme& l1, const Lexeme& l2) const {
-		return (!l1.is_disamb() && l2.is_disamb())
-				|| (l1.is_disamb() == l2.is_disamb()
-				&& (tagset->get_original_pos_index(l1.tag().get_pos_index()) >
+		return
+		 (!l1.is_disamb() && l2.is_disamb())
+		 || (l1.is_disamb() == l2.is_disamb()
+		  && (tagset->get_original_pos_index(l1.tag().get_pos_index()) >
 					tagset->get_original_pos_index(l2.tag().get_pos_index())
-				|| (l1.tag().get_pos() == l2.tag().get_pos()
-				&& l1 < l2)));
+		  || (l1.tag().get_pos() == l2.tag().get_pos()
+			&& l1 < l2)));
 	}
 };
 
-- 
GitLab