From 135010179d51d3c9dfa6be2b833a00234be9c7ce Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Wed, 9 Mar 2011 09:47:06 +0100
Subject: [PATCH] ccl format readers and writers

---
 libcorpus2/CMakeLists.txt    |   2 +
 libcorpus2/io/cclreader.cpp  |  15 +++++-
 libcorpus2/io/cclreader.h    |   2 +
 libcorpus2/io/cclwriter.cpp  | 100 +++++++++++++++++++++++++++++++++++
 libcorpus2/io/cclwriter.h    |  34 ++++++++++++
 libcorpus2/io/xceswriter.cpp |  58 +++++++-------------
 libcorpus2/io/xceswriter.h   |  20 +------
 libcorpus2/io/xmlreader.cpp  |   1 +
 libcorpus2/io/xmlwriter.cpp  |  88 ++++++++++++++++++++++++++++++
 libcorpus2/io/xmlwriter.h    |  45 ++++++++++++++++
 10 files changed, 308 insertions(+), 57 deletions(-)
 create mode 100644 libcorpus2/io/cclwriter.cpp
 create mode 100644 libcorpus2/io/cclwriter.h
 create mode 100644 libcorpus2/io/xmlwriter.cpp
 create mode 100644 libcorpus2/io/xmlwriter.h

diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt
index 5095f83..bdc9964 100644
--- a/libcorpus2/CMakeLists.txt
+++ b/libcorpus2/CMakeLists.txt
@@ -56,6 +56,7 @@ SET(libcorpus2_STAT_SRC
 	token.cpp
 	tokenmetadata.cpp
 	io/cclreader.cpp
+	io/cclwriter.cpp
 	io/fastxces.cpp
 	io/orthwriter.cpp
 	io/plainwriter.cpp
@@ -70,6 +71,7 @@ SET(libcorpus2_STAT_SRC
 	io/xcesvalidate.cpp
 	io/xceswriter.cpp
 	io/xmlreader.cpp
+	io/xmlwriter.cpp
 	util/settings.cpp
 	util/symboldictionary.cpp
 	util/tokentimer.cpp
diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp
index 46c1b0d..d0d5d59 100644
--- a/libcorpus2/io/cclreader.cpp
+++ b/libcorpus2/io/cclreader.cpp
@@ -184,7 +184,9 @@ bool CclReaderImpl::process_end_element(const Glib::ustring & name)
 		}
 		if (segid > 0) {
 			token_anns_.insert(std::make_pair(ann_chan_, segid));
-			token_ann_heads_.insert(ann_chan_);
+			if (ann_head_) {
+				token_ann_heads_.insert(ann_chan_);
+			}
 		}
 		state_ = STATE_TOK;
 		return true;
@@ -204,4 +206,15 @@ void CclReaderImpl::finish_token()
 	}
 }
 
+void CclReader::set_option(const std::string& option)
+{
+	if (option == "loose") {
+		impl_->set_loose_tag_parsing(true);
+	} else if (option == "strict") {
+		impl_->set_loose_tag_parsing(false);
+	} else if (option == "no_warn_inconsistent") {
+		impl_->set_warn_on_inconsistent(false);
+	}
+}
+
 } /* end ns Corpus2 */
diff --git a/libcorpus2/io/cclreader.h b/libcorpus2/io/cclreader.h
index 3807ff9..7fe1f98 100644
--- a/libcorpus2/io/cclreader.h
+++ b/libcorpus2/io/cclreader.h
@@ -42,6 +42,8 @@ public:
 		return *is_;
 	}
 
+	void set_option(const std::string& option);
+
 protected:
 	void ensure_more();
 
diff --git a/libcorpus2/io/cclwriter.cpp b/libcorpus2/io/cclwriter.cpp
new file mode 100644
index 0000000..3e1a40b
--- /dev/null
+++ b/libcorpus2/io/cclwriter.cpp
@@ -0,0 +1,100 @@
+#include <libcorpus2/io/cclwriter.h>
+#include <libpwrutils/foreach.h>
+#include <libcorpus2/ann/annotatedsentence.h>
+#include <libcorpus2/io/xcescommon.h>
+
+namespace Corpus2 {
+
+bool CclWriter::registered = TokenWriter::register_writer<CclWriter>("ccl",
+		"flat,chunk,nochunk,nodisamb,sorttags,split,ws");
+
+CclWriter::CclWriter(std::ostream& os, const Tagset& tagset,
+		const string_range_vector& params)
+	: XmlWriter(os, tagset, params)
+{
+	do_header();
+}
+
+CclWriter::~CclWriter()
+{
+	finish();
+}
+
+void CclWriter::write_sentence(const Sentence& s)
+{
+	paragraph_head();
+	const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s);
+	if (use_indent_) indent_more();
+	osi() << "<sentence>\n";
+	if (use_indent_) indent_more();
+	for (size_t idx = 0; idx < s.size(); ++idx) {
+		const Token* t = s.tokens()[idx];
+		if (ann) {
+			token_as_xces_xml_head(os(), *t, use_indent_ ? indent_level() : -1, whitespace_info_);
+			if (use_indent_) indent_more();
+			token_as_xces_xml_body(os(), tagset(), *t, use_indent_ ? indent_level() : -1, output_disamb_, sort_tags_);
+			foreach (const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) {
+				osi() << "<ann chan=\"" << v.first << "\"";
+				if (v.second.is_head_at(idx)) {
+					os() << " head=\"1\"";
+				}
+				os() << ">";
+				os() << v.second.get_segment_at(idx);
+				os() << "</ann>\n";
+			}
+			if (use_indent_) indent_less();
+			osi() << "</tok>\n";
+		} else {
+			XmlWriter::write_token(*t);
+		}
+	}
+	if (use_indent_) indent_less();	osi() << "</sentence>\n";
+	if (use_indent_) indent_less();
+	osi() << "</chunk>\n";
+}
+
+void CclWriter::write_chunk(const Chunk &c)
+{
+	paragraph_head(c);
+	if (use_indent_) indent_more();
+	foreach (const Sentence::ConstPtr& s, c.sentences()) {
+		write_sentence(*s);
+	}
+	if (use_indent_) indent_less();
+	osi() << "</chunk>\n";
+}
+
+void CclWriter::do_header()
+{
+	XmlWriter::do_header();
+	os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n";
+	os() << "<cesAna";
+	os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\"";
+	os() << " version=\"1.0\" type=\"lex disamb\">\n";
+	os() << "<chunkList>\n";
+	if (use_indent_) indent_more();
+}
+
+void CclWriter::do_footer()
+{
+	if (use_indent_) indent_less();
+	os() << "</chunkList>\n";
+	os() << "</cesAna>\n";
+}
+
+void CclWriter::paragraph_head()
+{
+	osi() << "<chunk id=\"ch" << ++cid_ << "\""
+		<< " type=\"p\">\n";
+}
+
+void CclWriter::paragraph_head(const Chunk& c)
+{
+	osi() << "<chunk";
+	foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) {
+		os() << " " << v.first << "=\"" << v.second << "\"";
+	}
+	os() << ">\n";
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/io/cclwriter.h b/libcorpus2/io/cclwriter.h
new file mode 100644
index 0000000..c9acb11
--- /dev/null
+++ b/libcorpus2/io/cclwriter.h
@@ -0,0 +1,34 @@
+#ifndef LIBSORPUS2_IO_CCLWRITER_H
+#define LIBCORPUS2_IO_CCLWRITER_H
+
+#include <libcorpus2/io/xmlwriter.h>
+
+namespace Corpus2 {
+
+class CclWriter : public XmlWriter
+{
+public:
+	CclWriter(std::ostream& os, const Tagset& tagset,
+			const string_range_vector& params);
+
+	~CclWriter();
+
+	void write_sentence(const Sentence &s);
+
+	void write_chunk(const Chunk &c);
+
+	static bool registered;
+
+protected:
+	void do_header();
+
+	void do_footer();
+
+	void paragraph_head();
+
+	void paragraph_head(const Chunk& c);
+};
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_IO_CCLWRITER_H
diff --git a/libcorpus2/io/xceswriter.cpp b/libcorpus2/io/xceswriter.cpp
index e389d68..33693ce 100644
--- a/libcorpus2/io/xceswriter.cpp
+++ b/libcorpus2/io/xceswriter.cpp
@@ -25,27 +25,18 @@ bool XcesWriter::registered = TokenWriter::register_writer<XcesWriter>("xces",
 
 XcesWriter::XcesWriter(std::ostream& os, const Tagset& tagset,
 		const string_range_vector& params)
-	: TokenWriter(os, tagset, params), cid_(0)
-	, use_indent_(true), force_chunk_(false), output_disamb_(true)
-	, sort_tags_(false), split_chunks_on_newlines_(false)
-	, whitespace_info_(false)
+	: XmlWriter(os, tagset, params)
+	, force_chunk_(false)
+	, split_chunks_on_newlines_(false)
 {
 	foreach (const string_range& param, params) {
 		std::string p = boost::copy_range<std::string>(param);
-		if (p == "flat") {
-			use_indent_ = false;
-		} else if (p == "chunk") {
+		if (p == "chunk") {
 			force_chunk_ = true;
 		} else if (p == "nochunk") {
 			force_chunk_ = false;
-		} else if (p == "nodisamb") {
-			output_disamb_ = false;
-		} else if (p == "sorttags") {
-			sort_tags_ = true;
 		} else if (p == "split") {
 			split_chunks_on_newlines_ = true;
-		} else if (p == "ws") {
-			whitespace_info_ = true;
 		}
 	}
 	do_header();
@@ -56,20 +47,10 @@ XcesWriter::~XcesWriter()
 	finish();
 }
 
-void XcesWriter::write_token(const Token &t)
-{
-	token_as_xces_xml(os(), tagset(), t, use_indent_ ? indent_level() : -1,
-			output_disamb_, sort_tags_, whitespace_info_);
-}
-
 void XcesWriter::write_sentence(const Sentence& s)
 {
 	osi() << "<chunk type=\"s\">\n";
-	if (use_indent_) indent_more();
-	foreach (const Token* t, s.tokens()) {
-		write_token(*t);
-	}
-	if (use_indent_) indent_less();
+	XmlWriter::write_sentence(s);
 	osi() << "</chunk>\n";
 }
 
@@ -96,7 +77,7 @@ void XcesWriter::write_chunk(const Chunk &c)
 
 void XcesWriter::do_header()
 {
-	os() << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+	XmlWriter::do_header();
 	os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n";
 	os() << "<cesAna";
 	os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\"";
@@ -120,19 +101,20 @@ void XcesWriter::do_footer()
 	os() << "</cesAna>\n";
 }
 
-void XcesWriter::paragraph_head()
-{
-	osi() << "<chunk id=\"ch" << ++cid_ << "\""
-		<< " type=\"p\">\n";
-}
 
-void XcesWriter::paragraph_head(const Chunk& c)
-{
-	osi() << "<chunk";
-	foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) {
-		os() << " " << v.first << "=\"" << v.second << "\"";
-	}
-	os() << ">\n";
-}
+//void XcesWriter::paragraph_head()
+//{
+//	osi() << "<chunk id=\"ch" << ++cid_ << "\""
+//		<< " type=\"p\">\n";
+//}
+
+//void XcesWriter::paragraph_head(const Chunk& c)
+//{
+//	osi() << "<chunk";
+//	foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) {
+//		os() << " " << v.first << "=\"" << v.second << "\"";
+//	}
+//	os() << ">\n";
+//}
 
 } /* end ns Corpus2 */
diff --git a/libcorpus2/io/xceswriter.h b/libcorpus2/io/xceswriter.h
index 835ce32..a9c6634 100644
--- a/libcorpus2/io/xceswriter.h
+++ b/libcorpus2/io/xceswriter.h
@@ -17,19 +17,17 @@ or FITNESS FOR A PARTICULAR PURPOSE.
 #ifndef LIBCORPUS2_IO_XCESWRITER_H
 #define LIBCORPUS2_IO_XCESWRITER_H
 
-#include <libcorpus2/io/writer.h>
+#include <libcorpus2/io/xmlwriter.h>
 
 namespace Corpus2 {
 
-class XcesWriter : public TokenWriter {
+class XcesWriter : public XmlWriter {
 public:
 	XcesWriter(std::ostream& os, const Tagset& tagset,
 			const string_range_vector& params);
 
 	~XcesWriter();
 
-	void write_token(const Token &t);
-
 	void write_sentence(const Sentence &s);
 
 	void write_chunk(const Chunk &c);
@@ -41,23 +39,9 @@ protected:
 
 	void do_footer();
 
-	void paragraph_head();
-
-	void paragraph_head(const Chunk& c);
-
-	int cid_;
-
-	bool use_indent_;
-
 	bool force_chunk_;
 
-	bool output_disamb_;
-
-	bool sort_tags_;
-
 	bool split_chunks_on_newlines_;
-
-	bool whitespace_info_;
 };
 
 } /* end ns Corpus2 */
diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp
index 5ce99f2..a420722 100644
--- a/libcorpus2/io/xmlreader.cpp
+++ b/libcorpus2/io/xmlreader.cpp
@@ -240,4 +240,5 @@ void XmlReader::on_end_element(const Glib::ustring &name)
 	}
 }
 
+
 } /* end ns Corpus2 */
diff --git a/libcorpus2/io/xmlwriter.cpp b/libcorpus2/io/xmlwriter.cpp
new file mode 100644
index 0000000..e540073
--- /dev/null
+++ b/libcorpus2/io/xmlwriter.cpp
@@ -0,0 +1,88 @@
+/*
+    Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski
+    Part of the libcorpus2 project
+
+    This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+    This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.
+
+    See the LICENSE and COPYING files for more details.
+*/
+
+#include <libcorpus2/io/xcescommon.h>
+#include <libcorpus2/io/xmlwriter.h>
+#include <libpwrutils/foreach.h>
+
+namespace Corpus2 {
+
+XmlWriter::XmlWriter(std::ostream& os, const Tagset& tagset,
+		const string_range_vector& params)
+	: TokenWriter(os, tagset, params), cid_(0)
+	, use_indent_(true), output_disamb_(true)
+	, sort_tags_(false), whitespace_info_(false)
+{
+	foreach (const string_range& param, params) {
+		std::string p = boost::copy_range<std::string>(param);
+		if (p == "flat") {
+			use_indent_ = false;
+		} else if (p == "nodisamb") {
+			output_disamb_ = false;
+		} else if (p == "sorttags") {
+			sort_tags_ = true;
+		} else if (p == "ws") {
+			whitespace_info_ = true;
+		}
+	}
+}
+
+XmlWriter::~XmlWriter()
+{
+	finish();
+}
+
+void XmlWriter::write_token(const Token &t)
+{
+	token_as_xces_xml(os(), tagset(), t, use_indent_ ? indent_level() : -1,
+			output_disamb_, sort_tags_, whitespace_info_);
+}
+
+void XmlWriter::write_sentence(const Sentence& s)
+{
+	if (use_indent_) indent_more();
+	foreach (const Token* t, s.tokens()) {
+		write_token(*t);
+	}
+	if (use_indent_) indent_less();
+}
+
+
+void XmlWriter::do_header()
+{
+	os() << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+}
+
+void XmlWriter::do_footer()
+{
+}
+
+void XmlWriter::paragraph_head()
+{
+	osi() << "<chunk id=\"autoch" << ++cid_ << "\""
+		<< " type=\"p\">\n";
+}
+
+void XmlWriter::paragraph_head(const Chunk& c)
+{
+	osi() << "<chunk";
+	foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) {
+		os() << " " << v.first << "=\"" << v.second << "\"";
+	}
+	os() << ">\n";
+}
+
+} /* end ns Corpus2 */
diff --git a/libcorpus2/io/xmlwriter.h b/libcorpus2/io/xmlwriter.h
new file mode 100644
index 0000000..b8f7ccf
--- /dev/null
+++ b/libcorpus2/io/xmlwriter.h
@@ -0,0 +1,45 @@
+#ifndef LIBSORPUS2_IO_XMLWRITER_H
+#define LIBCORPUS2_IO_XMLWRITER_H
+
+#include <libcorpus2/io/writer.h>
+
+namespace Corpus2 {
+
+/**
+ * Base class for xml-ish writers
+ */
+class XmlWriter : public TokenWriter {
+public:
+	XmlWriter(std::ostream& os, const Tagset& tagset,
+			const string_range_vector& params);
+
+	~XmlWriter();
+
+	void write_token(const Token &t);
+
+	void write_sentence(const Sentence &s);
+
+protected:
+	void do_header();
+
+	void do_footer();
+
+	virtual void paragraph_head();
+
+	void paragraph_head(const Chunk& c);
+
+	int cid_;
+
+	bool use_indent_;
+
+	bool output_disamb_;
+
+	bool sort_tags_;
+
+	bool whitespace_info_;
+};
+
+
+} /* end ns Corpus2 */
+
+#endif // LIBCORPUS2_IO_XMLWRITER_H
-- 
GitLab