From 135010179d51d3c9dfa6be2b833a00234be9c7ce Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Wed, 9 Mar 2011 09:47:06 +0100 Subject: [PATCH] ccl format readers and writers --- libcorpus2/CMakeLists.txt | 2 + libcorpus2/io/cclreader.cpp | 15 +++++- libcorpus2/io/cclreader.h | 2 + libcorpus2/io/cclwriter.cpp | 100 +++++++++++++++++++++++++++++++++++ libcorpus2/io/cclwriter.h | 34 ++++++++++++ libcorpus2/io/xceswriter.cpp | 58 +++++++------------- libcorpus2/io/xceswriter.h | 20 +------ libcorpus2/io/xmlreader.cpp | 1 + libcorpus2/io/xmlwriter.cpp | 88 ++++++++++++++++++++++++++++++ libcorpus2/io/xmlwriter.h | 45 ++++++++++++++++ 10 files changed, 308 insertions(+), 57 deletions(-) create mode 100644 libcorpus2/io/cclwriter.cpp create mode 100644 libcorpus2/io/cclwriter.h create mode 100644 libcorpus2/io/xmlwriter.cpp create mode 100644 libcorpus2/io/xmlwriter.h diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 5095f83..bdc9964 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -56,6 +56,7 @@ SET(libcorpus2_STAT_SRC token.cpp tokenmetadata.cpp io/cclreader.cpp + io/cclwriter.cpp io/fastxces.cpp io/orthwriter.cpp io/plainwriter.cpp @@ -70,6 +71,7 @@ SET(libcorpus2_STAT_SRC io/xcesvalidate.cpp io/xceswriter.cpp io/xmlreader.cpp + io/xmlwriter.cpp util/settings.cpp util/symboldictionary.cpp util/tokentimer.cpp diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index 46c1b0d..d0d5d59 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -184,7 +184,9 @@ bool CclReaderImpl::process_end_element(const Glib::ustring & name) } if (segid > 0) { token_anns_.insert(std::make_pair(ann_chan_, segid)); - token_ann_heads_.insert(ann_chan_); + if (ann_head_) { + token_ann_heads_.insert(ann_chan_); + } } state_ = STATE_TOK; return true; @@ -204,4 +206,15 @@ void CclReaderImpl::finish_token() } } +void CclReader::set_option(const std::string& option) +{ + if (option == "loose") { + impl_->set_loose_tag_parsing(true); + } else if (option == "strict") { + impl_->set_loose_tag_parsing(false); + } else if (option == "no_warn_inconsistent") { + impl_->set_warn_on_inconsistent(false); + } +} + } /* end ns Corpus2 */ diff --git a/libcorpus2/io/cclreader.h b/libcorpus2/io/cclreader.h index 3807ff9..7fe1f98 100644 --- a/libcorpus2/io/cclreader.h +++ b/libcorpus2/io/cclreader.h @@ -42,6 +42,8 @@ public: return *is_; } + void set_option(const std::string& option); + protected: void ensure_more(); diff --git a/libcorpus2/io/cclwriter.cpp b/libcorpus2/io/cclwriter.cpp new file mode 100644 index 0000000..3e1a40b --- /dev/null +++ b/libcorpus2/io/cclwriter.cpp @@ -0,0 +1,100 @@ +#include <libcorpus2/io/cclwriter.h> +#include <libpwrutils/foreach.h> +#include <libcorpus2/ann/annotatedsentence.h> +#include <libcorpus2/io/xcescommon.h> + +namespace Corpus2 { + +bool CclWriter::registered = TokenWriter::register_writer<CclWriter>("ccl", + "flat,chunk,nochunk,nodisamb,sorttags,split,ws"); + +CclWriter::CclWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : XmlWriter(os, tagset, params) +{ + do_header(); +} + +CclWriter::~CclWriter() +{ + finish(); +} + +void CclWriter::write_sentence(const Sentence& s) +{ + paragraph_head(); + const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s); + if (use_indent_) indent_more(); + osi() << "<sentence>\n"; + if (use_indent_) indent_more(); + for (size_t idx = 0; idx < s.size(); ++idx) { + const Token* t = s.tokens()[idx]; + if (ann) { + token_as_xces_xml_head(os(), *t, use_indent_ ? indent_level() : -1, whitespace_info_); + if (use_indent_) indent_more(); + token_as_xces_xml_body(os(), tagset(), *t, use_indent_ ? indent_level() : -1, output_disamb_, sort_tags_); + foreach (const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) { + osi() << "<ann chan=\"" << v.first << "\""; + if (v.second.is_head_at(idx)) { + os() << " head=\"1\""; + } + os() << ">"; + os() << v.second.get_segment_at(idx); + os() << "</ann>\n"; + } + if (use_indent_) indent_less(); + osi() << "</tok>\n"; + } else { + XmlWriter::write_token(*t); + } + } + if (use_indent_) indent_less(); osi() << "</sentence>\n"; + if (use_indent_) indent_less(); + osi() << "</chunk>\n"; +} + +void CclWriter::write_chunk(const Chunk &c) +{ + paragraph_head(c); + if (use_indent_) indent_more(); + foreach (const Sentence::ConstPtr& s, c.sentences()) { + write_sentence(*s); + } + if (use_indent_) indent_less(); + osi() << "</chunk>\n"; +} + +void CclWriter::do_header() +{ + XmlWriter::do_header(); + os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"; + os() << "<cesAna"; + os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\""; + os() << " version=\"1.0\" type=\"lex disamb\">\n"; + os() << "<chunkList>\n"; + if (use_indent_) indent_more(); +} + +void CclWriter::do_footer() +{ + if (use_indent_) indent_less(); + os() << "</chunkList>\n"; + os() << "</cesAna>\n"; +} + +void CclWriter::paragraph_head() +{ + osi() << "<chunk id=\"ch" << ++cid_ << "\"" + << " type=\"p\">\n"; +} + +void CclWriter::paragraph_head(const Chunk& c) +{ + osi() << "<chunk"; + foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) { + os() << " " << v.first << "=\"" << v.second << "\""; + } + os() << ">\n"; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/cclwriter.h b/libcorpus2/io/cclwriter.h new file mode 100644 index 0000000..c9acb11 --- /dev/null +++ b/libcorpus2/io/cclwriter.h @@ -0,0 +1,34 @@ +#ifndef LIBSORPUS2_IO_CCLWRITER_H +#define LIBCORPUS2_IO_CCLWRITER_H + +#include <libcorpus2/io/xmlwriter.h> + +namespace Corpus2 { + +class CclWriter : public XmlWriter +{ +public: + CclWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + ~CclWriter(); + + void write_sentence(const Sentence &s); + + void write_chunk(const Chunk &c); + + static bool registered; + +protected: + void do_header(); + + void do_footer(); + + void paragraph_head(); + + void paragraph_head(const Chunk& c); +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_CCLWRITER_H diff --git a/libcorpus2/io/xceswriter.cpp b/libcorpus2/io/xceswriter.cpp index e389d68..33693ce 100644 --- a/libcorpus2/io/xceswriter.cpp +++ b/libcorpus2/io/xceswriter.cpp @@ -25,27 +25,18 @@ bool XcesWriter::registered = TokenWriter::register_writer<XcesWriter>("xces", XcesWriter::XcesWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) - : TokenWriter(os, tagset, params), cid_(0) - , use_indent_(true), force_chunk_(false), output_disamb_(true) - , sort_tags_(false), split_chunks_on_newlines_(false) - , whitespace_info_(false) + : XmlWriter(os, tagset, params) + , force_chunk_(false) + , split_chunks_on_newlines_(false) { foreach (const string_range& param, params) { std::string p = boost::copy_range<std::string>(param); - if (p == "flat") { - use_indent_ = false; - } else if (p == "chunk") { + if (p == "chunk") { force_chunk_ = true; } else if (p == "nochunk") { force_chunk_ = false; - } else if (p == "nodisamb") { - output_disamb_ = false; - } else if (p == "sorttags") { - sort_tags_ = true; } else if (p == "split") { split_chunks_on_newlines_ = true; - } else if (p == "ws") { - whitespace_info_ = true; } } do_header(); @@ -56,20 +47,10 @@ XcesWriter::~XcesWriter() finish(); } -void XcesWriter::write_token(const Token &t) -{ - token_as_xces_xml(os(), tagset(), t, use_indent_ ? indent_level() : -1, - output_disamb_, sort_tags_, whitespace_info_); -} - void XcesWriter::write_sentence(const Sentence& s) { osi() << "<chunk type=\"s\">\n"; - if (use_indent_) indent_more(); - foreach (const Token* t, s.tokens()) { - write_token(*t); - } - if (use_indent_) indent_less(); + XmlWriter::write_sentence(s); osi() << "</chunk>\n"; } @@ -96,7 +77,7 @@ void XcesWriter::write_chunk(const Chunk &c) void XcesWriter::do_header() { - os() << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; + XmlWriter::do_header(); os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"; os() << "<cesAna"; os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\""; @@ -120,19 +101,20 @@ void XcesWriter::do_footer() os() << "</cesAna>\n"; } -void XcesWriter::paragraph_head() -{ - osi() << "<chunk id=\"ch" << ++cid_ << "\"" - << " type=\"p\">\n"; -} -void XcesWriter::paragraph_head(const Chunk& c) -{ - osi() << "<chunk"; - foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) { - os() << " " << v.first << "=\"" << v.second << "\""; - } - os() << ">\n"; -} +//void XcesWriter::paragraph_head() +//{ +// osi() << "<chunk id=\"ch" << ++cid_ << "\"" +// << " type=\"p\">\n"; +//} + +//void XcesWriter::paragraph_head(const Chunk& c) +//{ +// osi() << "<chunk"; +// foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) { +// os() << " " << v.first << "=\"" << v.second << "\""; +// } +// os() << ">\n"; +//} } /* end ns Corpus2 */ diff --git a/libcorpus2/io/xceswriter.h b/libcorpus2/io/xceswriter.h index 835ce32..a9c6634 100644 --- a/libcorpus2/io/xceswriter.h +++ b/libcorpus2/io/xceswriter.h @@ -17,19 +17,17 @@ or FITNESS FOR A PARTICULAR PURPOSE. #ifndef LIBCORPUS2_IO_XCESWRITER_H #define LIBCORPUS2_IO_XCESWRITER_H -#include <libcorpus2/io/writer.h> +#include <libcorpus2/io/xmlwriter.h> namespace Corpus2 { -class XcesWriter : public TokenWriter { +class XcesWriter : public XmlWriter { public: XcesWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params); ~XcesWriter(); - void write_token(const Token &t); - void write_sentence(const Sentence &s); void write_chunk(const Chunk &c); @@ -41,23 +39,9 @@ protected: void do_footer(); - void paragraph_head(); - - void paragraph_head(const Chunk& c); - - int cid_; - - bool use_indent_; - bool force_chunk_; - bool output_disamb_; - - bool sort_tags_; - bool split_chunks_on_newlines_; - - bool whitespace_info_; }; } /* end ns Corpus2 */ diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp index 5ce99f2..a420722 100644 --- a/libcorpus2/io/xmlreader.cpp +++ b/libcorpus2/io/xmlreader.cpp @@ -240,4 +240,5 @@ void XmlReader::on_end_element(const Glib::ustring &name) } } + } /* end ns Corpus2 */ diff --git a/libcorpus2/io/xmlwriter.cpp b/libcorpus2/io/xmlwriter.cpp new file mode 100644 index 0000000..e540073 --- /dev/null +++ b/libcorpus2/io/xmlwriter.cpp @@ -0,0 +1,88 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/io/xcescommon.h> +#include <libcorpus2/io/xmlwriter.h> +#include <libpwrutils/foreach.h> + +namespace Corpus2 { + +XmlWriter::XmlWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params), cid_(0) + , use_indent_(true), output_disamb_(true) + , sort_tags_(false), whitespace_info_(false) +{ + foreach (const string_range& param, params) { + std::string p = boost::copy_range<std::string>(param); + if (p == "flat") { + use_indent_ = false; + } else if (p == "nodisamb") { + output_disamb_ = false; + } else if (p == "sorttags") { + sort_tags_ = true; + } else if (p == "ws") { + whitespace_info_ = true; + } + } +} + +XmlWriter::~XmlWriter() +{ + finish(); +} + +void XmlWriter::write_token(const Token &t) +{ + token_as_xces_xml(os(), tagset(), t, use_indent_ ? indent_level() : -1, + output_disamb_, sort_tags_, whitespace_info_); +} + +void XmlWriter::write_sentence(const Sentence& s) +{ + if (use_indent_) indent_more(); + foreach (const Token* t, s.tokens()) { + write_token(*t); + } + if (use_indent_) indent_less(); +} + + +void XmlWriter::do_header() +{ + os() << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; +} + +void XmlWriter::do_footer() +{ +} + +void XmlWriter::paragraph_head() +{ + osi() << "<chunk id=\"autoch" << ++cid_ << "\"" + << " type=\"p\">\n"; +} + +void XmlWriter::paragraph_head(const Chunk& c) +{ + osi() << "<chunk"; + foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) { + os() << " " << v.first << "=\"" << v.second << "\""; + } + os() << ">\n"; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/xmlwriter.h b/libcorpus2/io/xmlwriter.h new file mode 100644 index 0000000..b8f7ccf --- /dev/null +++ b/libcorpus2/io/xmlwriter.h @@ -0,0 +1,45 @@ +#ifndef LIBSORPUS2_IO_XMLWRITER_H +#define LIBCORPUS2_IO_XMLWRITER_H + +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +/** + * Base class for xml-ish writers + */ +class XmlWriter : public TokenWriter { +public: + XmlWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + ~XmlWriter(); + + void write_token(const Token &t); + + void write_sentence(const Sentence &s); + +protected: + void do_header(); + + void do_footer(); + + virtual void paragraph_head(); + + void paragraph_head(const Chunk& c); + + int cid_; + + bool use_indent_; + + bool output_disamb_; + + bool sort_tags_; + + bool whitespace_info_; +}; + + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_XMLWRITER_H -- GitLab