Skip to content
Snippets Groups Projects
Commit 13501017 authored by ilor's avatar ilor
Browse files

ccl format readers and writers

parent 95b7f74b
No related branches found
No related tags found
No related merge requests found
...@@ -56,6 +56,7 @@ SET(libcorpus2_STAT_SRC ...@@ -56,6 +56,7 @@ SET(libcorpus2_STAT_SRC
token.cpp token.cpp
tokenmetadata.cpp tokenmetadata.cpp
io/cclreader.cpp io/cclreader.cpp
io/cclwriter.cpp
io/fastxces.cpp io/fastxces.cpp
io/orthwriter.cpp io/orthwriter.cpp
io/plainwriter.cpp io/plainwriter.cpp
...@@ -70,6 +71,7 @@ SET(libcorpus2_STAT_SRC ...@@ -70,6 +71,7 @@ SET(libcorpus2_STAT_SRC
io/xcesvalidate.cpp io/xcesvalidate.cpp
io/xceswriter.cpp io/xceswriter.cpp
io/xmlreader.cpp io/xmlreader.cpp
io/xmlwriter.cpp
util/settings.cpp util/settings.cpp
util/symboldictionary.cpp util/symboldictionary.cpp
util/tokentimer.cpp util/tokentimer.cpp
......
...@@ -184,8 +184,10 @@ bool CclReaderImpl::process_end_element(const Glib::ustring & name) ...@@ -184,8 +184,10 @@ bool CclReaderImpl::process_end_element(const Glib::ustring & name)
} }
if (segid > 0) { if (segid > 0) {
token_anns_.insert(std::make_pair(ann_chan_, segid)); token_anns_.insert(std::make_pair(ann_chan_, segid));
if (ann_head_) {
token_ann_heads_.insert(ann_chan_); token_ann_heads_.insert(ann_chan_);
} }
}
state_ = STATE_TOK; state_ = STATE_TOK;
return true; return true;
} else { } else {
...@@ -204,4 +206,15 @@ void CclReaderImpl::finish_token() ...@@ -204,4 +206,15 @@ void CclReaderImpl::finish_token()
} }
} }
void CclReader::set_option(const std::string& option)
{
if (option == "loose") {
impl_->set_loose_tag_parsing(true);
} else if (option == "strict") {
impl_->set_loose_tag_parsing(false);
} else if (option == "no_warn_inconsistent") {
impl_->set_warn_on_inconsistent(false);
}
}
} /* end ns Corpus2 */ } /* end ns Corpus2 */
...@@ -42,6 +42,8 @@ public: ...@@ -42,6 +42,8 @@ public:
return *is_; return *is_;
} }
void set_option(const std::string& option);
protected: protected:
void ensure_more(); void ensure_more();
......
#include <libcorpus2/io/cclwriter.h>
#include <libpwrutils/foreach.h>
#include <libcorpus2/ann/annotatedsentence.h>
#include <libcorpus2/io/xcescommon.h>
namespace Corpus2 {
bool CclWriter::registered = TokenWriter::register_writer<CclWriter>("ccl",
"flat,chunk,nochunk,nodisamb,sorttags,split,ws");
CclWriter::CclWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params)
: XmlWriter(os, tagset, params)
{
do_header();
}
CclWriter::~CclWriter()
{
finish();
}
void CclWriter::write_sentence(const Sentence& s)
{
paragraph_head();
const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s);
if (use_indent_) indent_more();
osi() << "<sentence>\n";
if (use_indent_) indent_more();
for (size_t idx = 0; idx < s.size(); ++idx) {
const Token* t = s.tokens()[idx];
if (ann) {
token_as_xces_xml_head(os(), *t, use_indent_ ? indent_level() : -1, whitespace_info_);
if (use_indent_) indent_more();
token_as_xces_xml_body(os(), tagset(), *t, use_indent_ ? indent_level() : -1, output_disamb_, sort_tags_);
foreach (const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) {
osi() << "<ann chan=\"" << v.first << "\"";
if (v.second.is_head_at(idx)) {
os() << " head=\"1\"";
}
os() << ">";
os() << v.second.get_segment_at(idx);
os() << "</ann>\n";
}
if (use_indent_) indent_less();
osi() << "</tok>\n";
} else {
XmlWriter::write_token(*t);
}
}
if (use_indent_) indent_less(); osi() << "</sentence>\n";
if (use_indent_) indent_less();
osi() << "</chunk>\n";
}
void CclWriter::write_chunk(const Chunk &c)
{
paragraph_head(c);
if (use_indent_) indent_more();
foreach (const Sentence::ConstPtr& s, c.sentences()) {
write_sentence(*s);
}
if (use_indent_) indent_less();
osi() << "</chunk>\n";
}
void CclWriter::do_header()
{
XmlWriter::do_header();
os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n";
os() << "<cesAna";
os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\"";
os() << " version=\"1.0\" type=\"lex disamb\">\n";
os() << "<chunkList>\n";
if (use_indent_) indent_more();
}
void CclWriter::do_footer()
{
if (use_indent_) indent_less();
os() << "</chunkList>\n";
os() << "</cesAna>\n";
}
void CclWriter::paragraph_head()
{
osi() << "<chunk id=\"ch" << ++cid_ << "\""
<< " type=\"p\">\n";
}
void CclWriter::paragraph_head(const Chunk& c)
{
osi() << "<chunk";
foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) {
os() << " " << v.first << "=\"" << v.second << "\"";
}
os() << ">\n";
}
} /* end ns Corpus2 */
#ifndef LIBSORPUS2_IO_CCLWRITER_H
#define LIBCORPUS2_IO_CCLWRITER_H
#include <libcorpus2/io/xmlwriter.h>
namespace Corpus2 {
class CclWriter : public XmlWriter
{
public:
CclWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params);
~CclWriter();
void write_sentence(const Sentence &s);
void write_chunk(const Chunk &c);
static bool registered;
protected:
void do_header();
void do_footer();
void paragraph_head();
void paragraph_head(const Chunk& c);
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_IO_CCLWRITER_H
...@@ -25,27 +25,18 @@ bool XcesWriter::registered = TokenWriter::register_writer<XcesWriter>("xces", ...@@ -25,27 +25,18 @@ bool XcesWriter::registered = TokenWriter::register_writer<XcesWriter>("xces",
XcesWriter::XcesWriter(std::ostream& os, const Tagset& tagset, XcesWriter::XcesWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params) const string_range_vector& params)
: TokenWriter(os, tagset, params), cid_(0) : XmlWriter(os, tagset, params)
, use_indent_(true), force_chunk_(false), output_disamb_(true) , force_chunk_(false)
, sort_tags_(false), split_chunks_on_newlines_(false) , split_chunks_on_newlines_(false)
, whitespace_info_(false)
{ {
foreach (const string_range& param, params) { foreach (const string_range& param, params) {
std::string p = boost::copy_range<std::string>(param); std::string p = boost::copy_range<std::string>(param);
if (p == "flat") { if (p == "chunk") {
use_indent_ = false;
} else if (p == "chunk") {
force_chunk_ = true; force_chunk_ = true;
} else if (p == "nochunk") { } else if (p == "nochunk") {
force_chunk_ = false; force_chunk_ = false;
} else if (p == "nodisamb") {
output_disamb_ = false;
} else if (p == "sorttags") {
sort_tags_ = true;
} else if (p == "split") { } else if (p == "split") {
split_chunks_on_newlines_ = true; split_chunks_on_newlines_ = true;
} else if (p == "ws") {
whitespace_info_ = true;
} }
} }
do_header(); do_header();
...@@ -56,20 +47,10 @@ XcesWriter::~XcesWriter() ...@@ -56,20 +47,10 @@ XcesWriter::~XcesWriter()
finish(); finish();
} }
void XcesWriter::write_token(const Token &t)
{
token_as_xces_xml(os(), tagset(), t, use_indent_ ? indent_level() : -1,
output_disamb_, sort_tags_, whitespace_info_);
}
void XcesWriter::write_sentence(const Sentence& s) void XcesWriter::write_sentence(const Sentence& s)
{ {
osi() << "<chunk type=\"s\">\n"; osi() << "<chunk type=\"s\">\n";
if (use_indent_) indent_more(); XmlWriter::write_sentence(s);
foreach (const Token* t, s.tokens()) {
write_token(*t);
}
if (use_indent_) indent_less();
osi() << "</chunk>\n"; osi() << "</chunk>\n";
} }
...@@ -96,7 +77,7 @@ void XcesWriter::write_chunk(const Chunk &c) ...@@ -96,7 +77,7 @@ void XcesWriter::write_chunk(const Chunk &c)
void XcesWriter::do_header() void XcesWriter::do_header()
{ {
os() << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; XmlWriter::do_header();
os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"; os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n";
os() << "<cesAna"; os() << "<cesAna";
os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\""; os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\"";
...@@ -120,19 +101,20 @@ void XcesWriter::do_footer() ...@@ -120,19 +101,20 @@ void XcesWriter::do_footer()
os() << "</cesAna>\n"; os() << "</cesAna>\n";
} }
void XcesWriter::paragraph_head()
{
osi() << "<chunk id=\"ch" << ++cid_ << "\""
<< " type=\"p\">\n";
}
void XcesWriter::paragraph_head(const Chunk& c) //void XcesWriter::paragraph_head()
{ //{
osi() << "<chunk"; // osi() << "<chunk id=\"ch" << ++cid_ << "\""
foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) { // << " type=\"p\">\n";
os() << " " << v.first << "=\"" << v.second << "\""; //}
}
os() << ">\n"; //void XcesWriter::paragraph_head(const Chunk& c)
} //{
// osi() << "<chunk";
// foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) {
// os() << " " << v.first << "=\"" << v.second << "\"";
// }
// os() << ">\n";
//}
} /* end ns Corpus2 */ } /* end ns Corpus2 */
...@@ -17,19 +17,17 @@ or FITNESS FOR A PARTICULAR PURPOSE. ...@@ -17,19 +17,17 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#ifndef LIBCORPUS2_IO_XCESWRITER_H #ifndef LIBCORPUS2_IO_XCESWRITER_H
#define LIBCORPUS2_IO_XCESWRITER_H #define LIBCORPUS2_IO_XCESWRITER_H
#include <libcorpus2/io/writer.h> #include <libcorpus2/io/xmlwriter.h>
namespace Corpus2 { namespace Corpus2 {
class XcesWriter : public TokenWriter { class XcesWriter : public XmlWriter {
public: public:
XcesWriter(std::ostream& os, const Tagset& tagset, XcesWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params); const string_range_vector& params);
~XcesWriter(); ~XcesWriter();
void write_token(const Token &t);
void write_sentence(const Sentence &s); void write_sentence(const Sentence &s);
void write_chunk(const Chunk &c); void write_chunk(const Chunk &c);
...@@ -41,23 +39,9 @@ protected: ...@@ -41,23 +39,9 @@ protected:
void do_footer(); void do_footer();
void paragraph_head();
void paragraph_head(const Chunk& c);
int cid_;
bool use_indent_;
bool force_chunk_; bool force_chunk_;
bool output_disamb_;
bool sort_tags_;
bool split_chunks_on_newlines_; bool split_chunks_on_newlines_;
bool whitespace_info_;
}; };
} /* end ns Corpus2 */ } /* end ns Corpus2 */
......
...@@ -240,4 +240,5 @@ void XmlReader::on_end_element(const Glib::ustring &name) ...@@ -240,4 +240,5 @@ void XmlReader::on_end_element(const Glib::ustring &name)
} }
} }
} /* end ns Corpus2 */ } /* end ns Corpus2 */
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <libcorpus2/io/xcescommon.h>
#include <libcorpus2/io/xmlwriter.h>
#include <libpwrutils/foreach.h>
namespace Corpus2 {
XmlWriter::XmlWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params)
: TokenWriter(os, tagset, params), cid_(0)
, use_indent_(true), output_disamb_(true)
, sort_tags_(false), whitespace_info_(false)
{
foreach (const string_range& param, params) {
std::string p = boost::copy_range<std::string>(param);
if (p == "flat") {
use_indent_ = false;
} else if (p == "nodisamb") {
output_disamb_ = false;
} else if (p == "sorttags") {
sort_tags_ = true;
} else if (p == "ws") {
whitespace_info_ = true;
}
}
}
XmlWriter::~XmlWriter()
{
finish();
}
void XmlWriter::write_token(const Token &t)
{
token_as_xces_xml(os(), tagset(), t, use_indent_ ? indent_level() : -1,
output_disamb_, sort_tags_, whitespace_info_);
}
void XmlWriter::write_sentence(const Sentence& s)
{
if (use_indent_) indent_more();
foreach (const Token* t, s.tokens()) {
write_token(*t);
}
if (use_indent_) indent_less();
}
void XmlWriter::do_header()
{
os() << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
}
void XmlWriter::do_footer()
{
}
void XmlWriter::paragraph_head()
{
osi() << "<chunk id=\"autoch" << ++cid_ << "\""
<< " type=\"p\">\n";
}
void XmlWriter::paragraph_head(const Chunk& c)
{
osi() << "<chunk";
foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) {
os() << " " << v.first << "=\"" << v.second << "\"";
}
os() << ">\n";
}
} /* end ns Corpus2 */
#ifndef LIBSORPUS2_IO_XMLWRITER_H
#define LIBCORPUS2_IO_XMLWRITER_H
#include <libcorpus2/io/writer.h>
namespace Corpus2 {
/**
* Base class for xml-ish writers
*/
class XmlWriter : public TokenWriter {
public:
XmlWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params);
~XmlWriter();
void write_token(const Token &t);
void write_sentence(const Sentence &s);
protected:
void do_header();
void do_footer();
virtual void paragraph_head();
void paragraph_head(const Chunk& c);
int cid_;
bool use_indent_;
bool output_disamb_;
bool sort_tags_;
bool whitespace_info_;
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_IO_XMLWRITER_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment