/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libcorpus2 project This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details. */ #include <libcorpus2/io/cclwriter.h> #include <boost/foreach.hpp> #include <libcorpus2/ann/annotatedsentence.h> #include <libcorpus2/io/xcescommon.h> #include <libcorpus2/tokenmetadata.h> namespace Corpus2 { bool CclWriter::registered = TokenWriter::register_writer<CclWriter>("ccl", "flat,chunk,nochunk,nodisamb,sorttags,split,ws"); CclWriter::CclWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) : XmlWriter(os, tagset, params) { do_header(); } CclWriter::~CclWriter() { finish(); } void CclWriter::write_sentence(const Sentence& s) { paragraph_head(); if (use_indent_) indent_more(); write_sentence_int(s); if (use_indent_) indent_less(); osi() << "</chunk>\n"; } void CclWriter::write_sentence_int(const Sentence &s) { const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s); std::string id = s.id(); if (id == "") { osi() << "<sentence>\n"; } else { osi() << "<sentence id=\"" << id << "\">\n"; } if (use_indent_) indent_more(); for (size_t idx = 0; idx < s.size(); ++idx) { const Token* t = s.tokens()[idx]; if (ann) { token_as_xces_xml_head(os(), *t, use_indent_ ? indent_level() : -1, whitespace_info_); if (use_indent_) indent_more(); token_as_xces_xml_body(os(), tagset(), *t, use_indent_ ? indent_level() : -1, output_disamb_, sort_tags_); BOOST_FOREACH(const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) { osi() << "<ann chan=\"" << v.first << "\""; if (v.second.is_head_at(idx)) { os() << " head=\"1\""; } os() << ">"; os() << v.second.get_segment_at(idx); os() << "</ann>\n"; } boost::shared_ptr<TokenMetaData> md = t->get_metadata(); if (md) { BOOST_FOREACH(const TokenMetaData::attr_map_t::value_type& v, md->attributes()) { osi() << "<prop key=\"" << v.first << "\"" << ">"; os() << v.second << "</prop>\n"; } } if (use_indent_) indent_less(); osi() << "</tok>\n"; } else { // TODO: currently writing of token metadata is supported only when // we've got an AnnotatedSentence. XmlWriter::write_token(*t); } } if (use_indent_) indent_less(); osi() << "</sentence>\n"; } void CclWriter::write_chunk(const Chunk &c) { paragraph_head(c); if (use_indent_) indent_more(); BOOST_FOREACH(const Sentence::ConstPtr& s, c.sentences()) { write_sentence_int(*s); } if (use_indent_) indent_less(); osi() << "</chunk>\n"; } void CclWriter::do_header() { XmlWriter::do_header(); os() << "<!DOCTYPE chunkList SYSTEM \"ccl.dtd\">\n"; os() << "<chunkList"; //os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\""; os() << ">\n"; if (use_indent_) indent_more(); } void CclWriter::do_footer() { if (use_indent_) indent_less(); os() << "</chunkList>\n"; } void CclWriter::paragraph_head() { osi() << "<chunk id=\"ch" << ++cid_ << "\"" << " type=\"p\">\n"; } void CclWriter::paragraph_head(const Chunk& c) { // in CCL format chunks may have at most two attributes: // id (unique XML-style id) and type (typically p for paragraphs) osi() << "<chunk"; if (c.has_attribute("id")) { const std::string &val = c.get_attribute("id"); if (!val.empty()) { os() << " id=\"" << val << "\""; } } if (c.has_attribute("type")) { const std::string &val = c.get_attribute("type"); if (!val.empty()) { os() << " type=\"" << val << "\""; } } os() << ">\n"; } } /* end ns Corpus2 */