diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 1b19902bdc232c91fa020df55819975dc1ed1b24..aec09322a823e75089ffdfd7002ef29a00ec7292 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -52,6 +52,7 @@ SET(libcorpus2_STAT_SRC token.cpp io/orthwriter.cpp io/plainwriter.cpp + io/premorphwriter.cpp io/reader.cpp io/rft.cpp io/sax.cpp diff --git a/libcorpus2/io/premorphwriter.cpp b/libcorpus2/io/premorphwriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..29b22f364978e4fda4965782c304469343fd337a --- /dev/null +++ b/libcorpus2/io/premorphwriter.cpp @@ -0,0 +1,107 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/io/premorphwriter.h> +#include <libpwrutils/foreach.h> + +namespace Corpus2 { + +bool PremorphWriter::registered = TokenWriter::register_writer<PremorphWriter>( + "premorph", "chunk"); + +PremorphWriter::PremorphWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params), cid_(0), force_chunk_(false) +{ + foreach (const string_range& param, params) { + std::string p = boost::copy_range<std::string>(param); + if (p == "chunk") { + force_chunk_ = true; + } + } + do_header(); +} + +PremorphWriter::~PremorphWriter() +{ + finish(); +} + +void PremorphWriter::write_token(const Token &t) +{ + os() << PwrNlp::Whitespace::to_whitespace(t.wa()) << t.orth_utf8(); +} + +void PremorphWriter::write_sentence(const Sentence &s) +{ + os() << "<chunk type=\"s\">"; + if (!s.tokens().empty()) { + os() << s[0]->orth_utf8(); + } + for (size_t i = 1; i < s.tokens().size(); ++i) { + write_token(*s[i]); + } + os() << "</chunk>\n"; +} + +void PremorphWriter::write_chunk(const Chunk &c) +{ + paragraph_head(c); + foreach (const Sentence* s, c.sentences()) { + write_sentence(*s); + } + os() << "</chunk>\n"; +} + +void PremorphWriter::do_header() +{ + os() << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; + os() << "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"; + os() << "<cesAna"; + os() << " xmlns:xlink=\"http://www.w3.org/1999/xlink\""; + os() << " version=\"1.0\" type=\"premorph\">\n"; + os() << "<chunkList>\n"; + if (force_chunk_) { + paragraph_head(); + } +} + +void PremorphWriter::do_footer() +{ + if (force_chunk_) { + os() << "</chunk>\n"; + } + os() << "</chunkList>\n"; + os() << "</cesAna>\n"; +} + +void PremorphWriter::paragraph_head() +{ + os() << "<chunk id=\"ch" << ++cid_ << "\"" + << " type=\"p\">\n"; +} + +void PremorphWriter::paragraph_head(const Chunk& c) +{ + os() << "<chunk"; + foreach (const Chunk::attr_map_t::value_type& v, c.attributes()) { + os() << " " << v.first << "=\"" << v.second << "\""; + } + os() << ">\n"; +} + + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/premorphwriter.h b/libcorpus2/io/premorphwriter.h new file mode 100644 index 0000000000000000000000000000000000000000..0e1a6b8588ad513c1bd2d8c076604fc596ea5d74 --- /dev/null +++ b/libcorpus2/io/premorphwriter.h @@ -0,0 +1,56 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_IO_PREMORPHWRITER_H +#define LIBCORPUS2_IO_PREMORPHWRITER_H + +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +class PremorphWriter : public TokenWriter { +public: + PremorphWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + ~PremorphWriter(); + + void write_token(const Token &t); + + void write_sentence(const Sentence &s); + + void write_chunk(const Chunk &c); + +protected: + void do_header(); + + void do_footer(); + + void paragraph_head(); + + void paragraph_head(const Chunk& c); + + int cid_; + + bool force_chunk_; + + static bool registered; + +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_PREMORPHWRITER_H