From 903e88c7ede46e04e29ec8311d5ccbad14af1f1e Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Tue, 12 Apr 2011 17:16:08 +0200 Subject: [PATCH] add two new writers: a dummy "none" writer that disables output and a "stats" writer that outputs general info only about token counts and annotation info --- libcorpus2/CMakeLists.txt | 2 ++ libcorpus2/io/nonewriter.cpp | 27 +++++++++++++++++ libcorpus2/io/nonewriter.h | 25 ++++++++++++++++ libcorpus2/io/plainwriter.cpp | 2 +- libcorpus2/io/statwriter.cpp | 55 +++++++++++++++++++++++++++++++++++ libcorpus2/io/statwriter.h | 25 ++++++++++++++++ 6 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 libcorpus2/io/nonewriter.cpp create mode 100644 libcorpus2/io/nonewriter.h create mode 100644 libcorpus2/io/statwriter.cpp create mode 100644 libcorpus2/io/statwriter.h diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 09a03e4..96f80e9 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -58,12 +58,14 @@ SET(libcorpus2_STAT_SRC io/cclreader.cpp io/cclwriter.cpp io/fastxces.cpp + io/nonewriter.cpp io/orthwriter.cpp io/plainwriter.cpp io/premorphwriter.cpp io/reader.cpp io/rft.cpp io/sax.cpp + io/statwriter.cpp io/writer.cpp io/xces.cpp io/xcescommon.cpp diff --git a/libcorpus2/io/nonewriter.cpp b/libcorpus2/io/nonewriter.cpp new file mode 100644 index 0000000..2aaf4ee --- /dev/null +++ b/libcorpus2/io/nonewriter.cpp @@ -0,0 +1,27 @@ +#include <libcorpus2/io/nonewriter.h> + +namespace Corpus2 { + +bool NoneWriter::registered = TokenWriter::register_writer<NoneWriter>( + "none"); + +NoneWriter::NoneWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params) +{ +} + +void NoneWriter::write_token(const Token&) +{ +} + +void NoneWriter::write_sentence(const Sentence&) +{ +} + +void NoneWriter::write_chunk(const Chunk&) +{ +} + + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/nonewriter.h b/libcorpus2/io/nonewriter.h new file mode 100644 index 0000000..8d6a719 --- /dev/null +++ b/libcorpus2/io/nonewriter.h @@ -0,0 +1,25 @@ +#ifndef LIBSORPUS2_IO_NONEWRITER_H +#define LIBCORPUS2_IO_NONEWRITER_H + +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +class NoneWriter : public TokenWriter +{ +public: + NoneWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + void write_token(const Token& t); + + void write_sentence(const Sentence& t); + + void write_chunk(const Chunk& c); + + static bool registered; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_NONEWRITER_H diff --git a/libcorpus2/io/plainwriter.cpp b/libcorpus2/io/plainwriter.cpp index 45b156b..225afe4 100644 --- a/libcorpus2/io/plainwriter.cpp +++ b/libcorpus2/io/plainwriter.cpp @@ -19,7 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { -bool PlainWriter::registered = PlainWriter::register_writer<PlainWriter>( +bool PlainWriter::registered = TokenWriter::register_writer<PlainWriter>( "plain"); PlainWriter::PlainWriter(std::ostream& os, const Tagset& tagset, diff --git a/libcorpus2/io/statwriter.cpp b/libcorpus2/io/statwriter.cpp new file mode 100644 index 0000000..b77fbf7 --- /dev/null +++ b/libcorpus2/io/statwriter.cpp @@ -0,0 +1,55 @@ +#include <libcorpus2/io/statwriter.h> +#include <libcorpus2/ann/annotatedsentence.h> +#include <iomanip> +#include <libpwrutils/foreach.h> + +namespace Corpus2 { + +bool StatWriter::registered = TokenWriter::register_writer<StatWriter>( + "stat"); + +StatWriter::StatWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params) +{ +} + +void StatWriter::write_token(const Token& t) +{ + if (t.orth().length() == 0) { + os() << "~"; + } else { + UnicodeString o1(t.orth().charAt(0)); + os() << PwrNlp::to_utf8(o1); + } +} + + +void StatWriter::write_sentence(const Sentence& s) +{ + os() << std::setw(8) << "tokens" << " "; + foreach (const Token* t, s.tokens()) { + write_token(*t); + } + os() << "\n"; + const AnnotatedSentence* as = dynamic_cast<const AnnotatedSentence*>(&s); + if (as) { + foreach (const AnnotatedSentence::chan_map_t::value_type& vt, as->all_channels()) { + os() << std::setw(8) << vt.first << " "; + os() << vt.second.dump_alpha(); + os() << "\n"; + } + } + os() << "\n"; +} + +void StatWriter::write_chunk(const Chunk& c) +{ + foreach (const Sentence::Ptr s, c.sentences()) { + write_sentence(*s); + } + os() << "\n"; +} + + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/statwriter.h b/libcorpus2/io/statwriter.h new file mode 100644 index 0000000..a022b9f --- /dev/null +++ b/libcorpus2/io/statwriter.h @@ -0,0 +1,25 @@ +#ifndef LIBSORPUS2_IO_STATWRITER_H +#define LIBCORPUS2_IO_STATWRITER_H + +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +class StatWriter : public TokenWriter +{ +public: + StatWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + void write_token(const Token& t); + + void write_sentence(const Sentence& t); + + void write_chunk(const Chunk& c); + + static bool registered; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_STATWRITER_H -- GitLab