From a7ae417fef47bf63a077cae080769a4d5182c133 Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Sat, 15 Jun 2013 13:27:52 +0200 Subject: [PATCH] new writer: line (simple chunk line drawings) --- CMakeLists.txt | 2 +- libcorpus2/CMakeLists.txt | 1 + libcorpus2/io/linewriter.cpp | 118 +++++++++++++++++++++++++++++++++++ libcorpus2/io/linewriter.h | 44 +++++++++++++ 4 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 libcorpus2/io/linewriter.cpp create mode 100644 libcorpus2/io/linewriter.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 09c8d60..f60324a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ PROJECT(Corpus2Library) set(corpus2_ver_major "1") set(corpus2_ver_minor "3") -set(corpus2_ver_patch "3") +set(corpus2_ver_patch "4") cmake_minimum_required(VERSION 2.8.0) diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index dacd420..c1f931c 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -58,6 +58,7 @@ SET(libcorpus2_STAT_SRC io/helpers.cpp io/fastxces.cpp io/iob-chan.cpp + io/linewriter.cpp io/nonewriter.cpp io/orthwriter.cpp io/pathwriter.cpp diff --git a/libcorpus2/io/linewriter.cpp b/libcorpus2/io/linewriter.cpp new file mode 100644 index 0000000..deb8650 --- /dev/null +++ b/libcorpus2/io/linewriter.cpp @@ -0,0 +1,118 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details. +*/ + +#include <libcorpus2/io/linewriter.h> +#include <libcorpus2/io/linewriter.h> +#include <libcorpus2/ann/annotatedsentence.h> +#include <iomanip> +#include <boost/foreach.hpp> + +namespace Corpus2 { + +bool LineWriter::registered = TokenWriter::register_writer<LineWriter>( + "line"); + +LineWriter::LineWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params) +{ +} + +void LineWriter::write_token(const Token& t) +{ + os() << t.orth_utf8(); +} + + +void LineWriter::write_sentence(const Sentence& s) +{ + const std::string tok_name("Tokens"); + const std::string empty_char(" "); + const std::string chunk_char("\xe2\x94\x80"); + //const std::string head_char("\xe2\x94\x81"); // hardcoded utf-8 + const std::string head_char("\xe2\x95\x90"); // hardcoded utf-8 + + const AnnotatedSentence* as = dynamic_cast<const AnnotatedSentence*>(&s); + // get longest channel name for padding + int name_padding = tok_name.length(); + + if (as) { + BOOST_FOREACH(const AnnotatedSentence::chan_map_t::value_type& vt, as->all_channels()) { + const int that_len = vt.first.length(); + if (that_len > name_padding) { + name_padding = that_len; + } + } + } + + // dump token orths and remember orth lengths + std::vector<int> orth_lens; + os() << std::left << std::setw(name_padding) << tok_name; + BOOST_FOREACH(const Token* t, s.tokens()) { + os() << " " << t->orth_utf8(); + orth_lens.push_back(t->orth().length()); + } + os() << "\n"; + + // dump channel line representations + if (as) { + AnnotatedSentence* hax = const_cast<AnnotatedSentence*>(as); // sorry + BOOST_FOREACH(const AnnotatedSentence::chan_map_t::value_type& vt, hax->all_channels()) { + os() << std::left << std::setw(name_padding) << vt.first; + // use IOB2 representation internally + AnnotationChannel &chan = hax->get_channel(vt.first); + chan.make_iob_from_segments(); + IOB::Enum last_tag = IOB::O; + // write line representation + for (int idx = 0; idx < chan.size(); idx++) { + IOB::Enum this_tag = chan.get_iob_at(idx); + if (last_tag == IOB::O || this_tag == IOB::O) { + os() << " "; + } + else { + os() << chunk_char; + } + last_tag = chan.get_iob_at(idx); + std::string now(" "); + if (last_tag != IOB::O) { + if (chan.is_head_at(idx)) { + now = head_char; + } + else { + now = chunk_char; + } + } + for (int line_pos = orth_lens[idx]; line_pos > 0; line_pos--) { + os() << now; + } + last_tag = this_tag; + } + os() << "\n"; + } + } + os() << "\n"; +} + +void LineWriter::write_chunk(const Chunk& c) +{ + BOOST_FOREACH(const Sentence::Ptr s, c.sentences()) { + write_sentence(*s); + } + os() << "\n"; +} + + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/linewriter.h b/libcorpus2/io/linewriter.h new file mode 100644 index 0000000..a86c2e9 --- /dev/null +++ b/libcorpus2/io/linewriter.h @@ -0,0 +1,44 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_IO_LINEWRITER_H +#define LIBCORPUS2_IO_LINEWRITER_H + +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +/** + * A writer that produces simple text (UTF-8) line drawing representation + * of syntactic annotation in channels. */ +class LineWriter : public TokenWriter +{ +public: + LineWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + void write_token(const Token& t); + + void write_sentence(const Sentence& t); + + void write_chunk(const Chunk& c); + + static bool registered; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_STATWRITER_H -- GitLab