From 223523fa57180e14060d62a4c4c076656db1e56b Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Mon, 23 May 2011 17:18:29 +0200 Subject: [PATCH] WIP iob-chan --- libcorpus2/CMakeLists.txt | 1 + libcorpus2/ann/channel.cpp | 2 +- libcorpus2/ann/channel.h | 2 +- libcorpus2/io/iob-chan.cpp | 180 +++++++++++++++++++++++++++++++++++++ libcorpus2/io/iob-chan.h | 86 ++++++++++++++++++ libcorpus2/io/rft.cpp | 1 - libcorpus2/token.cpp | 11 +-- 7 files changed, 275 insertions(+), 8 deletions(-) create mode 100644 libcorpus2/io/iob-chan.cpp create mode 100644 libcorpus2/io/iob-chan.h diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index b225155..9b8fbd4 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -58,6 +58,7 @@ SET(libcorpus2_STAT_SRC io/cclreader.cpp io/cclwriter.cpp io/fastxces.cpp + io/iob-chan.cpp io/nonewriter.cpp io/orthwriter.cpp io/pathwriter.cpp diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp index 0138795..34e7549 100644 --- a/libcorpus2/ann/channel.cpp +++ b/libcorpus2/ann/channel.cpp @@ -107,7 +107,7 @@ void AnnotationChannel::set_segment_at(int token_idx, int segment_idx) } } -IOB::Enum AnnotationChannel::get_iob_at(int idx) +IOB::Enum AnnotationChannel::get_iob_at(int idx) const { if (idx >= 0 && idx < static_cast<int>(iobs_.size())) { return iobs_[idx]; diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h index d4b02bc..204ee2c 100644 --- a/libcorpus2/ann/channel.h +++ b/libcorpus2/ann/channel.h @@ -130,7 +130,7 @@ public: /** * IOB getter, returns IOB::O if idx is out of range. */ - IOB::Enum get_iob_at(int idx); + IOB::Enum get_iob_at(int idx) const; /** * IOB setter, out of range indices are not processed. diff --git a/libcorpus2/io/iob-chan.cpp b/libcorpus2/io/iob-chan.cpp new file mode 100644 index 0000000..2962d83 --- /dev/null +++ b/libcorpus2/io/iob-chan.cpp @@ -0,0 +1,180 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/io/iob-chan.h> +#include <libpwrutils/foreach.h> +#include <libcorpus2/ann/annotatedsentence.h> + +#include <boost/algorithm/string.hpp> +#include <boost/make_shared.hpp> +#include <fstream> +#include <boost/algorithm/string/split.hpp> + +namespace Corpus2 { + +bool IobChanWriter::registered = TokenWriter::register_writer<IobChanWriter>( + "iob-chan", "nowarn"); + +IobChanWriter::IobChanWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params), warn_on_no_lexemes_(true) +{ + foreach (const string_range& param, params) { + std::string p = boost::copy_range<std::string>(param); + if (p == "nowarn") { + warn_on_no_lexemes_ = false; + } + } +} + +void IobChanWriter::write_token(const Token& t) +{ + os() << t.orth_utf8(); + if (t.lexemes().empty()) { + if (warn_on_no_lexemes_) { + std::cerr << "No lexemes for token!"; + } + } else { + const Lexeme& pref = t.get_preferred_lexeme(tagset()); + os() << "\t"; + write_tag(pref.tag()); + } + os() << "\n"; +} + +void IobChanWriter::write_sentence(const Sentence& s) +{ + const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s); + for (size_t idx = 0; idx < s.size(); ++idx) { + const Token* t = s.tokens()[idx]; + os() << t->orth_utf8(); + if (t->lexemes().empty()) { + if (warn_on_no_lexemes_) { + std::cerr << "No lexemes for token!"; + } + } else { + const Lexeme& pref = t->get_preferred_lexeme(tagset()); + os() << "\t"; + write_tag(pref.tag()); + } + if (ann) { + bool first = true; + foreach (const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) { + if (!first) { + os() << ","; + } + os() << v.first << "-"; + os() << Corpus2::IOB::to_string(v.second.get_iob_at(idx)); + first = false; + } + } + os() << "\n"; + } + os() << "\n"; +} + +void IobChanWriter::write_chunk(const Chunk& c) +{ + foreach (const Sentence::ConstPtr& s, c.sentences()) { + write_sentence(*s); + } +} + +void IobChanWriter::write_tag(const Tag& tag) +{ + os() << tagset().tag_to_string(tag); +} + + +bool IobChanReader::registered = TokenReader::register_reader<IobChanReader>("iob-chan", + "ign,loose,strict,no_set_disamb"); + + +IobChanReader::IobChanReader(const Tagset& tagset, std::istream& is) + : BufferedSentenceReader(tagset), is_(&is), disamb_(true) +{ +} + +IobChanReader::IobChanReader(const Tagset& tagset, const std::string& filename) + : BufferedSentenceReader(tagset), is_(), disamb_(true) +{ + is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); + if (!this->is_owned_->good()) { + throw Corpus2Error("File not found!"); + } + else { + this->is_ = is_owned_.get(); + } +} + +Sentence::Ptr IobChanReader::actual_next_sentence() +{ + std::string line; + AnnotatedSentence::Ptr s; + typedef boost::split_iterator<std::string::const_iterator> string_split_iterator; + + while (is().good()) { + std::getline(is(), line); + if (line.empty()) { + return s; + } + std::vector<std::string> spl; + boost::algorithm::split(spl, line, boost::is_any_of("\t")); + if (spl.size() != 4) { + std::cerr << "Invalid line: " << line << "\n"; + } else { + const std::string& orth = spl[0]; + const std::string& lemma = spl[0]; + const std::string& tag_string = spl[1]; + Tag tag = parse_tag(tag_string); + Token* t = new Token(); + t->set_orth(UnicodeString::fromUTF8(orth)); + t->set_wa(PwrNlp::Whitespace::Space); + t->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag)); + if (disamb_) { + t->lexemes().back().set_disamb(true); + } + s->append(t); + const std::string& cline = line; + for (string_split_iterator value_it = boost::make_split_iterator( + cline, boost::token_finder(boost::is_any_of(","))); + value_it != string_split_iterator(); + ++value_it) { + + } + } + } + return s; +} + +void IobChanReader::set_option(const std::string &option) +{ + if (option == "no_set_disamb") { + disamb_ = false; + } else { + BufferedSentenceReader::set_option(option); + } +} + +std::string IobChanReader::get_option(const std::string &option) const +{ + if (option == "no_set_disamb") { + return !disamb_ ? option : ""; + } + return BufferedSentenceReader::get_option(option); +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/iob-chan.h b/libcorpus2/io/iob-chan.h new file mode 100644 index 0000000..ea6bb8a --- /dev/null +++ b/libcorpus2/io/iob-chan.h @@ -0,0 +1,86 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBSORPUS2_IO_IOB_CHAN_H +#define LIBCORPUS2_IO_IOB_CHAN_H + +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/writer.h> +#include <boost/scoped_ptr.hpp> + +namespace Corpus2 { + +/** + * Simple writer class to output token in RFTagger-compatible corpora form. + * + * One token per line, token line consists of the orth, followed by the + * tag, followed by newline (one tag per token only). Each sentence + * is followed by a blank line. + * + * The first lexeme is used. No-lexeme tokens trigger a warning unless + * nowarn is passed. + */ +class IobChanWriter : public TokenWriter +{ +public: + IobChanWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + void write_token(const Token& t); + + void write_sentence(const Sentence& s); + + void write_chunk(const Chunk &p); + + void write_tag(const Tag& tag); + + static bool registered; + +private: + bool warn_on_no_lexemes_; +}; + +class IobChanReader : public BufferedSentenceReader +{ +public: + IobChanReader(const Tagset& tagset, std::istream& is); + + IobChanReader(const Tagset& tagset, const std::string& filename); + + std::istream& is() { + return *is_; + } + + void set_option(const std::string& option); + + std::string get_option(const std::string& option) const; + + static bool registered; + +protected: + /// BufferedSentenceReader override + Sentence::Ptr actual_next_sentence(); + + std::istream* is_; + boost::scoped_ptr<std::istream> is_owned_; + + /// Whether to mark all incoming tags as disambiguated + bool disamb_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_IOB_CHAN_H diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index ead4021..67552df 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -67,7 +67,6 @@ void RftWriter::write_token(const Token& t) const Lexeme& pref = t.get_preferred_lexeme(tagset()); os() << "\t"; write_tag(pref.tag()); - std::string tag_str = tagset().tag_to_no_opt_string(pref.tag()); } os() << "\n"; } diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp index 38f3bdb..8593675 100644 --- a/libcorpus2/token.cpp +++ b/libcorpus2/token.cpp @@ -65,12 +65,13 @@ struct preferred_lexeme_cmp const Tagset* tagset; bool operator()(const Lexeme& l1, const Lexeme& l2) const { - return (!l1.is_disamb() && l2.is_disamb()) - || (l1.is_disamb() == l2.is_disamb() - && (tagset->get_original_pos_index(l1.tag().get_pos_index()) > + return + (!l1.is_disamb() && l2.is_disamb()) + || (l1.is_disamb() == l2.is_disamb() + && (tagset->get_original_pos_index(l1.tag().get_pos_index()) > tagset->get_original_pos_index(l2.tag().get_pos_index()) - || (l1.tag().get_pos() == l2.tag().get_pos() - && l1 < l2))); + || (l1.tag().get_pos() == l2.tag().get_pos() + && l1 < l2))); } }; -- GitLab