diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 279ef42bd24546d1d70b947834d7f4a6977adbb7..d351a6de1f3c64bd047701674e26c1a41e1461b3 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -59,6 +59,7 @@ SET(libcorpus2_STAT_SRC io/cclwriter.cpp io/helpers.cpp io/fastxces.cpp + io/iob-chan.cpp io/nonewriter.cpp io/orthwriter.cpp io/pathwriter.cpp diff --git a/libcorpus2/io/iob-chan.cpp b/libcorpus2/io/iob-chan.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dd07211fc94d3ef0456ef22d020514c08eb033c2 --- /dev/null +++ b/libcorpus2/io/iob-chan.cpp @@ -0,0 +1,216 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/io/iob-chan.h> +#include <libpwrutils/foreach.h> +#include <libcorpus2/ann/annotatedsentence.h> + +#include <boost/algorithm/string.hpp> +#include <boost/make_shared.hpp> +#include <fstream> +#include <boost/algorithm/string/split.hpp> + +namespace Corpus2 { + +bool IobChanWriter::registered = TokenWriter::register_writer<IobChanWriter>( + "iob-chan", "nowarn,noforce"); + +IobChanWriter::IobChanWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params), warn_on_no_lexemes_(true), force_(true) +{ + foreach (const string_range& param, params) { + std::string p = boost::copy_range<std::string>(param); + if (p == "nowarn") { + warn_on_no_lexemes_ = false; + } else if (p == "noforce") { + force_ = false; + } + } +} + +void IobChanWriter::write_token(const Token& t) +{ + os() << t.orth_utf8(); + if (t.lexemes().empty()) { + if (warn_on_no_lexemes_) { + std::cerr << "No lexemes for token!"; + } + } else { + const Lexeme& pref = t.get_preferred_lexeme(tagset()); + os() << "\t"; + write_tag(pref.tag()); + } + os() << "\n"; +} + +void IobChanWriter::write_sentence(const Sentence& s) +{ + const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s); + if (force_) { + // I sincerely apologize + AnnotatedSentence* hax = const_cast<AnnotatedSentence*>(ann); + foreach(const AnnotatedSentence::chan_map_t::value_type& v, hax->all_channels()) { + hax->get_channel(v.first).make_iob_from_segments(); + } + } + for (size_t idx = 0; idx < s.size(); ++idx) { + const Token* t = s.tokens()[idx]; + os() << t->orth_utf8(); + if (t->lexemes().empty()) { + if (warn_on_no_lexemes_) { + std::cerr << "No lexemes for token!"; + } + } else { + const Lexeme& pref = t->get_preferred_lexeme(tagset()); + os() << "\t"; + os() << pref.lemma_utf8(); + os() << "\t"; + write_tag(pref.tag()); + os() << "\t"; + } + if (ann) { + bool first = true; + foreach (const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) { + if (!first) { + os() << ","; + } + os() << v.first << "-"; + os() << Corpus2::IOB::to_string(v.second.get_iob_at(idx)); + first = false; + } + } + os() << "\n"; + } + os() << "\n"; +} + +void IobChanWriter::write_chunk(const Chunk& c) +{ + foreach (const Sentence::ConstPtr& s, c.sentences()) { + write_sentence(*s); + } +} + +void IobChanWriter::write_tag(const Tag& tag) +{ + os() << tagset().tag_to_string(tag); +} + + +bool IobChanReader::registered = TokenReader::register_reader<IobChanReader>("iob-chan", + "ign,loose,strict,no_set_disamb"); + + +IobChanReader::IobChanReader(const Tagset& tagset, std::istream& is) + : BufferedSentenceReader(tagset), is_(&is), disamb_(true) +{ +} + +IobChanReader::IobChanReader(const Tagset& tagset, const std::string& filename) + : BufferedSentenceReader(tagset), is_(), disamb_(true) +{ + is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); + if (!this->is_owned_->good()) { + throw Corpus2Error("File not found!"); + } + else { + this->is_ = is_owned_.get(); + } +} + +Sentence::Ptr IobChanReader::actual_next_sentence() +{ + std::string line; + boost::shared_ptr<AnnotatedSentence> s; + typedef boost::split_iterator<std::string::const_iterator> string_split_iterator; + + while (is().good()) { + std::getline(is(), line); + if (line.empty()) { + break; + } + std::vector<std::string> spl; + boost::algorithm::split(spl, line, boost::is_any_of("\t")); + if (spl.size() != 4) { + std::cerr << "Invalid line: " << line << "(" << spl.size() << ")\n"; + } else { + const std::string& orth = spl[0]; + const std::string& lemma = spl[1]; + const std::string& tag_string = spl[2]; + const std::string& anns = spl[3]; + Tag tag = parse_tag(tag_string); + Token* t = new Token(); + t->set_orth(UnicodeString::fromUTF8(orth)); + t->set_wa(PwrNlp::Whitespace::Space); + t->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag)); + if (disamb_) { + t->lexemes().back().set_disamb(true); + } + if (!s) { + s = boost::make_shared<AnnotatedSentence>(); + t->set_wa(PwrNlp::Whitespace::Newline); + } + s->append(t); + std::vector<std::string> annsplit; + boost::algorithm::split(annsplit, anns, boost::is_any_of(",")); + foreach (const std::string& a, annsplit) { + std::vector<std::string> one_ann_split; + boost::algorithm::split(one_ann_split, a, boost::is_any_of("-")); + if (one_ann_split.size() != 2) { + std::cerr << "Invalid annotation:" << a << "\n"; + } else { + const std::string& aname = one_ann_split[0]; + const std::string& aiob = one_ann_split[1]; + Corpus2::IOB::Enum iob = Corpus2::IOB::from_string(aiob); + if (iob == Corpus2::IOB::PostLast) { + std::cerr << "Invalid IOB tag: " << aiob << "\n"; + } else { + if (!s->has_channel(aname)) { + s->create_channel(aname); + } + s->get_channel(aname).set_iob_at(s->size() - 1, iob); + } + } + } + } + } + if (s) { + foreach (const AnnotatedSentence::chan_map_t::value_type& v, s->all_channels()) { + s->get_channel(v.first).make_segments_from_iob(); + } + } + return s; +} + +void IobChanReader::set_option(const std::string &option) +{ + if (option == "no_set_disamb") { + disamb_ = false; + } else { + BufferedSentenceReader::set_option(option); + } +} + +std::string IobChanReader::get_option(const std::string &option) const +{ + if (option == "no_set_disamb") { + return !disamb_ ? option : ""; + } + return BufferedSentenceReader::get_option(option); +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/iob-chan.h b/libcorpus2/io/iob-chan.h new file mode 100644 index 0000000000000000000000000000000000000000..378e8503d16d3952b35f085c57d2c36b6fb2739d --- /dev/null +++ b/libcorpus2/io/iob-chan.h @@ -0,0 +1,88 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBSORPUS2_IO_IOB_CHAN_H +#define LIBCORPUS2_IO_IOB_CHAN_H + +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/writer.h> +#include <boost/scoped_ptr.hpp> + +namespace Corpus2 { + +/** + * Simple writer class to output token in RFTagger-compatible corpora form. + * + * One token per line, token line consists of the orth, followed by the + * tag, followed by newline (one tag per token only). Each sentence + * is followed by a blank line. + * + * The first lexeme is used. No-lexeme tokens trigger a warning unless + * nowarn is passed. + */ +class IobChanWriter : public TokenWriter +{ +public: + IobChanWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + void write_token(const Token& t); + + void write_sentence(const Sentence& s); + + void write_chunk(const Chunk &p); + + void write_tag(const Tag& tag); + + static bool registered; + +private: + bool warn_on_no_lexemes_; + + bool force_; +}; + +class IobChanReader : public BufferedSentenceReader +{ +public: + IobChanReader(const Tagset& tagset, std::istream& is); + + IobChanReader(const Tagset& tagset, const std::string& filename); + + std::istream& is() { + return *is_; + } + + void set_option(const std::string& option); + + std::string get_option(const std::string& option) const; + + static bool registered; + +protected: + /// BufferedSentenceReader override + Sentence::Ptr actual_next_sentence(); + + std::istream* is_; + boost::scoped_ptr<std::istream> is_owned_; + + /// Whether to mark all incoming tags as disambiguated + bool disamb_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_IOB_CHAN_H