/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libcorpus2 project This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details. */ #include <libcorpus2/io/rft.h> #include <libpwrutils/foreach.h> #include <boost/algorithm/string.hpp> #include <boost/algorithm/string/predicate.hpp> #include <boost/make_shared.hpp> #include <fstream> namespace Corpus2 { bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( "rft", "mbt,nowarn,colon,alltags,opt,latin2"); RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) : TokenWriter(os, tagset, params), warn_on_no_lexemes_(true) , mbt_dialect_(false), colon_(false), opt_(false), alltags_(false) { foreach (const string_range& param, params) { std::string p = boost::copy_range<std::string>(param); if (p == "nowarn") { warn_on_no_lexemes_ = false; } else if (p == "mbt") { mbt_dialect_ = true; colon_ = true; opt_ = false; } else if (p == "alltags") { alltags_ = true; } else if (p == "opt") { opt_ = true; } else if (p == "colon") { colon_ = true; } else if (p == "latin2") { encoding_ = p; } } } void RftWriter::write_token(const Token& t) { if (encoding_.empty()) { os() << t.orth_utf8(); } else { char buf[256]; int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str()); if (len < 256) { os() << buf; } else { std::cerr << "Characetr encoding error in codepage rft output\n"; os() << "???"; } } if (t.lexemes().empty()) { if (warn_on_no_lexemes_) { std::cerr << "No lexemes for token!"; } } else if (alltags_) { foreach (const Lexeme& lex, t.lexemes()) { os() << "\t"; write_tag(lex.tag()); } } else { const Lexeme& pref = t.get_preferred_lexeme(tagset()); os() << "\t"; write_tag(pref.tag()); } os() << "\n"; } void RftWriter::write_sentence(const Sentence& s) { foreach (const Token* t, s.tokens()) { write_token(*t); } if (mbt_dialect_) { os() << "<utt>"; } os() << "\n"; } void RftWriter::write_chunk(const Chunk& c) { foreach (const Sentence::ConstPtr& s, c.sentences()) { write_sentence(*s); } } void RftWriter::write_tag(const Tag& tag) { std::string tag_str = opt_ ? tagset().tag_to_string(tag) : tagset().tag_to_no_opt_string(tag); os() << (colon_ ? tag_str // when MBT-compliant, suppress colon substitution : boost::algorithm::replace_all_copy(tag_str, ":", ".")); } bool RftReader::registered = TokenReader::register_reader<RftReader>("rft", "ign,loose,strict,set_disamb,mbt"); RftReader::RftReader(const Tagset& tagset, std::istream& is, bool disamb, bool mbt_dialect) : BufferedSentenceReader(tagset), is_(&is), disamb_(disamb) , mbt_dialect_(mbt_dialect) { } RftReader::RftReader(const Tagset& tagset, const std::string& filename, bool disamb, bool mbt_dialect) : BufferedSentenceReader(tagset), is_(), disamb_(disamb) , mbt_dialect_(mbt_dialect) { is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); if (!this->is_owned_->good()) { throw Corpus2Error("File not found!"); } else { this->is_ = is_owned_.get(); } } Sentence::Ptr RftReader::actual_next_sentence() { std::string line; Sentence::Ptr s; while (is().good()) { std::getline(is(), line); if (line.empty() || (mbt_dialect_ && boost::starts_with(line, "<utt>"))) { return s; } else { size_t tab = line.find('\t'); if (tab == line.npos || tab == 0 || (tab == line.size() - 1)) { std::cerr << "Invalid line: " << line << "\n"; } else { std::string orth = line.substr(0, tab); std::string tag_string = line.substr(tab + 1); if (!mbt_dialect_) { boost::algorithm::replace_all(tag_string, ".", ":"); } Tag tag = parse_tag(tag_string); Token* t = new Token(); t->set_orth(UnicodeString::fromUTF8(orth)); t->set_wa(PwrNlp::Whitespace::Space); if (!s) { s = make_sentence(); t->set_wa(PwrNlp::Whitespace::Newline); } t->add_lexeme(Lexeme(t->orth(), tag)); if (disamb_) { t->lexemes().back().set_disamb(true); } s->append(t); } } } return s; } void RftReader::set_option(const std::string &option) { if (option == "mbt") { mbt_dialect_ = true; } else if (option == "set_disamb") { disamb_ = true; } else { BufferedSentenceReader::set_option(option); } } std::string RftReader::get_option(const std::string &option) const { if (option == "mbt") { return mbt_dialect_ ? option : ""; } else if (option == "set_disamb") { return disamb_ ? option : ""; } return BufferedSentenceReader::get_option(option); } } /* end ns Corpus2 */