/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libcorpus2 project This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details. */ #include <libcorpus2/io/plainreader.h> #include <libpwrutils/foreach.h> #include <boost/algorithm/string.hpp> #include <boost/lexical_cast.hpp> #include <boost/make_shared.hpp> #include <fstream> namespace Corpus2 { bool PlainReader::registered = TokenReader::register_reader<PlainReader>("plain", "ign,loose,strict"); PlainReader::PlainReader(const Tagset& tagset, std::istream& is) : BufferedSentenceReader(tagset), is_(&is) { } PlainReader::PlainReader(const Tagset& tagset, const std::string& filename) : BufferedSentenceReader(tagset), is_() { is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); if (!this->is_owned_->good()) { throw Corpus2Error("File not found!"); } else { this->is_ = is_owned_.get(); } } Sentence::Ptr PlainReader::actual_next_sentence() { std::string line; Sentence::Ptr s; size_t line_no = 0; while (is().good()) { std::getline(is(), line); ++line_no; if (line.empty()) { return s; } else { std::vector<std::string> fields; boost::algorithm::split(fields, line, boost::is_any_of("\t")); assert(!fields.empty()); if (fields[0].empty()) { //lexeme if (s->empty()) { throw Corpus2Error("PlainReader lexemes without a token at " + boost::lexical_cast<std::string>(line_no)); } if (fields.size() < 3) { throw Corpus2Error("PlainReader not enough fields at " + boost::lexical_cast<std::string>(line_no)); } const std::string& lemma = fields[1]; const std::string& tag_string = fields[2]; Tag tag = parse_tag(tag_string); Token* last_token = s->tokens().back(); last_token->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag)); if (fields.size() > 3 && fields[3] == "disamb") { last_token->lexemes().back().set_disamb(true); } } else { // orth-ws Token* t = new Token(); const std::string& orth = fields[0]; t->set_orth(UnicodeString::fromUTF8(orth)); PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space; if (!s) { s = make_sentence(); wa = PwrNlp::Whitespace::Newline; } if (fields.size() > 1) { wa = PwrNlp::Whitespace::from_string(fields[1]); } t->set_wa(wa); s->append(t); } } } return s; } void PlainReader::set_option(const std::string &option) { BufferedSentenceReader::set_option(option); } std::string PlainReader::get_option(const std::string &option) const { return BufferedSentenceReader::get_option(option); } } /* end ns Corpus2 */