diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index bb21452b1febf8c19b29dad0c705688b6b1748b9..d00090a5dd567ca3db2f9b680198852812060682 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -63,6 +63,7 @@ SET(libcorpus2_STAT_SRC io/nonewriter.cpp io/orthwriter.cpp io/pathwriter.cpp + io/plainreader.cpp io/plainwriter.cpp io/premorphwriter.cpp io/reader.cpp diff --git a/libcorpus2/io/plainreader.cpp b/libcorpus2/io/plainreader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2d322cf19ab8e4ba3300feec2208f4beb9bc0a65 --- /dev/null +++ b/libcorpus2/io/plainreader.cpp @@ -0,0 +1,93 @@ +#include <libcorpus2/io/plainreader.h> +#include <libpwrutils/foreach.h> + +#include <boost/algorithm/string.hpp> +#include <boost/lexical_cast.hpp> +#include <boost/make_shared.hpp> +#include <fstream> + +namespace Corpus2 { + +bool PlainReader::registered = TokenReader::register_reader<PlainReader>("plain", + "ign,loose,strict"); + + +PlainReader::PlainReader(const Tagset& tagset, std::istream& is) + : BufferedSentenceReader(tagset), is_(&is) +{ +} + +PlainReader::PlainReader(const Tagset& tagset, const std::string& filename) + : BufferedSentenceReader(tagset), is_() +{ + is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); + if (!this->is_owned_->good()) { + throw Corpus2Error("File not found!"); + } + else { + this->is_ = is_owned_.get(); + } +} + +Sentence::Ptr PlainReader::actual_next_sentence() +{ + std::string line; + Sentence::Ptr s; + size_t line_no = 0; + while (is().good()) { + std::getline(is(), line); + ++line_no; + if (line.empty()) { + return s; + } else { + std::vector<std::string> fields; + boost::algorithm::split(fields, line, boost::is_any_of("\t")); + assert(!fields.empty()); + if (fields[0].empty()) { //lexeme + if (s->empty()) { + throw Corpus2Error("PlainReader lexemes without a token at " + + boost::lexical_cast<std::string>(line_no)); + } + if (fields.size() < 3) { + throw Corpus2Error("PlainReader not enough fields at " + + boost::lexical_cast<std::string>(line_no)); + } + const std::string& lemma = fields[1]; + const std::string& tag_string = fields[2]; + Tag tag = parse_tag(tag_string); + Token* last_token = s->tokens().back(); + last_token->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag)); + if (fields.size() > 3 && fields[3] == "disamb") { + last_token->lexemes().back().set_disamb(true); + } + } else { // orth-ws + Token* t = new Token(); + const std::string& orth = fields[0]; + t->set_orth(UnicodeString::fromUTF8(orth)); + PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space; + if (!s) { + s = make_sentence(); + wa = PwrNlp::Whitespace::Newline; + } + if (fields.size() > 1) { + wa = PwrNlp::Whitespace::from_string(fields[1]); + } + t->set_wa(wa); + s->append(t); + } + } + } + return s; +} + +void PlainReader::set_option(const std::string &option) +{ + BufferedSentenceReader::set_option(option); +} + +std::string PlainReader::get_option(const std::string &option) const +{ + return BufferedSentenceReader::get_option(option); +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/plainreader.h b/libcorpus2/io/plainreader.h new file mode 100644 index 0000000000000000000000000000000000000000..c17cb9e3a417c353685bc5e73f80b0ee387981af --- /dev/null +++ b/libcorpus2/io/plainreader.h @@ -0,0 +1,36 @@ +#ifndef LIBSORPUS2_IO_PLAINREADER_H +#define LIBCORPUS2_IO_PLAINREADER_H + +#include <libcorpus2/io/reader.h> +#include <boost/scoped_ptr.hpp> + +namespace Corpus2 { + +class PlainReader : public BufferedSentenceReader +{ +public: + PlainReader(const Tagset& tagset, std::istream& is); + + PlainReader(const Tagset& tagset, const std::string& filename); + + std::istream& is() { + return *is_; + } + + void set_option(const std::string& option); + + std::string get_option(const std::string& option) const; + + static bool registered; + +protected: + /// BufferedSentenceReader override + Sentence::Ptr actual_next_sentence(); + + std::istream* is_; + boost::scoped_ptr<std::istream> is_owned_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_PLAINREADER_H