Skip to content
Snippets Groups Projects
Commit a0aee2cc authored by ilor's avatar ilor
Browse files

add missing reader for the 'plain' format

parent 2380e8ea
Branches
No related merge requests found
......@@ -63,6 +63,7 @@ SET(libcorpus2_STAT_SRC
io/nonewriter.cpp
io/orthwriter.cpp
io/pathwriter.cpp
io/plainreader.cpp
io/plainwriter.cpp
io/premorphwriter.cpp
io/reader.cpp
......
#include <libcorpus2/io/plainreader.h>
#include <libpwrutils/foreach.h>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/make_shared.hpp>
#include <fstream>
namespace Corpus2 {
bool PlainReader::registered = TokenReader::register_reader<PlainReader>("plain",
"ign,loose,strict");
PlainReader::PlainReader(const Tagset& tagset, std::istream& is)
: BufferedSentenceReader(tagset), is_(&is)
{
}
PlainReader::PlainReader(const Tagset& tagset, const std::string& filename)
: BufferedSentenceReader(tagset), is_()
{
is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));
if (!this->is_owned_->good()) {
throw Corpus2Error("File not found!");
}
else {
this->is_ = is_owned_.get();
}
}
Sentence::Ptr PlainReader::actual_next_sentence()
{
std::string line;
Sentence::Ptr s;
size_t line_no = 0;
while (is().good()) {
std::getline(is(), line);
++line_no;
if (line.empty()) {
return s;
} else {
std::vector<std::string> fields;
boost::algorithm::split(fields, line, boost::is_any_of("\t"));
assert(!fields.empty());
if (fields[0].empty()) { //lexeme
if (s->empty()) {
throw Corpus2Error("PlainReader lexemes without a token at "
+ boost::lexical_cast<std::string>(line_no));
}
if (fields.size() < 3) {
throw Corpus2Error("PlainReader not enough fields at "
+ boost::lexical_cast<std::string>(line_no));
}
const std::string& lemma = fields[1];
const std::string& tag_string = fields[2];
Tag tag = parse_tag(tag_string);
Token* last_token = s->tokens().back();
last_token->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag));
if (fields.size() > 3 && fields[3] == "disamb") {
last_token->lexemes().back().set_disamb(true);
}
} else { // orth-ws
Token* t = new Token();
const std::string& orth = fields[0];
t->set_orth(UnicodeString::fromUTF8(orth));
PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space;
if (!s) {
s = make_sentence();
wa = PwrNlp::Whitespace::Newline;
}
if (fields.size() > 1) {
wa = PwrNlp::Whitespace::from_string(fields[1]);
}
t->set_wa(wa);
s->append(t);
}
}
}
return s;
}
void PlainReader::set_option(const std::string &option)
{
BufferedSentenceReader::set_option(option);
}
std::string PlainReader::get_option(const std::string &option) const
{
return BufferedSentenceReader::get_option(option);
}
} /* end ns Corpus2 */
#ifndef LIBSORPUS2_IO_PLAINREADER_H
#define LIBCORPUS2_IO_PLAINREADER_H
#include <libcorpus2/io/reader.h>
#include <boost/scoped_ptr.hpp>
namespace Corpus2 {
class PlainReader : public BufferedSentenceReader
{
public:
PlainReader(const Tagset& tagset, std::istream& is);
PlainReader(const Tagset& tagset, const std::string& filename);
std::istream& is() {
return *is_;
}
void set_option(const std::string& option);
std::string get_option(const std::string& option) const;
static bool registered;
protected:
/// BufferedSentenceReader override
Sentence::Ptr actual_next_sentence();
std::istream* is_;
boost::scoped_ptr<std::istream> is_owned_;
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_IO_PLAINREADER_H
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment