-
unknown authored4b961f72
reader.cpp 8.95 KiB
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details.
*/
#include <libcorpus2/io/reader.h>
#include <boost/make_shared.hpp>
#include <boost/algorithm/string.hpp>
#include <libcorpus2/ann/annotatedsentence.h>
#include <sstream>
#include <libpwrutils/plugin.h>
#include <libcorpus2/util/settings.h>
namespace Corpus2 {
namespace detail {
/**
* Declaration of the TokenWriter factory as a singleton Loki object
* factory. The factory instance can be accessed as
* TokenLayerFactory::Instance(). It is assumed that all derived classes
* have the same constructor signature.
*/
typedef Loki::SingletonHolder<
TokenReaderFactory,
Loki::CreateUsingNew, // default, needed to change the item below
Loki::LongevityLifetime::DieAsSmallObjectChild // per libloki docs
>
TokenReaderFactorySingleton;
TokenReaderFactory& token_reader_factory()
{
return TokenReaderFactorySingleton::Instance();
}
} /* ned ns detail */
TokenReader::TokenReader(const Tagset& tagset)
: tagset_(tagset), tag_parse_mode_(Tagset::ParseDefault),
use_annotated_sentences_(false)
{
}
TokenReader::~TokenReader()
{
}
void TokenReader::set_option(const std::string &option)
{
if (option == "ign") {
tag_parse_mode_ = static_cast<Tagset::ParseMode>(
tag_parse_mode_ | Tagset::ParseFailWithIgn);
} else if (option == "loose") {
tag_parse_mode_ = static_cast<Tagset::ParseMode>(
Tagset::ParseLoose | (tag_parse_mode_ & Tagset::ParseFailWithIgn));
} else if (option == "strict") {
tag_parse_mode_ = static_cast<Tagset::ParseMode>(
Tagset::ParseDefault | (tag_parse_mode_ & Tagset::ParseFailWithIgn));
} else if (option == "ann") {
use_annotated_sentences_ = true;
} else {
throw Corpus2Error("Unknown option passed to reader: " + option);
}
}
std::string TokenReader::get_option(const std::string &option) const
{
if (option == "ign") {
return tag_parse_mode_ & Tagset::ParseFailWithIgn ? option : "";
} else if (option == "loose") {
return (tag_parse_mode_ & ~Tagset::ParseFailWithIgn)
== Tagset::ParseLoose ? option : "";
} else if (option == "strict") {
return (tag_parse_mode_ & ~Tagset::ParseFailWithIgn)
== Tagset::ParseDefault ? option : "";
} else if (option == "ann") {
return use_annotated_sentences_ ? option : "";
} else {
return "unknown";
}
}
boost::shared_ptr<Sentence> TokenReader::make_sentence() const
{
if (use_annotated_sentences_) {
return boost::make_shared<AnnotatedSentence>();
} else {
return boost::make_shared<Sentence>();
}
}
namespace {
std::string guess_plugin_name(const std::string& reader_class_id, int idx)
{
switch (idx) {
case 0: return reader_class_id + "reader";
case 1: return reader_class_id;
default: return "";
}
}
}
boost::shared_ptr<TokenReader> TokenReader::create_path_reader(
const std::string& class_id_params,
const Tagset& tagset,
const std::string& path)
{
string_range_vector params;
boost::algorithm::split(params, class_id_params,
boost::is_any_of(std::string(",")));
std::string class_id = boost::copy_range<std::string>(params[0]);
params.erase(params.begin(), params.begin() + 1);
int plugin_name_idx = 0;
while (plugin_name_idx >=0) {
try {
return boost::shared_ptr<TokenReader>(
detail::TokenReaderFactorySingleton::Instance().
path_factory.CreateObject(class_id, tagset, path, params));
} catch (detail::TokenReaderFactoryException&) {
std::string next_plugin = guess_plugin_name(class_id, plugin_name_idx);
if (!next_plugin.empty()) {
PwrNlp::Plugin::load("corpus2", next_plugin, !Path::Instance().get_verbose());
plugin_name_idx++;
} else {
plugin_name_idx = -1;
}
}
}
throw Corpus2Error("Reader class not found: " + class_id);
}
boost::shared_ptr<TokenReader> TokenReader::create_stream_reader(
const std::string& class_id_params,
const Tagset& tagset,
std::istream& stream)
{
string_range_vector params;
boost::algorithm::split(params, class_id_params,
boost::is_any_of(std::string(",")));
std::string class_id = boost::copy_range<std::string>(params[0]);
params.erase(params.begin(), params.begin() + 1);
int plugin_name_idx = 0;
while (plugin_name_idx >=0) {
try {
return boost::shared_ptr<TokenReader>(
detail::TokenReaderFactorySingleton::Instance()
.stream_factory.CreateObject(class_id, tagset, stream, params));
} catch (detail::TokenReaderFactoryException&) {
std::string next_plugin = guess_plugin_name(class_id, plugin_name_idx);
if (!next_plugin.empty()) {
PwrNlp::Plugin::load("corpus2", next_plugin, !Path::Instance().get_verbose());
plugin_name_idx++;
} else {
plugin_name_idx = -1;
}
}
}
std::vector<std::string> ids;
ids = detail::TokenReaderFactorySingleton::Instance().path_factory.RegisteredIds();
if (std::find(ids.begin(), ids.end(), class_id) == ids.end()) {
throw Corpus2Error("Reader class not found: " + class_id);
} else {
throw Corpus2Error("This reader does not support stream mode: " + class_id);
}
}
std::vector<std::string> TokenReader::available_reader_types()
{
return detail::TokenReaderFactorySingleton::Instance().path_factory.RegisteredIds();
}
std::string TokenReader::reader_help(const std::string& class_id)
{
std::map<std::string, std::string>::const_iterator c;
c = detail::TokenReaderFactorySingleton::Instance().help.find(class_id);
if (c != detail::TokenReaderFactorySingleton::Instance().help.end()) {
return c->second;
} else {
return "";
}
}
std::vector<std::string> TokenReader::available_reader_types_help()
{
std::vector<std::string> v = available_reader_types();
BOOST_FOREACH(std::string& id, v) {
std::stringstream ss;
std::map<std::string, std::string>::const_iterator c;
c = detail::TokenReaderFactorySingleton::Instance().help.find(id);
if (c != detail::TokenReaderFactorySingleton::Instance().help.end()) {
ss << id << "[";
ss << c->second;
ss << "]";
}
id = ss.str();
}
return v;
}
BufferedChunkReader::BufferedChunkReader(const Tagset& tagset)
: TokenReader(tagset)
{
}
BufferedChunkReader::~BufferedChunkReader()
{
BOOST_FOREACH(Token* t, token_buf_) {
delete t;
}
}
Token* BufferedChunkReader::get_next_token()
{
bool more = true;
while (token_buf_.empty() && more) {
ensure_more();
Sentence::Ptr s = get_next_sentence();
if (s) {
std::copy(s->tokens().begin(), s->tokens().end(),
std::back_inserter(token_buf_));
s->release_tokens();
} else {
more = false;
}
}
if (token_buf_.empty()) {
return NULL;
} else {
Token* t = token_buf_.front();
token_buf_.pop_front();
return t;
}
}
Sentence::Ptr BufferedChunkReader::get_next_sentence()
{
bool more = true;
while (sentence_buf_.empty() && more) {
ensure_more();
boost::shared_ptr<Chunk> c = get_next_chunk();
if (c) {
std::copy(c->sentences().begin(), c->sentences().end(),
std::back_inserter(sentence_buf_));
} else {
more = false;
}
}
if (sentence_buf_.empty()) {
return Sentence::Ptr();
} else {
Sentence::Ptr s = sentence_buf_.front();
sentence_buf_.pop_front();
return s;
}
}
boost::shared_ptr<Chunk> BufferedChunkReader::get_next_chunk()
{
ensure_more();
if (chunk_buf_.empty()) {
return boost::shared_ptr<Chunk>();
} else {
boost::shared_ptr<Chunk> t = chunk_buf_.front();
chunk_buf_.pop_front();
return t;
}
}
BufferedSentenceReader::BufferedSentenceReader(const Tagset& tagset)
: TokenReader(tagset), chunkify_(true)
, sentence_buf_(), token_buf_()
{
}
Token* BufferedSentenceReader::get_next_token()
{
bool more = true;
while (token_buf_.empty() && more) {
Sentence::Ptr s = get_next_sentence();
if (s) {
std::copy(s->tokens().begin(), s->tokens().end(),
std::back_inserter(token_buf_));
s->release_tokens();
} else {
more = false;
}
}
if (token_buf_.empty()) {
return NULL;
} else {
Token* t = token_buf_.front();
token_buf_.pop_front();
return t;
}
}
Sentence::Ptr BufferedSentenceReader::get_next_sentence()
{
if (sentence_buf_ != NULL) {
Sentence::Ptr s = sentence_buf_;
sentence_buf_.reset();
return s;
} else {
return actual_next_sentence();
}
}
boost::shared_ptr<Chunk> BufferedSentenceReader::get_next_chunk()
{
Sentence::Ptr s = get_next_sentence();
if (!s) {
return boost::shared_ptr<Chunk>();
} else {
boost::shared_ptr<Chunk> c = boost::make_shared<Chunk>();
c->append(s);
s = get_next_sentence();
while (s && (!chunkify_ || s->first_token()->wa() !=
PwrNlp::Whitespace::ManyNewlines)) {
c->append(s);
s = get_next_sentence();
}
if (s) {
sentence_buf_ = s;
}
return c;
}
}
} /* end ns Corpus2 */