Skip to content
Snippets Groups Projects
Commit e2aa981d authored by Paweł Kędzia's avatar Paweł Kędzia
Browse files

Added DocumentCorpusReader

parent 6e7affbe
Branches
No related merge requests found
......@@ -2,70 +2,53 @@
#include <boost/algorithm/string.hpp>
#include <libcorpus2/exception.h>
#include <libcorpus2_whole/io/corpusreader.h>
#include <libcorpus2_whole/io/documentcorpusreader.h>
#include <libcorpus2_whole/io/documentreader.h>
namespace Corpus2 {
namespace whole {
CorpusReader::CorpusReader(const Tagset& tagset, const std::string& corpus_type)
: tagset_(tagset), corpus_type_(corpus_type)
DocumentCorpusReader::DocumentCorpusReader(const Tagset& tagset) : tagset_(tagset)
{
//
}
boost::shared_ptr<Corpus> CorpusReader::read(const std::string& corpus_file_path)
boost::shared_ptr<Corpus> DocumentCorpusReader::read(const std::string& corpus_file_path)
{
std::string line;
std::string ann_path, rel_path;
boost::shared_ptr<DocumentReader> doc_reader;
std::ifstream corpus_file(corpus_file_path.c_str());
if (!corpus_file) {
throw Corpus2Error(corpus_file_path + " file not found!");
}
boost::shared_ptr<Corpus> corpus = boost::make_shared<Corpus>(corpus_file_path);
while(getline(corpus_file, line)) {
boost::shared_ptr<DocumentReaderI> reader;
std::string ann_path, rel_path;
// split line by semicolon
std::vector<std::string> splitted_line;
boost::split(splitted_line, line, boost::is_any_of(";"));
if (splitted_line.empty()) {
// maybe exception?
continue;
}
else if (splitted_line.size() == 1) {
ann_path = splitted_line[0];
rel_path = "";
}
else {
ann_path = splitted_line[0];
rel_path = splitted_line[1];
// maybe exception?
continue;
}
reader = this->get_reader_by_type(this->corpus_type_, ann_path, rel_path);
boost::shared_ptr<Document> document = reader->read();
corpus->add_document(document);
}
return corpus;
}
ann_path = splitted_line[0];
rel_path = splitted_line[1];
//
boost::shared_ptr<DocumentReaderI> CorpusReader::get_reader_by_type(
const std::string &type,
const std::string &ann_path,
const std::string &rel_path)
{
/*if (type == "poliqarp") {
static boost::shared_ptr<PoliqarpDocumentReader> pq_doc_reader;
if (!pq_doc_reader) {
pq_doc_reader = boost::shared_ptr<PoliqarpDocumentReader>(
new PoliqarpDocumentReader(this->tagset_, ann_path));
}
return pq_doc_reader;
} else */if (type == "document") {
return boost::shared_ptr<DocumentReader>(
doc_reader = boost::shared_ptr<DocumentReader>(
new DocumentReader(this->tagset_, ann_path, rel_path));
corpus->add_document(doc_reader->read());
}
throw Corpus2Error(type + " is unknown reader type!");
return corpus;
}
} // whole ns
......
#ifndef LIBCORPUS2_WHOLE_CORPUSREADER_H
#define LIBCORPUS2_WHOLE_CORPUSREADER_H
#ifndef LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H
#define LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H
#include <string>
#include <boost/shared_ptr.hpp>
#include <libcorpus2_whole/io/docreaderi.h>
#include <libcorpus2_whole/io/docreader.h>
#include <libcorpus2_whole/corpus.h>
#include <libcorpus2_whole/io/reader_i.h>
namespace Corpus2 {
namespace whole {
/**
* CorpusReader is a corpus-like reader
* Reads "document-like cropus"
* Reads all documents in given file and returns their Corpus representation
*/
class CorpusReader
class DocumentCorpusReader : public CorpusReaderI
{
public:
/**
* @arg corpus_type may be:
* - document (contains relations)
* - poliqarp
*/
CorpusReader(const Tagset& tagset, const std::string& corpus_type);
DocumentCorpusReader(const Tagset& tagset);
/**
* Reads corpus from given path
* @arg corpus_file Path to file contains paths to corpus files.
* Depend on corpus type, each line in this file should contains only
* path to one document from corpus or path to (in particular DocReader)
* relations and annotatons (in one line, first is path to annotations
* and second are relations -- these paths, should be separated by semicolon)
* Each lines in given corpus file contains two paths, to relations
* and annotatons (in one line, first is path to annotations
* and second are relations -- these paths, should be separated
* by semicolon)
* @return Readed corpus
*/
boost::shared_ptr<Corpus> read(const std::string& corpus_file);
protected:
/// Tagset to use, sets only onece in constructor
const Tagset& tagset_;
/// Type of corpus, sets only once in constructor
const std::string& corpus_type_;
/**
* Sets options for readers (relation reader and/or ccl reader).
* Available options:
* - autogen_sent_id -- for automatically generation identifiers of sentences
*/
void set_option(const std::string& option);
private:
/// Returns reader based on corpus type (poliqarp/document)
boost::shared_ptr<DocumentReaderI> get_reader_by_type(
const std::string &type,
const std::string &ann_path,
const std::string &rel_path = "");
const Tagset& tagset_;
};
} // whole ns
} // Corpus2 ns
#endif // LIBCORPUS2_WHOLE_CORPUSREADER_H
#endif // LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment