Skip to content
Snippets Groups Projects
Commit e2aa981d authored by Paweł Kędzia's avatar Paweł Kędzia
Browse files

Added DocumentCorpusReader

parent 6e7affbe
No related branches found
No related tags found
No related merge requests found
......@@ -2,70 +2,53 @@
#include <boost/algorithm/string.hpp>
#include <libcorpus2/exception.h>
#include <libcorpus2_whole/io/corpusreader.h>
#include <libcorpus2_whole/io/documentcorpusreader.h>
#include <libcorpus2_whole/io/documentreader.h>
namespace Corpus2 {
namespace whole {
CorpusReader::CorpusReader(const Tagset& tagset, const std::string& corpus_type)
: tagset_(tagset), corpus_type_(corpus_type)
DocumentCorpusReader::DocumentCorpusReader(const Tagset& tagset) : tagset_(tagset)
{
//
}
boost::shared_ptr<Corpus> CorpusReader::read(const std::string& corpus_file_path)
boost::shared_ptr<Corpus> DocumentCorpusReader::read(const std::string& corpus_file_path)
{
std::string line;
std::string ann_path, rel_path;
boost::shared_ptr<DocumentReader> doc_reader;
std::ifstream corpus_file(corpus_file_path.c_str());
if (!corpus_file) {
throw Corpus2Error(corpus_file_path + " file not found!");
}
boost::shared_ptr<Corpus> corpus = boost::make_shared<Corpus>(corpus_file_path);
while(getline(corpus_file, line)) {
boost::shared_ptr<DocumentReaderI> reader;
std::string ann_path, rel_path;
// split line by semicolon
std::vector<std::string> splitted_line;
boost::split(splitted_line, line, boost::is_any_of(";"));
if (splitted_line.empty()) {
// maybe exception?
continue;
}
else if (splitted_line.size() == 1) {
ann_path = splitted_line[0];
rel_path = "";
// maybe exception?
continue;
}
else {
ann_path = splitted_line[0];
rel_path = splitted_line[1];
}
reader = this->get_reader_by_type(this->corpus_type_, ann_path, rel_path);
boost::shared_ptr<Document> document = reader->read();
corpus->add_document(document);
}
return corpus;
}
//
boost::shared_ptr<DocumentReaderI> CorpusReader::get_reader_by_type(
const std::string &type,
const std::string &ann_path,
const std::string &rel_path)
{
/*if (type == "poliqarp") {
static boost::shared_ptr<PoliqarpDocumentReader> pq_doc_reader;
if (!pq_doc_reader) {
pq_doc_reader = boost::shared_ptr<PoliqarpDocumentReader>(
new PoliqarpDocumentReader(this->tagset_, ann_path));
}
return pq_doc_reader;
} else */if (type == "document") {
return boost::shared_ptr<DocumentReader>(
doc_reader = boost::shared_ptr<DocumentReader>(
new DocumentReader(this->tagset_, ann_path, rel_path));
corpus->add_document(doc_reader->read());
}
throw Corpus2Error(type + " is unknown reader type!");
return corpus;
}
} // whole ns
......
#ifndef LIBCORPUS2_WHOLE_CORPUSREADER_H
#define LIBCORPUS2_WHOLE_CORPUSREADER_H
#ifndef LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H
#define LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H
#include <string>
#include <boost/shared_ptr.hpp>
#include <libcorpus2_whole/io/docreaderi.h>
#include <libcorpus2_whole/io/docreader.h>
#include <libcorpus2_whole/corpus.h>
#include <libcorpus2_whole/io/reader_i.h>
namespace Corpus2 {
namespace whole {
/**
* CorpusReader is a corpus-like reader
* Reads "document-like cropus"
* Reads all documents in given file and returns their Corpus representation
*/
class CorpusReader
class DocumentCorpusReader : public CorpusReaderI
{
public:
/**
* @arg corpus_type may be:
* - document (contains relations)
* - poliqarp
*/
CorpusReader(const Tagset& tagset, const std::string& corpus_type);
DocumentCorpusReader(const Tagset& tagset);
/**
* Reads corpus from given path
* @arg corpus_file Path to file contains paths to corpus files.
* Depend on corpus type, each line in this file should contains only
* path to one document from corpus or path to (in particular DocReader)
* relations and annotatons (in one line, first is path to annotations
* and second are relations -- these paths, should be separated by semicolon)
* Each lines in given corpus file contains two paths, to relations
* and annotatons (in one line, first is path to annotations
* and second are relations -- these paths, should be separated
* by semicolon)
* @return Readed corpus
*/
boost::shared_ptr<Corpus> read(const std::string& corpus_file);
protected:
/// Tagset to use, sets only onece in constructor
const Tagset& tagset_;
/// Type of corpus, sets only once in constructor
const std::string& corpus_type_;
/**
* Sets options for readers (relation reader and/or ccl reader).
* Available options:
* - autogen_sent_id -- for automatically generation identifiers of sentences
*/
void set_option(const std::string& option);
private:
/// Returns reader based on corpus type (poliqarp/document)
boost::shared_ptr<DocumentReaderI> get_reader_by_type(
const std::string &type,
const std::string &ann_path,
const std::string &rel_path = "");
const Tagset& tagset_;
};
} // whole ns
} // Corpus2 ns
#endif // LIBCORPUS2_WHOLE_CORPUSREADER_H
#endif // LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment