#include <fstream> #include <boost/algorithm/string.hpp> #include <libcorpus2/exception.h> #include <libcorpus2_whole/io/documentcorpusreader.h> #include <libcorpus2_whole/io/cclrelreader.h> namespace Corpus2 { namespace whole { DocumentCorpusReader::DocumentCorpusReader(const Tagset& tagset) : tagset_(tagset) { // } boost::shared_ptr<Corpus> DocumentCorpusReader::read(const std::string& corpus_file_path) { std::string line; std::string ann_path, rel_path; boost::shared_ptr<CclRelReader> doc_reader; std::ifstream corpus_file(corpus_file_path.c_str()); if (!corpus_file) { throw Corpus2Error(corpus_file_path + " file not found!"); } boost::shared_ptr<Corpus> corpus = boost::make_shared<Corpus>(corpus_file_path); while(getline(corpus_file, line)) { // split line by semicolon std::vector<std::string> splitted_line; boost::split(splitted_line, line, boost::is_any_of(";")); if (splitted_line.empty()) { // maybe exception? continue; } else if (splitted_line.size() == 1) { throw Corpus2Error("DocumentReader requires both paths to relations and annotations"); } ann_path = splitted_line[0]; rel_path = splitted_line[1]; doc_reader = boost::shared_ptr<CclRelReader>( new CclRelReader(this->tagset_, ann_path, rel_path)); corpus->add_document(doc_reader->read()); } return corpus; } } // whole ns } // Corpus2 ns