From e2aa981d010452540bacfdccf9aa67eb7e75599f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20K=C4=99dzia?= <Pawel.Kedzia@pwr.wroc.pl> Date: Thu, 29 Dec 2011 12:21:49 +0100 Subject: [PATCH] Added DocumentCorpusReader --- libcorpus2_whole/io/documentcorpusreader.cpp | 53 +++++++------------- libcorpus2_whole/io/documentcorpusreader.h | 48 +++++++----------- 2 files changed, 37 insertions(+), 64 deletions(-) diff --git a/libcorpus2_whole/io/documentcorpusreader.cpp b/libcorpus2_whole/io/documentcorpusreader.cpp index 0548556..12d0359 100644 --- a/libcorpus2_whole/io/documentcorpusreader.cpp +++ b/libcorpus2_whole/io/documentcorpusreader.cpp @@ -2,70 +2,53 @@ #include <boost/algorithm/string.hpp> #include <libcorpus2/exception.h> -#include <libcorpus2_whole/io/corpusreader.h> +#include <libcorpus2_whole/io/documentcorpusreader.h> +#include <libcorpus2_whole/io/documentreader.h> namespace Corpus2 { namespace whole { -CorpusReader::CorpusReader(const Tagset& tagset, const std::string& corpus_type) - : tagset_(tagset), corpus_type_(corpus_type) +DocumentCorpusReader::DocumentCorpusReader(const Tagset& tagset) : tagset_(tagset) { // } -boost::shared_ptr<Corpus> CorpusReader::read(const std::string& corpus_file_path) +boost::shared_ptr<Corpus> DocumentCorpusReader::read(const std::string& corpus_file_path) { std::string line; + std::string ann_path, rel_path; + boost::shared_ptr<DocumentReader> doc_reader; + std::ifstream corpus_file(corpus_file_path.c_str()); if (!corpus_file) { throw Corpus2Error(corpus_file_path + " file not found!"); } boost::shared_ptr<Corpus> corpus = boost::make_shared<Corpus>(corpus_file_path); - while(getline(corpus_file, line)) { - boost::shared_ptr<DocumentReaderI> reader; - std::string ann_path, rel_path; // split line by semicolon std::vector<std::string> splitted_line; boost::split(splitted_line, line, boost::is_any_of(";")); + if (splitted_line.empty()) { + // maybe exception? continue; } else if (splitted_line.size() == 1) { - ann_path = splitted_line[0]; - rel_path = ""; - } - else { - ann_path = splitted_line[0]; - rel_path = splitted_line[1]; + // maybe exception? + continue; } - reader = this->get_reader_by_type(this->corpus_type_, ann_path, rel_path); - boost::shared_ptr<Document> document = reader->read(); - corpus->add_document(document); - } - return corpus; -} + ann_path = splitted_line[0]; + rel_path = splitted_line[1]; -// -boost::shared_ptr<DocumentReaderI> CorpusReader::get_reader_by_type( - const std::string &type, - const std::string &ann_path, - const std::string &rel_path) -{ - /*if (type == "poliqarp") { - static boost::shared_ptr<PoliqarpDocumentReader> pq_doc_reader; - if (!pq_doc_reader) { - pq_doc_reader = boost::shared_ptr<PoliqarpDocumentReader>( - new PoliqarpDocumentReader(this->tagset_, ann_path)); - } - return pq_doc_reader; - } else */if (type == "document") { - return boost::shared_ptr<DocumentReader>( + doc_reader = boost::shared_ptr<DocumentReader>( new DocumentReader(this->tagset_, ann_path, rel_path)); + + corpus->add_document(doc_reader->read()); } - throw Corpus2Error(type + " is unknown reader type!"); + + return corpus; } } // whole ns diff --git a/libcorpus2_whole/io/documentcorpusreader.h b/libcorpus2_whole/io/documentcorpusreader.h index fcf0779..642639a 100644 --- a/libcorpus2_whole/io/documentcorpusreader.h +++ b/libcorpus2_whole/io/documentcorpusreader.h @@ -1,56 +1,46 @@ -#ifndef LIBCORPUS2_WHOLE_CORPUSREADER_H -#define LIBCORPUS2_WHOLE_CORPUSREADER_H +#ifndef LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H +#define LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H #include <string> #include <boost/shared_ptr.hpp> - -#include <libcorpus2_whole/io/docreaderi.h> -#include <libcorpus2_whole/io/docreader.h> #include <libcorpus2_whole/corpus.h> +#include <libcorpus2_whole/io/reader_i.h> namespace Corpus2 { namespace whole { /** - * CorpusReader is a corpus-like reader + * Reads "document-like cropus" + * Reads all documents in given file and returns their Corpus representation */ -class CorpusReader +class DocumentCorpusReader : public CorpusReaderI { public: - /** - * @arg corpus_type may be: - * - document (contains relations) - * - poliqarp - */ - CorpusReader(const Tagset& tagset, const std::string& corpus_type); + DocumentCorpusReader(const Tagset& tagset); /** * Reads corpus from given path * @arg corpus_file Path to file contains paths to corpus files. - * Depend on corpus type, each line in this file should contains only - * path to one document from corpus or path to (in particular DocReader) - * relations and annotatons (in one line, first is path to annotations - * and second are relations -- these paths, should be separated by semicolon) + * Each lines in given corpus file contains two paths, to relations + * and annotatons (in one line, first is path to annotations + * and second are relations -- these paths, should be separated + * by semicolon) * @return Readed corpus */ boost::shared_ptr<Corpus> read(const std::string& corpus_file); -protected: - /// Tagset to use, sets only onece in constructor - const Tagset& tagset_; - - /// Type of corpus, sets only once in constructor - const std::string& corpus_type_; + /** + * Sets options for readers (relation reader and/or ccl reader). + * Available options: + * - autogen_sent_id -- for automatically generation identifiers of sentences + */ + void set_option(const std::string& option); private: - /// Returns reader based on corpus type (poliqarp/document) - boost::shared_ptr<DocumentReaderI> get_reader_by_type( - const std::string &type, - const std::string &ann_path, - const std::string &rel_path = ""); + const Tagset& tagset_; }; } // whole ns } // Corpus2 ns -#endif // LIBCORPUS2_WHOLE_CORPUSREADER_H +#endif // LIBCORPUS2_WHOLE_DOCUMENTCORPUSREADER_H -- GitLab