diff --git a/libcorpus2_whole/io/corpusreader.cpp b/libcorpus2_whole/io/corpusreader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e53a7a88484daa129017074e08ffc75814a60c5 --- /dev/null +++ b/libcorpus2_whole/io/corpusreader.cpp @@ -0,0 +1,34 @@ +#include <libcorpus2_whole/io/corpusreader.h> +#include <libcorpus2_whole/io/poliqarpcorpusreader.h> +#include <libcorpus2_whole/io/documentcorpusreader.h> + +namespace Corpus2 { +namespace whole{ + +CorpusReader::CorpusReader(const Tagset& tagset, const std::string& corpus_type) + : corpus_type_(corpus_type), tagset_(tagset) +{ + // +} + +boost::shared_ptr<Corpus> CorpusReader::read(const std::string& corpus_file_path) +{ + boost::shared_ptr<CorpusReaderI> reader = this->get_corpus_reader_by_type(); + return reader->read(corpus_file_path); +} + +// +boost::shared_ptr<CorpusReaderI> CorpusReader::get_corpus_reader_by_type() +{ + if (corpus_type_ == "poliqarp") { + return boost::shared_ptr<PoliqarpCorpusReader>( + new PoliqarpCorpusReader(tagset_)); + } else if (corpus_type_ == "document") { + return boost::shared_ptr<DocumentCorpusReader>( + new DocumentCorpusReader(tagset_)); + } + throw Corpus2Error(corpus_type_ + " is unknown reader type!"); +} + +} // whole ns +} // Corpus2 ns diff --git a/libcorpus2_whole/io/corpusreader.h b/libcorpus2_whole/io/corpusreader.h new file mode 100644 index 0000000000000000000000000000000000000000..633ecbb83a241fe44fc302d13b619a55cd4975f0 --- /dev/null +++ b/libcorpus2_whole/io/corpusreader.h @@ -0,0 +1,47 @@ +#ifndef LIBCORPUS2_WHOLE_CORPUSREADER_H +#define LIBCORPUS2_WHOLE_CORPUSREADER_H + +#include <string> +#include <libcorpus2_whole/corpus.h> +#include <libcorpus2_whole/io/reader_i.h> + +namespace Corpus2 { +namespace whole { + +class CorpusReader +{ +public: + /** + * @arg corpus_type may be: + * - document (contains relations) + * - poliqarp + */ + CorpusReader(const Tagset& tagset, const std::string& corpus_type); + + /** + * Reads corpus from given path + * @arg corpus_file Path to file contains paths to corpus files. + * Depend on corpus type, each line in this file should contains only + * path to one document from corpus or path to (in particular DocReader) + * relations and annotatons (in one line, first is path to annotations + * and second are relations -- these paths, should be separated by semicolon) + * @return Readed corpus + */ + boost::shared_ptr<Corpus> read(const std::string& corpus_file); + +private: + /// Returns reader based on corpus type (poliqarp/document) + boost::shared_ptr<CorpusReaderI> get_corpus_reader_by_type(); + +private: + /// Type of corpus, sets only once in constructor + const std::string& corpus_type_; + + /// Tagset to use, sets only onece in constructor + const Tagset& tagset_; +}; + +} // whole ns +} // Corpus2 ns + +#endif // LIBCORPUS2_WHOLE_CORPUSREADER_H