diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 4c2be81282c0caa11820b360d45ce41ca479e024..bc866c345cac128be6d971f5dde97c6a34ce1431 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -41,6 +41,7 @@ SET(libcorpus2_STAT_SRC ann/iob.cpp ann/view.cpp chunk.cpp + document.cpp exception.cpp lexeme.cpp sentence.cpp @@ -53,6 +54,7 @@ SET(libcorpus2_STAT_SRC tokenmetadata.cpp io/cclreader.cpp io/cclwriter.cpp + io/docreader.cpp io/helpers.cpp io/fastxces.cpp io/iob-chan.cpp diff --git a/libcorpus2/document.cpp b/libcorpus2/document.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6cc6372cb821c0b142154652bb78036224e9243a --- /dev/null +++ b/libcorpus2/document.cpp @@ -0,0 +1,31 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/document.h> +#include <boost/make_shared.hpp> + +namespace Corpus2 { + +Document::Document() + : paragraphs_() +{ +} + +Document::~Document() +{ +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/document.h b/libcorpus2/document.h new file mode 100644 index 0000000000000000000000000000000000000000..777284a427944e68c744b60f8868d95e37aa5b2d --- /dev/null +++ b/libcorpus2/document.h @@ -0,0 +1,51 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_DOCUMENT_H +#define LIBCORPUS2_DOCUMENT_H + +#include <libcorpus2/chunk.h> +#include <boost/shared_ptr.hpp> + +namespace Corpus2 { + +/** + * A whole document, consisting of consecutive paragraphs ("chunks"), being + * sequences of sentences. + * Usage of this class assumes that a whole document is read into memory before + * any further takes place. + */ +class Document +{ +public: + Document(); + ~Document(); + + void add_paragraph(const boost::shared_ptr<Chunk> para) { + paragraphs_.push_back(para); + } + + const std::vector< boost::shared_ptr<Chunk> >& paragraphs() const { + return paragraphs_; + } + +protected: + std::vector< boost::shared_ptr<Chunk> > paragraphs_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_DOCUMENT_H diff --git a/libcorpus2/io/docreader.cpp b/libcorpus2/io/docreader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..597190c760b465e9f07fb12b8e5772c2c20f73a0 --- /dev/null +++ b/libcorpus2/io/docreader.cpp @@ -0,0 +1,22 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/io/docreader.h> +#include <boost/make_shared.hpp> + +namespace Corpus2 { + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/docreader.h b/libcorpus2/io/docreader.h new file mode 100644 index 0000000000000000000000000000000000000000..632def65ca2d630301f04ffab5aa5c07b618d9ef --- /dev/null +++ b/libcorpus2/io/docreader.h @@ -0,0 +1,43 @@ +/* + Copyright (C) 2010 Tomasz Åšniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_DOCREADER_H +#define LIBCORPUS2_DOCREADER_H + +#include <libcorpus2/io/reader.h> + +namespace Corpus2 { + +/** + * A reader for whole documents. Note that a whole document is read into memory + * before any processing may take place. + */ +class DocumentReader { +public: + /** + * Reads a whole document, using the two given path: the morphosyntax and + * chunk-style annotations are read from annot_path, while relations + * between chunk-style annotations are read from rela_path. + * Both path may in particular point to the same path. + * TODO! + */ + DocumentReader(const std::string &annot_path, + const std::string &rela_path, + const std::string &rdr_class_id = "ccl"); +}; +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_DOCREADER_H