From cadbb954c79ba10db6fea4f7ad309e3bc9ec7886 Mon Sep 17 00:00:00 2001 From: Lukasz Bilenkij <lukasz.bilenkij@gmail.com> Date: Tue, 7 Feb 2012 15:26:30 +0100 Subject: [PATCH] new DocumentReader --- libcorpus2_whole/CMakeLists.txt | 4 +- libcorpus2_whole/io/cclrelreader.cpp | 93 +++++++++++++ libcorpus2_whole/io/cclrelreader.h | 102 ++++++++++++++ libcorpus2_whole/io/documentcorpusreader.cpp | 8 +- libcorpus2_whole/io/documentreader.cpp | 130 ++++++++---------- libcorpus2_whole/io/documentreader.h | 100 +++----------- .../io/poliqarpdocumentreader.cpp | 4 + swig/cclrelreader.i | 50 +++++++ swig/corpus2.i | 1 + swig/documentreader.i | 44 ++---- 10 files changed, 345 insertions(+), 191 deletions(-) create mode 100644 libcorpus2_whole/io/cclrelreader.cpp create mode 100644 libcorpus2_whole/io/cclrelreader.h create mode 100644 swig/cclrelreader.i diff --git a/libcorpus2_whole/CMakeLists.txt b/libcorpus2_whole/CMakeLists.txt index 8312adf..0e07fa9 100644 --- a/libcorpus2_whole/CMakeLists.txt +++ b/libcorpus2_whole/CMakeLists.txt @@ -11,9 +11,11 @@ SET(libcorpus2_whole_SRC relation.cpp io/reader_i.h io/relreader.cpp - io/documentreader.cpp + io/cclrelreader.cpp io/documentcorpusreader.cpp io/corpusreader.cpp + io/documentreader.cpp + ) if(CORPUS2_BUILD_POLIQARP) diff --git a/libcorpus2_whole/io/cclrelreader.cpp b/libcorpus2_whole/io/cclrelreader.cpp new file mode 100644 index 0000000..ae69a8b --- /dev/null +++ b/libcorpus2_whole/io/cclrelreader.cpp @@ -0,0 +1,93 @@ +/* + Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski, Paweł Kędzia + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <boost/make_shared.hpp> +#include <libcorpus2_whole/io/cclrelreader.h> + +namespace Corpus2 { +namespace whole { + CclRelReader::CclRelReader(const Tagset& tagset, + const std::string &annot_path, const std::string &rela_path) + : DocumentReaderI("document") + { + make_readers(tagset, annot_path, rela_path); + make_id_doc(annot_path, rela_path); + } + + void CclRelReader::make_readers(const Tagset& tagset, + const std::string &annot_path, const std::string &rela_path) + { + ccl_reader_ = boost::make_shared<CclReader>(tagset, annot_path); + // prevent the underlying CCL reader from complaining about + // relation XML tags unknown to the reader itself + // (in case annot_path and rela_path poin to the same file) + ccl_reader_->set_option("no_warn_unexpected_xml"); + rel_reader_ = boost::make_shared<RelationReader>(rela_path); + } + + void CclRelReader::make_id_doc(const std::string &annot_path, + const std::string &rela_path) + { + id_ = (annot_path + ";" + rela_path); + } + + boost::shared_ptr<Document> CclRelReader::read() + { + boost::shared_ptr<Chunk> chunk; + boost::shared_ptr<Document> document = boost::make_shared<Document>(id_); + + // Read ccl document and makes document + while (1) { + chunk = ccl_reader_->get_next_chunk(); + if (!chunk) { + break; + } + else { + document->add_paragraph(chunk); + } + } + + // Read relations and adds them to the document + const std::vector< boost::shared_ptr<Relation> > relations = + rel_reader_->relations(); + for (unsigned int i = 0; i < relations.size(); i++) { + document->add_relation(relations[i]); + } + + return document; + } + + void CclRelReader::set_option(const std::string& option) + { + if (option == "autogen_sent_id") { + ccl_reader_->set_option("autogen_sent_id"); + } else if (option == "autogen_chunk_id") { + ccl_reader_->set_option("autogen_chunk_id"); + } + } + + std::string CclRelReader::get_option(const std::string& option) const { + if (option == "autogen_sent_id") { + return ccl_reader_->get_option("autogen_sent_id"); + } + else if (option == "autogen_chunk_id") { + return ccl_reader_->get_option("autogen_chunk_id"); + } + return ""; + } + +} // whole ns +} // Corpus2 ns diff --git a/libcorpus2_whole/io/cclrelreader.h b/libcorpus2_whole/io/cclrelreader.h new file mode 100644 index 0000000..cd19f68 --- /dev/null +++ b/libcorpus2_whole/io/cclrelreader.h @@ -0,0 +1,102 @@ +/* + Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski, Paweł Kędzia + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_WHOLE_DOCREADER_H +#define LIBCORPUS2_WHOLE_DOCREADER_H + +#include <libcorpus2/io/cclreader.h> +#include <libcorpus2_whole/relation.h> +#include <libcorpus2_whole/document.h> +#include <libcorpus2_whole/io/relreader.h> +#include <libcorpus2_whole/io/reader_i.h> + +#include <boost/shared_ptr.hpp> + +namespace Corpus2 { +namespace whole { + +/** + * A reader for whole documents. Note that a whole document is read into memory + * before any processing may take place. + */ +class CclRelReader : public DocumentReaderI { +public: + /** + * Reads a whole document, using the two given path: the morphosyntax and + * chunk-style annotations are read from annot_path, while relations + * between chunk-style annotations are read from rela_path. + * Both path may in particular point to the same path. + * @param tagset Tagset to use + * @param annot_path Path to file with morphosyntax and chunk-style annotations + * @param rela_path path to file with relations + */ + CclRelReader(const Tagset& tagset, + const std::string &annot_path, + const std::string &rela_path); + + /** + * Reads document stored in given file(s), in file with morphosyntax and + * chunk-style annotations and from file with relations. + * @return Pointer to readed Document + */ + boost::shared_ptr<Document> read(); + + /** + * Sets options for readers (relation reader and/or ccl reader). + * Available options: + * - autogen_sent_id -- for automatically generation identifiers of sentences + */ + void set_option(const std::string& option); + + /** + * @return option + */ + std::string get_option(const std::string& option) const; + +private: + /** + * Makes CclReader and RelationReader for given paths to files. + * @param annot_path Path to file with morphosyntax and chunk-style annotations + * @param tagset Tagset to use in CclReader + * @param rela_path path to file with relations + */ + void make_readers( + const Tagset& tagset, + const std::string &annot_path, + const std::string &rela_path); + + /** + * Based on given paths (annotations and relations) makes document identifier + * Document identifier is set to id_ class-state + */ + void make_id_doc(const std::string &annot_path, + const std::string &rela_path); + + // ------------------------------------------------------------------------- + /// Pointer to CclReader + boost::shared_ptr<CclReader> ccl_reader_; + + /// Pointer to RelationReader + boost::shared_ptr<RelationReader> rel_reader_; + + /// Future document identifier + std::string id_; +}; + +} // whole ns +} // Corpus2 ns + +#endif // LIBCORPUS2_WHOLE_DOCREADER_H diff --git a/libcorpus2_whole/io/documentcorpusreader.cpp b/libcorpus2_whole/io/documentcorpusreader.cpp index a85e97c..4a7b0d8 100644 --- a/libcorpus2_whole/io/documentcorpusreader.cpp +++ b/libcorpus2_whole/io/documentcorpusreader.cpp @@ -3,7 +3,7 @@ #include <libcorpus2/exception.h> #include <libcorpus2_whole/io/documentcorpusreader.h> -#include <libcorpus2_whole/io/documentreader.h> +#include <libcorpus2_whole/io/cclrelreader.h> namespace Corpus2 { namespace whole { @@ -17,7 +17,7 @@ boost::shared_ptr<Corpus> DocumentCorpusReader::read(const std::string& corpus_f { std::string line; std::string ann_path, rel_path; - boost::shared_ptr<DocumentReader> doc_reader; + boost::shared_ptr<CclRelReader> doc_reader; std::ifstream corpus_file(corpus_file_path.c_str()); if (!corpus_file) { @@ -41,8 +41,8 @@ boost::shared_ptr<Corpus> DocumentCorpusReader::read(const std::string& corpus_f ann_path = splitted_line[0]; rel_path = splitted_line[1]; - doc_reader = boost::shared_ptr<DocumentReader>( - new DocumentReader(this->tagset_, ann_path, rel_path)); + doc_reader = boost::shared_ptr<CclRelReader>( + new CclRelReader(this->tagset_, ann_path, rel_path)); corpus->add_document(doc_reader->read()); } diff --git a/libcorpus2_whole/io/documentreader.cpp b/libcorpus2_whole/io/documentreader.cpp index c30a4fb..a3ebce5 100644 --- a/libcorpus2_whole/io/documentreader.cpp +++ b/libcorpus2_whole/io/documentreader.cpp @@ -1,93 +1,81 @@ -/* - Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski, Paweł Kędzia - Part of the libcorpus2 project - This program is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation; either version 3 of the License, or (at your option) -any later version. +#include <boost/algorithm/string.hpp> +#include <libcorpus2_whole/io/documentreader.h> +#include <libcorpus2_whole/io/cclrelreader.h> - This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. - See the LICENSE and COPYING files for more details. -*/ +#include <libcorpus2_whole/io/poliqarpdocumentreader.h> + -#include <boost/make_shared.hpp> -#include <libcorpus2_whole/io/documentreader.h> namespace Corpus2 { -namespace whole { - DocumentReader::DocumentReader(const Tagset& tagset, - const std::string &annot_path, const std::string &rela_path) - : DocumentReaderI("document") - { - make_readers(tagset, annot_path, rela_path); - make_id_doc(annot_path, rela_path); - } +namespace whole{ - void DocumentReader::make_readers(const Tagset& tagset, - const std::string &annot_path, const std::string &rela_path) +DocumentReader::DocumentReader(const Tagset& tagset, const std::string& corpus_type, const std::string& corpus_file_path) + : corpus_type_(corpus_type), tagset_(tagset), corpus_path_(corpus_file_path) +{ + if (corpus_type_ == "poliqarp") { - ccl_reader_ = boost::make_shared<CclReader>(tagset, annot_path); - // prevent the underlying CCL reader from complaining about - // relation XML tags unknown to the reader itself - // (in case annot_path and rela_path poin to the same file) - ccl_reader_->set_option("no_warn_unexpected_xml"); - rel_reader_ = boost::make_shared<RelationReader>(rela_path); + reader = boost::shared_ptr<PoliqarpDocumentReader>( + new PoliqarpDocumentReader(tagset_, corpus_path_)); } - - void DocumentReader::make_id_doc(const std::string &annot_path, - const std::string &rela_path) - { - id_ = (annot_path + ";" + rela_path); + else if (corpus_type_ == "document") + { + corpus_file.open(corpus_file_path.c_str()); } + else + throw Corpus2Error(corpus_type_ + " is an unknown reader type!"); - boost::shared_ptr<Document> DocumentReader::read() - { - boost::shared_ptr<Chunk> chunk; - boost::shared_ptr<Document> document = boost::make_shared<Document>(id_); - - // Read ccl document and makes document - while (1) { - chunk = ccl_reader_->get_next_chunk(); - if (!chunk) { - break; - } - else { - document->add_paragraph(chunk); - } - } +} - // Read relations and adds them to the document - const std::vector< boost::shared_ptr<Relation> > relations = - rel_reader_->relations(); - for (unsigned int i = 0; i < relations.size(); i++) { - document->add_relation(relations[i]); - } +boost::shared_ptr<Document> DocumentReader::read() +{ - return document; + std::string line; + if (corpus_type_ == "poliqarp") + { + return this->reader->read(); } - - void DocumentReader::set_option(const std::string& option) + if (corpus_type_ == "document") { - if (option == "autogen_sent_id") { - ccl_reader_->set_option("autogen_sent_id"); - } else if (option == "autogen_chunk_id") { - ccl_reader_->set_option("autogen_chunk_id"); + if (std::getline(corpus_file, line)) + { + return get_cclrel_reader(line)->read(); + //return Document("End"); + } + else + { + return boost::make_shared<Document>("End"); } } +} - std::string DocumentReader::get_option(const std::string& option) const { - if (option == "autogen_sent_id") { - return ccl_reader_->get_option("autogen_sent_id"); - } - else if (option == "autogen_chunk_id") { - return ccl_reader_->get_option("autogen_chunk_id"); - } - return ""; + +boost::shared_ptr<DocumentReaderI> DocumentReader::get_cclrel_reader(std::string& line) +{ + std::string ann_path, rel_path; + + // split line by semicolon + std::vector<std::string> splitted_line; + boost::split(splitted_line, line, boost::is_any_of(";")); + + if (splitted_line.empty()) { + throw Corpus2Error("Empty line in corpus file!"); + } + else if (splitted_line.size() == 1) { + throw Corpus2Error("CclRelReader requires both paths to relations and annotations"); } + ann_path = splitted_line[0]; + rel_path = splitted_line[1]; + + return boost::shared_ptr<CclRelReader>( + new CclRelReader(tagset_, ann_path, rel_path)); + + + +} + + } // whole ns } // Corpus2 ns diff --git a/libcorpus2_whole/io/documentreader.h b/libcorpus2_whole/io/documentreader.h index 8b092de..9f3753e 100644 --- a/libcorpus2_whole/io/documentreader.h +++ b/libcorpus2_whole/io/documentreader.h @@ -1,102 +1,34 @@ -/* - Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski, Paweł Kędzia - Part of the libcorpus2 project - - This program is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation; either version 3 of the License, or (at your option) -any later version. - - This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. - - See the LICENSE and COPYING files for more details. -*/ - -#ifndef LIBCORPUS2_WHOLE_DOCREADER_H -#define LIBCORPUS2_WHOLE_DOCREADER_H - -#include <libcorpus2/io/cclreader.h> -#include <libcorpus2_whole/relation.h> -#include <libcorpus2_whole/document.h> -#include <libcorpus2_whole/io/relreader.h> +#ifndef LIBCORPUS2_WHOLE_CORPUSREADER_H +#define LIBCORPUS2_WHOLE_CORPUSREADER_H +#include <fstream> +#include <string> +#include <libcorpus2_whole/corpus.h> #include <libcorpus2_whole/io/reader_i.h> -#include <boost/shared_ptr.hpp> - namespace Corpus2 { namespace whole { -/** - * A reader for whole documents. Note that a whole document is read into memory - * before any processing may take place. - */ -class DocumentReader : public DocumentReaderI { +class DocumentReader +{ public: - /** - * Reads a whole document, using the two given path: the morphosyntax and - * chunk-style annotations are read from annot_path, while relations - * between chunk-style annotations are read from rela_path. - * Both path may in particular point to the same path. - * @param tagset Tagset to use - * @param annot_path Path to file with morphosyntax and chunk-style annotations - * @param rela_path path to file with relations - */ - DocumentReader(const Tagset& tagset, - const std::string &annot_path, - const std::string &rela_path); + DocumentReader(const Tagset& tagset, const std::string& corpus_type, const std::string& corpus_file_path); - /** - * Reads document stored in given file(s), in file with morphosyntax and - * chunk-style annotations and from file with relations. - * @return Pointer to readed Document - */ boost::shared_ptr<Document> read(); - /** - * Sets options for readers (relation reader and/or ccl reader). - * Available options: - * - autogen_sent_id -- for automatically generation identifiers of sentences - */ - void set_option(const std::string& option); - - /** - * @return option - */ - std::string get_option(const std::string& option) const; - private: - /** - * Makes CclReader and RelationReader for given paths to files. - * @param annot_path Path to file with morphosyntax and chunk-style annotations - * @param tagset Tagset to use in CclReader - * @param rela_path path to file with relations - */ - void make_readers( - const Tagset& tagset, - const std::string &annot_path, - const std::string &rela_path); - /** - * Based on given paths (annotations and relations) makes document identifier - * Document identifier is set to id_ class-state - */ - void make_id_doc(const std::string &annot_path, - const std::string &rela_path); - - // ------------------------------------------------------------------------- - /// Pointer to CclReader - boost::shared_ptr<CclReader> ccl_reader_; + boost::shared_ptr<DocumentReaderI> get_cclrel_reader(std::string& line); + boost::shared_ptr<DocumentReaderI> reader; + std::ifstream corpus_file; +private: + const std::string corpus_type_; + const Tagset& tagset_; + const std::string corpus_path_; - /// Pointer to RelationReader - boost::shared_ptr<RelationReader> rel_reader_; - /// Future document identifier - std::string id_; }; } // whole ns } // Corpus2 ns -#endif // LIBCORPUS2_WHOLE_DOCREADER_H +#endif // LIBCORPUS2_WHOLE_CORPUSREADER_H diff --git a/libcorpus2_whole/io/poliqarpdocumentreader.cpp b/libcorpus2_whole/io/poliqarpdocumentreader.cpp index 8e41832..b52ddac 100644 --- a/libcorpus2_whole/io/poliqarpdocumentreader.cpp +++ b/libcorpus2_whole/io/poliqarpdocumentreader.cpp @@ -19,6 +19,10 @@ boost::shared_ptr<Document> PoliqarpDocumentReader::read() document = boost::make_shared<Document>(); document->add_paragraph(chunk); } + else + { + document = boost::make_shared<Document>("End"); + } return document; } diff --git a/swig/cclrelreader.i b/swig/cclrelreader.i new file mode 100644 index 0000000..d18d751 --- /dev/null +++ b/swig/cclrelreader.i @@ -0,0 +1,50 @@ +#ifndef SWIG_LIBCORPUS2_DOCUMENTREADER_I +#define SWIG_LIBCORPUS2_DOCUMENTREADER_I + +%module libcorpuscclrelreader +%{ + #include <libcorpus2_whole/io/cclrelreader.h> +%} + +%include "exception.i" +%include "document.i" +%include "boost_shared_ptr.i" + +namespace Corpus2 { +namespace whole { + class CclRelReader { + public: + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + CclRelReader(const Tagset&, const std::string&, const std::string &); + + %exception { + try { + $action + } catch (PwrNlp::PwrNlpError &e) { + PyErr_SetString(PyExc_IndexError, e.info().c_str()); + return NULL; + } + } + boost::shared_ptr<Document> read(); + + void set_option(const std::string& option); + std::string get_option(const std::string& option) const; + + /* --------------------------------------------------------------------- */ + ~CclRelReader(); + }; +} // whole ns +} // Corpus2 ns + +using namespace std; +using namespace Corpus2; +using namespace Corpus2::whole; + +#endif /* SWIG_LIBCORPUS2_DOCUMENTREADER_I */ diff --git a/swig/corpus2.i b/swig/corpus2.i index 40b5f85..cf47ba4 100644 --- a/swig/corpus2.i +++ b/swig/corpus2.i @@ -27,6 +27,7 @@ %include "document.i" %include "corpus.i" %include "relationreader.i" +%include "cclrelreader.i" %include "documentreader.i" %include "corpusreader.i" diff --git a/swig/documentreader.i b/swig/documentreader.i index 2d1258b..bafc3ea 100644 --- a/swig/documentreader.i +++ b/swig/documentreader.i @@ -1,44 +1,26 @@ -#ifndef SWIG_LIBCORPUS2_DOCUMENTREADER_I -#define SWIG_LIBCORPUS2_DOCUMENTREADER_I +#ifndef SWIG_LIBCORPUS2_CORPUS_READER_I +#define SWIG_LIBCORPUS2_CORPUS_READER_I -%module libcorpusdocumentreader +%module libcorpusdocument %{ #include <libcorpus2_whole/io/documentreader.h> %} -%include "exception.i" -%include "document.i" -%include "boost_shared_ptr.i" +%include "std_defs.i" +%include "tagset.i" +%include "corpus.i" + +%template(DocumentReaderPtr) boost::shared_ptr<Corpus2::whole::DocumentReader>; +%template(ConstDocumentReaderPtr) boost::shared_ptr<const Corpus2::whole::DocumentReader>; + +%template(DocumentReaderPtrVector) std::vector<boost::shared_ptr<Corpus2::whole::DocumentReader> >; namespace Corpus2 { namespace whole { class DocumentReader { public: - %exception { - try { - $action - } catch (PwrNlp::PwrNlpError &e) { - PyErr_SetString(PyExc_IndexError, e.info().c_str()); - return NULL; - } - } - DocumentReader(const Tagset&, const std::string&, const std::string &); - - %exception { - try { - $action - } catch (PwrNlp::PwrNlpError &e) { - PyErr_SetString(PyExc_IndexError, e.info().c_str()); - return NULL; - } - } + DocumentReader(const Tagset& tagset, const std::string& corpus_type,const std::string& corpus_file); boost::shared_ptr<Document> read(); - - void set_option(const std::string& option); - std::string get_option(const std::string& option) const; - - /* --------------------------------------------------------------------- */ - ~DocumentReader(); }; } // whole ns } // Corpus2 ns @@ -47,4 +29,4 @@ using namespace std; using namespace Corpus2; using namespace Corpus2::whole; -#endif /* SWIG_LIBCORPUS2_DOCUMENTREADER_I */ +#endif /* SWIG_LIBCORPUS2_CORPUS_READER_I */ -- GitLab