Skip to content
Snippets Groups Projects
Commit 28b6cb24 authored by Paweł Kędzia's avatar Paweł Kędzia
Browse files

Added module libcorpus2_whole

parent 58365c06
No related branches found
No related tags found
No related merge requests found
Showing
with 390 additions and 28 deletions
......@@ -41,10 +41,8 @@ SET(libcorpus2_STAT_SRC
ann/iob.cpp
ann/view.cpp
chunk.cpp
document.cpp
exception.cpp
lexeme.cpp
relation.cpp
sentence.cpp
tag.cpp
tagging.cpp
......@@ -55,7 +53,6 @@ SET(libcorpus2_STAT_SRC
tokenmetadata.cpp
io/cclreader.cpp
io/cclwriter.cpp
io/docreader.cpp
io/helpers.cpp
io/fastxces.cpp
io/iob-chan.cpp
......@@ -66,7 +63,6 @@ SET(libcorpus2_STAT_SRC
io/plainwriter.cpp
io/premorphwriter.cpp
io/reader.cpp
io/relreader.cpp
io/rft.cpp
io/sax.cpp
io/statwriter.cpp
......
PROJECT(corpus2_whole)
find_package(LibXML++ REQUIRED QUIET)
include_directories(${LibXML++_INCLUDE_DIRS})
link_directories(${LibXML++_LIBRARY_DIRS})
set(LIBS ${LIBS} ${LibXML++_LIBRARIES})
SET(libcorpus2_whole_SRC
corpus.cpp
corpusreader.cpp
docreader.cpp
document.cpp
poliqarpdocumentreader.cpp
relation.cpp
relreader.cpp
docreaderi.h
)
file(GLOB_RECURSE INCS "*.h")
if(WIN32)
add_library(corpus2_whole STATIC ${libcorpus2_whole_SRC} ${INCS})
else(WIN32)
add_library(corpus2_whole SHARED ${libcorpus2_whole_SRC} ${INCS})
endif(WIN32)
target_link_libraries(corpus2_whole corpus2 ${LIBS})
if(UNIX)
install(TARGETS corpus2_whole
LIBRARY DESTINATION lib)
install(
DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DESTINATION include
FILES_MATCHING PATTERN "*.h"
PATTERN ".svn" EXCLUDE
PATTERN "bin" EXCLUDE
PATTERN "build" EXCLUDE
PATTERN "CMake*" EXCLUDE
)
install(
DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/
DESTINATION include
FILES_MATCHING PATTERN "version.h"
PATTERN "config_d.h"
)
install(
DIRECTORY ${LIBCORPUS2_SRC_DATA_DIR}/
DESTINATION ${LIBCORPUS2_INSTALL_DATA_DIR}
FILES_MATCHING PATTERN "*.ini"
PATTERN "*.tagset"
PATTERN ".svn" EXCLUDE
PATTERN "local/*" EXCLUDE
)
endif(UNIX)
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski, Paweł Kędzia
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <libcorpus2_whole/corpus.h>
namespace Corpus2 {
namespace whole{
Corpus::Corpus(const std::string name) : name_(name), documents_()
{
this->current_document_ = documents_.begin();
}
} // whole ns
} // Corpus2 ns
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski, Paweł Kędzia
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef LIBCORPUS2_WHOLE_CORPUS_H
#define LIBCORPUS2_WHOLE_CORPUS_H
#include <vector>
#include <boost/shared_ptr.hpp>
#include <libcorpus2_whole/document.h>
namespace Corpus2 {
namespace whole {
/**
* Represents Corpus
*/
class Corpus
{
public:
/**
* Corpus constructor takes one parameter:
* @arg name Name of the corpus - default is empty
*/
Corpus(const std::string name = "");
/**
* Adds one (readed) document to corpus
*/
void add_document(boost::shared_ptr<Document> document) {
this->documents_.push_back(document);
}
/// Returns list of the documents from corpus
const std::vector<boost::shared_ptr<Document> > documents() const {
return this->documents_;
}
/// Next document in corpus
/// @todo I don't know if it'll be working... It should be tested!
boost::shared_ptr<Document> next_document() {
return *(current_document_++);
}
private:
/// Corpus name
const std::string name_;
/// List of the documents in corpus
std::vector<boost::shared_ptr<Document> > documents_;
/// Current document
std::vector<boost::shared_ptr<Document> >::iterator current_document_;
};
} // whole ns
} // Corpus2 ns
#endif // LIBCORPUS2_WHOLE_CORPUS_H
#include <fstream>
#include <boost/algorithm/string.hpp>
#include <libcorpus2/exception.h>
#include <libcorpus2_whole/corpusreader.h>
namespace Corpus2 {
namespace whole {
CorpusReader::CorpusReader(const Tagset& tagset, const std::string& corpus_type)
: tagset_(tagset), corpus_type_(corpus_type)
{
//
}
boost::shared_ptr<Corpus> CorpusReader::read(const std::string& corpus_file_path)
{
std::string line;
std::ifstream corpus_file(corpus_file_path.c_str());
if (!corpus_file) {
throw Corpus2Error(corpus_file_path + " file not found!");
}
boost::shared_ptr<Corpus> corpus = boost::make_shared<Corpus>(corpus_file_path);
while(getline(corpus_file, line)) {
boost::shared_ptr<DocumentReaderI> reader;
std::string ann_path, rel_path;
// split line by semicolon
std::vector<std::string> splitted_line;
boost::split(splitted_line, line, boost::is_any_of(";"));
if (splitted_line.empty()) {
continue;
}
else if (splitted_line.size() == 1) {
ann_path = splitted_line[0];
rel_path = "";
}
else {
ann_path = splitted_line[0];
rel_path = splitted_line[1];
}
reader = this->get_reader_by_type(this->corpus_type_, ann_path, rel_path);
boost::shared_ptr<Document> document = reader->read();
corpus->add_document(document);
}
return corpus;
}
//
boost::shared_ptr<DocumentReaderI> CorpusReader::get_reader_by_type(
const std::string &type,
const std::string &ann_path,
const std::string &rel_path)
{
/*if (type == "poliqarp") {
static boost::shared_ptr<PoliqarpDocumentReader> pq_doc_reader;
if (!pq_doc_reader) {
pq_doc_reader = boost::shared_ptr<PoliqarpDocumentReader>(
new PoliqarpDocumentReader(this->tagset_, ann_path));
}
return pq_doc_reader;
} else */if (type == "document") {
return boost::shared_ptr<DocumentReader>(
new DocumentReader(this->tagset_, ann_path, rel_path));
}
throw Corpus2Error(type + " is unknown reader type!");
}
} // whole ns
} // Corpus2 ns
#ifndef LIBCORPUS2_WHOLE_CORPUSREADER_H
#define LIBCORPUS2_WHOLE_CORPUSREADER_H
#include <string>
#include <boost/shared_ptr.hpp>
#include <libcorpus2_whole/docreaderi.h>
#include <libcorpus2_whole/corpus.h>
#include <libcorpus2_whole/docreader.h>
namespace Corpus2 {
namespace whole {
/**
* CorpusReader is a corpus-like reader
*/
class CorpusReader
{
public:
/**
* @arg corpus_type may be:
* - document (contains relations)
* - poliqarp
*/
CorpusReader(const Tagset& tagset, const std::string& corpus_type);
/**
* Reads corpus from given path
* @arg corpus_file Path to file contains paths to corpus files.
* Depend on corpus type, each line in this file should contains only
* path to one document from corpus or path to (in particular DocReader)
* relations and annotatons (in one line, first is path to annotations
* and second are relations -- these paths, should be separated by semicolon)
* @return Readed corpus
*/
boost::shared_ptr<Corpus> read(const std::string& corpus_file);
protected:
/// Tagset to use, sets only onece in constructor
const Tagset& tagset_;
/// Type of corpus, sets only once in constructor
const std::string& corpus_type_;
private:
/// Returns reader based on corpus type (poliqarp/document)
boost::shared_ptr<DocumentReaderI> get_reader_by_type(
const std::string &type,
const std::string &ann_path,
const std::string &rel_path = "");
};
} // whole ns
} // Corpus2 ns
#endif // LIBCORPUS2_WHOLE_CORPUSREADER_H
......@@ -15,11 +15,12 @@ or FITNESS FOR A PARTICULAR PURPOSE.
*/
#include <boost/make_shared.hpp>
#include <libcorpus2/io/docreader.h>
#include <libcorpus2_whole/docreader.h>
namespace Corpus2 {
DocumentReader::DocumentReader(const Tagset& tagset,
const std::string &annot_path, const std::string &rela_path)
: DocumentReaderI("document")
{
make_readers(tagset, annot_path, rela_path);
}
......
......@@ -14,13 +14,14 @@ or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef LIBCORPUS2_DOCREADER_H
#define LIBCORPUS2_DOCREADER_H
#ifndef LIBCORPUS2_WHOLE__DOCREADER_H
#define LIBCORPUS2_WHOLE__DOCREADER_H
#include <libcorpus2/relation.h>
#include <libcorpus2/document.h>
#include <libcorpus2_whole/relation.h>
#include <libcorpus2_whole/document.h>
#include <libcorpus2_whole/docreaderi.h>
#include <libcorpus2/io/cclreader.h>
#include <libcorpus2/io/relreader.h>
#include <libcorpus2_whole/relreader.h>
#include <boost/shared_ptr.hpp>
......@@ -30,7 +31,7 @@ namespace Corpus2 {
* A reader for whole documents. Note that a whole document is read into memory
* before any processing may take place.
*/
class DocumentReader {
class DocumentReader : public DocumentReaderI {
public:
/**
* Reads a whole document, using the two given path: the morphosyntax and
......@@ -85,4 +86,4 @@ private:
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_DOCREADER_H
#endif // LIBCORPUS2_WHOLE_DOCREADER_H
#ifndef READERI_H
#define READERI_H
#include <string>
#include <libcorpus2_whole/document.h>
#include <boost/shared_ptr.hpp>
namespace Corpus2 {
/**
* Reader interface,
* Contains method to get reader type (such as document, poliqarp).
* Method for reading read must by implemented!
*/
class DocumentReaderI
{
public:
DocumentReaderI(const std::string type) : type_(type) {}
const std::string& type() const { return type_; }
virtual boost::shared_ptr<Document> read() = 0;
protected:
const std::string type_;
};
}
#endif // READERI_H
......@@ -14,7 +14,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <libcorpus2/document.h>
#include <libcorpus2_whole/document.h>
#include <boost/make_shared.hpp>
namespace Corpus2 {
......
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski, Paweł Kędzia
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
......@@ -14,11 +14,11 @@ or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef LIBCORPUS2_DOCUMENT_H
#define LIBCORPUS2_DOCUMENT_H
#ifndef LIBCORPUS2_WHOLE_DOCUMENT_H
#define LIBCORPUS2_WHOLE_DOCUMENT_H
#include <libcorpus2/chunk.h>
#include <libcorpus2/relation.h>
#include <libcorpus2_whole/relation.h>
#include <boost/shared_ptr.hpp>
namespace Corpus2 {
......@@ -65,4 +65,4 @@ protected:
} /* end ns Corpus2 */
#endif // LIBCORPUS2_DOCUMENT_H
#endif // LIBCORPUS2_WHOLE_DOCUMENT_H
#include <libcorpus2_whole/poliqarpdocumentreader.h>
namespace Corpus2 {
namespace whole {
PoliqarpDocumentReader::PoliqarpDocumentReader(const Tagset& tagset, const std::string& corpus_path)
: DocumentReaderI("poliqarp"), corpus_path_(corpus_path)
{
this->pqr_ = boost::shared_ptr<PoliqarpReader> (new PoliqarpReader(tagset, corpus_path));
}
boost::shared_ptr<Document> PoliqarpDocumentReader::read()
{
boost::shared_ptr<Document> document = boost::make_shared<Document>();
// boost::shared_ptr<Chunk> chunk = this->pqr_->pq_->get_next_sentence();
// if (chunk) {
// document->add_paragraph(chunk);
// }
return document;
}
} // whole ns
} // Corpus2 ns
#ifndef LIBCORPUS2_WHOLE_POLIQARPDOCUMENTREADER_H
#define LIBCORPUS2_WHOLE_POLIQARPDOCUMENTREADER_H
#include <poliqarp/pqreader.h>
#include <libcorpus2_whole/docreaderi.h>
#include <libcorpus2_whole/document.h>
namespace Corpus2 {
namespace whole {
class PoliqarpDocumentReader : public DocumentReaderI
{
public:
PoliqarpDocumentReader(const Tagset& tagset, const std::string& corpus_path);
/// semantic of this methd is like get_next_document
boost::shared_ptr<Document> read();
private:
const std::string corpus_path_;
boost::shared_ptr<PoliqarpReader> pqr_;
};
} // whole ns
} // Corpus2 ns
#endif // LIBCORPUS2_WHOLE_POLIQARPDOCUMENTREADER_H
......@@ -15,7 +15,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
*/
#include <boost/make_shared.hpp>
#include <libcorpus2/relation.h>
#include <libcorpus2_whole/relation.h>
namespace Corpus2 {
......
File moved
......@@ -16,7 +16,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include <libpwrutils/foreach.h>
#include <libcorpus2/exception.h>
#include <libcorpus2/io/relreader.h>
#include <libcorpus2_whole/relreader.h>
#include <fstream>
#include <boost/make_shared.hpp>
......
......@@ -14,15 +14,15 @@ or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef LIBCORPUS2_RELREADER_H
#define LIBCORPUS2_RELREADER_H
#ifndef LIBCORPUS2_WHOLE_RELREADER_H
#define LIBCORPUS2_WHOLE_RELREADER_H
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/scoped_ptr.hpp>
#include <libxml++/parsers/saxparser.h>
#include <libcorpus2/relation.h>
#include <libcorpus2_whole/relation.h>
#include <iostream>
......@@ -120,4 +120,4 @@ private:
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_RELREADER_H
#endif // LIBCORPUS2_WHOLE_RELREADER_H
PROJECT(Corpus2Poliqarp)
PROJECT(corpus2_poliqarpreader)
cmake_minimum_required(VERSION 2.8.0)
set(c2pq_ver_major 1)
set(c2pq_ver_minor 0)
......
......@@ -2,7 +2,7 @@
PROJECT(corpus2SwigWrap)
set(LIBS "corpus2" "pwrutils")
set(LIBS "corpus2" "corpus2_whole" "corpus2_poliqarpreader" "pwrutils")
include_directories (${corpus2_SOURCE_DIR})
include_directories (${pwrutils_SOURCE_DIR})
......
......@@ -3,7 +3,7 @@
%module libcorpusdocument
%{
#include <libcorpus2/document.h>
#include <libcorpus2_whole/document.h>
%}
%include "std_defs.i"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment