Skip to content
Snippets Groups Projects
Commit dc7c28eb authored by ilor's avatar ilor
Browse files

PoliqarpReader

parent 50043d34
No related merge requests found
......@@ -68,7 +68,6 @@ add_subdirectory(libpwrutils)
add_subdirectory(libcorpus2)
add_subdirectory(corpus2tools)
add_subdirectory(tests)
if(CORPUS2_BUILD_POLIQARP)
add_subdirectory(poliqarp-library)
add_subdirectory(poliqarp)
......
......@@ -5,8 +5,8 @@
namespace Corpus2 {
bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>("xces-fast",
"ign,loose,strict");
bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>(
"xces-fast","ign,loose,strict");
class FastXcesReaderImpl
{
......
project (PoliqarpLibrary)
cmake_minimum_required(VERSION 2.8.0)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts)
set(pqlib_ver_major 1)
set(pqlib_ver_minor 0)
INCLUDE (CheckIncludeFiles)
INCLUDE (CheckLibraryExists)
......@@ -136,14 +138,18 @@ set(poliqarpd_SRC
poliqarpd/utils.c
)
add_library(libpoliqarp SHARED ${foostring_SRC} ${progress_SRC} ${unibits_SRC} ${common_SRC} ${sakura_SRC} ${BF_SOURCES} ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h)
add_dependencies(libpoliqarp ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.ha)
add_library(poliqarpc2 SHARED ${foostring_SRC} ${progress_SRC} ${unibits_SRC} ${common_SRC} ${sakura_SRC} ${BF_SOURCES} ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h)
add_dependencies(poliqarpc2 ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.ha)
set_target_properties(poliqarpc2 PROPERTIES
VERSION "${pqlib_ver_major}.${pqlib_ver_minor}"
SOVERSION ${pqlib_ver_major})
add_executable(poliqarpc-shared utils/poliqarpc.c)
target_link_libraries(poliqarpc-shared libpoliqarp pthread)
target_link_libraries(poliqarpc-shared poliqarpc2 pthread)
add_executable(poliqarpd-shared ${poliqarpd_SRC})
target_link_libraries(poliqarpd-shared libpoliqarp pthread)
target_link_libraries(poliqarpd-shared poliqarpc2 pthread)
if(UNIX)
install(TARGETS libpoliqarp LIBRARY DESTINATION lib)
install(TARGETS poliqarpc2 LIBRARY DESTINATION lib)
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/sakura DESTINATION include
FILES_MATCHING PATTERN "*.h")
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/progress DESTINATION include/sakura
......
PROJECT(Corpus2Poliqarp)
cmake_minimum_required(VERSION 2.8.0)
set(c2pq_ver_major 1)
set(c2pq_ver_minor 0)
include_directories(${PoliqarpLibrary_SOURCE_DIR}/sakura)
include_directories(${PoliqarpLibrary_SOURCE_DIR})
include_directories(${PoliqarpLibrary_BINARY_DIR}/sakura)
include_directories(${PoliqarpLibrary_BINARY_DIR})
add_library(corpus2poliqarp SHARED pqclient.cpp)
add_executable(c2pqtest c2pqtest.cpp)
target_link_libraries(corpus2poliqarp libpoliqarp corpus2)
target_link_libraries(c2pqtest libpoliqarp corpus2poliqarp corpus2 pwrutils )
add_library(corpus2poliqarp SHARED pqclient.cpp pqreader.cpp)
set_target_properties(corpus2poliqarp PROPERTIES
VERSION "${c2pq_ver_major}.${c2pq_ver_minor}"
SOVERSION ${c2pq_ver_major})
target_link_libraries(corpus2poliqarp poliqarpc2 corpus2)
add_executable(c2pqtest c2pqtest.cpp)
target_link_libraries(c2pqtest poliqarpc2 corpus2poliqarp corpus2 pwrutils )
if(UNIX)
install(TARGETS corpus2poliqarp LIBRARY DESTINATION lib)
install(TARGETS c2pqtest RUNTIME DESTINATION bin)
endif(UNIX)
......@@ -15,7 +15,10 @@ int main(int argc, char** argv)
pqc.execute_query();
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer = Corpus2::TokenWriter::create_stream_writer("plain", std::cout, tagset);
while (Corpus2::Token* t = pqc.get_next_focus_token()) {
writer->write_token_dispose(t);
//while (Corpus2::Token* t = pqc.get_next_focus_token()) {
// writer->write_token_dispose(t);
//}
while (Corpus2::Sentence::Ptr s = pqc.get_next_match_sequence()) {
writer->write_sentence(*s);
}
}
#include "pqclient.h"
#include <boost/make_shared.hpp>
#include <boost/lexical_cast.hpp>
extern "C" {
void async_notify_new_results(void* session)
......@@ -136,7 +137,7 @@ Token* PoliqarpClient::get_token(size_t pos)
poliqarp_get_interpretation_set_info(&set, &sinfo);
std::auto_ptr<Token> res(new Token());
if (!info.space_before) {
if (info.space_before) {
res->set_wa(PwrNlp::Whitespace::Space);
}
res->set_orth_utf8(info.text);
......@@ -151,25 +152,25 @@ Token* PoliqarpClient::get_token(size_t pos)
return res.release();
}
Sentence::Ptr PoliqarpClient::get_next_sequence(bool whole_sentence)
boost::shared_ptr<Chunk> PoliqarpClient::get_next_document()
{
Sentence::Ptr sentence;
if (info_.used > 0) {
if (buffer_pos_ < info_.used) {
struct poliqarp_match poli_match;
poliqarp_get_match(&buffer_, &poli_match, buffer_pos_++);
curr_chunk_doc_id_ = poli_match.document;
if (whole_sentence) {
//sentence = get_token_range(poli_match.withinStart, poli_match.withinEnd);
poliqarp_match match;
boost::shared_ptr<Chunk> chunk;
if (next_match(match)) {
chunk = boost::make_shared<Chunk>();
size_t document_id = match.document;
chunk->set_attribute("id", "ch" + boost::lexical_cast<std::string>(document_id));
chunk->append(get_token_range(match.start, match.end));
while (next_match(match)) {
if (match.document == document_id) {
chunk->append(get_token_range(match.start, match.end));
} else {
sentence = get_token_range(poli_match.start, poli_match.end);
buffer_pos_--;
break;
}
} else {
execute_query();
sentence = get_next_sequence(whole_sentence);
}
}
return sentence;
return chunk;
}
Sentence::Ptr PoliqarpClient::get_token_range(size_t from, size_t to)
......
......@@ -32,12 +32,11 @@ public:
Token* get_next_focus_token();
Sentence::Ptr get_next_match_sequence();
boost::shared_ptr<Chunk> get_next_document();
Token* get_token(size_t pos);
Sentence::Ptr get_token_range(size_t from, size_t to);
Sentence::Ptr get_next_sequence(bool whole_sentence);
size_t get_count_of_matches_so_far();
size_t only_count_results();
size_t get_corpus_size() const;
......
#include "pqreader.h"
#include "pqclient.h"
namespace Corpus2 {
bool PoliqarpReader::registered = TokenReader::register_path_reader<PoliqarpReader>(
"poliqarp","token,chunk,sentence");
PoliqarpReader::PoliqarpReader(const Tagset &tagset, const std::string &filename)
: TokenReader(tagset), pq_(new PoliqarpClient(tagset, filename)),
executed_(false), mode_(PQ_SENTENCES)
{
pq_->compile_query("[]+ within s");
}
PoliqarpReader::~PoliqarpReader()
{
}
void PoliqarpReader::set_query(const std::string &query)
{
pq_->compile_query(query);
mode_ = PQ_MANUAL;
executed_ = false;
}
void PoliqarpReader::execute()
{
pq_->execute_query();
executed_ = true;
}
Token* PoliqarpReader::get_next_token()
{
if (!executed_) execute();
return pq_->get_next_focus_token();
}
Sentence::Ptr PoliqarpReader::get_next_sentence()
{
if (!executed_) execute();
return pq_->get_next_match_sequence();
}
boost::shared_ptr<Chunk> PoliqarpReader::get_next_chunk()
{
if (!executed_) execute();
return pq_->get_next_document();
}
void PoliqarpReader::set_option(const std::string &option)
{
if (option == "chunk") {
pq_->compile_query("[]+ within p");
mode_ = PQ_CHUNKS;
executed_ = false;
} else if (option == "token") {
pq_->compile_query("[]+ within s");
mode_ = PQ_SENTENCES;
executed_ = false;
} else if (option == "token") {
pq_->compile_query("[]");
mode_ = PQ_TOKENS;
executed_ = false;
} else {
TokenReader::set_option(option);
}
}
std::string PoliqarpReader::get_option(const std::string& option) const
{
return TokenReader::get_option(option);
}
} /* end ns Corpus2 */
#ifndef LIBSORPUS2_PQREADER_H
#define LIBCORPUS2_PQREADER_H
#include <libcorpus2/io/reader.h>
#include <deque>
#include <boost/scoped_ptr.hpp>
namespace Corpus2 {
// fwd decl
class PoliqarpClient;
class PoliqarpReader: public TokenReader
{
public:
PoliqarpReader(const Tagset& tagset, const std::string& filename);
enum PQ_MODE {
PQ_TOKENS,
PQ_SENTENCES,
PQ_CHUNKS,
PQ_MANUAL
};
~PoliqarpReader();
void set_query(const std::string& query);
void execute();
Token* get_next_token();
Sentence::Ptr get_next_sentence();
boost::shared_ptr<Chunk> get_next_chunk();
void set_option(const std::string& option);
std::string get_option(const std::string& option) const;
static bool registered;
protected:
void ensure_more();
boost::scoped_ptr<PoliqarpClient> pq_;
bool executed_;
PQ_MODE mode_;
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_PQREADER_H
......@@ -2,9 +2,10 @@
PROJECT(corpus2SwigWrap)
set(CORPUS2_LIBS corpus2)
set(PWRUTILS_LIBS pwrutils)
set(CORPUS2_PWR_LIBS ${CORPUS2_LIBS} ${PWRUTILS_LIBS})
set(LIBS "corpus2" "pwrutils")
if (CORPUS2_BUILD_POLIQARP)
set(LIBS ${LIBS} "corpus2poliqarp")
endif (CORPUS2_BUILD_POLIQARP)
include_directories (${corpus2_SOURCE_DIR})
include_directories (${pwrutils_SOURCE_DIR})
......@@ -48,8 +49,7 @@ SET_SOURCE_FILES_PROPERTIES(corpus2.i PROPERTIES CPLUSPLUS ON)
SET_SOURCE_FILES_PROPERTIES(corpus2.i PROPERTIES SWIG_FLAGS "-includeall" )
set(SWIG_MODULE_corpus2_EXTRA_DEPS ${I_FILES})
SWIG_ADD_MODULE(corpus2 python corpus2.i )
SWIG_LINK_LIBRARIES(corpus2 ${PYTHON_LIBRARIES} ${PWRUTILS_LIBS})
SWIG_LINK_LIBRARIES(corpus2 ${PYTHON_LIBRARIES} ${CORPUS2_PWR_LIBS})
SWIG_LINK_LIBRARIES(corpus2 ${PYTHON_LIBRARIES} ${LIBS})
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment