From dc7c28ebbd2c1df4fcf9367c3bfd41d7e313b5ea Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 2 Jun 2011 17:18:37 +0200 Subject: [PATCH] PoliqarpReader --- CMakeLists.txt | 1 - libcorpus2/io/fastxces.cpp | 4 +- poliqarp-library/CMakeLists.txt | 16 ++++--- poliqarp/CMakeLists.txt | 19 ++++++--- poliqarp/c2pqtest.cpp | 7 ++- poliqarp/pqclient.cpp | 31 +++++++------- poliqarp/pqclient.h | 3 +- poliqarp/pqreader.cpp | 76 +++++++++++++++++++++++++++++++++ poliqarp/pqreader.h | 56 ++++++++++++++++++++++++ swig/CMakeLists.txt | 10 ++--- 10 files changed, 185 insertions(+), 38 deletions(-) create mode 100644 poliqarp/pqreader.cpp create mode 100644 poliqarp/pqreader.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 096fa86..e15adab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,6 @@ add_subdirectory(libpwrutils) add_subdirectory(libcorpus2) add_subdirectory(corpus2tools) add_subdirectory(tests) - if(CORPUS2_BUILD_POLIQARP) add_subdirectory(poliqarp-library) add_subdirectory(poliqarp) diff --git a/libcorpus2/io/fastxces.cpp b/libcorpus2/io/fastxces.cpp index 6eaedaf..f2e0f30 100644 --- a/libcorpus2/io/fastxces.cpp +++ b/libcorpus2/io/fastxces.cpp @@ -5,8 +5,8 @@ namespace Corpus2 { -bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>("xces-fast", - "ign,loose,strict"); +bool FastXcesReader::registered = TokenReader::register_reader<FastXcesReader>( + "xces-fast","ign,loose,strict"); class FastXcesReaderImpl { diff --git a/poliqarp-library/CMakeLists.txt b/poliqarp-library/CMakeLists.txt index 5f07bb2..21c4ee0 100644 --- a/poliqarp-library/CMakeLists.txt +++ b/poliqarp-library/CMakeLists.txt @@ -1,6 +1,8 @@ project (PoliqarpLibrary) cmake_minimum_required(VERSION 2.8.0) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts) +set(pqlib_ver_major 1) +set(pqlib_ver_minor 0) INCLUDE (CheckIncludeFiles) INCLUDE (CheckLibraryExists) @@ -136,14 +138,18 @@ set(poliqarpd_SRC poliqarpd/utils.c ) -add_library(libpoliqarp SHARED ${foostring_SRC} ${progress_SRC} ${unibits_SRC} ${common_SRC} ${sakura_SRC} ${BF_SOURCES} ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h) -add_dependencies(libpoliqarp ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.ha) +add_library(poliqarpc2 SHARED ${foostring_SRC} ${progress_SRC} ${unibits_SRC} ${common_SRC} ${sakura_SRC} ${BF_SOURCES} ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h) +add_dependencies(poliqarpc2 ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.ha) +set_target_properties(poliqarpc2 PROPERTIES + VERSION "${pqlib_ver_major}.${pqlib_ver_minor}" + SOVERSION ${pqlib_ver_major}) + add_executable(poliqarpc-shared utils/poliqarpc.c) -target_link_libraries(poliqarpc-shared libpoliqarp pthread) +target_link_libraries(poliqarpc-shared poliqarpc2 pthread) add_executable(poliqarpd-shared ${poliqarpd_SRC}) -target_link_libraries(poliqarpd-shared libpoliqarp pthread) +target_link_libraries(poliqarpd-shared poliqarpc2 pthread) if(UNIX) - install(TARGETS libpoliqarp LIBRARY DESTINATION lib) + install(TARGETS poliqarpc2 LIBRARY DESTINATION lib) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/sakura DESTINATION include FILES_MATCHING PATTERN "*.h") install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/progress DESTINATION include/sakura diff --git a/poliqarp/CMakeLists.txt b/poliqarp/CMakeLists.txt index a02fa5f..b1a51a2 100644 --- a/poliqarp/CMakeLists.txt +++ b/poliqarp/CMakeLists.txt @@ -1,15 +1,22 @@ PROJECT(Corpus2Poliqarp) cmake_minimum_required(VERSION 2.8.0) +set(c2pq_ver_major 1) +set(c2pq_ver_minor 0) include_directories(${PoliqarpLibrary_SOURCE_DIR}/sakura) include_directories(${PoliqarpLibrary_SOURCE_DIR}) include_directories(${PoliqarpLibrary_BINARY_DIR}/sakura) include_directories(${PoliqarpLibrary_BINARY_DIR}) -add_library(corpus2poliqarp SHARED pqclient.cpp) - -add_executable(c2pqtest c2pqtest.cpp) -target_link_libraries(corpus2poliqarp libpoliqarp corpus2) -target_link_libraries(c2pqtest libpoliqarp corpus2poliqarp corpus2 pwrutils ) - +add_library(corpus2poliqarp SHARED pqclient.cpp pqreader.cpp) +set_target_properties(corpus2poliqarp PROPERTIES + VERSION "${c2pq_ver_major}.${c2pq_ver_minor}" + SOVERSION ${c2pq_ver_major}) +target_link_libraries(corpus2poliqarp poliqarpc2 corpus2) +add_executable(c2pqtest c2pqtest.cpp) +target_link_libraries(c2pqtest poliqarpc2 corpus2poliqarp corpus2 pwrutils ) +if(UNIX) + install(TARGETS corpus2poliqarp LIBRARY DESTINATION lib) + install(TARGETS c2pqtest RUNTIME DESTINATION bin) +endif(UNIX) diff --git a/poliqarp/c2pqtest.cpp b/poliqarp/c2pqtest.cpp index bd78881..29a1114 100644 --- a/poliqarp/c2pqtest.cpp +++ b/poliqarp/c2pqtest.cpp @@ -15,7 +15,10 @@ int main(int argc, char** argv) pqc.execute_query(); boost::shared_ptr<Corpus2::TokenWriter> writer; writer = Corpus2::TokenWriter::create_stream_writer("plain", std::cout, tagset); - while (Corpus2::Token* t = pqc.get_next_focus_token()) { - writer->write_token_dispose(t); + //while (Corpus2::Token* t = pqc.get_next_focus_token()) { + // writer->write_token_dispose(t); + //} + while (Corpus2::Sentence::Ptr s = pqc.get_next_match_sequence()) { + writer->write_sentence(*s); } } diff --git a/poliqarp/pqclient.cpp b/poliqarp/pqclient.cpp index e2f1f66..3d7115c 100644 --- a/poliqarp/pqclient.cpp +++ b/poliqarp/pqclient.cpp @@ -1,5 +1,6 @@ #include "pqclient.h" #include <boost/make_shared.hpp> +#include <boost/lexical_cast.hpp> extern "C" { void async_notify_new_results(void* session) @@ -136,7 +137,7 @@ Token* PoliqarpClient::get_token(size_t pos) poliqarp_get_interpretation_set_info(&set, &sinfo); std::auto_ptr<Token> res(new Token()); - if (!info.space_before) { + if (info.space_before) { res->set_wa(PwrNlp::Whitespace::Space); } res->set_orth_utf8(info.text); @@ -151,25 +152,25 @@ Token* PoliqarpClient::get_token(size_t pos) return res.release(); } -Sentence::Ptr PoliqarpClient::get_next_sequence(bool whole_sentence) +boost::shared_ptr<Chunk> PoliqarpClient::get_next_document() { - Sentence::Ptr sentence; - if (info_.used > 0) { - if (buffer_pos_ < info_.used) { - struct poliqarp_match poli_match; - poliqarp_get_match(&buffer_, &poli_match, buffer_pos_++); - curr_chunk_doc_id_ = poli_match.document; - if (whole_sentence) { - //sentence = get_token_range(poli_match.withinStart, poli_match.withinEnd); + poliqarp_match match; + boost::shared_ptr<Chunk> chunk; + if (next_match(match)) { + chunk = boost::make_shared<Chunk>(); + size_t document_id = match.document; + chunk->set_attribute("id", "ch" + boost::lexical_cast<std::string>(document_id)); + chunk->append(get_token_range(match.start, match.end)); + while (next_match(match)) { + if (match.document == document_id) { + chunk->append(get_token_range(match.start, match.end)); } else { - sentence = get_token_range(poli_match.start, poli_match.end); + buffer_pos_--; + break; } - } else { - execute_query(); - sentence = get_next_sequence(whole_sentence); } } - return sentence; + return chunk; } Sentence::Ptr PoliqarpClient::get_token_range(size_t from, size_t to) diff --git a/poliqarp/pqclient.h b/poliqarp/pqclient.h index 11437cb..0420849 100644 --- a/poliqarp/pqclient.h +++ b/poliqarp/pqclient.h @@ -32,12 +32,11 @@ public: Token* get_next_focus_token(); Sentence::Ptr get_next_match_sequence(); + boost::shared_ptr<Chunk> get_next_document(); Token* get_token(size_t pos); Sentence::Ptr get_token_range(size_t from, size_t to); - Sentence::Ptr get_next_sequence(bool whole_sentence); - size_t get_count_of_matches_so_far(); size_t only_count_results(); size_t get_corpus_size() const; diff --git a/poliqarp/pqreader.cpp b/poliqarp/pqreader.cpp new file mode 100644 index 0000000..400e9b5 --- /dev/null +++ b/poliqarp/pqreader.cpp @@ -0,0 +1,76 @@ +#include "pqreader.h" +#include "pqclient.h" + +namespace Corpus2 { + +bool PoliqarpReader::registered = TokenReader::register_path_reader<PoliqarpReader>( + "poliqarp","token,chunk,sentence"); + + +PoliqarpReader::PoliqarpReader(const Tagset &tagset, const std::string &filename) + : TokenReader(tagset), pq_(new PoliqarpClient(tagset, filename)), + executed_(false), mode_(PQ_SENTENCES) +{ + pq_->compile_query("[]+ within s"); +} + +PoliqarpReader::~PoliqarpReader() +{ +} + +void PoliqarpReader::set_query(const std::string &query) +{ + pq_->compile_query(query); + mode_ = PQ_MANUAL; + executed_ = false; +} + +void PoliqarpReader::execute() +{ + pq_->execute_query(); + executed_ = true; +} + +Token* PoliqarpReader::get_next_token() +{ + if (!executed_) execute(); + return pq_->get_next_focus_token(); +} + +Sentence::Ptr PoliqarpReader::get_next_sentence() +{ + if (!executed_) execute(); + return pq_->get_next_match_sequence(); +} + +boost::shared_ptr<Chunk> PoliqarpReader::get_next_chunk() +{ + if (!executed_) execute(); + return pq_->get_next_document(); +} + +void PoliqarpReader::set_option(const std::string &option) +{ + if (option == "chunk") { + pq_->compile_query("[]+ within p"); + mode_ = PQ_CHUNKS; + executed_ = false; + } else if (option == "token") { + pq_->compile_query("[]+ within s"); + mode_ = PQ_SENTENCES; + executed_ = false; + } else if (option == "token") { + pq_->compile_query("[]"); + mode_ = PQ_TOKENS; + executed_ = false; + } else { + TokenReader::set_option(option); + } +} + +std::string PoliqarpReader::get_option(const std::string& option) const +{ + return TokenReader::get_option(option); +} + +} /* end ns Corpus2 */ diff --git a/poliqarp/pqreader.h b/poliqarp/pqreader.h new file mode 100644 index 0000000..35acf22 --- /dev/null +++ b/poliqarp/pqreader.h @@ -0,0 +1,56 @@ +#ifndef LIBSORPUS2_PQREADER_H +#define LIBCORPUS2_PQREADER_H + +#include <libcorpus2/io/reader.h> +#include <deque> +#include <boost/scoped_ptr.hpp> + + +namespace Corpus2 { + +// fwd decl +class PoliqarpClient; + +class PoliqarpReader: public TokenReader +{ +public: + PoliqarpReader(const Tagset& tagset, const std::string& filename); + + enum PQ_MODE { + PQ_TOKENS, + PQ_SENTENCES, + PQ_CHUNKS, + PQ_MANUAL + }; + + ~PoliqarpReader(); + + void set_query(const std::string& query); + + void execute(); + + Token* get_next_token(); + + Sentence::Ptr get_next_sentence(); + + boost::shared_ptr<Chunk> get_next_chunk(); + + void set_option(const std::string& option); + + std::string get_option(const std::string& option) const; + + static bool registered; + +protected: + void ensure_more(); + + boost::scoped_ptr<PoliqarpClient> pq_; + + bool executed_; + + PQ_MODE mode_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_PQREADER_H diff --git a/swig/CMakeLists.txt b/swig/CMakeLists.txt index 52bf215..bf78211 100644 --- a/swig/CMakeLists.txt +++ b/swig/CMakeLists.txt @@ -2,9 +2,10 @@ PROJECT(corpus2SwigWrap) -set(CORPUS2_LIBS corpus2) -set(PWRUTILS_LIBS pwrutils) -set(CORPUS2_PWR_LIBS ${CORPUS2_LIBS} ${PWRUTILS_LIBS}) +set(LIBS "corpus2" "pwrutils") +if (CORPUS2_BUILD_POLIQARP) + set(LIBS ${LIBS} "corpus2poliqarp") +endif (CORPUS2_BUILD_POLIQARP) include_directories (${corpus2_SOURCE_DIR}) include_directories (${pwrutils_SOURCE_DIR}) @@ -48,8 +49,7 @@ SET_SOURCE_FILES_PROPERTIES(corpus2.i PROPERTIES CPLUSPLUS ON) SET_SOURCE_FILES_PROPERTIES(corpus2.i PROPERTIES SWIG_FLAGS "-includeall" ) set(SWIG_MODULE_corpus2_EXTRA_DEPS ${I_FILES}) SWIG_ADD_MODULE(corpus2 python corpus2.i ) -SWIG_LINK_LIBRARIES(corpus2 ${PYTHON_LIBRARIES} ${PWRUTILS_LIBS}) -SWIG_LINK_LIBRARIES(corpus2 ${PYTHON_LIBRARIES} ${CORPUS2_PWR_LIBS}) +SWIG_LINK_LIBRARIES(corpus2 ${PYTHON_LIBRARIES} ${LIBS}) # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- -- GitLab