From 5082e27d76aec073917647cf2bba799af228f6f9 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Wed, 1 Jun 2011 17:34:51 +0200 Subject: [PATCH] simple corpus2 pqlib wrapper and test executable, wip --- poliqarp/CMakeLists.txt | 8 +- poliqarp/c2pqtest.cpp | 21 ++++ poliqarp/pqclient.cpp | 215 ++++++++++++++++++++++++++++++++++++++++ poliqarp/pqclient.h | 57 ++++++++++- 4 files changed, 295 insertions(+), 6 deletions(-) create mode 100644 poliqarp/c2pqtest.cpp diff --git a/poliqarp/CMakeLists.txt b/poliqarp/CMakeLists.txt index 769cb66..a02fa5f 100644 --- a/poliqarp/CMakeLists.txt +++ b/poliqarp/CMakeLists.txt @@ -1,9 +1,15 @@ PROJECT(Corpus2Poliqarp) cmake_minimum_required(VERSION 2.8.0) -include_directories("/usr/local/include/sakura/") +include_directories(${PoliqarpLibrary_SOURCE_DIR}/sakura) +include_directories(${PoliqarpLibrary_SOURCE_DIR}) +include_directories(${PoliqarpLibrary_BINARY_DIR}/sakura) +include_directories(${PoliqarpLibrary_BINARY_DIR}) add_library(corpus2poliqarp SHARED pqclient.cpp) +add_executable(c2pqtest c2pqtest.cpp) target_link_libraries(corpus2poliqarp libpoliqarp corpus2) +target_link_libraries(c2pqtest libpoliqarp corpus2poliqarp corpus2 pwrutils ) + diff --git a/poliqarp/c2pqtest.cpp b/poliqarp/c2pqtest.cpp new file mode 100644 index 0000000..bd78881 --- /dev/null +++ b/poliqarp/c2pqtest.cpp @@ -0,0 +1,21 @@ +#include "pqclient.h" +#include <libcorpus2/tagsetmanager.h> +#include <iostream> +#include <libcorpus2/io/writer.h> + +int main(int argc, char** argv) +{ + std::cerr << "C2PQTEST\n"; + if (argc < 3) return 2; + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi"); + std::cerr << argv[1] << "\n"; + std::cerr << argv[2] << "\n"; + Corpus2::PoliqarpClient pqc(tagset, argv[1]); + pqc.compile_query(argv[2]); + pqc.execute_query(); + boost::shared_ptr<Corpus2::TokenWriter> writer; + writer = Corpus2::TokenWriter::create_stream_writer("plain", std::cout, tagset); + while (Corpus2::Token* t = pqc.get_next_focus_token()) { + writer->write_token_dispose(t); + } +} diff --git a/poliqarp/pqclient.cpp b/poliqarp/pqclient.cpp index 24d3423..e2f1f66 100644 --- a/poliqarp/pqclient.cpp +++ b/poliqarp/pqclient.cpp @@ -1 +1,216 @@ #include "pqclient.h" +#include <boost/make_shared.hpp> + +extern "C" { + void async_notify_new_results(void* session) + { + } +} + +namespace Corpus2 +{ + +PoliqarpClient::PoliqarpClient(const Tagset& tagset, const std::string path) + : tagset_(tagset) +{ + query_compiled_ = false; + poliqarp_error error = poliqarp_error_none; + if (poliqarp_create("", &error) != 0) { + throw Corpus2Error(poliqarp_error_message_get(&error)); + } + progress_init(&progress_); + count_so_far_ = 0; + err_ = 0; + if (poliqarp_open_corpus(&corpus_, path.c_str(), &progress_, &error) == -1) { + throw Corpus2Error(poliqarp_error_message_get(&error)); + } else { + poliqarp_create_match_buffer(&buffer_, 1000); + } + poliqarp_corpus_info cinfo; + poliqarp_get_corpus_info(&corpus_, &cinfo); + corpus_size_ = cinfo.num_segments; + curr_chunk_doc_id_ = 0; +} + +PoliqarpClient::~PoliqarpClient() +{ + poliqarp_close_corpus(&corpus_); + poliqarp_destroy_match_buffer(&buffer_); + if (query_compiled_) { + poliqarp_destroy_query(&query_); + query_compiled_ = false; + } + poliqarp_destroy(); +}; + +void PoliqarpClient::compile_query(const std::string & q) +{ + count_so_far_ = 0; + last_query_ = q; + if (query_compiled_) { + poliqarp_destroy_query(&query_); + query_compiled_ = false; + } + poliqarp_error error = poliqarp_error_none; + if (q.empty()) { + throw Corpus2Error("EmptyQuery"); + } else if (poliqarp_create_query(&query_, q.c_str(), &corpus_, + 0, NULL, NULL, &error) == -1) { + throw Corpus2Error(std::string("QueryFailed: ") + poliqarp_error_message_get(&error)); + } else { + query_compiled_ = true; + } +} + +void PoliqarpClient::reset_query() +{ + compile_query(last_query_); +} + +void PoliqarpClient::execute_query() +{ + if (query_compiled_) { + poliqarp_forget(&buffer_); + if (poliqarp_produce(&buffer_, 1000, &query_, &progress_, + NULL, 0, 1000)) { + throw Corpus2Error("query execution error"); + } + if (poliqarp_get_match_buffer_info(&buffer_, &info_)) { + throw Corpus2Error("buffer read error"); + } + count_so_far_ += buffer_.used; + buffer_pos_ = 0; + } else { + throw Corpus2Error("Query not compiled"); + } +} + + +bool PoliqarpClient::next_match(poliqarp_match& match) +{ + if (info_.used > 0) { + if (buffer_pos_ < info_.used) { + poliqarp_get_match(&buffer_, &match, buffer_pos_++); + return true; + } else if (info_.used == buffer_.capacity) { + poliqarp_forget(&buffer_); + execute_query(); + if (info_.used > 0) { + poliqarp_get_match(&buffer_, &match, buffer_pos_++); + return true; + } + } + } + return false; +} + +Token* PoliqarpClient::get_next_focus_token() +{ + poliqarp_match match; + if (next_match(match)) { + return get_token(match.focus); + } else { + return NULL; + } +}; + +Sentence::Ptr PoliqarpClient::get_next_match_sequence() +{ + poliqarp_match match; + if (next_match(match)) { + return get_token_range(match.start, match.end); + } else { + return Sentence::Ptr(); + } +} + +Token* PoliqarpClient::get_token(size_t pos) +{ + poliqarp_segment segment; + poliqarp_segment_info info; + poliqarp_interpretation_set set; + poliqarp_interpretation_set_info sinfo; + poliqarp_get_segment(&segment, &corpus_, pos); + poliqarp_get_segment_info(&segment, &info); + poliqarp_get_disambiguated_interpretations(&segment, &set); + poliqarp_get_interpretation_set_info(&set, &sinfo); + + std::auto_ptr<Token> res(new Token()); + if (!info.space_before) { + res->set_wa(PwrNlp::Whitespace::Space); + } + res->set_orth_utf8(info.text); + for (size_t i = 0; i < sinfo.size; i++) { + poliqarp_interpretation interp; + poliqarp_interpretation_info iinfo; + poliqarp_get_interpretation(&set, &interp, i); + poliqarp_get_interpretation_info(&interp, &iinfo); + Tag tag = tagset_.parse_simple_tag(iinfo.tag); + res->add_lexeme(Lexeme(UnicodeString::fromUTF8(iinfo.base), tag)); + } + return res.release(); +} + +Sentence::Ptr PoliqarpClient::get_next_sequence(bool whole_sentence) +{ + Sentence::Ptr sentence; + if (info_.used > 0) { + if (buffer_pos_ < info_.used) { + struct poliqarp_match poli_match; + poliqarp_get_match(&buffer_, &poli_match, buffer_pos_++); + curr_chunk_doc_id_ = poli_match.document; + if (whole_sentence) { + //sentence = get_token_range(poli_match.withinStart, poli_match.withinEnd); + } else { + sentence = get_token_range(poli_match.start, poli_match.end); + } + } else { + execute_query(); + sentence = get_next_sequence(whole_sentence); + } + } + return sentence; +} + +Sentence::Ptr PoliqarpClient::get_token_range(size_t from, size_t to) +{ + Sentence::Ptr s = boost::make_shared<Sentence>(); + for (size_t j = from; j < to; j++) { + s->append(get_token(j)); + } + return s; +} + +size_t PoliqarpClient::get_count_of_matches_so_far() +{ + return count_so_far_; +} + +size_t PoliqarpClient::only_count_results() +{ + //countSoFar = 0; + if (query_compiled_) { + while (poliqarp_produce(&buffer_, 1000, &query_, &progress_, NULL, 0, 1000) && + poliqarp_get_match_buffer_info(&buffer_, &info_)==0 && + info_.used > 0) { + count_so_far_ += info_.used; + poliqarp_forget(&buffer_); + } + } + return count_so_far_; +} + +size_t PoliqarpClient::get_corpus_size() const +{ + return corpus_size_; +}; + +size_t PoliqarpClient::get_corpus_pos() const +{ + if (query_compiled_) { + return query_.last_context.index; + } else { + return 0; + } +}; +} diff --git a/poliqarp/pqclient.h b/poliqarp/pqclient.h index 3ee0e1b..11437cb 100644 --- a/poliqarp/pqclient.h +++ b/poliqarp/pqclient.h @@ -3,22 +3,69 @@ #include <boost/utility.hpp> +extern "C" { + #define this this_ + #include <poliqarp.h> + #undef this + void async_notify_new_results(void* session); +} + +#include <libcorpus2/chunk.h> + namespace Corpus2 { class PoliqarpClient : boost::noncopyable { public: - PoliqarpClient(const std::string path); + PoliqarpClient(const Tagset& tagset, const std::string path); ~PoliqarpClient(); - void reload_corpus(const std::string& path); - void restart(); - int execute_query(); - int compile_query(const std::string& q); + void compile_query(const std::string& q); + void reset_query(); + void execute_query(); + + int has_error(); + + bool next_match(poliqarp_match& match); + + Token* get_next_focus_token(); + Sentence::Ptr get_next_match_sequence(); + + Token* get_token(size_t pos); + Sentence::Ptr get_token_range(size_t from, size_t to); + + Sentence::Ptr get_next_sequence(bool whole_sentence); + + size_t get_count_of_matches_so_far(); + size_t only_count_results(); size_t get_corpus_size() const; + size_t get_corpus_pos() const; + private: + const Tagset& tagset_; + boost::shared_ptr<Sentence> match_; + boost::shared_ptr<Chunk> document_; + size_t buffer_pos_; + size_t count_so_far_; + int err_; + std::string last_error_; + std::string last_query_; + bool quiet_; + char *corpusname_; + char *querytext_; + bool tags_context_; + bool tags_match_; + bool query_compiled_; + size_t corpus_size_; + size_t curr_chunk_doc_id_; + struct poliqarp_corpus corpus_; + struct poliqarp_query query_; + struct poliqarp_match_buffer buffer_; + struct poliqarp_match_buffer_info info_; + progress_t progress_; + void *exception_data_; }; } -- GitLab