Skip to content
Snippets Groups Projects
Commit 5082e27d authored by ilor's avatar ilor
Browse files

simple corpus2 pqlib wrapper and test executable, wip

parent 6aed7643
No related branches found
No related tags found
No related merge requests found
PROJECT(Corpus2Poliqarp)
cmake_minimum_required(VERSION 2.8.0)
include_directories("/usr/local/include/sakura/")
include_directories(${PoliqarpLibrary_SOURCE_DIR}/sakura)
include_directories(${PoliqarpLibrary_SOURCE_DIR})
include_directories(${PoliqarpLibrary_BINARY_DIR}/sakura)
include_directories(${PoliqarpLibrary_BINARY_DIR})
add_library(corpus2poliqarp SHARED pqclient.cpp)
add_executable(c2pqtest c2pqtest.cpp)
target_link_libraries(corpus2poliqarp libpoliqarp corpus2)
target_link_libraries(c2pqtest libpoliqarp corpus2poliqarp corpus2 pwrutils )
#include "pqclient.h"
#include <libcorpus2/tagsetmanager.h>
#include <iostream>
#include <libcorpus2/io/writer.h>
int main(int argc, char** argv)
{
std::cerr << "C2PQTEST\n";
if (argc < 3) return 2;
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
std::cerr << argv[1] << "\n";
std::cerr << argv[2] << "\n";
Corpus2::PoliqarpClient pqc(tagset, argv[1]);
pqc.compile_query(argv[2]);
pqc.execute_query();
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer = Corpus2::TokenWriter::create_stream_writer("plain", std::cout, tagset);
while (Corpus2::Token* t = pqc.get_next_focus_token()) {
writer->write_token_dispose(t);
}
}
#include "pqclient.h"
#include <boost/make_shared.hpp>
extern "C" {
void async_notify_new_results(void* session)
{
}
}
namespace Corpus2
{
PoliqarpClient::PoliqarpClient(const Tagset& tagset, const std::string path)
: tagset_(tagset)
{
query_compiled_ = false;
poliqarp_error error = poliqarp_error_none;
if (poliqarp_create("", &error) != 0) {
throw Corpus2Error(poliqarp_error_message_get(&error));
}
progress_init(&progress_);
count_so_far_ = 0;
err_ = 0;
if (poliqarp_open_corpus(&corpus_, path.c_str(), &progress_, &error) == -1) {
throw Corpus2Error(poliqarp_error_message_get(&error));
} else {
poliqarp_create_match_buffer(&buffer_, 1000);
}
poliqarp_corpus_info cinfo;
poliqarp_get_corpus_info(&corpus_, &cinfo);
corpus_size_ = cinfo.num_segments;
curr_chunk_doc_id_ = 0;
}
PoliqarpClient::~PoliqarpClient()
{
poliqarp_close_corpus(&corpus_);
poliqarp_destroy_match_buffer(&buffer_);
if (query_compiled_) {
poliqarp_destroy_query(&query_);
query_compiled_ = false;
}
poliqarp_destroy();
};
void PoliqarpClient::compile_query(const std::string & q)
{
count_so_far_ = 0;
last_query_ = q;
if (query_compiled_) {
poliqarp_destroy_query(&query_);
query_compiled_ = false;
}
poliqarp_error error = poliqarp_error_none;
if (q.empty()) {
throw Corpus2Error("EmptyQuery");
} else if (poliqarp_create_query(&query_, q.c_str(), &corpus_,
0, NULL, NULL, &error) == -1) {
throw Corpus2Error(std::string("QueryFailed: ") + poliqarp_error_message_get(&error));
} else {
query_compiled_ = true;
}
}
void PoliqarpClient::reset_query()
{
compile_query(last_query_);
}
void PoliqarpClient::execute_query()
{
if (query_compiled_) {
poliqarp_forget(&buffer_);
if (poliqarp_produce(&buffer_, 1000, &query_, &progress_,
NULL, 0, 1000)) {
throw Corpus2Error("query execution error");
}
if (poliqarp_get_match_buffer_info(&buffer_, &info_)) {
throw Corpus2Error("buffer read error");
}
count_so_far_ += buffer_.used;
buffer_pos_ = 0;
} else {
throw Corpus2Error("Query not compiled");
}
}
bool PoliqarpClient::next_match(poliqarp_match& match)
{
if (info_.used > 0) {
if (buffer_pos_ < info_.used) {
poliqarp_get_match(&buffer_, &match, buffer_pos_++);
return true;
} else if (info_.used == buffer_.capacity) {
poliqarp_forget(&buffer_);
execute_query();
if (info_.used > 0) {
poliqarp_get_match(&buffer_, &match, buffer_pos_++);
return true;
}
}
}
return false;
}
Token* PoliqarpClient::get_next_focus_token()
{
poliqarp_match match;
if (next_match(match)) {
return get_token(match.focus);
} else {
return NULL;
}
};
Sentence::Ptr PoliqarpClient::get_next_match_sequence()
{
poliqarp_match match;
if (next_match(match)) {
return get_token_range(match.start, match.end);
} else {
return Sentence::Ptr();
}
}
Token* PoliqarpClient::get_token(size_t pos)
{
poliqarp_segment segment;
poliqarp_segment_info info;
poliqarp_interpretation_set set;
poliqarp_interpretation_set_info sinfo;
poliqarp_get_segment(&segment, &corpus_, pos);
poliqarp_get_segment_info(&segment, &info);
poliqarp_get_disambiguated_interpretations(&segment, &set);
poliqarp_get_interpretation_set_info(&set, &sinfo);
std::auto_ptr<Token> res(new Token());
if (!info.space_before) {
res->set_wa(PwrNlp::Whitespace::Space);
}
res->set_orth_utf8(info.text);
for (size_t i = 0; i < sinfo.size; i++) {
poliqarp_interpretation interp;
poliqarp_interpretation_info iinfo;
poliqarp_get_interpretation(&set, &interp, i);
poliqarp_get_interpretation_info(&interp, &iinfo);
Tag tag = tagset_.parse_simple_tag(iinfo.tag);
res->add_lexeme(Lexeme(UnicodeString::fromUTF8(iinfo.base), tag));
}
return res.release();
}
Sentence::Ptr PoliqarpClient::get_next_sequence(bool whole_sentence)
{
Sentence::Ptr sentence;
if (info_.used > 0) {
if (buffer_pos_ < info_.used) {
struct poliqarp_match poli_match;
poliqarp_get_match(&buffer_, &poli_match, buffer_pos_++);
curr_chunk_doc_id_ = poli_match.document;
if (whole_sentence) {
//sentence = get_token_range(poli_match.withinStart, poli_match.withinEnd);
} else {
sentence = get_token_range(poli_match.start, poli_match.end);
}
} else {
execute_query();
sentence = get_next_sequence(whole_sentence);
}
}
return sentence;
}
Sentence::Ptr PoliqarpClient::get_token_range(size_t from, size_t to)
{
Sentence::Ptr s = boost::make_shared<Sentence>();
for (size_t j = from; j < to; j++) {
s->append(get_token(j));
}
return s;
}
size_t PoliqarpClient::get_count_of_matches_so_far()
{
return count_so_far_;
}
size_t PoliqarpClient::only_count_results()
{
//countSoFar = 0;
if (query_compiled_) {
while (poliqarp_produce(&buffer_, 1000, &query_, &progress_, NULL, 0, 1000) &&
poliqarp_get_match_buffer_info(&buffer_, &info_)==0 &&
info_.used > 0) {
count_so_far_ += info_.used;
poliqarp_forget(&buffer_);
}
}
return count_so_far_;
}
size_t PoliqarpClient::get_corpus_size() const
{
return corpus_size_;
};
size_t PoliqarpClient::get_corpus_pos() const
{
if (query_compiled_) {
return query_.last_context.index;
} else {
return 0;
}
};
}
......@@ -3,22 +3,69 @@
#include <boost/utility.hpp>
extern "C" {
#define this this_
#include <poliqarp.h>
#undef this
void async_notify_new_results(void* session);
}
#include <libcorpus2/chunk.h>
namespace Corpus2
{
class PoliqarpClient : boost::noncopyable
{
public:
PoliqarpClient(const std::string path);
PoliqarpClient(const Tagset& tagset, const std::string path);
~PoliqarpClient();
void reload_corpus(const std::string& path);
void restart();
int execute_query();
int compile_query(const std::string& q);
void compile_query(const std::string& q);
void reset_query();
void execute_query();
int has_error();
bool next_match(poliqarp_match& match);
Token* get_next_focus_token();
Sentence::Ptr get_next_match_sequence();
Token* get_token(size_t pos);
Sentence::Ptr get_token_range(size_t from, size_t to);
Sentence::Ptr get_next_sequence(bool whole_sentence);
size_t get_count_of_matches_so_far();
size_t only_count_results();
size_t get_corpus_size() const;
size_t get_corpus_pos() const;
private:
const Tagset& tagset_;
boost::shared_ptr<Sentence> match_;
boost::shared_ptr<Chunk> document_;
size_t buffer_pos_;
size_t count_so_far_;
int err_;
std::string last_error_;
std::string last_query_;
bool quiet_;
char *corpusname_;
char *querytext_;
bool tags_context_;
bool tags_match_;
bool query_compiled_;
size_t corpus_size_;
size_t curr_chunk_doc_id_;
struct poliqarp_corpus corpus_;
struct poliqarp_query query_;
struct poliqarp_match_buffer buffer_;
struct poliqarp_match_buffer_info info_;
progress_t progress_;
void *exception_data_;
};
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment