Skip to content
Snippets Groups Projects
Commit 97b13d27 authored by ilor's avatar ilor
Browse files

Upgrade data rule tests to optionally check annotations and use match rules (see README)

parent 4dbd3b95
Branches
No related merge requests found
......@@ -8,6 +8,8 @@
#include <libcorpus2/util/settings.h>
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/io/xcesreader.h>
#include <libcorpus2/io/cclreader.h>
#include <libcorpus2/ann/annotatedsentence.h>
#include <libwccl/sentencecontext.h>
#include <libwccl/parser/Parser.h>
......@@ -56,8 +58,16 @@ boost::shared_ptr<Corpus2::Chunk> get_corpus(const std::string& path, const Corp
if (i != corpus_cache.end()) {
return i->second;
} else {
Corpus2::XcesReader xr(tagset, path);
boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk();
boost::shared_ptr<Corpus2::Chunk> chunk;
boost::filesystem::path fp(path);
std::string fn = fp.filename();
if (fn.find("ccl") == fn.npos) {
Corpus2::XcesReader xr(tagset, path);
chunk = xr.get_next_chunk();
} else {
Corpus2::CclReader cr(tagset, path);
chunk = cr.get_next_chunk();
}
corpus_cache.insert(std::make_pair(path, chunk));
return chunk;
}
......@@ -94,6 +104,45 @@ void dump_lexemes(const std::set<Corpus2::Lexeme>& lex, std::set<Corpus2::Lexeme
}
}
void check_sentences(int sentence_i, const Corpus2::Tagset& tagset,
const Corpus2::Sentence::Ptr& sentence,
const Corpus2::Sentence::Ptr& expected_sentence)
{
BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size());
for (size_t ti = 0; ti < sentence->size(); ++ti) {
Corpus2::Token& token = *sentence->tokens()[ti];
Corpus2::Token& expected_token = *expected_sentence->tokens()[ti];
BOOST_CHECK_EQUAL(token.orth_utf8(), expected_token.orth_utf8());
std::set<Corpus2::Lexeme> lex;
std::copy(token.lexemes().begin(), token.lexemes().end(),
std::inserter(lex, lex.begin()));
std::set<Corpus2::Lexeme> expected_lex;
std::copy(expected_token.lexemes().begin(), expected_token.lexemes().end(),
std::inserter(expected_lex, expected_lex.begin()));
if (lex != expected_lex) {
BOOST_ERROR("Lexeme mismatch in sentence " << sentence_i
<< ", token " << ti << " [" << expected_token.orth_utf8() << "]"
<< " (" << lex.size() << ", expected " << expected_lex.size() << ")");
dump_lexemes(lex, expected_lex, tagset);
}
}
boost::shared_ptr<Corpus2::AnnotatedSentence> annotated, expected_annotated;
annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence);
expected_annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(expected_sentence);
if (expected_annotated) {
BOOST_REQUIRE(annotated);
foreach (const Corpus2::AnnotatedSentence::chan_map_t::value_type& v, expected_annotated->all_channels()) {
std::string channel_name = v.first;
BOOST_REQUIRE(annotated->has_channel(channel_name));
const Corpus2::AnnotationChannel& expected_channel = v.second;
std::string expected_channel_data = channel_name + ":" + expected_channel.dump_alpha();
Corpus2::AnnotationChannel& channel = annotated->get_channel(channel_name);
std::string channel_data = channel_name + ":" + channel.dump_alpha();
BOOST_CHECK_EQUAL(channel_data, expected_channel_data);
}
}
}
void test_one_rule_item_actual(const rule_compare_test& c)
{
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(c.tagset);
......@@ -109,33 +158,36 @@ void test_one_rule_item_actual(const rule_compare_test& c)
std::string rf = c.rule_file.string();
std::ifstream is(rf.c_str());
BOOST_REQUIRE(is.good());
boost::shared_ptr<Wccl::TagRuleSequence> rules = parser.parseTagRuleSequence(is);
boost::shared_ptr<Wccl::TagRuleSequence> rules;
boost::shared_ptr<Wccl::MatchRule> matchr;
std::string fn = c.rule_file.filename();
if (fn.find("match") == fn.npos) {
rules = parser.parseTagRuleSequence(is);
} else {
matchr = parser.parseMatchRule(is);
}
for (size_t i = 0; i < chunk->sentences().size(); ++i) {
Corpus2::Sentence::Ptr sentence = chunk->sentences()[i]->clone_shared();
Corpus2::Sentence::Ptr expected_sentence = expected->sentences()[i];
BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size());
rules->execute_once(sentence);
BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size());
for (size_t ti = 0; ti < sentence->size(); ++ti) {
Corpus2::Token& token = *sentence->tokens()[ti];
Corpus2::Token& expected_token = *expected_sentence->tokens()[ti];
BOOST_CHECK_EQUAL(token.orth_utf8(), expected_token.orth_utf8());
std::set<Corpus2::Lexeme> lex;
std::copy(token.lexemes().begin(), token.lexemes().end(),
std::inserter(lex, lex.begin()));
std::set<Corpus2::Lexeme> expected_lex;
std::copy(expected_token.lexemes().begin(), expected_token.lexemes().end(),
std::inserter(expected_lex, expected_lex.begin()));
if (lex != expected_lex) {
BOOST_ERROR("Lexeme mismatch in sentence " << i
<< ", token " << ti << " [" << expected_token.orth_utf8() << "]"
<< " (" << lex.size() << ", expected " << expected_lex.size() << ")");
dump_lexemes(lex, expected_lex, tagset);
}
if (rules) {
rules->execute_once(sentence);
} else if (matchr) {
boost::shared_ptr<Corpus2::AnnotatedSentence> annotated;
annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence);
BOOST_REQUIRE(annotated);
matchr->apply(annotated);
} else {
BOOST_ERROR("Dark forces");
}
check_sentences(i, tagset, sentence, expected_sentence);
}
}
struct init_status
{
path input;
......
Test cases are defined by .ccl files, one .ccl file is one testcase.
Test cases are defined by .ccl files, one .ccl file is one testcase.
A test case loads a corpus from an .xml file thet is not an .out.xml file from the test case directory, or directories above if there are none. Behavior is undefined if there is more than one .xml file. Only the first chunk is processed (all sentences from the chunk).
If the test case filename contains the string "match", it is treated as a match rule, otherwise a disambiguation rule. Match rules require CCL input files as described below.
A test case loads a corpus from an .xml file thet is not an .out.xml file from the test case directory, or directories above if there are none.
Behavior is undefined if there is more than one .xml file in a directory. Only the first chunk is processed (all sentences from the chunk).
If the xml filename contains the string "ccl", it will be read as a CCL format xml as opposed to the default XCES.
A foo.ccl file should be accompanied by a foo.out.xml file defining the expected output. Output is compared intelligently, lexeme order / duplicates does not matter.
If the output filenale contains the string "ccl", it will be read as a CCL file as in the input file. Additionally, annotations will be compared.
A magic file with the extension .is-the-tagset defines the tagset for testcases in this directory and all subdirectories, unless overrden by another .is-the-tagset file.
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment