From 97b13d271e040ac679d0ff10c819a1be4b9a765a Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Wed, 27 Apr 2011 13:25:06 +0200 Subject: [PATCH] Upgrade data rule tests to optionally check annotations and use match rules (see README) --- tests/datarule.cpp | 94 ++++++++++++++++++++++++++++++++--------- tests/rules-data/README | 9 +++- 2 files changed, 80 insertions(+), 23 deletions(-) diff --git a/tests/datarule.cpp b/tests/datarule.cpp index b85e858..871f2a5 100644 --- a/tests/datarule.cpp +++ b/tests/datarule.cpp @@ -8,6 +8,8 @@ #include <libcorpus2/util/settings.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/io/xcesreader.h> +#include <libcorpus2/io/cclreader.h> +#include <libcorpus2/ann/annotatedsentence.h> #include <libwccl/sentencecontext.h> #include <libwccl/parser/Parser.h> @@ -56,8 +58,16 @@ boost::shared_ptr<Corpus2::Chunk> get_corpus(const std::string& path, const Corp if (i != corpus_cache.end()) { return i->second; } else { - Corpus2::XcesReader xr(tagset, path); - boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk(); + boost::shared_ptr<Corpus2::Chunk> chunk; + boost::filesystem::path fp(path); + std::string fn = fp.filename(); + if (fn.find("ccl") == fn.npos) { + Corpus2::XcesReader xr(tagset, path); + chunk = xr.get_next_chunk(); + } else { + Corpus2::CclReader cr(tagset, path); + chunk = cr.get_next_chunk(); + } corpus_cache.insert(std::make_pair(path, chunk)); return chunk; } @@ -94,6 +104,45 @@ void dump_lexemes(const std::set<Corpus2::Lexeme>& lex, std::set<Corpus2::Lexeme } } +void check_sentences(int sentence_i, const Corpus2::Tagset& tagset, + const Corpus2::Sentence::Ptr& sentence, + const Corpus2::Sentence::Ptr& expected_sentence) +{ + BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); + for (size_t ti = 0; ti < sentence->size(); ++ti) { + Corpus2::Token& token = *sentence->tokens()[ti]; + Corpus2::Token& expected_token = *expected_sentence->tokens()[ti]; + BOOST_CHECK_EQUAL(token.orth_utf8(), expected_token.orth_utf8()); + std::set<Corpus2::Lexeme> lex; + std::copy(token.lexemes().begin(), token.lexemes().end(), + std::inserter(lex, lex.begin())); + std::set<Corpus2::Lexeme> expected_lex; + std::copy(expected_token.lexemes().begin(), expected_token.lexemes().end(), + std::inserter(expected_lex, expected_lex.begin())); + if (lex != expected_lex) { + BOOST_ERROR("Lexeme mismatch in sentence " << sentence_i + << ", token " << ti << " [" << expected_token.orth_utf8() << "]" + << " (" << lex.size() << ", expected " << expected_lex.size() << ")"); + dump_lexemes(lex, expected_lex, tagset); + } + } + boost::shared_ptr<Corpus2::AnnotatedSentence> annotated, expected_annotated; + annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence); + expected_annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(expected_sentence); + if (expected_annotated) { + BOOST_REQUIRE(annotated); + foreach (const Corpus2::AnnotatedSentence::chan_map_t::value_type& v, expected_annotated->all_channels()) { + std::string channel_name = v.first; + BOOST_REQUIRE(annotated->has_channel(channel_name)); + const Corpus2::AnnotationChannel& expected_channel = v.second; + std::string expected_channel_data = channel_name + ":" + expected_channel.dump_alpha(); + Corpus2::AnnotationChannel& channel = annotated->get_channel(channel_name); + std::string channel_data = channel_name + ":" + channel.dump_alpha(); + BOOST_CHECK_EQUAL(channel_data, expected_channel_data); + } + } +} + void test_one_rule_item_actual(const rule_compare_test& c) { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(c.tagset); @@ -109,33 +158,36 @@ void test_one_rule_item_actual(const rule_compare_test& c) std::string rf = c.rule_file.string(); std::ifstream is(rf.c_str()); BOOST_REQUIRE(is.good()); - boost::shared_ptr<Wccl::TagRuleSequence> rules = parser.parseTagRuleSequence(is); + + + boost::shared_ptr<Wccl::TagRuleSequence> rules; + boost::shared_ptr<Wccl::MatchRule> matchr; + + std::string fn = c.rule_file.filename(); + if (fn.find("match") == fn.npos) { + rules = parser.parseTagRuleSequence(is); + } else { + matchr = parser.parseMatchRule(is); + } for (size_t i = 0; i < chunk->sentences().size(); ++i) { Corpus2::Sentence::Ptr sentence = chunk->sentences()[i]->clone_shared(); Corpus2::Sentence::Ptr expected_sentence = expected->sentences()[i]; BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); - rules->execute_once(sentence); - BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); - for (size_t ti = 0; ti < sentence->size(); ++ti) { - Corpus2::Token& token = *sentence->tokens()[ti]; - Corpus2::Token& expected_token = *expected_sentence->tokens()[ti]; - BOOST_CHECK_EQUAL(token.orth_utf8(), expected_token.orth_utf8()); - std::set<Corpus2::Lexeme> lex; - std::copy(token.lexemes().begin(), token.lexemes().end(), - std::inserter(lex, lex.begin())); - std::set<Corpus2::Lexeme> expected_lex; - std::copy(expected_token.lexemes().begin(), expected_token.lexemes().end(), - std::inserter(expected_lex, expected_lex.begin())); - if (lex != expected_lex) { - BOOST_ERROR("Lexeme mismatch in sentence " << i - << ", token " << ti << " [" << expected_token.orth_utf8() << "]" - << " (" << lex.size() << ", expected " << expected_lex.size() << ")"); - dump_lexemes(lex, expected_lex, tagset); - } + if (rules) { + rules->execute_once(sentence); + } else if (matchr) { + boost::shared_ptr<Corpus2::AnnotatedSentence> annotated; + annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence); + BOOST_REQUIRE(annotated); + matchr->apply(annotated); + } else { + BOOST_ERROR("Dark forces"); } + check_sentences(i, tagset, sentence, expected_sentence); } } + struct init_status { path input; diff --git a/tests/rules-data/README b/tests/rules-data/README index fcb85ff..b753c05 100644 --- a/tests/rules-data/README +++ b/tests/rules-data/README @@ -1,7 +1,12 @@ -Test cases are defined by .ccl files, one .ccl file is one testcase. +Test cases are defined by .ccl files, one .ccl file is one testcase. -A test case loads a corpus from an .xml file thet is not an .out.xml file from the test case directory, or directories above if there are none. Behavior is undefined if there is more than one .xml file. Only the first chunk is processed (all sentences from the chunk). +If the test case filename contains the string "match", it is treated as a match rule, otherwise a disambiguation rule. Match rules require CCL input files as described below. + +A test case loads a corpus from an .xml file thet is not an .out.xml file from the test case directory, or directories above if there are none. +Behavior is undefined if there is more than one .xml file in a directory. Only the first chunk is processed (all sentences from the chunk). +If the xml filename contains the string "ccl", it will be read as a CCL format xml as opposed to the default XCES. A foo.ccl file should be accompanied by a foo.out.xml file defining the expected output. Output is compared intelligently, lexeme order / duplicates does not matter. +If the output filenale contains the string "ccl", it will be read as a CCL file as in the input file. Additionally, annotations will be compared. A magic file with the extension .is-the-tagset defines the tagset for testcases in this directory and all subdirectories, unless overrden by another .is-the-tagset file. -- GitLab