diff --git a/tests/datarule.cpp b/tests/datarule.cpp index b85e858fec5051ac770d2119bf34ae049fb53acb..871f2a548c11410f198babda3ea1074268164ba2 100644 --- a/tests/datarule.cpp +++ b/tests/datarule.cpp @@ -8,6 +8,8 @@ #include <libcorpus2/util/settings.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/io/xcesreader.h> +#include <libcorpus2/io/cclreader.h> +#include <libcorpus2/ann/annotatedsentence.h> #include <libwccl/sentencecontext.h> #include <libwccl/parser/Parser.h> @@ -56,8 +58,16 @@ boost::shared_ptr<Corpus2::Chunk> get_corpus(const std::string& path, const Corp if (i != corpus_cache.end()) { return i->second; } else { - Corpus2::XcesReader xr(tagset, path); - boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk(); + boost::shared_ptr<Corpus2::Chunk> chunk; + boost::filesystem::path fp(path); + std::string fn = fp.filename(); + if (fn.find("ccl") == fn.npos) { + Corpus2::XcesReader xr(tagset, path); + chunk = xr.get_next_chunk(); + } else { + Corpus2::CclReader cr(tagset, path); + chunk = cr.get_next_chunk(); + } corpus_cache.insert(std::make_pair(path, chunk)); return chunk; } @@ -94,6 +104,45 @@ void dump_lexemes(const std::set<Corpus2::Lexeme>& lex, std::set<Corpus2::Lexeme } } +void check_sentences(int sentence_i, const Corpus2::Tagset& tagset, + const Corpus2::Sentence::Ptr& sentence, + const Corpus2::Sentence::Ptr& expected_sentence) +{ + BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); + for (size_t ti = 0; ti < sentence->size(); ++ti) { + Corpus2::Token& token = *sentence->tokens()[ti]; + Corpus2::Token& expected_token = *expected_sentence->tokens()[ti]; + BOOST_CHECK_EQUAL(token.orth_utf8(), expected_token.orth_utf8()); + std::set<Corpus2::Lexeme> lex; + std::copy(token.lexemes().begin(), token.lexemes().end(), + std::inserter(lex, lex.begin())); + std::set<Corpus2::Lexeme> expected_lex; + std::copy(expected_token.lexemes().begin(), expected_token.lexemes().end(), + std::inserter(expected_lex, expected_lex.begin())); + if (lex != expected_lex) { + BOOST_ERROR("Lexeme mismatch in sentence " << sentence_i + << ", token " << ti << " [" << expected_token.orth_utf8() << "]" + << " (" << lex.size() << ", expected " << expected_lex.size() << ")"); + dump_lexemes(lex, expected_lex, tagset); + } + } + boost::shared_ptr<Corpus2::AnnotatedSentence> annotated, expected_annotated; + annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence); + expected_annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(expected_sentence); + if (expected_annotated) { + BOOST_REQUIRE(annotated); + foreach (const Corpus2::AnnotatedSentence::chan_map_t::value_type& v, expected_annotated->all_channels()) { + std::string channel_name = v.first; + BOOST_REQUIRE(annotated->has_channel(channel_name)); + const Corpus2::AnnotationChannel& expected_channel = v.second; + std::string expected_channel_data = channel_name + ":" + expected_channel.dump_alpha(); + Corpus2::AnnotationChannel& channel = annotated->get_channel(channel_name); + std::string channel_data = channel_name + ":" + channel.dump_alpha(); + BOOST_CHECK_EQUAL(channel_data, expected_channel_data); + } + } +} + void test_one_rule_item_actual(const rule_compare_test& c) { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(c.tagset); @@ -109,33 +158,36 @@ void test_one_rule_item_actual(const rule_compare_test& c) std::string rf = c.rule_file.string(); std::ifstream is(rf.c_str()); BOOST_REQUIRE(is.good()); - boost::shared_ptr<Wccl::TagRuleSequence> rules = parser.parseTagRuleSequence(is); + + + boost::shared_ptr<Wccl::TagRuleSequence> rules; + boost::shared_ptr<Wccl::MatchRule> matchr; + + std::string fn = c.rule_file.filename(); + if (fn.find("match") == fn.npos) { + rules = parser.parseTagRuleSequence(is); + } else { + matchr = parser.parseMatchRule(is); + } for (size_t i = 0; i < chunk->sentences().size(); ++i) { Corpus2::Sentence::Ptr sentence = chunk->sentences()[i]->clone_shared(); Corpus2::Sentence::Ptr expected_sentence = expected->sentences()[i]; BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); - rules->execute_once(sentence); - BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); - for (size_t ti = 0; ti < sentence->size(); ++ti) { - Corpus2::Token& token = *sentence->tokens()[ti]; - Corpus2::Token& expected_token = *expected_sentence->tokens()[ti]; - BOOST_CHECK_EQUAL(token.orth_utf8(), expected_token.orth_utf8()); - std::set<Corpus2::Lexeme> lex; - std::copy(token.lexemes().begin(), token.lexemes().end(), - std::inserter(lex, lex.begin())); - std::set<Corpus2::Lexeme> expected_lex; - std::copy(expected_token.lexemes().begin(), expected_token.lexemes().end(), - std::inserter(expected_lex, expected_lex.begin())); - if (lex != expected_lex) { - BOOST_ERROR("Lexeme mismatch in sentence " << i - << ", token " << ti << " [" << expected_token.orth_utf8() << "]" - << " (" << lex.size() << ", expected " << expected_lex.size() << ")"); - dump_lexemes(lex, expected_lex, tagset); - } + if (rules) { + rules->execute_once(sentence); + } else if (matchr) { + boost::shared_ptr<Corpus2::AnnotatedSentence> annotated; + annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence); + BOOST_REQUIRE(annotated); + matchr->apply(annotated); + } else { + BOOST_ERROR("Dark forces"); } + check_sentences(i, tagset, sentence, expected_sentence); } } + struct init_status { path input; diff --git a/tests/rules-data/README b/tests/rules-data/README index fcb85ff94fdbf8eec66d413b81e437a60ef73d90..b753c05b9b2b57bac7a6b9ba755a57c85ba3b7a5 100644 --- a/tests/rules-data/README +++ b/tests/rules-data/README @@ -1,7 +1,12 @@ -Test cases are defined by .ccl files, one .ccl file is one testcase. +Test cases are defined by .ccl files, one .ccl file is one testcase. -A test case loads a corpus from an .xml file thet is not an .out.xml file from the test case directory, or directories above if there are none. Behavior is undefined if there is more than one .xml file. Only the first chunk is processed (all sentences from the chunk). +If the test case filename contains the string "match", it is treated as a match rule, otherwise a disambiguation rule. Match rules require CCL input files as described below. + +A test case loads a corpus from an .xml file thet is not an .out.xml file from the test case directory, or directories above if there are none. +Behavior is undefined if there is more than one .xml file in a directory. Only the first chunk is processed (all sentences from the chunk). +If the xml filename contains the string "ccl", it will be read as a CCL format xml as opposed to the default XCES. A foo.ccl file should be accompanied by a foo.out.xml file defining the expected output. Output is compared intelligently, lexeme order / duplicates does not matter. +If the output filenale contains the string "ccl", it will be read as a CCL file as in the input file. Additionally, annotations will be compared. A magic file with the extension .is-the-tagset defines the tagset for testcases in this directory and all subdirectories, unless overrden by another .is-the-tagset file.