#include "datadriven.h" #include <antlr/RecognitionException.hpp> #include <libpwrutils/util.h> #include <libpwrutils/foreach.h> #include <libpwrutils/pathsearch.h> #include <libcorpus2/util/settings.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/io/xcesreader.h> #include <libcorpus2/io/cclreader.h> #include <libcorpus2/ann/annotatedsentence.h> #include <libwccl/sentencecontext.h> #include <libwccl/parser/Parser.h> #include <libwccl/ops/funexeccontext.h> #include <fstream> #include <boost/filesystem/fstream.hpp> #include <iostream> #include <iomanip> #include <map> #include <set> #include <sstream> #include <boost/algorithm/string.hpp> #include <boost/bind.hpp> #include <boost/filesystem.hpp> #include <boost/foreach.hpp> #include <boost/shared_ptr.hpp> #include <boost/test/unit_test.hpp> #include <boost/test/parameterized_test.hpp> using boost::filesystem::directory_iterator; using boost::filesystem::exists; using boost::filesystem::is_directory; using boost::filesystem::path; using boost::filesystem::ifstream; namespace { struct rule_compare_test { std::string tagset; path corpus_file; path rule_file; path output_corpus; }; typedef std::map<std::string, boost::shared_ptr<Corpus2::Chunk> > corpus_cache_t; corpus_cache_t corpus_cache; boost::shared_ptr<Corpus2::Chunk> get_corpus(const std::string& path, const Corpus2::Tagset& tagset) { corpus_cache_t::const_iterator i; i = corpus_cache.find(path); if (i != corpus_cache.end()) { return i->second; } else { boost::shared_ptr<Corpus2::Chunk> chunk; boost::filesystem::path fp(path); std::string fn = fp.filename(); if (fn.find("ccl") == fn.npos) { Corpus2::XcesReader xr(tagset, path); chunk = xr.get_next_chunk(); } else { Corpus2::CclReader cr(tagset, path); chunk = cr.get_next_chunk(); } corpus_cache.insert(std::make_pair(path, chunk)); return chunk; } } void dump_lexemes(const std::set<Corpus2::Lexeme>& lex, std::set<Corpus2::Lexeme>& expected_lex, const Corpus2::Tagset& tagset) { std::set<Corpus2::Lexeme>::const_iterator i = lex.begin(); std::set<Corpus2::Lexeme>::const_iterator ei = expected_lex.begin(); while (i != lex.end() && ei != expected_lex.end()) { if (*i < *ei) { std::cerr << "EXTRA: " << i->lemma_utf8() << "\t" << tagset.tag_to_string(i->tag()) << "\n"; ++i; } else if (*i > *ei) { std::cerr << "MISSING: " << ei->lemma_utf8() << "\t" << tagset.tag_to_string(ei->tag()) << "\n"; ++ei; } else if (*i == *ei) { std::cerr << "OK: " << i->lemma_utf8() << "\t" << tagset.tag_to_string(i->tag()) << "\n"; ++i; ++ei; } else { std::cerr << "DARK FORCES AT PLAY: " << i->lemma_utf8() << "\t" << tagset.tag_to_string(i->tag()) << "\t" << ei->lemma_utf8() << "\t" << tagset.tag_to_string(ei->tag()) << "\n"; } } while (i != lex.end()) { std::cerr << "EXTRA: " << i->lemma_utf8() << "\t" << tagset.tag_to_string(i->tag()) << "\n"; ++i; } while (ei != expected_lex.end()) { std::cerr << "MISSING: " << ei->lemma_utf8() << "\t" << tagset.tag_to_string(ei->tag()) << "\n"; ++ei; } } void check_sentences(int sentence_i, const Corpus2::Tagset& tagset, const Corpus2::Sentence::Ptr& sentence, const Corpus2::Sentence::Ptr& expected_sentence) { BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); for (size_t ti = 0; ti < sentence->size(); ++ti) { Corpus2::Token& token = *sentence->tokens()[ti]; Corpus2::Token& expected_token = *expected_sentence->tokens()[ti]; BOOST_CHECK_EQUAL(token.orth_utf8(), expected_token.orth_utf8()); std::set<Corpus2::Lexeme> lex; std::copy(token.lexemes().begin(), token.lexemes().end(), std::inserter(lex, lex.begin())); std::set<Corpus2::Lexeme> expected_lex; std::copy(expected_token.lexemes().begin(), expected_token.lexemes().end(), std::inserter(expected_lex, expected_lex.begin())); if (lex != expected_lex) { BOOST_ERROR("Lexeme mismatch in sentence " << sentence_i << ", token " << ti << " [" << expected_token.orth_utf8() << "]" << " (" << lex.size() << ", expected " << expected_lex.size() << ")"); dump_lexemes(lex, expected_lex, tagset); } } boost::shared_ptr<Corpus2::AnnotatedSentence> annotated, expected_annotated; annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence); expected_annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(expected_sentence); if (expected_annotated) { BOOST_REQUIRE(annotated); foreach (const Corpus2::AnnotatedSentence::chan_map_t::value_type& v, expected_annotated->all_channels()) { std::string channel_name = v.first; BOOST_REQUIRE_MESSAGE(annotated->has_channel(channel_name), "Expected channel " + channel_name); const Corpus2::AnnotationChannel& expected_channel = v.second; std::string expected_channel_data = channel_name + ":" + expected_channel.dump_alpha(); Corpus2::AnnotationChannel& channel = annotated->get_channel(channel_name); std::string channel_data = channel_name + ":" + channel.dump_alpha(); BOOST_CHECK_EQUAL(channel_data, expected_channel_data); } } } void test_one_rule_item_actual(const rule_compare_test& c) { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(c.tagset); boost::shared_ptr<Corpus2::Chunk> chunk = get_corpus(c.corpus_file.string(), tagset); if (!chunk) { BOOST_ERROR("Empty chunk loaded"); } boost::shared_ptr<Corpus2::TokenReader> reader; std::string ofn = c.output_corpus.filename(); if (ofn.find("ccl") != ofn.npos || ofn.find("match") != ofn.npos) { reader = Corpus2::TokenReader::create_path_reader("ccl", tagset, c.output_corpus.string()); } else { reader = Corpus2::TokenReader::create_path_reader("xces", tagset, c.output_corpus.string()); } reader->set_option("loose"); boost::shared_ptr<Corpus2::Chunk> expected = reader->get_next_chunk(); BOOST_REQUIRE_EQUAL(chunk->sentences().size(), expected->sentences().size()); Wccl::Parser parser(tagset); std::string rf = c.rule_file.string(); std::ifstream is(rf.c_str()); BOOST_REQUIRE(is.good()); boost::shared_ptr<Wccl::WcclFile> parsed; parsed = parser.parseWcclFile(is, LIBWCCL_TEST_DATA_DIR); for (size_t i = 0; i < chunk->sentences().size(); ++i) { Corpus2::Sentence::Ptr sentence = chunk->sentences()[i]->clone_shared(); Corpus2::Sentence::Ptr expected_sentence = expected->sentences()[i]; BOOST_REQUIRE_EQUAL(sentence->size(), expected_sentence->size()); if (parsed->has_tag_rules()) { parsed->get_tag_rules_ptr()->execute_once(sentence); } if (parsed->has_match_rules()) { boost::shared_ptr<Corpus2::AnnotatedSentence> annotated; annotated = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(sentence); BOOST_REQUIRE(annotated); parsed->get_match_rules_ptr()->apply_all(annotated); } check_sentences(i, tagset, sentence, expected_sentence); } } struct init_status { path input; std::string tagset; }; int init_subdir(const path& dir, std::string ps, std::vector<rule_compare_test>& tests, init_status status = init_status()) { int count = 0; ps += dir.string(); if (!ps.empty()) { ps += Corpus2::Path::Instance().get_path_separator(); } directory_iterator end_itr; // default-constructed is past-the-end std::set<path> txt_tests; std::set<path> subdirs; for (directory_iterator itr(dir); itr != end_itr; ++itr) { if (is_directory(itr->status())) { subdirs.insert(itr->path()); } else { if (itr->path().extension() == ".ccl") { txt_tests.insert(itr->path()); } else if (itr->path().extension() == ".xml") { if (!boost::algorithm::ends_with(itr->path().stem(), ".out")) { status.input = itr->path(); } } else if (itr->path().extension() == ".is-the-tagset") { status.tagset = itr->path().stem(); } } } foreach (const path& s, txt_tests) { path o = s; o.replace_extension(".out.xml"); if (boost::filesystem::is_regular_file(o)) { rule_compare_test c; c.corpus_file = status.input; c.rule_file = s; c.output_corpus = o; c.tagset = status.tagset; tests.push_back(c); ++count; } else { std::cerr << "No output file: " << o.string() << "\n"; } } BOOST_TEST_MESSAGE("Found " << count << " valid data test case" << (count > 1 ? "s" : "") << " in " << dir << " [" << ps << "]" ); foreach (const path& s, subdirs) { count += init_subdir(s, ps, tests, status); } return count; } void test_one_rule_item(const rule_compare_test& c) { try { test_one_rule_item_actual(c); } catch (PwrNlp::PwrNlpError& e) { BOOST_ERROR("Caught " << e.scope() << " exception: \n" << e.info()); } catch (antlr::RecognitionException &e) { BOOST_ERROR("Caught " << "ANTLR" << " exception: \n" << e.getFileLineColumnString() << " " << e.getMessage()); } catch (antlr::ANTLRException &e) { BOOST_ERROR("Caught " << "ANTLR" << " exception: \n" << " " << e.getMessage()); } } } void init_data_rule_suite(boost::unit_test::test_suite *ts, const std::string& path) { std::string subdir_name = LIBWCCL_TEST_DATA_DIR "rules-data"; if (!path.empty()) { subdir_name = path; } if (!exists(subdir_name)) { BOOST_TEST_MESSAGE("Rules test data subdir does not exist"); } std::vector<rule_compare_test> compares; init_subdir(subdir_name, "", compares); foreach (const rule_compare_test& ci, compares) { std::string rel_path = boost::algorithm::replace_first_copy( ci.rule_file.string(), subdir_name, ""); std::string name = "rule_data_test:" + rel_path; //std::cout << name << "\n"; ts->add(boost::unit_test::make_test_case( boost::bind(test_one_rule_item, ci), name)); } }