diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1f22c9307483ede881cf2bebe70bd064ed1f68ff..fa3aa8bc2a254f83057f3c00546b28f1097d8cb1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -6,6 +6,9 @@ add_definitions(-DLIBCORPUS2_TEST_DATA_DIR="${PROJECT_SOURCE_DIR}/") add_executable( tests main.cpp + basic.cpp + tag_split.cpp + tagset_parse.cpp ) target_link_libraries ( tests maca ${Boost_LIBRARIES}) @@ -13,6 +16,4 @@ target_link_libraries ( tests maca ${Boost_LIBRARIES}) include_directories(${Boost_INCLUDE_DIR}) link_directories(${Boost_LIBRARY_DIRS}) -#configure_file(test-sanity.sh ${CMAKE_CURRENT_BINARY_DIR}) -#add_custom_target(test tests COMMAND ./test-sanity.sh small) -#add_custom_target(test-large ./test-sanity.sh large) +add_custom_target(test tests) diff --git a/tests/basic.cpp b/tests/basic.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d9651327aca94202a2fe8232d1f6a8c7e6081ec5 --- /dev/null +++ b/tests/basic.cpp @@ -0,0 +1,42 @@ +#include <boost/test/unit_test.hpp> + +#include <libcorpus2/token.h> + +const char tagsetstr1[] = "[ATTR]\n" + "A tag tog other a3 \n" + "B data thing tag-thing thang\n" + "C a b c \n" + "[POS]\n some A B [C]\n"; + +BOOST_AUTO_TEST_CASE( token ) +{ + Corpus2::Token t(UnicodeString::fromUTF8("ZZ"), PwrNlp::Whitespace::ManySpaces); + BOOST_CHECK_EQUAL(t.orth_utf8(), "ZZ"); + BOOST_CHECK_EQUAL(t.wa(), PwrNlp::Whitespace::ManySpaces); + BOOST_CHECK(t.lexemes().empty()); +} + +BOOST_AUTO_TEST_CASE( token_dup_lexemes ) +{ + Corpus2::Token t(UnicodeString::fromUTF8("ZZ"), PwrNlp::Whitespace::ManySpaces); + //Corpus2::Tagset tagset(tagsetstr1); + Corpus2::Tag t1(Corpus2::tagset_idx_t(0), Corpus2::pos_idx_t(0)); + Corpus2::Lexeme l1(UnicodeString::fromUTF8("aaa"), t1); + Corpus2::Lexeme l2(UnicodeString::fromUTF8("bbb"), t1); + BOOST_CHECK(!t.check_duplicate_lexemes()); + BOOST_CHECK(!t.remove_duplicate_lexemes()); + t.add_lexeme(l1); + BOOST_CHECK(!t.check_duplicate_lexemes()); + BOOST_CHECK(!t.remove_duplicate_lexemes()); + t.add_lexeme(l2); + BOOST_CHECK(!t.check_duplicate_lexemes()); + BOOST_CHECK(!t.remove_duplicate_lexemes()); + Corpus2::Token tt(t); + t.add_lexeme(l1); + BOOST_CHECK(t != tt); + BOOST_CHECK(t.check_duplicate_lexemes()); + BOOST_CHECK(t.remove_duplicate_lexemes()); + BOOST_CHECK(!t.check_duplicate_lexemes()); + BOOST_CHECK(!t.remove_duplicate_lexemes()); + BOOST_CHECK(t == tt); +} diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c0d93c35bf5f825c8eb08497991a724f8ae517b0 --- /dev/null +++ b/tests/tag_split.cpp @@ -0,0 +1,161 @@ +#include <boost/test/unit_test.hpp> +#include <set> +#include <libpwrutils/foreach.h> +#include <libcorpus2/tagset.h> +#include <libcorpus2/token.h> + +BOOST_AUTO_TEST_SUITE( tag_split ) + +struct F { + F() { + const char tagset_string[] = "[ATTR]\n" + "A tag tog other a3 \n" + "B data thing tag-thing thang\n" + "C a b c \n" + "[POS]\n some A B [C]\n"; + tagset.reset(new Corpus2::Tagset(tagset_string)); + } + boost::shared_ptr<Corpus2::Tagset> tagset; + + std::vector<Corpus2::Tag> check_split(const std::string s, const std::set<std::string> expect) + { + std::set<std::string> actual; + std::vector<Corpus2::Tag> tags; + Corpus2::Token t; + tagset->lexemes_into_token(t, UnicodeString(), s); + foreach (const Corpus2::Lexeme& lex, t.lexemes()) { + const Corpus2::Tag& tag = lex.tag(); + BOOST_WARN(tagset->validate_tag(tag, false)); + actual.insert(tagset->tag_to_string(tag)); + tags.push_back(tag); + } + BOOST_CHECK_EQUAL_COLLECTIONS(actual.begin(), actual.end(), expect.begin(), expect.end()); + return tags; + } +}; + + +BOOST_FIXTURE_TEST_CASE( plain, F ) +{ + const char tag[] = "some:tag:data"; + std::set<std::string> r; + r.insert(tag); + check_split(tag, r); +} + +BOOST_FIXTURE_TEST_CASE( plus, F ) +{ + const char tag[] = "some:tag:data+some:other:tag-thing"; + std::set<std::string> result; + result.insert("some:tag:data"); + result.insert("some:other:tag-thing"); + check_split(tag, result); +} + +BOOST_FIXTURE_TEST_CASE( dot, F ) +{ + const char tag[] = "some:tag.tog:data"; + std::set<std::string> result; + result.insert("some:tag:data"); + result.insert("some:tog:data"); + check_split(tag, result); +} + +BOOST_FIXTURE_TEST_CASE( dots, F ) +{ + const char tag[] = "some:tag.tog:data:a.b.c"; + std::set<std::string> result; + result.insert("some:tag:data:a"); + result.insert("some:tog:data:a"); + result.insert("some:tag:data:b"); + result.insert("some:tog:data:b"); + result.insert("some:tag:data:c"); + result.insert("some:tog:data:c"); + check_split(tag, result); +} + +BOOST_FIXTURE_TEST_CASE( dots_plus, F ) +{ + const char tag[] = "some:tag.tog:data:a.b+some:other:thing.thang"; + std::set<std::string> result; + result.insert("some:tag:data:a"); + result.insert("some:tog:data:a"); + result.insert("some:tag:data:b"); + result.insert("some:tog:data:b"); + result.insert("some:other:thing"); + result.insert("some:other:thang"); + check_split(tag, result); +} + +BOOST_FIXTURE_TEST_CASE( missing, F ) +{ + const char tag[] = "some:data"; + std::set<std::string> r; + r.insert("some::data"); + check_split(tag, r); +} + +BOOST_FIXTURE_TEST_CASE( bad_value, F ) +{ + const char tag[] = "some:bad:data"; + std::set<std::string> r; + BOOST_CHECK_THROW( + check_split(tag, r), Corpus2::TagParseError + ); +} + +BOOST_FIXTURE_TEST_CASE( bad_pos, F ) +{ + const char tag[] = "something:data"; + std::set<std::string> r; + BOOST_CHECK_THROW( + check_split(tag, r), Corpus2::TagParseError + ); +} + +BOOST_FIXTURE_TEST_CASE( underscore, F ) +{ + const char tag[] = "some:_:data"; + std::set<std::string> r; + r.insert("some:tag:data"); + r.insert("some:tog:data"); + r.insert("some:other:data"); + r.insert("some:a3:data"); + check_split(tag, r); +} + +BOOST_FIXTURE_TEST_CASE( underscores, F ) +{ + const char tag[] = "some:_:data:_"; + std::set<std::string> r0; + r0.insert("some:tag:data"); + r0.insert("some:tog:data"); + r0.insert("some:other:data"); + r0.insert("some:a3:data"); + std::set<std::string> r; + foreach (const std::string& s, r0) { + r.insert(s + ":a"); + r.insert(s + ":b"); + r.insert(s + ":c"); + } + + check_split(tag, r); +} + +BOOST_FIXTURE_TEST_CASE( underscore_dots, F ) +{ + const char tag[] = "some:_:data:c.a"; + std::set<std::string> r0; + r0.insert("some:tag:data"); + r0.insert("some:tog:data"); + r0.insert("some:other:data"); + r0.insert("some:a3:data"); + std::set<std::string> r; + foreach (const std::string& s, r0) { + r.insert(s + ":a"); + r.insert(s + ":c"); + } + + check_split(tag, r); +} +BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/tagset_parse.cpp b/tests/tagset_parse.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4fd78c795220a098c7c78d24102a3731a0c55940 --- /dev/null +++ b/tests/tagset_parse.cpp @@ -0,0 +1,143 @@ +#include <boost/test/unit_test.hpp> +#include <set> +#include <libpwrutils/foreach.h> +#include <libcorpus2/tagsetparser.h> +#include <libcorpus2/tagsetmanager.h> +#include <iostream> + +BOOST_AUTO_TEST_SUITE( tagset_parse ); + +Corpus2::Tagset parse(const char* s) +{ + std::stringstream ss; + ss << s; + return Corpus2::TagsetParser::load_ini(ss); +} + +#define PRE "[ATTR]\n" +#define POSA "[POS]\n POS1\n" + +BOOST_AUTO_TEST_CASE( empty ) +{ + BOOST_CHECK_THROW( + parse(""), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( minimal ) +{ + try { + parse(PRE POSA); + } catch (Corpus2::TagsetParseError& e) { + BOOST_FAIL(e.info()); + } +} +BOOST_AUTO_TEST_CASE( minimal_nonewline ) +{ + try { + parse(PRE "[POS]\n POS1"); + } catch (Corpus2::TagsetParseError& e) { + BOOST_FAIL(e.info()); + } +} + +BOOST_AUTO_TEST_CASE( dupe_val ) +{ + BOOST_CHECK_THROW( + parse(PRE "A a a " POSA), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( dupe_val2 ) +{ + BOOST_CHECK_THROW( + parse(PRE "A a b\nB c d\n C x a" POSA), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( dupe_sym ) +{ + BOOST_CHECK_THROW( + parse(PRE "A a b\nB c d\n a x z" POSA), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( dupe_sym2 ) +{ + BOOST_CHECK_THROW( + parse(PRE "A a b\nB c d" POSA "A B"), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( dupe_attr ) +{ + BOOST_CHECK_THROW( + parse(PRE "A a b\nB c d\n C x z" POSA), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( dupe_pos ) +{ + BOOST_CHECK_THROW( + parse(PRE "A a b\n" POSA "P1 A\n P1\n"), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( bad_pos_attr ) +{ + BOOST_CHECK_THROW( + parse(PRE "A a b\n" POSA "P1 A\n P2 ZZ\n"), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( bad_value_name ) +{ + BOOST_CHECK_THROW( + parse(PRE "@@ a b\n" POSA "P1 A\n P2"), Corpus2::TagsetParseError + ); +} + +BOOST_AUTO_TEST_CASE( size1 ) +{ + Corpus2::Tagset t = parse(PRE POSA); + BOOST_CHECK_EQUAL(t.size(), 1); + BOOST_CHECK_EQUAL(t.size_extra(), 1); +} + +BOOST_AUTO_TEST_CASE( size2 ) +{ + Corpus2::Tagset t = parse(PRE POSA " POS2\n"); + BOOST_CHECK_EQUAL(t.size(), 2); + BOOST_CHECK_EQUAL(t.size_extra(), 2); +} + +BOOST_AUTO_TEST_CASE( size3 ) +{ + Corpus2::Tagset t = parse(PRE "A a b\n" POSA "POS2 A"); + BOOST_CHECK_EQUAL(t.size(), 3); + BOOST_CHECK_EQUAL(t.size_extra(), 6); +} + +BOOST_AUTO_TEST_CASE( size6 ) +{ + Corpus2::Tagset t = parse(PRE "A a b \n B c d\n" POSA "POS2 A\n POS3 [A]\n"); + BOOST_CHECK_EQUAL(t.size(), 6); + BOOST_CHECK_EQUAL(t.size_extra(), 27); +} + +BOOST_AUTO_TEST_CASE( load_named ) +{ + BOOST_CHECK_NO_THROW( + try { + Corpus2::get_named_tagset("test"); + }catch(Corpus2::Error& e) { + std::cerr << e.info(); + throw; + } + ); + BOOST_CHECK_THROW( + Corpus2::get_named_tagset("__nonexistant_9867s8t"), + Corpus2::FileNotFound); +} + +BOOST_AUTO_TEST_SUITE_END();