From 51e69c41b08874310a04dd37e2ca1157e5229ddf Mon Sep 17 00:00:00 2001
From: ilor <kailoran@gmail.com>
Date: Wed, 22 Sep 2010 11:06:21 +0200
Subject: [PATCH] move some tests from maca tha belong in libcorpus2

---
 tests/CMakeLists.txt   |   7 +-
 tests/basic.cpp        |  42 +++++++++++
 tests/tag_split.cpp    | 161 +++++++++++++++++++++++++++++++++++++++++
 tests/tagset_parse.cpp | 143 ++++++++++++++++++++++++++++++++++++
 4 files changed, 350 insertions(+), 3 deletions(-)
 create mode 100644 tests/basic.cpp
 create mode 100644 tests/tag_split.cpp
 create mode 100644 tests/tagset_parse.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1f22c93..fa3aa8b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,6 +6,9 @@ add_definitions(-DLIBCORPUS2_TEST_DATA_DIR="${PROJECT_SOURCE_DIR}/")
 
 add_executable( tests
 	main.cpp
+	basic.cpp
+	tag_split.cpp
+	tagset_parse.cpp
 )
 
 target_link_libraries ( tests maca ${Boost_LIBRARIES})
@@ -13,6 +16,4 @@ target_link_libraries ( tests maca ${Boost_LIBRARIES})
 include_directories(${Boost_INCLUDE_DIR})
 link_directories(${Boost_LIBRARY_DIRS})
 
-#configure_file(test-sanity.sh ${CMAKE_CURRENT_BINARY_DIR})
-#add_custom_target(test tests COMMAND ./test-sanity.sh small)
-#add_custom_target(test-large ./test-sanity.sh large)
+add_custom_target(test tests)
diff --git a/tests/basic.cpp b/tests/basic.cpp
new file mode 100644
index 0000000..d965132
--- /dev/null
+++ b/tests/basic.cpp
@@ -0,0 +1,42 @@
+#include <boost/test/unit_test.hpp>
+
+#include <libcorpus2/token.h>
+
+const char tagsetstr1[] = "[ATTR]\n"
+	"A tag tog other a3 \n"
+	"B data thing tag-thing thang\n"
+	"C a b c \n"
+	"[POS]\n some A B [C]\n";
+
+BOOST_AUTO_TEST_CASE( token )
+{
+	Corpus2::Token t(UnicodeString::fromUTF8("ZZ"), PwrNlp::Whitespace::ManySpaces);
+	BOOST_CHECK_EQUAL(t.orth_utf8(), "ZZ");
+	BOOST_CHECK_EQUAL(t.wa(), PwrNlp::Whitespace::ManySpaces);
+	BOOST_CHECK(t.lexemes().empty());
+}
+
+BOOST_AUTO_TEST_CASE( token_dup_lexemes )
+{
+	Corpus2::Token t(UnicodeString::fromUTF8("ZZ"), PwrNlp::Whitespace::ManySpaces);
+	//Corpus2::Tagset tagset(tagsetstr1);
+	Corpus2::Tag t1(Corpus2::tagset_idx_t(0), Corpus2::pos_idx_t(0));
+	Corpus2::Lexeme l1(UnicodeString::fromUTF8("aaa"), t1);
+	Corpus2::Lexeme l2(UnicodeString::fromUTF8("bbb"), t1);
+	BOOST_CHECK(!t.check_duplicate_lexemes());
+	BOOST_CHECK(!t.remove_duplicate_lexemes());
+	t.add_lexeme(l1);
+	BOOST_CHECK(!t.check_duplicate_lexemes());
+	BOOST_CHECK(!t.remove_duplicate_lexemes());
+	t.add_lexeme(l2);
+	BOOST_CHECK(!t.check_duplicate_lexemes());
+	BOOST_CHECK(!t.remove_duplicate_lexemes());
+	Corpus2::Token tt(t);
+	t.add_lexeme(l1);
+	BOOST_CHECK(t != tt);
+	BOOST_CHECK(t.check_duplicate_lexemes());
+	BOOST_CHECK(t.remove_duplicate_lexemes());
+	BOOST_CHECK(!t.check_duplicate_lexemes());
+	BOOST_CHECK(!t.remove_duplicate_lexemes());
+	BOOST_CHECK(t == tt);
+}
diff --git a/tests/tag_split.cpp b/tests/tag_split.cpp
new file mode 100644
index 0000000..c0d93c3
--- /dev/null
+++ b/tests/tag_split.cpp
@@ -0,0 +1,161 @@
+#include <boost/test/unit_test.hpp>
+#include <set>
+#include <libpwrutils/foreach.h>
+#include <libcorpus2/tagset.h>
+#include <libcorpus2/token.h>
+
+BOOST_AUTO_TEST_SUITE( tag_split )
+
+struct F {
+	F() {
+		const char tagset_string[] = "[ATTR]\n"
+			"A tag tog other a3 \n"
+			"B data thing tag-thing thang\n"
+			"C a b c \n"
+			"[POS]\n some A B [C]\n";
+		tagset.reset(new Corpus2::Tagset(tagset_string));
+	}
+	boost::shared_ptr<Corpus2::Tagset> tagset;
+
+	std::vector<Corpus2::Tag> check_split(const std::string s, const std::set<std::string> expect)
+	{
+		std::set<std::string> actual;
+		std::vector<Corpus2::Tag> tags;
+		Corpus2::Token t;
+		tagset->lexemes_into_token(t, UnicodeString(), s);
+		foreach (const Corpus2::Lexeme& lex, t.lexemes()) {
+			const Corpus2::Tag& tag = lex.tag();
+			BOOST_WARN(tagset->validate_tag(tag, false));
+			actual.insert(tagset->tag_to_string(tag));
+			tags.push_back(tag);
+		}
+		BOOST_CHECK_EQUAL_COLLECTIONS(actual.begin(), actual.end(), expect.begin(), expect.end());
+		return tags;
+	}
+};
+
+
+BOOST_FIXTURE_TEST_CASE( plain, F )
+{
+	const char tag[] = "some:tag:data";
+	std::set<std::string> r;
+	r.insert(tag);
+	check_split(tag, r);
+}
+
+BOOST_FIXTURE_TEST_CASE( plus, F )
+{
+	const char tag[] = "some:tag:data+some:other:tag-thing";
+	std::set<std::string> result;
+	result.insert("some:tag:data");
+	result.insert("some:other:tag-thing");
+	check_split(tag, result);
+}
+
+BOOST_FIXTURE_TEST_CASE( dot, F )
+{
+	const char tag[] = "some:tag.tog:data";
+	std::set<std::string> result;
+	result.insert("some:tag:data");
+	result.insert("some:tog:data");
+	check_split(tag, result);
+}
+
+BOOST_FIXTURE_TEST_CASE( dots, F )
+{
+	const char tag[] = "some:tag.tog:data:a.b.c";
+	std::set<std::string> result;
+	result.insert("some:tag:data:a");
+	result.insert("some:tog:data:a");
+	result.insert("some:tag:data:b");
+	result.insert("some:tog:data:b");
+	result.insert("some:tag:data:c");
+	result.insert("some:tog:data:c");
+	check_split(tag, result);
+}
+
+BOOST_FIXTURE_TEST_CASE( dots_plus, F )
+{
+	const char tag[] = "some:tag.tog:data:a.b+some:other:thing.thang";
+	std::set<std::string> result;
+	result.insert("some:tag:data:a");
+	result.insert("some:tog:data:a");
+	result.insert("some:tag:data:b");
+	result.insert("some:tog:data:b");
+	result.insert("some:other:thing");
+	result.insert("some:other:thang");
+	check_split(tag, result);
+}
+
+BOOST_FIXTURE_TEST_CASE( missing, F )
+{
+	const char tag[] = "some:data";
+	std::set<std::string> r;
+	r.insert("some::data");
+	check_split(tag, r);
+}
+
+BOOST_FIXTURE_TEST_CASE( bad_value, F )
+{
+	const char tag[] = "some:bad:data";
+	std::set<std::string> r;
+	BOOST_CHECK_THROW(
+			check_split(tag, r), Corpus2::TagParseError
+	);
+}
+
+BOOST_FIXTURE_TEST_CASE( bad_pos, F )
+{
+	const char tag[] = "something:data";
+	std::set<std::string> r;
+	BOOST_CHECK_THROW(
+			check_split(tag, r), Corpus2::TagParseError
+	);
+}
+
+BOOST_FIXTURE_TEST_CASE( underscore, F )
+{
+	const char tag[] = "some:_:data";
+	std::set<std::string> r;
+	r.insert("some:tag:data");
+	r.insert("some:tog:data");
+	r.insert("some:other:data");
+	r.insert("some:a3:data");
+	check_split(tag, r);
+}
+
+BOOST_FIXTURE_TEST_CASE( underscores, F )
+{
+	const char tag[] = "some:_:data:_";
+	std::set<std::string> r0;
+	r0.insert("some:tag:data");
+	r0.insert("some:tog:data");
+	r0.insert("some:other:data");
+	r0.insert("some:a3:data");
+	std::set<std::string> r;
+	foreach (const std::string& s, r0) {
+		r.insert(s + ":a");
+		r.insert(s + ":b");
+		r.insert(s + ":c");
+	}
+
+	check_split(tag, r);
+}
+
+BOOST_FIXTURE_TEST_CASE( underscore_dots, F )
+{
+	const char tag[] = "some:_:data:c.a";
+	std::set<std::string> r0;
+	r0.insert("some:tag:data");
+	r0.insert("some:tog:data");
+	r0.insert("some:other:data");
+	r0.insert("some:a3:data");
+	std::set<std::string> r;
+	foreach (const std::string& s, r0) {
+		r.insert(s + ":a");
+		r.insert(s + ":c");
+	}
+
+	check_split(tag, r);
+}
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/tagset_parse.cpp b/tests/tagset_parse.cpp
new file mode 100644
index 0000000..4fd78c7
--- /dev/null
+++ b/tests/tagset_parse.cpp
@@ -0,0 +1,143 @@
+#include <boost/test/unit_test.hpp>
+#include <set>
+#include <libpwrutils/foreach.h>
+#include <libcorpus2/tagsetparser.h>
+#include <libcorpus2/tagsetmanager.h>
+#include <iostream>
+
+BOOST_AUTO_TEST_SUITE( tagset_parse );
+
+Corpus2::Tagset parse(const char* s)
+{
+	std::stringstream ss;
+	ss << s;
+	return Corpus2::TagsetParser::load_ini(ss);
+}
+
+#define PRE "[ATTR]\n"
+#define POSA "[POS]\n POS1\n"
+
+BOOST_AUTO_TEST_CASE( empty )
+{
+	BOOST_CHECK_THROW(
+		parse(""), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( minimal )
+{
+	try {
+		parse(PRE POSA);
+	} catch (Corpus2::TagsetParseError& e) {
+		BOOST_FAIL(e.info());
+	}
+}
+BOOST_AUTO_TEST_CASE( minimal_nonewline )
+{
+	try {
+		parse(PRE "[POS]\n POS1");
+	} catch (Corpus2::TagsetParseError& e) {
+		BOOST_FAIL(e.info());
+	}
+}
+
+BOOST_AUTO_TEST_CASE( dupe_val )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "A a a " POSA), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( dupe_val2 )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "A a b\nB c d\n C x a" POSA), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( dupe_sym )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "A a b\nB c d\n a x z" POSA), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( dupe_sym2 )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "A a b\nB c d" POSA "A B"), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( dupe_attr )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "A a b\nB c d\n C x z" POSA), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( dupe_pos )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "A a b\n" POSA "P1 A\n P1\n"), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( bad_pos_attr )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "A a b\n" POSA "P1 A\n P2 ZZ\n"), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( bad_value_name )
+{
+	BOOST_CHECK_THROW(
+		parse(PRE "@@ a b\n" POSA "P1 A\n P2"), Corpus2::TagsetParseError
+	);
+}
+
+BOOST_AUTO_TEST_CASE( size1 )
+{
+	Corpus2::Tagset t = parse(PRE POSA);
+	BOOST_CHECK_EQUAL(t.size(), 1);
+	BOOST_CHECK_EQUAL(t.size_extra(), 1);
+}
+
+BOOST_AUTO_TEST_CASE( size2 )
+{
+	Corpus2::Tagset t = parse(PRE POSA " POS2\n");
+	BOOST_CHECK_EQUAL(t.size(), 2);
+	BOOST_CHECK_EQUAL(t.size_extra(), 2);
+}
+
+BOOST_AUTO_TEST_CASE( size3 )
+{
+	Corpus2::Tagset t = parse(PRE "A a b\n" POSA "POS2 A");
+	BOOST_CHECK_EQUAL(t.size(), 3);
+	BOOST_CHECK_EQUAL(t.size_extra(), 6);
+}
+
+BOOST_AUTO_TEST_CASE( size6 )
+{
+	Corpus2::Tagset t = parse(PRE "A a b \n B c d\n" POSA "POS2 A\n POS3 [A]\n");
+	BOOST_CHECK_EQUAL(t.size(), 6);
+	BOOST_CHECK_EQUAL(t.size_extra(), 27);
+}
+
+BOOST_AUTO_TEST_CASE( load_named )
+{
+	BOOST_CHECK_NO_THROW(
+		try {
+			Corpus2::get_named_tagset("test");
+		}catch(Corpus2::Error& e) {
+			std::cerr << e.info();
+			throw;
+		}
+	);
+	BOOST_CHECK_THROW(
+		Corpus2::get_named_tagset("__nonexistant_9867s8t"),
+		Corpus2::FileNotFound);
+}
+
+BOOST_AUTO_TEST_SUITE_END();
-- 
GitLab