Merge branch 'master' of nlp.pwr.wroc.pl:corpus2

7e52f5e3 · Pawel Orlowicz · 89e3a78c · ff0a41e5 · 7e52f5e3 · 7e52f5e3
Commit 7e52f5e3 authored 12 years ago by Pawel Orlowicz
--- a/libcorpus2/io/conllwriter.cpp
+++ b/libcorpus2/io/conllwriter.cpp
@@ -7,12 +7,52 @@
 namespace Corpus2 {

 bool ConllWriter::registered = TokenWriter::register_writer<ConllWriter>("conll");
+const std::string ConllWriter::SUPERPOS_ATTR("superpos");

 ConllWriter::ConllWriter(std::ostream& os, const Tagset& tagset,
 		const string_range_vector& params)
 	: TokenWriter(os, tagset, params)
 {
 	myTagset=tagset;
+	// check if the tagset contains 'superpos' attribute
+
+	idx_t superpos_attr = myTagset.get_attribute_index(SUPERPOS_ATTR);
+	if (superpos_attr == -1)
+	{
+		throw Corpus2Error("Tagset " + myTagset.name() +
+						   " contains no 'superpos' attribute"
+						   " (required by CONLL format)");
+	}
+	// ensure that the 'superpos' attribute is obligatory and first
+	// for each of the gram. classes defined
+
+	for (idx_t pos = 0; pos < myTagset.pos_count(); ++pos) {
+		const std::vector<bool> req_attrs = myTagset.get_pos_required_attributes(pos);
+		// superpos_attr is the index of 'superpos' attr
+		// this index should be within range of required attributes for pos
+		// the attrubite should be marked as required
+		if ((idx_t)req_attrs.size() <= superpos_attr)
+		{
+			throw Corpus2Error("Tagset " + myTagset.name() +
+							   " should define 'superpos' attribute for each"
+							   " grammatical class (req. by CONLL writer)");
+		}
+		if (!req_attrs[superpos_attr])
+		{
+			throw Corpus2Error("Tagset " + myTagset.name() +
+							   " should define 'superpos' attribute"
+							   " as REQUIRED for each class"
+							   " (req. by CONLL writer)");
+		}
+		// ensure that no attribute comes before superpos
+		if (tagset.get_pos_attributes(pos)[0] != superpos_attr)
+		{
+			throw Corpus2Error("Tagset " + myTagset.name() +
+							   " should define 'superpos' attribute"
+							   " as the FIRST one for each class"
+							   " (req. by CONLL writer)");
+		}
+	}
 }

 ConllWriter::~ConllWriter()
@@ -22,25 +62,36 @@ ConllWriter::~ConllWriter()

 void ConllWriter::write_token(const Token &t)
 {
-	os()<<t.orth_utf8()<<"\t";
-	Lexeme lex = t.get_preferred_lexeme(myTagset);
-	os()<<lex.lemma_utf8()+"\t";
-	std::string tag = myTagset.tag_to_string(lex.tag());
-	std::vector<std::string> strs;
-	std::transform(tag.begin(), tag.end(), tag.begin(), ::tolower);
-	boost::split(strs, tag, boost::is_any_of(":"));
-	os()<<strs[1]<<"\t"<<strs[0]<<"\t";
-	if(strs.size()>2)
+	const Lexeme &lex = t.get_preferred_lexeme(myTagset);
+	os() << t.orth_utf8() << "\t" << lex.lemma_utf8() << "\t";
+
+	// get lower-case tag representation
+	std::string tagstr = myTagset.tag_to_string(lex.tag());
+	std::transform(tagstr.begin(), tagstr.end(), tagstr.begin(), ::tolower);
+
+	// ugly, but should work: split the lower tag repr on colons
+	std::vector<std::string> segs;
+	boost::split(segs, tagstr, boost::is_any_of(":"));
+
+	// now write each part of the split string and pad the non-existent
+	// attributes with _
+	// (ctr has asserted that after the obligatory gram. class comes
+	// 'superpos' attribute, so it is safe to assume there are always
+	// at least 2 segments)
+	os() << segs[1] << "\t" << segs[0] << "\t";
+	if(segs.size() > 2)
 	{
 		size_t i;
-		for(i=2;i<strs.size()-1;i++)
+		for(i = 2; i < segs.size() - 1; i++)
 		{
-			os()<<strs[i]<<"|";
+			os() << segs[i] <<"|";
 		}
-		os()<<strs[i]<<"\t_\t_\t_\t_";
+		os() << segs[i] << "\t_\t_\t_\t_";
 	}
 	else
-		os()<<"_\t_\t_\t_\t_";
+	{
+		os()<< "_\t_\t_\t_\t_";
+	}
 }

 void ConllWriter::write_sentence(const Sentence& s)

--- a/libcorpus2/io/conllwriter.h
+++ b/libcorpus2/io/conllwriter.h
@@ -5,6 +5,14 @@

 namespace Corpus2 {

+/**
+ * Writer in the CONLL format (as required by MALT parser). The writer
+ * assumes that the tagset used employs an attribute named 'superpos'
+ * (this naming is obligatory) and the attribute is defined as first
+ * and required for each grammatical class. This attribute is used to
+ * designate a more general POS category for each token (e.g. all verb
+ * classes could be marked as VERB there).
+ */
 class ConllWriter : public TokenWriter
 {
 public:
@@ -19,6 +27,7 @@ public:

 	void write_chunk(const Chunk &c);

+	const static std::string SUPERPOS_ATTR;
 	static bool registered;

 protected:

--- a/utils/tagger-eval.py
+++ b/utils/tagger-eval.py
@@ -135,6 +135,7 @@ class Metric:
 	# as above but metric for POS hits
 	POS_WC = ([Feat.WEAK_POS_HIT], None)
 	POS_SC = ([Feat.STRONG_POS_HIT], None)
+	POS_WC_LOWER = ([Feat.WEAK_POS_HIT, Feat.SEG_NOCHANGE], None) # lower bound for POS WC
 	# separate stats for known and unknown forms
 	KN_WC = ([Feat.WEAK_TAG_HIT, Feat.KNOWN], [Feat.KNOWN])
 	UNK_WC = ([Feat.WEAK_TAG_HIT, Feat.UNKNOWN], [Feat.UNKNOWN])