Skip to content
Snippets Groups Projects
Commit 9404d1fd authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

make conllwriter check its tagset for compliance with 'superpos' assumptions

parent d110e799
No related branches found
No related tags found
No related merge requests found
......@@ -7,12 +7,52 @@
namespace Corpus2 {
bool ConllWriter::registered = TokenWriter::register_writer<ConllWriter>("conll");
const std::string ConllWriter::SUPERPOS_ATTR("superpos");
ConllWriter::ConllWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params)
: TokenWriter(os, tagset, params)
{
myTagset=tagset;
// check if the tagset contains 'superpos' attribute
idx_t superpos_attr = myTagset.get_attribute_index(SUPERPOS_ATTR);
if (superpos_attr == -1)
{
throw Corpus2Error("Tagset " + myTagset.name() +
" contains no 'superpos' attribute"
" (required by CONLL format)");
}
// ensure that the 'superpos' attribute is obligatory and first
// for each of the gram. classes defined
for (idx_t pos = 0; pos < myTagset.pos_count(); ++pos) {
const std::vector<bool> req_attrs = myTagset.get_pos_required_attributes(pos);
// superpos_attr is the index of 'superpos' attr
// this index should be within range of required attributes for pos
// the attrubite should be marked as required
if ((idx_t)req_attrs.size() <= superpos_attr)
{
throw Corpus2Error("Tagset " + myTagset.name() +
" should define 'superpos' attribute for each"
" grammatical class (req. by CONLL writer)");
}
if (!req_attrs[superpos_attr])
{
throw Corpus2Error("Tagset " + myTagset.name() +
" should define 'superpos' attribute"
" as REQUIRED for each class"
" (req. by CONLL writer)");
}
// ensure that no attribute comes before superpos
if (tagset.get_pos_attributes(pos)[0] != superpos_attr)
{
throw Corpus2Error("Tagset " + myTagset.name() +
" should define 'superpos' attribute"
" as the FIRST one for each class"
" (req. by CONLL writer)");
}
}
}
ConllWriter::~ConllWriter()
......@@ -22,26 +62,37 @@ ConllWriter::~ConllWriter()
void ConllWriter::write_token(const Token &t)
{
os()<<t.orth_utf8()<<"\t";
Lexeme lex = t.get_preferred_lexeme(myTagset);
os()<<lex.lemma_utf8()+"\t";
std::string tag = myTagset.tag_to_string(lex.tag());
std::vector<std::string> strs;
std::transform(tag.begin(), tag.end(), tag.begin(), ::tolower);
boost::split(strs, tag, boost::is_any_of(":"));
os()<<strs[1]<<"\t"<<strs[0]<<"\t";
if(strs.size()>2)
const Lexeme &lex = t.get_preferred_lexeme(myTagset);
os() << t.orth_utf8() << "\t" << lex.lemma_utf8() << "\t";
// get lower-case tag representation
std::string tagstr = myTagset.tag_to_string(lex.tag());
std::transform(tagstr.begin(), tagstr.end(), tagstr.begin(), ::tolower);
// ugly, but should work: split the lower tag repr on colons
std::vector<std::string> segs;
boost::split(segs, tagstr, boost::is_any_of(":"));
// now write each part of the split string and pad the non-existent
// attributes with _
// (ctr has asserted that after the obligatory gram. class comes
// 'superpos' attribute, so it is safe to assume there are always
// at least 2 segments)
os() << segs[1] << "\t" << segs[0] << "\t";
if(segs.size() > 2)
{
size_t i;
for(i=2;i<strs.size()-1;i++)
for(i = 2; i < segs.size() - 1; i++)
{
os()<<strs[i]<<"|";
os() << segs[i] <<"|";
}
os()<<strs[i]<<"\t_\t_\t_\t_";
os() << segs[i] << "\t_\t_\t_\t_";
}
else
{
os()<< "_\t_\t_\t_\t_";
}
}
void ConllWriter::write_sentence(const Sentence& s)
{
......
......@@ -19,6 +19,7 @@ public:
void write_chunk(const Chunk &c);
const static std::string SUPERPOS_ATTR;
static bool registered;
protected:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment