diff --git a/libcorpus2/io/conllwriter.cpp b/libcorpus2/io/conllwriter.cpp index dcaacfc85f38ea91ceacbdbfe7cc4c7394f7b3d4..eea52b36246d08a40e9d8e52c5712b91bca66a6a 100644 --- a/libcorpus2/io/conllwriter.cpp +++ b/libcorpus2/io/conllwriter.cpp @@ -7,12 +7,52 @@ namespace Corpus2 { bool ConllWriter::registered = TokenWriter::register_writer<ConllWriter>("conll"); +const std::string ConllWriter::SUPERPOS_ATTR("superpos"); ConllWriter::ConllWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) : TokenWriter(os, tagset, params) { myTagset=tagset; + // check if the tagset contains 'superpos' attribute + + idx_t superpos_attr = myTagset.get_attribute_index(SUPERPOS_ATTR); + if (superpos_attr == -1) + { + throw Corpus2Error("Tagset " + myTagset.name() + + " contains no 'superpos' attribute" + " (required by CONLL format)"); + } + // ensure that the 'superpos' attribute is obligatory and first + // for each of the gram. classes defined + + for (idx_t pos = 0; pos < myTagset.pos_count(); ++pos) { + const std::vector<bool> req_attrs = myTagset.get_pos_required_attributes(pos); + // superpos_attr is the index of 'superpos' attr + // this index should be within range of required attributes for pos + // the attrubite should be marked as required + if ((idx_t)req_attrs.size() <= superpos_attr) + { + throw Corpus2Error("Tagset " + myTagset.name() + + " should define 'superpos' attribute for each" + " grammatical class (req. by CONLL writer)"); + } + if (!req_attrs[superpos_attr]) + { + throw Corpus2Error("Tagset " + myTagset.name() + + " should define 'superpos' attribute" + " as REQUIRED for each class" + " (req. by CONLL writer)"); + } + // ensure that no attribute comes before superpos + if (tagset.get_pos_attributes(pos)[0] != superpos_attr) + { + throw Corpus2Error("Tagset " + myTagset.name() + + " should define 'superpos' attribute" + " as the FIRST one for each class" + " (req. by CONLL writer)"); + } + } } ConllWriter::~ConllWriter() @@ -22,25 +62,36 @@ ConllWriter::~ConllWriter() void ConllWriter::write_token(const Token &t) { - os()<<t.orth_utf8()<<"\t"; - Lexeme lex = t.get_preferred_lexeme(myTagset); - os()<<lex.lemma_utf8()+"\t"; - std::string tag = myTagset.tag_to_string(lex.tag()); - std::vector<std::string> strs; - std::transform(tag.begin(), tag.end(), tag.begin(), ::tolower); - boost::split(strs, tag, boost::is_any_of(":")); - os()<<strs[1]<<"\t"<<strs[0]<<"\t"; - if(strs.size()>2) + const Lexeme &lex = t.get_preferred_lexeme(myTagset); + os() << t.orth_utf8() << "\t" << lex.lemma_utf8() << "\t"; + + // get lower-case tag representation + std::string tagstr = myTagset.tag_to_string(lex.tag()); + std::transform(tagstr.begin(), tagstr.end(), tagstr.begin(), ::tolower); + + // ugly, but should work: split the lower tag repr on colons + std::vector<std::string> segs; + boost::split(segs, tagstr, boost::is_any_of(":")); + + // now write each part of the split string and pad the non-existent + // attributes with _ + // (ctr has asserted that after the obligatory gram. class comes + // 'superpos' attribute, so it is safe to assume there are always + // at least 2 segments) + os() << segs[1] << "\t" << segs[0] << "\t"; + if(segs.size() > 2) { size_t i; - for(i=2;i<strs.size()-1;i++) + for(i = 2; i < segs.size() - 1; i++) { - os()<<strs[i]<<"|"; + os() << segs[i] <<"|"; } - os()<<strs[i]<<"\t_\t_\t_\t_"; + os() << segs[i] << "\t_\t_\t_\t_"; } else - os()<<"_\t_\t_\t_\t_"; + { + os()<< "_\t_\t_\t_\t_"; + } } void ConllWriter::write_sentence(const Sentence& s) diff --git a/libcorpus2/io/conllwriter.h b/libcorpus2/io/conllwriter.h index c7a7d7b3c490ac79c80d28a4046019f1e9145f0c..40b6b12c56eff833dbef4eb6b19fee2662e71141 100644 --- a/libcorpus2/io/conllwriter.h +++ b/libcorpus2/io/conllwriter.h @@ -19,6 +19,7 @@ public: void write_chunk(const Chunk &c); + const static std::string SUPERPOS_ATTR; static bool registered; protected: