/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libcorpus2 project This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE.CORPUS2, LICENSE.POLIQARP, COPYING.LESSER and COPYING files for more details. */ #include "conllwriter.h" #include <libpwrutils/foreach.h> #include <boost/algorithm/string.hpp> #include <algorithm> namespace Corpus2 { bool ConllWriter::registered = TokenWriter::register_writer<ConllWriter>("conll"); const std::string ConllWriter::SUPERPOS_ATTR("superpos"); ConllWriter::ConllWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) : TokenWriter(os, tagset, params) { myTagset=tagset; // check if the tagset contains 'superpos' attribute idx_t superpos_attr = myTagset.get_attribute_index(SUPERPOS_ATTR); if (superpos_attr == -1) { throw Corpus2Error("Tagset " + myTagset.name() + " contains no 'superpos' attribute" " (required by CONLL format)"); } // ensure that the 'superpos' attribute is obligatory and first // for each of the gram. classes defined for (idx_t pos = 0; pos < myTagset.pos_count(); ++pos) { const std::vector<bool> req_attrs = myTagset.get_pos_required_attributes(pos); // superpos_attr is the index of 'superpos' attr // this index should be within range of required attributes for pos // the attrubite should be marked as required if ((idx_t)req_attrs.size() <= superpos_attr) { throw Corpus2Error("Tagset " + myTagset.name() + " should define 'superpos' attribute for each" " grammatical class (req. by CONLL writer)"); } if (!req_attrs[superpos_attr]) { throw Corpus2Error("Tagset " + myTagset.name() + " should define 'superpos' attribute" " as REQUIRED for each class" " (req. by CONLL writer)"); } // ensure that no attribute comes before superpos if (tagset.get_pos_attributes(pos)[0] != superpos_attr) { throw Corpus2Error("Tagset " + myTagset.name() + " should define 'superpos' attribute" " as the FIRST one for each class" " (req. by CONLL writer)"); } } } ConllWriter::~ConllWriter() { finish(); } void ConllWriter::write_token(const Token &t) { const Lexeme &lex = t.get_preferred_lexeme(myTagset); os() << t.orth_utf8() << "\t" << lex.lemma_utf8() << "\t"; // get lower-case tag representation std::string tagstr = myTagset.tag_to_string(lex.tag()); std::transform(tagstr.begin(), tagstr.end(), tagstr.begin(), ::tolower); // ugly, but should work: split the lower tag repr on colons std::vector<std::string> segs; boost::split(segs, tagstr, boost::is_any_of(":")); // now write each part of the split string and pad the non-existent // attributes with _ // (ctr has asserted that after the obligatory gram. class comes // 'superpos' attribute, so it is safe to assume there are always // at least 2 segments) os() << segs[1] << "\t" << segs[0] << "\t"; if(segs.size() > 2) { size_t i; for(i = 2; i < segs.size() - 1; i++) { os() << segs[i] <<"|"; } os() << segs[i] << "\t_\t_\t_\t_"; } else { os()<< "_\t_\t_\t_\t_"; } } void ConllWriter::write_sentence(const Sentence& s) { int i=1; foreach (const Token* t, s.tokens()) { os()<<i<<"\t"; write_token(*t); os()<<"\n"; i++; } os()<<"\n"; } void ConllWriter::write_chunk(const Chunk &c) { foreach (const Sentence::ConstPtr& s, c.sentences()) { write_sentence(*s); } } void ConllWriter::do_header() { } void ConllWriter::do_footer() { } } /* end ns Corpus2 */