diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index d5ccdc18c16c25e2b67dfb00daa582e70c72f0be..ad7900137f2e7ea296900b654c6ab5bdf12300e0 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -53,6 +53,7 @@ SET(libcorpus2_STAT_SRC tokenmetadata.cpp io/cclreader.cpp io/cclwriter.cpp + io/conllwriter.cpp io/helpers.cpp io/fastxces.cpp io/iob-chan.cpp diff --git a/libcorpus2/io/conllwriter.cpp b/libcorpus2/io/conllwriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a5df551c5cc0a6aef6ab716ce271ac28711f42c8 --- /dev/null +++ b/libcorpus2/io/conllwriter.cpp @@ -0,0 +1,71 @@ +#include "conllwriter.h" +#include <libpwrutils/foreach.h> +#include <boost/algorithm/string.hpp> + + +namespace Corpus2 { + +bool ConllWriter::registered = TokenWriter::register_writer<ConllWriter>("conll"); + +ConllWriter::ConllWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params) +{ + myTagset=tagset; +} + +ConllWriter::~ConllWriter() +{ + finish(); +} + +void ConllWriter::write_token(const Token &t) +{ + os()<<t.orth_utf8()<<"\t"; + Lexeme lex = t.get_preferred_lexeme(myTagset); + os()<<lex.lemma_utf8()+"\t"; + std::string tag = myTagset.tag_to_string(lex.tag()); + std::vector<std::string> strs; + boost::split(strs, tag, boost::is_any_of(":")); + os()<<strs[0]<<"\t"<<strs[0]<<"\t"; + if(strs.size()>1) + { + size_t i; + for(i=1;i<strs.size()-1;i++) + { + os()<<strs[i]<<"|"; + } + os()<<strs[i]<<"\t_\t_\t_\t_"; + } + else + os()<<"_\t_\t_\t_\t_"; +} + +void ConllWriter::write_sentence(const Sentence& s) +{ + int i=1; + foreach (const Token* t, s.tokens()) { + os()<<i<<"\t"; + write_token(*t); + os()<<"\n"; + i++; + } +} + +void ConllWriter::write_chunk(const Chunk &c) +{ + foreach (const Sentence::ConstPtr& s, c.sentences()) { + write_sentence(*s); + } +} + +void ConllWriter::do_header() +{ + +} + +void ConllWriter::do_footer() +{ +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/conllwriter.h b/libcorpus2/io/conllwriter.h new file mode 100644 index 0000000000000000000000000000000000000000..c7a7d7b3c490ac79c80d28a4046019f1e9145f0c --- /dev/null +++ b/libcorpus2/io/conllwriter.h @@ -0,0 +1,35 @@ +#ifndef CONLLWRITER_H +#define CONLLWRITER_H + +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +class ConllWriter : public TokenWriter +{ +public: + ConllWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + ~ConllWriter(); + + void write_token(const Token &t); + + void write_sentence(const Sentence &s); + + void write_chunk(const Chunk &c); + + static bool registered; + +protected: + void do_header(); + + void do_footer(); +private: + Tagset myTagset; + +}; + +} /* end ns Corpus2 */ + +#endif // CONLLWRITER_H