From ae9581dd5db68fef1714cf9f3f447cf4406042dc Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Tue, 24 May 2011 11:19:00 +0200 Subject: [PATCH] iob-chan WIP cont. --- libcorpus2/io/iob-chan.cpp | 53 ++++++++++++++++++++++++++++++-------- libcorpus2/io/iob-chan.h | 2 ++ 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/libcorpus2/io/iob-chan.cpp b/libcorpus2/io/iob-chan.cpp index 2962d83..d231b0f 100644 --- a/libcorpus2/io/iob-chan.cpp +++ b/libcorpus2/io/iob-chan.cpp @@ -26,16 +26,18 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool IobChanWriter::registered = TokenWriter::register_writer<IobChanWriter>( - "iob-chan", "nowarn"); + "iob-chan", "nowarn,noforce"); IobChanWriter::IobChanWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) - : TokenWriter(os, tagset, params), warn_on_no_lexemes_(true) + : TokenWriter(os, tagset, params), warn_on_no_lexemes_(true), force_(true) { foreach (const string_range& param, params) { std::string p = boost::copy_range<std::string>(param); if (p == "nowarn") { warn_on_no_lexemes_ = false; + } else if (p == "noforce") { + force_ = false; } } } @@ -58,6 +60,13 @@ void IobChanWriter::write_token(const Token& t) void IobChanWriter::write_sentence(const Sentence& s) { const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s); + if (force_) { + // I sincerely apologize + AnnotatedSentence* hax = const_cast<AnnotatedSentence*>(ann); + foreach(const AnnotatedSentence::chan_map_t::value_type& v, hax->all_channels()) { + hax->get_channel(v.first).make_iob_from_segments(); + } + } for (size_t idx = 0; idx < s.size(); ++idx) { const Token* t = s.tokens()[idx]; os() << t->orth_utf8(); @@ -68,7 +77,10 @@ void IobChanWriter::write_sentence(const Sentence& s) } else { const Lexeme& pref = t->get_preferred_lexeme(tagset()); os() << "\t"; + os() << pref.lemma_utf8(); + os() << "\t"; write_tag(pref.tag()); + os() << "\t"; } if (ann) { bool first = true; @@ -123,7 +135,7 @@ IobChanReader::IobChanReader(const Tagset& tagset, const std::string& filename) Sentence::Ptr IobChanReader::actual_next_sentence() { std::string line; - AnnotatedSentence::Ptr s; + boost::shared_ptr<AnnotatedSentence> s; typedef boost::split_iterator<std::string::const_iterator> string_split_iterator; while (is().good()) { @@ -137,8 +149,9 @@ Sentence::Ptr IobChanReader::actual_next_sentence() std::cerr << "Invalid line: " << line << "\n"; } else { const std::string& orth = spl[0]; - const std::string& lemma = spl[0]; - const std::string& tag_string = spl[1]; + const std::string& lemma = spl[1]; + const std::string& tag_string = spl[2]; + const std::string& anns = spl[3]; Tag tag = parse_tag(tag_string); Token* t = new Token(); t->set_orth(UnicodeString::fromUTF8(orth)); @@ -148,15 +161,33 @@ Sentence::Ptr IobChanReader::actual_next_sentence() t->lexemes().back().set_disamb(true); } s->append(t); - const std::string& cline = line; - for (string_split_iterator value_it = boost::make_split_iterator( - cline, boost::token_finder(boost::is_any_of(","))); - value_it != string_split_iterator(); - ++value_it) { - + std::vector<std::string> annsplit; + boost::algorithm::split(annsplit, anns, boost::is_any_of(",")); + foreach (const std::string& a, annsplit) { + std::vector<std::string> one_ann_split; + boost::algorithm::split(one_ann_split, a, boost::is_any_of("-")); + if (one_ann_split.size() != 2) { + std::cerr << "Invalid annotation:" << a << "\n"; + } else { + const std::string& aname = one_ann_split[0]; + const std::string& aiob = one_ann_split[1]; + Corpus2::IOB::Enum iob = Corpus2::IOB::from_string(aiob); + if (iob == Corpus2::IOB::PostLast) { + std::cerr << "Invalid IOB tag: " << aiob << "\n"; + } else { + if (!s->has_channel(aname)) { + s->create_channel(aname); + } + s->get_channel(aname).set_iob_at(s->size() - 1, iob); + } + } } } } + foreach (const AnnotatedSentence::chan_map_t::value_type& v, s->all_channels()) { + s->get_channel(v.first).make_segments_from_iob(); + } + return s; } diff --git a/libcorpus2/io/iob-chan.h b/libcorpus2/io/iob-chan.h index ea6bb8a..378e850 100644 --- a/libcorpus2/io/iob-chan.h +++ b/libcorpus2/io/iob-chan.h @@ -51,6 +51,8 @@ public: private: bool warn_on_no_lexemes_; + + bool force_; }; class IobChanReader : public BufferedSentenceReader -- GitLab