diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 0994d5fcef88f610f68941223339adc84748b180..aa066f935f3e062ffa7310725030968159db976b 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -54,6 +54,7 @@ SET(libcorpus2_STAT_SRC tagsetmanager.cpp tagsetparser.cpp token.cpp + io/cclreader.cpp io/orthwriter.cpp io/plainwriter.cpp io/premorphwriter.cpp diff --git a/libcorpus2/ann/annotatedsentence.cpp b/libcorpus2/ann/annotatedsentence.cpp index 68b561f92d1af65a3412bb6dbf420cf5c1bca184..327e9bad2be2d574c84d35e077ff1a088a0a861e 100644 --- a/libcorpus2/ann/annotatedsentence.cpp +++ b/libcorpus2/ann/annotatedsentence.cpp @@ -1,6 +1,8 @@ #include <libcorpus2/ann/annotatedsentence.h> #include <libcorpus2/ann/view.h> #include <boost/make_shared.hpp> +#include <sstream> +#include <libpwrutils/plural.h> namespace Corpus2 { @@ -56,7 +58,7 @@ boost::shared_ptr<AnnotationView> create_view( const std::string& ann_name) { const AnnotationChannel& chan = s->get_channel(ann_name); - std::vector<Annotation> ann = chan.make_annotation_vector(); + std::vector<Annotation> ann = chan.make_annotation_vector(AnnotationChannel::O_INCLUSIVE); boost::shared_ptr<AnnotationView> view; view = boost::make_shared<AnnotationView>(s, ann_name); foreach (const Annotation& a, ann) { @@ -76,4 +78,29 @@ boost::shared_ptr<AnnotationView> create_view( return view; } +void AnnotatedSentence::append(Token *t) +{ + Sentence::append(t); + foreach (chan_map_t::value_type& v, channels_) { + v.second.resize(size()); + } +} + +std::string AnnotatedSentence::annotation_info() const +{ + std::stringstream ss; + foreach (const chan_map_t::value_type& v, channels_) { + ss << "Channel " << v.first << ": \t"; + int ann, disj, un; + v.second.do_counts(ann, disj, un); + ss << PwrNlp::enpln(ann, "annotation") << ", "; + ss << disj << " disjoint, "; + int a = size() - un; + double r = (double)a / size(); + ss << "annotations span: " << a << "/" << size() << " tokens (" << r*100 << "%)"; + ss << "\n"; + } + return ss.str(); +} + } /* end ns Corpus2 */ diff --git a/libcorpus2/ann/annotatedsentence.h b/libcorpus2/ann/annotatedsentence.h index db2d4eade8c42a946a8db84071a096f057eba027..c69203bfc8a7101061052f1ed486cd4bd3159b77 100644 --- a/libcorpus2/ann/annotatedsentence.h +++ b/libcorpus2/ann/annotatedsentence.h @@ -4,11 +4,10 @@ #include <libcorpus2/sentence.h> #include <libcorpus2/exception.h> #include <libcorpus2/ann/channel.h> +#include <libcorpus2/ann/view.h> namespace Corpus2 { -class AnnotationView; - /** * Exception class for use when a requested annotation channel does not exist */ @@ -44,6 +43,9 @@ public: Sentence::Ptr clone_shared() const; + /// typedef for the channels + typedef std::map<std::string, AnnotationChannel> chan_map_t; + /** * Create an AnnotatedSentence from a Sentence, grabing all the tokens * directly (afterwards the source Sentence has no tokens). @@ -94,10 +96,16 @@ public: return i->second; } -private: - /// typedef for tha channels - typedef std::map<std::string, AnnotationChannel> chan_map_t; + const chan_map_t& all_channels() const { + return channels_; + } + + /// Sentence override, extends annotation objects + void append(Token *t); + std::string annotation_info() const; + +private: /// the actual channels chan_map_t channels_; }; diff --git a/libcorpus2/ann/channel.cpp b/libcorpus2/ann/channel.cpp index cb5fa318467103fa11fb509c4a8c958fe97c865e..eae9000dec58dd8fa89a80875c3554ef7ad2a683 100644 --- a/libcorpus2/ann/channel.cpp +++ b/libcorpus2/ann/channel.cpp @@ -3,6 +3,7 @@ #include <algorithm> #include <boost/bind.hpp> #include <sstream> +#include <set> namespace Corpus2 { @@ -16,6 +17,13 @@ AnnotationChannel::AnnotationChannel(int size) { } +void AnnotationChannel::resize(int size) +{ + segments_.resize(size); + iobs_.resize(size); + heads_.resize(size); +} + void AnnotationChannel::make_iob_from_segments() { int prev_seg = 0; @@ -78,6 +86,13 @@ int AnnotationChannel::get_segment_at(int idx) const } } +void AnnotationChannel::set_segment_at(int token_idx, int segment_idx) +{ + if (token_idx >= 0 && token_idx < static_cast<int>(segments_.size())) { + segments_[token_idx] = segment_idx; + } +} + IOB::Enum AnnotationChannel::get_iob_at(int idx) { if (idx >= 0 && idx < static_cast<int>(iobs_.size())) { @@ -110,9 +125,11 @@ void AnnotationChannel::set_head_at(int idx, bool v) } } -std::vector<Annotation> AnnotationChannel::make_annotation_vector() const +std::vector<Annotation> AnnotationChannel::make_annotation_vector( + AnnotationVectorMode mode) const { std::vector<Annotation> rv; + std::vector<int> not_annotated; int smax = 0; for (size_t i = 0; i < segments_.size(); ++i) { int s = segments_[i]; @@ -124,8 +141,15 @@ std::vector<Annotation> AnnotationChannel::make_annotation_vector() const if (heads_[i]) { rv[s - 1].head_index = i; } + } else if (mode & AnnotationChannel::O_INCLUSIVE) { + not_annotated.push_back(i); } } + foreach (int na, not_annotated) { + rv.push_back(Annotation()); + rv.back().indices.push_back(na); + rv.back().head_index = na; + } rv.erase(std::remove_if(rv.begin(), rv.end(), boost::bind(&Annotation::empty, _1)), rv.end()); foreach (Annotation& a, rv) { @@ -146,4 +170,26 @@ std::string AnnotationChannel::dump_iob() const return ss.str(); } +void AnnotationChannel::do_counts(int& annotations, int& disjoint, int& unannotated) const +{ + std::set<int> used_sids; + std::set<int> disjoint_sids; + int last_sid = 0; + annotations = 0; + disjoint = 0; + unannotated = 0; + foreach (int sid, segments_) { + if (sid == 0) { + ++unannotated; + } else if (!used_sids.insert(sid).second) { //was already there + if (last_sid != sid) { + disjoint_sids.insert(sid); + } + } + last_sid = sid; + } + annotations = used_sids.size(); + disjoint = disjoint_sids.size(); +} + } /* end ns Corpus2 */ diff --git a/libcorpus2/ann/channel.h b/libcorpus2/ann/channel.h index cea905e31a57e52a3c59fa438b4d3a398ff05ac6..e6765d36eb12f3d4e3f6aaeda19765fd4f6da4b5 100644 --- a/libcorpus2/ann/channel.h +++ b/libcorpus2/ann/channel.h @@ -52,6 +52,12 @@ public: */ explicit AnnotationChannel(int size); + int size() const { + return segments_.size(); + } + + void resize(int size); + /** * Discard IOB annotation information, regenerate it from the segment info. */ @@ -68,18 +74,22 @@ public: */ int renumber_segments(); - /** - * Create a vector of AnnotationSegment objects, each corresponding to - * an annotation, with the annotations possibly being disjoint. - */ - std::vector<Annotation> make_annotation_vector() const; + enum AnnotationVectorMode + { + O_DISJOINT_EXCLUSIVE = 0, + O_CONTINUOUS = 1, + O_INCLUSIVE = 2, + O_CONTINUOUS_INCLUSIVE = 3, + }; /** * Create a vector of AnnotationSegment objects, each corresponding to - * an annotation, forcing the annotations to be continous (disjoint - * annotations are split) + * an annotation, with the annotations possibly being disjoint unless + * O_CONTINUOUS is specified in mode, and omiting unanottated tokens unless + * O_INCLUSIVE is specified. */ - std::vector<Annotation> make_continuous_annotation_vector() const; + std::vector<Annotation> make_annotation_vector( + AnnotationVectorMode mode = O_DISJOINT_EXCLUSIVE) const; /** * The segment-index array accesor @@ -100,6 +110,11 @@ public: */ int get_segment_at(int idx) const; + /** + * Segment index setter, out of range indices are not processed. + */ + void set_segment_at(int token_idx, int segment_idx); + /** * The IOB data vector */ @@ -132,6 +147,8 @@ public: */ std::string dump_iob() const; + void do_counts(int& annotations, int& disjoint, int& unannotated) const; + private: /// segment indices std::vector<int> segments_; diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..23179f9237e892f277d6ec5a057bef8da9357013 --- /dev/null +++ b/libcorpus2/io/cclreader.cpp @@ -0,0 +1,289 @@ +/* + Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <libcorpus2/io/cclreader.h> +#include <libcorpus2/io/sax.h> +#include <libpwrutils/foreach.h> +#include <libxml++/libxml++.h> +#include <libxml2/libxml/parser.h> +#include <boost/make_shared.hpp> +#include <libcorpus2/ann/annotatedsentence.h> +#include <cstdlib> +#include <fstream> + +namespace Corpus2 { + +class CclReaderImpl : public BasicSaxParser +{ +public: + CclReaderImpl(const Tagset& tagset, + std::deque< boost::shared_ptr<Chunk> >& obuf, + bool disamb_only, bool disamb_sh); + + ~CclReaderImpl(); + +protected: + void on_start_element(const Glib::ustring & name, + const AttributeList& attributes); + void on_end_element(const Glib::ustring & name); + + void finish_sentence(); + + const Tagset& tagset_; + + enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ANN, XS_ORTH, XS_LEX, + XS_LEMMA, XS_TAG, XS_REL }; + state_t state_; + + bool chunkless_; + + bool out_of_chunk_; + + PwrNlp::Whitespace::Enum wa_; + + Glib::ustring sbuf_; + + Token* tok_; + + boost::shared_ptr<AnnotatedSentence> sent_; + + std::string ann_chan_; + + bool ann_head_; + + typedef std::map<std::string, int> token_ann_t; + + token_ann_t token_anns_; + + std::set<std::string> token_ann_heads_; + + boost::shared_ptr<Chunk> chunk_; + + std::deque< boost::shared_ptr<Chunk> >& obuf_; + + bool disamb_only_; + + bool disamb_sh_; +}; + +CclReader::CclReader(const Tagset& tagset, std::istream& is, + bool disamb_only, bool disamb_sh) + : BufferedChunkReader(tagset), + impl_(new CclReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) +{ + this->is_ = &is; +} + +CclReader::CclReader(const Tagset& tagset, const std::string& filename, bool disamb_only, bool disamb_sh) + : BufferedChunkReader(tagset), + impl_(new CclReaderImpl(tagset, chunk_buf_, disamb_only, disamb_sh)) +{ + this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); + + if (this->is_owned_->bad()) { + throw Corpus2Error("File not found!"); + } + else { + this->is_ = is_owned_.get(); + } +} + +CclReader::~CclReader() +{ +} + +void CclReader::ensure_more() +{ + static const int BUFSIZE=1024; + while (chunk_buf_.empty() && is().good()) { + unsigned char buf[BUFSIZE+1]; + is().read(reinterpret_cast<char*>(buf), BUFSIZE); + impl_->parse_chunk_raw(buf, is().gcount()); + if (is().eof()) { + impl_->finish_chunk_parsing(); + } + } +} + +CclReaderImpl::CclReaderImpl(const Tagset& tagset, + std::deque< boost::shared_ptr<Chunk> >& obuf, + bool disamb_only, bool disamb_sh) + : BasicSaxParser() + , tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false) + , wa_(PwrNlp::Whitespace::Newline) + , sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf) + , disamb_only_(disamb_only), disamb_sh_(disamb_sh) +{ +} + +CclReaderImpl::~CclReaderImpl() +{ + delete tok_; +} + +void CclReaderImpl::on_start_element(const Glib::ustring &name, + const AttributeList& attributes) +{ + if (name == "chunk") { + std::string type; + foreach (const Attribute& a, attributes) { + if (a.name == "type") { + type = a.value; + } + } + if (type == "s") { + throw XcesError("Trying to parse XCES as CCL (<chunk type=\"s\">)"); + } else if (state_ == XS_NONE) { + chunk_ = boost::make_shared<Chunk>(); + state_ = XS_CHUNK; + foreach (const Attribute& a, attributes) { + chunk_->set_attribute(a.name, a.value); + } + } else if (state_ == XS_CHUNK) { + throw XcesError("Nested <chunk>"); + } else { + throw XcesError("Unexpected <chunk>"); + } + } else if (state_ == XS_CHUNK && name == "sentence") { + state_ = XS_SENTENCE; + sent_ = boost::make_shared<AnnotatedSentence>(); + } else if (state_ == XS_SENTENCE && name == "tok") { + state_ = XS_TOK; + tok_ = new Token(); + tok_->set_wa(wa_); + wa_ = PwrNlp::Whitespace::Space; + token_anns_.clear(); + token_ann_heads_.clear(); + } else if (state_ == XS_TOK && name == "orth") { + state_ = XS_ORTH; + grab_characters_ = true; + clear_buf(); + } else if (state_ == XS_TOK && name == "ann") { + state_ = XS_ANN; + grab_characters_ = true; + clear_buf(); + ann_chan_ = ""; + ann_head_ = false; + foreach (const Attribute& a, attributes) { + if (a.name == "chan") { + ann_chan_ = a.value; + } else if (a.name == "head" && a.value == "1") { + ann_head_ = true; + } + } + if (ann_chan_.empty()) { + throw XcesError("<ann> with no channel name"); + } + } else if (state_ == XS_TOK && name == "lex") { + assert(tok_ != NULL); + bool is_disamb = false; + foreach (const Attribute& a, attributes) { + if (a.name == "disamb" && a.value == "1") { + is_disamb = true; + } + } + if (!disamb_only_ || is_disamb) { + tok_->add_lexeme(Lexeme()); + tok_->lexemes().back().set_disamb(is_disamb); + state_ = XS_LEX; + } + } else if (state_ == XS_LEX && name == "base") { + state_ = XS_LEMMA; + grab_characters_ = true; + clear_buf(); + } else if (state_ == XS_LEX && name == "ctag") { + state_ = XS_TAG; + grab_characters_ = true; + clear_buf(); + } else if (name == "ns") { + wa_ = PwrNlp::Whitespace::None; + } else if (name == "tok" && state_ == XS_NONE) { + std::cerr << "Warning: out-of-chunk token, assuming sentence start on line "; + std::cerr << this->context_->input->line << "\n"; + chunkless_ = true; + out_of_chunk_ = true; + chunk_ = boost::make_shared<Chunk>(); + sent_ = boost::make_shared<AnnotatedSentence>(); + state_ = XS_TOK; + tok_ = new Token(); + tok_->set_wa(wa_); + wa_ = PwrNlp::Whitespace::Space; + } +} + +void CclReaderImpl::finish_sentence() +{ + chunk_->append(sent_); + sent_.reset(); + if (chunkless_) { + obuf_.push_back(chunk_); + chunk_.reset(); + state_ = XS_NONE; + chunkless_ = false; + } else { + state_ = XS_CHUNK; + } +} + +void CclReaderImpl::on_end_element(const Glib::ustring &name) +{ + if (state_ == XS_ORTH && name == "orth") { + tok_->set_orth(UnicodeString::fromUTF8(get_buf())); + grab_characters_ = false; + state_ = XS_TOK; + } else if (state_ == XS_ANN && name == "ann") { + std::string buf = get_buf(); + grab_characters_ = false; + int segid = atoi(buf.c_str()); + if (!sent_->has_channel(ann_chan_)) { + sent_->create_channel(ann_chan_); + } + if (segid > 0) { + token_anns_.insert(std::make_pair(ann_chan_, segid)); + token_ann_heads_.insert(ann_chan_); + } + state_ = XS_TOK; + } else if (state_ == XS_LEMMA && name == "base") { + tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf())); + grab_characters_ = false; + state_ = XS_LEX; + } else if (state_ == XS_TAG && name == "ctag") { + Tag tag = tagset_.parse_simple_tag(get_buf(), true); + tok_->lexemes().back().set_tag(tag); + grab_characters_ = false; + state_ = XS_LEX; + } else if (state_ == XS_LEX && name == "lex") { + state_ = XS_TOK; + } else if (state_ == XS_TOK && name == "tok") { + sent_->append(tok_); + tok_ = NULL; + state_ = XS_SENTENCE; + foreach (const token_ann_t::value_type& v, token_anns_) { + sent_->get_channel(v.first).set_segment_at(sent_->size() - 1, v.second); + if (token_ann_heads_.find(v.first) != token_ann_heads_.end()) { + sent_->get_channel(v.first).set_head_at(sent_->size() - 1, true); + } + } + } else if (state_ == XS_SENTENCE && name == "sentence") { + finish_sentence(); + } else if (state_ == XS_CHUNK && name == "chunk") { + obuf_.push_back(chunk_); + chunk_.reset(); + state_ = XS_NONE; + } +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/cclreader.h b/libcorpus2/io/cclreader.h new file mode 100644 index 0000000000000000000000000000000000000000..3807ff9d5cd8462fe1a48cf5dd43255b8159e60c --- /dev/null +++ b/libcorpus2/io/cclreader.h @@ -0,0 +1,57 @@ +/* + Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#ifndef LIBCORPUS2_IO_CCLREADER_H +#define LIBCORPUS2_IO_CCLREADER_H + +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/xces.h> +#include <libcorpus2/chunk.h> +#include <deque> +#include <boost/scoped_ptr.hpp> + +namespace Corpus2 { + +class CclReaderImpl; + +class CclReader : public BufferedChunkReader +{ +public: + CclReader(const Tagset& tagset, std::istream& is, + bool disamb_only = false, bool disamb_sh = false); + + CclReader(const Tagset& tagset, const std::string& filename, + bool disamb_only = false, bool disamb_sh = false); + + ~CclReader(); + + std::istream& is() { + return *is_; + } + +protected: + void ensure_more(); + + // std::istream& is_; + std::istream* is_; + boost::scoped_ptr<std::istream> is_owned_; + + boost::scoped_ptr<CclReaderImpl> impl_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_CCLREADER_H diff --git a/libcorpus2/io/xcesreader.h b/libcorpus2/io/xcesreader.h index f742f036e3125f229dee3415bfa6382c7f51160d..004b62fd8f3fd112e0110d766039fbd89d7b2bbc 100644 --- a/libcorpus2/io/xcesreader.h +++ b/libcorpus2/io/xcesreader.h @@ -52,7 +52,6 @@ protected: boost::scoped_ptr<XcesReaderImpl> impl_; }; - } /* end ns Corpus2 */ #endif // LIBCORPUS2_IO_XCESREADER_H diff --git a/libcorpus2/sentence.h b/libcorpus2/sentence.h index 2a08e84674c047a897e18b396bb4f63d837a2311..6eb8522ec744579dd18460c569218ff2d3177f6e 100644 --- a/libcorpus2/sentence.h +++ b/libcorpus2/sentence.h @@ -69,7 +69,9 @@ public: } /// Helper function for appending tokens - void append(Token* t) { + /// Might be overriden in a child class to make adding a token keep + /// extra invariants + virtual void append(Token* t) { tokens_.push_back(t); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2d7d8bbee77ddf04ab6910c9d18af7d240e93c7a..e2b53b044182c393a80620bb129624158172586b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,6 +9,7 @@ add_executable( tests ann_basic.cpp basic.cpp io.cpp + ioann.cpp tag_split.cpp tagset_parse.cpp ) diff --git a/tests/ioann.cpp b/tests/ioann.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8334bf32effb64027a96cf93eb1604ae23484147 --- /dev/null +++ b/tests/ioann.cpp @@ -0,0 +1,133 @@ +/* + Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <boost/test/unit_test.hpp> +#include <set> +#include <libpwrutils/foreach.h> +#include <libpwrutils/bitset.h> +#include <libcorpus2/tagsetmanager.h> +#include <libcorpus2/io/cclreader.h> +#include <libcorpus2/io/writer.h> +#include <libcorpus2/ann/annotatedsentence.h> + +namespace { +static char swiatopoglad[] = +"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +"<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n" +"<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" version=\"1.0\" type=\"lex disamb\">\n" +"<chunkList>\n" +"<chunk id=\"ch51\" type=\"tok\">\n" +"<chunk type=\"s\">\n" +"<tok>\n" +"<orth>Uważam</orth>\n" +"<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n" +"</tok>\n" +"<ns/>\n" +"<tok>\n" +"<orth>,</orth>\n" +"<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<orth>że</orth>\n" +"<lex disamb=\"1\"><base>że</base><ctag>conj</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<orth>światopogląd</orth>\n" +"<lex><base>światopogląd</base><ctag>subst:sg:acc:m3</ctag></lex>\n" +"<lex disamb=\"1\"><base>światopogląd</base><ctag>subst:sg:nom:m3</ctag></lex>\n" +"</tok>\n" +"</chunk>\n" +"</chunk>\n" +"</chunkList>\n" +"</cesAna>\n" +; + +static char swiatopoglad_ann[] = +"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +"<chunkList>\n" +"<chunk id=\"ch51\" type=\"tok\">\n" +"<sentence>\n" +"<tok>\n" +"<ann chan=\"cute\">1</ann>\n" +"<ann chan=\"meh\">1</ann>\n" +"<orth>Uważam</orth>\n" +"<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n" +"</tok>\n" +"<ns/>\n" +"<tok>\n" +"<ann chan=\"cute\">1</ann>\n" +"<ann chan=\"meh\">2</ann>\n" +"<orth>,</orth>\n" +"<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<ann chan=\"meh\" head=\"1\">1</ann>\n" +"<orth>że</orth>\n" +"<lex disamb=\"1\"><base>że</base><ctag>conj</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<orth>światopogląd</orth>\n" +"<lex><base>światopogląd</base><ctag>subst:sg:acc:m3</ctag></lex>\n" +"<lex disamb=\"1\"><base>światopogląd</base><ctag>subst:sg:nom:m3</ctag></lex>\n" +"<ann chan=\"cute\">2</ann>\n" +"</tok>\n" +"</sentence>\n" +"</chunk>\n" +"</chunkList>\n" +; +} + +BOOST_AUTO_TEST_SUITE( ioann ) + +BOOST_AUTO_TEST_CASE( iobase ) +{ + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi"); + std::stringstream ssin; + ssin << swiatopoglad_ann; + Corpus2::CclReader xr(tagset, ssin); + boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk(); + std::stringstream ss; + boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create("xces,flat", ss, tagset)); + w->write_chunk(*chunk); + w->finish(); + BOOST_CHECK_EQUAL(ss.str(), swiatopoglad); + BOOST_REQUIRE(!chunk->sentences().empty()); + boost::shared_ptr<Corpus2::AnnotatedSentence> as; + as = boost::dynamic_pointer_cast<Corpus2::AnnotatedSentence>(chunk->sentences()[0]); + BOOST_REQUIRE(as); + + BOOST_REQUIRE(as->has_channel("cute")); + as->get_channel("cute").make_iob_from_segments(); + BOOST_CHECK_EQUAL(as->get_channel("cute").dump_iob(), "BIOB"); + Corpus2::Sentence::Ptr cute = Corpus2::create_view(as, "cute"); + BOOST_REQUIRE_EQUAL(cute->size(), 3); + BOOST_CHECK_EQUAL(cute->tokens()[0]->orth_utf8(), "Uważam,"); + BOOST_CHECK_EQUAL(cute->tokens()[1]->orth_utf8(), "że"); + BOOST_CHECK_EQUAL(cute->tokens()[2]->orth_utf8(), "światopogląd"); + + BOOST_REQUIRE(as->has_channel("meh")); + as->get_channel("meh").make_iob_from_segments(); + BOOST_CHECK_EQUAL(as->get_channel("meh").dump_iob(), "BBBO"); + Corpus2::Sentence::Ptr meh = Corpus2::create_view(as, "meh"); + BOOST_REQUIRE_EQUAL(meh->size(), 3); + BOOST_CHECK_EQUAL(meh->tokens()[0]->orth_utf8(), ","); + BOOST_CHECK_EQUAL(meh->tokens()[1]->orth_utf8(), "Uważam że"); + BOOST_CHECK_EQUAL(meh->tokens()[2]->orth_utf8(), "światopogląd"); + + std::cerr << as->annotation_info(); +} + +BOOST_AUTO_TEST_SUITE_END();