diff --git a/libcorpus2/io/reader.cpp b/libcorpus2/io/reader.cpp index 7dacd101ae706d63955bddaf14b303d8038b7d4f..46a158a4eceaf78eee265785497ca1a4c4811b76 100644 --- a/libcorpus2/io/reader.cpp +++ b/libcorpus2/io/reader.cpp @@ -222,22 +222,6 @@ boost::shared_ptr<Chunk> BufferedChunkReader::get_next_chunk() } } -bool BufferedChunkReader::has_more() -{ - ensure_more(); - return !chunk_buf_.empty(); -} - -bool BufferedSentenceReader::has_more() -{ - if (sentence_buf_ != NULL) { - return true; - } - sentence_buf_ = actual_next_sentence(); - return (sentence_buf_ != NULL); -} - - BufferedSentenceReader::BufferedSentenceReader(const Tagset& tagset) : TokenReader(tagset), chunkify_(true) , sentence_buf_(), token_buf_() diff --git a/libcorpus2/io/reader.h b/libcorpus2/io/reader.h index e4e242fe20306cf24bb4c82006aef1788d5a1a38..417497d637d8600f37d9476730e2323bd10802d4 100644 --- a/libcorpus2/io/reader.h +++ b/libcorpus2/io/reader.h @@ -90,13 +90,6 @@ public: */ virtual boost::shared_ptr<Chunk> get_next_chunk() = 0; - - /** - * Checks if there is anything left to be returned. Non-const because it - * might read ahead and fill the buffer. - */ - virtual bool has_more() = 0; - /** * General option setter. */ @@ -304,8 +297,6 @@ public: boost::shared_ptr<Chunk> get_next_chunk(); - bool has_more(); - void set_option(const std::string& option) { TokenReader::set_option(option); } @@ -341,8 +332,6 @@ public: Sentence::Ptr get_next_sentence(); - bool has_more(); - boost::shared_ptr<Chunk> get_next_chunk(); void set_option(const std::string& option) { diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index ead40217893a217749ec7e8b62a945c5dfdb36e6..1624404a32ad7cff11e7578433939d86e7cc0bb0 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( - "rft", "mbt,nowarn,colon,alltags,opt"); + "rft", "mbt,nowarn,colon,alltags,opt,latin2"); RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) @@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, opt_ = true; } else if (p == "colon") { colon_ = true; + } else if (p == "latin2") { + encoding_ = p; } } @@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, void RftWriter::write_token(const Token& t) { - os() << t.orth_utf8(); + if (encoding_.empty()) { + os() << t.orth_utf8(); + } else { + char buf[256]; + int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str()); + if (len < 256) { + os() << buf; + } else { + std::cerr << "Characetr encoding error in codepage rft output\n"; + os() << "???"; + } + } if (t.lexemes().empty()) { if (warn_on_no_lexemes_) { std::cerr << "No lexemes for token!"; diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h index 394df9720346576e5ddf394a7ccdfaf61029f122..b87b5dd616b9df8ef614965cfdc994ea2f40c193 100644 --- a/libcorpus2/io/rft.h +++ b/libcorpus2/io/rft.h @@ -64,6 +64,9 @@ private: /// Dialect flag: output all lexemes, not just the preferred one bool alltags_; + + /// Dialect flag: use non-utf8 encoding + std::string encoding_; }; class RftReader : public BufferedSentenceReader diff --git a/swig/boost_shared_ptr.i b/swig/boost_shared_ptr.i index 7803b22154c79cba53773442e3387ca0e466f7da..6f3ec42a0061504e01d2802c46156964fa3928fe 100644 --- a/swig/boost_shared_ptr.i +++ b/swig/boost_shared_ptr.i @@ -14,6 +14,14 @@ namespace boost { shared_ptr(); shared_ptr(T * p); T* operator->(); + T* get(); + +%pythoncode %{ + def __bool__(self): + return self.get() is not None + __nonzero__=__bool__ +%} + private: T * px; int pn; diff --git a/swig/libcorpustokenreader.i b/swig/libcorpustokenreader.i index 901b259f2f5c1aebaeb838b8e66bd1e15cbd3798..991d0028ef839e649f023d2a6a9fa94ae660c402 100644 --- a/swig/libcorpustokenreader.i +++ b/swig/libcorpustokenreader.i @@ -63,7 +63,6 @@ namespace Corpus2 { /* virtual Token* get_next_token() = 0; */ virtual Sentence::Ptr get_next_sentence() = 0; virtual boost::shared_ptr<Chunk> get_next_chunk() = 0; - virtual bool has_more() = 0; /* --------------------------------------------------------------------- */ virtual void set_option(const std::string& option);