diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 6dcd0017dc514687dda6b2ff82e14b472d3235b0..5095f8329b0178254963b79b49b0323b488e5400 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -56,6 +56,7 @@ SET(libcorpus2_STAT_SRC token.cpp tokenmetadata.cpp io/cclreader.cpp + io/fastxces.cpp io/orthwriter.cpp io/plainwriter.cpp io/premorphwriter.cpp diff --git a/libcorpus2/io/fastxces.cpp b/libcorpus2/io/fastxces.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f809c7b5174f99d25146ef862c897b98bf35f281 --- /dev/null +++ b/libcorpus2/io/fastxces.cpp @@ -0,0 +1,187 @@ +#include <libcorpus2/io/fastxces.h> +#include <boost/make_shared.hpp> +#include <boost/regex.hpp> +#include <fstream> + +namespace Corpus2 { + +class FastXcesReaderImpl +{ +public: + FastXcesReaderImpl(const Tagset& tagset, + std::deque< boost::shared_ptr<Chunk> >& obuf) + : tagset_(tagset), obuf_(obuf) + { + tok_ = new Token(); + sent_ = boost::make_shared<Sentence>(); + chunk_ = boost::make_shared<Chunk>(); + } + + ~FastXcesReaderImpl() + { + delete tok_; + } + + void parse_stream(std::istream& is) + { + boost::regex parsing_regex = boost::regex( + "(<ns\\>)|" + "(?:<chunk[^>]*\\<type=[\"']?([a-zA-Z]*)[\"']?[^>]*>)|" + "(?:<orth>\\s*(.*?)\\s*</orth>)|" + "(?:<lex\\>([^>]*\\<disamb=[\"']?1[\"']?)?[^>]*>\\s*" + "<base>\\s*(.*?)\\s*</base>\\s*<ctag>\\s*(.*?)\\s*</ctag>" + "\\s*</lex>)|" + "(</tok>)"); + enum { + MATCH_NS = 1, + MATCH_CHUNK_START = 2, + MATCH_ORTH = 3, + MATCH_DISAMB = 4, + MATCH_BASE = 5, + MATCH_CTAG = 6, + MATCH_ETOK = 7 + }; + // This code is heavily based on example from Boost.Regex + // (http://www.boost.org/doc/libs/1_41_0/libs/regex/doc/html/boost_regex/partial_matches.html) + char buf[4096]; + const char* next_pos = buf + sizeof(buf); + while (!is.eof()) { + std::streamsize leftover = (buf + sizeof(buf)) - next_pos; + std::streamsize size = next_pos - buf; + memcpy(buf, next_pos, leftover); + is.read(buf + leftover, size); + std::streamsize read = is.gcount(); + next_pos = buf + sizeof(buf); + + boost::cregex_iterator i(buf, buf + read + leftover, parsing_regex, + boost::match_default | boost::match_partial); + boost::cregex_iterator end; + for (; i != end; ++i) { + if ((*i)[0].matched == false) { + // Partial match, save position and break: + next_pos = (*i)[0].first; + break; + } + if ((*i)[MATCH_ORTH].matched) { + orth(i->str(MATCH_ORTH)); + } else if ((*i)[MATCH_CTAG].matched) { + ctag(i->str(MATCH_BASE), + i->str(MATCH_CTAG), + (*i)[MATCH_DISAMB].matched); + } else if ((*i)[MATCH_ETOK].matched) { + token_end(); + } else if ((*i)[MATCH_NS].matched) { + no_space(); + } else if ((*i)[MATCH_CHUNK_START].matched) { + chunk_start(i->str(MATCH_CHUNK_START)); + } + } + } + + // Finally close all remaining chunks. + finish_all(); + } +private: + const Tagset& tagset_; + + /// Token being constructed + Token* tok_; + + /// Sentence being constructed + Sentence::Ptr sent_; + + /// Chunk being constructed + boost::shared_ptr<Chunk> chunk_; + + /// Output chunk buffer + std::deque< boost::shared_ptr<Chunk> >& obuf_; + + void chunk_start(const std::string& type) { + if (type == "s") { + if (!sent_->empty()) { + chunk_->append(sent_); + } + sent_ = boost::make_shared<Sentence>(); + tok_->set_wa(PwrNlp::Whitespace::Newline); + } else { + if (!chunk_->empty()) { + obuf_.push_back(chunk_); + chunk_ = boost::make_shared<Chunk>(); + } + return; + } + } + + void finish_all() { + if (!sent_->empty()) { + chunk_->append(sent_); + sent_ = boost::make_shared<Sentence>(); + } + if (!chunk_->empty()) { + obuf_.push_back(chunk_); + chunk_ = boost::make_shared<Chunk>(); + } + } + + void no_space() { + tok_->set_wa(PwrNlp::Whitespace::None); + } + + void orth(const std::string& orth) { + tok_->set_orth(UnicodeString::fromUTF8(orth)); + } + + void ctag(const std::string& base, const std::string& ctag, bool disamb) { + Tag tag = tagset_.parse_simple_tag(ctag); + Lexeme lex(UnicodeString::fromUTF8(base), tag); + lex.set_disamb(disamb); + tok_->add_lexeme(lex); + } + + void token_end() { + sent_->append(tok_); + tok_ = new Token(); + tok_->set_wa(PwrNlp::Whitespace::Space); + } +}; + +FastXcesReader::FastXcesReader(const Tagset &tagset, std::istream &is) + : BufferedChunkReader(tagset), + impl_(new FastXcesReaderImpl(tagset, chunk_buf_)) +{ + this->is_ = &is; +} + +FastXcesReader::~FastXcesReader() +{ +} + + +void FastXcesReader::set_option(const std::string& /*option*/) +{ +} + +FastXcesReader::FastXcesReader(const Tagset &tagset, const std::string &filename) + : BufferedChunkReader(tagset), + impl_(new FastXcesReaderImpl(tagset, chunk_buf_)) +{ + this->is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in)); + + if (this->is_owned_->bad()) { + throw Corpus2Error("File not found!"); + } + else { + this->is_ = is_owned_.get(); + } +} + +void FastXcesReader::ensure_more() +{ + if (is_->good()) { + impl_->parse_stream(*is_); + } +} + + + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/fastxces.h b/libcorpus2/io/fastxces.h new file mode 100644 index 0000000000000000000000000000000000000000..ed822b0698f1a84f6312512573d5f34fd3345feb --- /dev/null +++ b/libcorpus2/io/fastxces.h @@ -0,0 +1,41 @@ +#ifndef LIBSORPUS2_IO_FASTXCES_H +#define LIBCORPUS2_IO_FASTXCES_H + +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/xces.h> +#include <libcorpus2/chunk.h> +#include <deque> +#include <boost/scoped_ptr.hpp> + +namespace Corpus2 { + +class FastXcesReaderImpl; + +class FastXcesReader : public BufferedChunkReader +{ +public: + FastXcesReader(const Tagset& tagset, std::istream& is); + + FastXcesReader(const Tagset& tagset, const std::string& filename); + + ~FastXcesReader(); + + std::istream& is() { + return *is_; + } + + void set_option(const std::string& option); + +protected: + void ensure_more(); + + // std::istream& is_; + std::istream* is_; + boost::scoped_ptr<std::istream> is_owned_; + + boost::scoped_ptr<FastXcesReaderImpl> impl_; +}; + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_FASTXCES_H diff --git a/tests/io.cpp b/tests/io.cpp index c4661d975b16ae2f4817283bbfb077e762eac11e..3a65f0ded997c2ba3da183d22b28584752994d3e 100644 --- a/tests/io.cpp +++ b/tests/io.cpp @@ -20,6 +20,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libpwrutils/bitset.h> #include <libcorpus2/tagsetmanager.h> #include <libcorpus2/io/xcesreader.h> +#include <libcorpus2/io/fastxces.h> #include <libcorpus2/io/writer.h> namespace { @@ -54,6 +55,37 @@ static char swiatopoglad[] = "</cesAna>\n" ; +static char swiatopoglad_noid[] = +"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +"<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n" +"<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" version=\"1.0\" type=\"lex disamb\">\n" +"<chunkList>\n" +"<chunk>\n" +"<chunk type=\"s\">\n" +"<tok>\n" +"<orth>Uważam</orth>\n" +"<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n" +"</tok>\n" +"<ns/>\n" +"<tok>\n" +"<orth>,</orth>\n" +"<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<orth>że</orth>\n" +"<lex disamb=\"1\"><base>że</base><ctag>conj</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<orth>światopogląd</orth>\n" +"<lex><base>światopogląd</base><ctag>subst:sg:acc:m3</ctag></lex>\n" +"<lex disamb=\"1\"><base>światopogląd</base><ctag>subst:sg:nom:m3</ctag></lex>\n" +"</tok>\n" +"</chunk>\n" +"</chunk>\n" +"</chunkList>\n" +"</cesAna>\n" +; + static char swiatopoglad_broken[] = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n" @@ -113,6 +145,21 @@ BOOST_AUTO_TEST_CASE( iobase ) BOOST_CHECK_EQUAL(ss.str(), swiatopoglad); } + +BOOST_AUTO_TEST_CASE( fast ) +{ + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi"); + std::stringstream ssin; + ssin << swiatopoglad; + Corpus2::FastXcesReader xr(tagset, ssin); + boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk(); + std::stringstream ss; + boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create("xces,flat", ss, tagset)); + w->write_chunk(*chunk); + w->finish(); + BOOST_CHECK_EQUAL(ss.str(), swiatopoglad_noid); +} + BOOST_AUTO_TEST_CASE( io_oo ) { const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");