diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 2af29e1d34104fca7cc965091d68724b3c56597d..279ef42bd24546d1d70b947834d7f4a6977adbb7 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -57,6 +57,7 @@ SET(libcorpus2_STAT_SRC tokenmetadata.cpp io/cclreader.cpp io/cclwriter.cpp + io/helpers.cpp io/fastxces.cpp io/nonewriter.cpp io/orthwriter.cpp diff --git a/libcorpus2/io/helpers.cpp b/libcorpus2/io/helpers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aac87844ee18597100b350a75af290b724aed34 --- /dev/null +++ b/libcorpus2/io/helpers.cpp @@ -0,0 +1,22 @@ +#include <libcorpus2/io/helpers.h> +#include <libcorpus2/io/reader.h> +#include <sstream> +namespace Corpus2 { + +std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string( + const std::string& data, + const Tagset& tagset, + const std::string& format) +{ + std::stringstream ss; + ss << data; + boost::shared_ptr<TokenReader> reader = TokenReader::create_stream_reader( + format, tagset, ss); + std::vector<boost::shared_ptr<Chunk> > chunks; + while (boost::shared_ptr<Chunk> c = reader->get_next_chunk()) { + chunks.push_back(c); + } + return chunks; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/helpers.h b/libcorpus2/io/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..92d8a896e4342572b399f91c9ccf82f2387d10ac --- /dev/null +++ b/libcorpus2/io/helpers.h @@ -0,0 +1,16 @@ +#ifndef LIBSORPUS2_IO_HELPERS_H +#define LIBCORPUS2_IO_HELPERS_H + +#include <libcorpus2/chunk.h> +#include <libcorpus2/tagset.h> + +namespace Corpus2 { + +std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string( + const std::string& data, + const Tagset& tagset, + const std::string& format); + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_HELPERS_H diff --git a/swig/libcorpuschunk.i b/swig/libcorpuschunk.i index baa3fc1e715e4a2aa2b401ce545ce300432bf1c9..df798640ec75a6ac7f0ddfd113c4aeb3050f82ce 100644 --- a/swig/libcorpuschunk.i +++ b/swig/libcorpuschunk.i @@ -46,6 +46,7 @@ namespace Corpus2 { }; } +%template(ChunkPtrVector) std::vector<boost::shared_ptr<Chunk> >; using namespace std; using namespace Corpus2; diff --git a/swig/libcorpustokenreader.i b/swig/libcorpustokenreader.i index 93043ff8446bdb1adc53001c4da9781f2399dae1..0a367d531e933538db5866133b8a9ed2a9ab2a6f 100644 --- a/swig/libcorpustokenreader.i +++ b/swig/libcorpustokenreader.i @@ -4,6 +4,7 @@ %module libcorpustokenreader %{ #include <libcorpus2/io/reader.h> + #include <libcorpus2/io/helpers.h> %} %include "libcorpustag.i" @@ -73,6 +74,10 @@ namespace Corpus2 { static std::string reader_help(const std::string& class_id); static std::vector<std::string> available_reader_types_help(); }; + + std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string( + const std::string& data, const Tagset& tagset, const std::string& format); + } using namespace std;