diff --git a/CMakeLists.txt b/CMakeLists.txt index 952c9deab6259b08f73a09e4bdea9aaf9ac8172e..86c2925c6867f3d1323290c893f180621b7bcc9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,7 @@ MARK_AS_ADVANCED(LIBCORPUS2_SRC_DATA_DIR) add_subdirectory(libpwrutils) add_subdirectory(libcorpus2) -add_subdirectory(tagset-tool) +add_subdirectory(corpus2tools) add_subdirectory(tests) -# add_subdirectory(swig) +add_subdirectory(swig) diff --git a/tagset-tool/CMakeLists.txt b/corpus2tools/CMakeLists.txt similarity index 68% rename from tagset-tool/CMakeLists.txt rename to corpus2tools/CMakeLists.txt index c527edaab7ccfb8a7e2130c0c56fc6e6ef1e49cb..3e7b198ba60e00065d0545eabb59dda56df2a145 100644 --- a/tagset-tool/CMakeLists.txt +++ b/corpus2tools/CMakeLists.txt @@ -9,16 +9,18 @@ endif (Libedit_FOUND) include_directories( ${CMAKE_SOURCE_DIR} ) -add_executable( tagset-tool main.cpp ) - +add_executable( tagset-tool tagset-tool.cpp ) target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) +add_executable( corpus-get corpus-get.cpp ) +target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) + include_directories(${Boost_INCLUDE_DIR}) link_directories(${Boost_LIBRARY_DIRS}) if(UNIX) - install(TARGETS tagset-tool + install(TARGETS tagset-tool corpus-get RUNTIME DESTINATION bin) endif(UNIX) diff --git a/corpus2tools/corpus-get.cpp b/corpus2tools/corpus-get.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9ea144f6a296670d209c069c527d471dbfe07ddc --- /dev/null +++ b/corpus2tools/corpus-get.cpp @@ -0,0 +1,77 @@ +#include <libcorpus2/tagsetmanager.h> +#include <libcorpus2/util/ioformat-options.h> +#include <boost/program_options.hpp> +#include <boost/algorithm/string.hpp> + +int main(int argc, char** argv) +{ + std::string tagset_name, filename; + std::string input_format, output_format; + int sentence, token = -1; + size_t stats = 0; + using boost::program_options::value; + boost::program_options::options_description desc("Allowed options"); + desc.add_options() + ("filename,F", value(&filename), + "filename") + ("sentence,S", value(&sentence), + "Sentence idx") + ("stats,s", value(&stats), + "Stats") + ("token,T", value(&token), + "Token idx ") + ("tagset,t", value(&tagset_name)->default_value("kipi"), + "Tagset name") + ; + Corpus2::add_input_options(desc); + Corpus2::add_output_options(desc); + boost::program_options::variables_map vm; + boost::program_options::positional_options_description p; + p.add("filename", 1); + p.add("sentence", 1); + p.add("token", 1); + + try { + boost::program_options::store( + boost::program_options::command_line_parser(argc, argv) + .options(desc).positional(p).run(), vm); + } catch (boost::program_options::error& e) { + std::cerr << e.what() << "\n"; + return 2; + } + boost::program_options::notify(vm); + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name); + boost::shared_ptr<Corpus2::TokenReader> reader; + reader = Corpus2::create_reader(vm, tagset, filename); + Corpus2::Sentence::Ptr s; + boost::shared_ptr<Corpus2::TokenWriter> writer; + writer = Corpus2::create_writer(vm, tagset); + std::map<int,int> lens; + for (int i = 0; i <= sentence; ++i) { + s = reader->get_next_sentence(); + if (s) { + lens[s->size()]++; + if (s->size() > stats) { + std::cerr << i << "\n"; + writer->write_sentence(*s); + } + } + } + if (s) { + if (token == -1) { + writer->write_sentence(*s); + } else if (static_cast<size_t>(token) < s->size()) { + writer->write_token(*(*s)[token]); + } + } + if (stats) { + typedef std::pair<int,int> pp; + foreach (const pp& p, lens) { + std::cerr << p.first << " " << p.second << "\n"; + } + } +} diff --git a/tagset-tool/main.cpp b/corpus2tools/tagset-tool.cpp similarity index 100% rename from tagset-tool/main.cpp rename to corpus2tools/tagset-tool.cpp diff --git a/doc/corpstats.py b/doc/corpstats.py new file mode 100755 index 0000000000000000000000000000000000000000..bbfc6dbe2fd547a0c20a28a3cd1cf33fafc11b71 --- /dev/null +++ b/doc/corpstats.py @@ -0,0 +1,101 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import sys +from optparse import OptionParser +from collections import defaultdict as dd +import corpus2 + +descr = """%prog [options] CORPUSFILE +Reads a corpus file and reports some statistics. +This script is a demo of the Python API. +""" + +def tokens(rdr): + """Yields subsequent tokens from a reader. + Declared here for demonstration.""" + while True: + tok = rdr.get_next_token() + if not tok: + break + yield tok + +def sentences(rdr): + """Yields subsequent sentences from a reader. + Declared here for demonstration.""" + while True: + sent = rdr.get_next_sentence() + if not sent: + break + yield sent + +def chunks(rdr): + """Yields subsequent sentences from a reader.""" + while True: + chunk = rdr.get_next_chunk() + if not chunk: + break + yield chunk + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + #parser.add_option('-o', '--output-format', type='string', action='store', + #dest='output_format', default='xces', + #help='set the output format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='kipi', + help='set the tagset used in input; default: kipi') + parser.add_option('-v', '--verbose', action='store_true', + dest='verbose', default=False, + help='report each token') + parser.add_option('-n', '--number-of-tags', type='int', action='store', + dest='num_tags', default=10, + help='set the max number of tags to report') + (options, args) = parser.parse_args() + + if len(args) != 1: + print 'You need to provide an input corpus.' + print 'See %s --help' % sys.argv[0] + sys.exit(1) + + inpath = args[0] + # load a tagset, create a reader + tagset = corpus2.get_named_tagset(options.tagset) + rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath) + # init stats (for this example) + num_toks, num_sents, num_chunks = 0, 0, 0 + tag_count = dd(int) + + for chunk in chunks(rdr): + for sent in chunk.sentences(): + for tok in sent.tokens(): + if options.verbose: + print tok.orth_utf8() + + for lex in tok.lexemes(): + tag_str = tagset.tag_to_string(lex.tag()) + tag_count[tag_str] += 1 + + if options.verbose: + lemma = lex.lemma_utf8() + print ('+' if lex.is_disamb() else ' '), lemma, tag_str + # if you want a unicode object, orth_utf8().decode('utf-8') + num_toks += 1 + num_sents += 1 + num_chunks += 1 + + + print 'Tokens:', num_toks + print 'Sents: ', num_sents + print 'Chunks:', num_chunks + print + print 'Most frequent tags:' + for tc in sorted(tag_count.items(), key=lambda tc: (-tc[1], tc[0]))[:options.num_tags]: + print '\t%s\t%d' % tc + + + +if __name__ == '__main__': + go() diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index 9b8fbd46064246325a2a2b8fe53d37261f137275..d351a6de1f3c64bd047701674e26c1a41e1461b3 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -57,6 +57,7 @@ SET(libcorpus2_STAT_SRC tokenmetadata.cpp io/cclreader.cpp io/cclwriter.cpp + io/helpers.cpp io/fastxces.cpp io/iob-chan.cpp io/nonewriter.cpp @@ -76,6 +77,7 @@ SET(libcorpus2_STAT_SRC io/xceswriter.cpp io/xmlreader.cpp io/xmlwriter.cpp + util/ioformat-options.cpp util/settings.cpp util/symboldictionary.cpp util/tokentimer.cpp diff --git a/libcorpus2/io/helpers.cpp b/libcorpus2/io/helpers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9aac87844ee18597100b350a75af290b724aed34 --- /dev/null +++ b/libcorpus2/io/helpers.cpp @@ -0,0 +1,22 @@ +#include <libcorpus2/io/helpers.h> +#include <libcorpus2/io/reader.h> +#include <sstream> +namespace Corpus2 { + +std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string( + const std::string& data, + const Tagset& tagset, + const std::string& format) +{ + std::stringstream ss; + ss << data; + boost::shared_ptr<TokenReader> reader = TokenReader::create_stream_reader( + format, tagset, ss); + std::vector<boost::shared_ptr<Chunk> > chunks; + while (boost::shared_ptr<Chunk> c = reader->get_next_chunk()) { + chunks.push_back(c); + } + return chunks; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/helpers.h b/libcorpus2/io/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..92d8a896e4342572b399f91c9ccf82f2387d10ac --- /dev/null +++ b/libcorpus2/io/helpers.h @@ -0,0 +1,16 @@ +#ifndef LIBSORPUS2_IO_HELPERS_H +#define LIBCORPUS2_IO_HELPERS_H + +#include <libcorpus2/chunk.h> +#include <libcorpus2/tagset.h> + +namespace Corpus2 { + +std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string( + const std::string& data, + const Tagset& tagset, + const std::string& format); + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_IO_HELPERS_H diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index 67552dff647ab71e176e2b233e6d003383a63580..8f28008dafccacd006d0656fa87a81844df67257 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -25,7 +25,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. namespace Corpus2 { bool RftWriter::registered = TokenWriter::register_writer<RftWriter>( - "rft", "mbt,nowarn,colon,alltags,opt"); + "rft", "mbt,nowarn,colon,alltags,opt,latin2"); RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, const string_range_vector& params) @@ -46,6 +46,8 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, opt_ = true; } else if (p == "colon") { colon_ = true; + } else if (p == "latin2") { + encoding_ = p; } } @@ -53,7 +55,18 @@ RftWriter::RftWriter(std::ostream& os, const Tagset& tagset, void RftWriter::write_token(const Token& t) { - os() << t.orth_utf8(); + if (encoding_.empty()) { + os() << t.orth_utf8(); + } else { + char buf[256]; + int len = t.orth().extract(0, t.orth().length(), buf, 255, encoding_.c_str()); + if (len < 256) { + os() << buf; + } else { + std::cerr << "Characetr encoding error in codepage rft output\n"; + os() << "???"; + } + } if (t.lexemes().empty()) { if (warn_on_no_lexemes_) { std::cerr << "No lexemes for token!"; diff --git a/libcorpus2/io/rft.h b/libcorpus2/io/rft.h index 394df9720346576e5ddf394a7ccdfaf61029f122..b87b5dd616b9df8ef614965cfdc994ea2f40c193 100644 --- a/libcorpus2/io/rft.h +++ b/libcorpus2/io/rft.h @@ -64,6 +64,9 @@ private: /// Dialect flag: output all lexemes, not just the preferred one bool alltags_; + + /// Dialect flag: use non-utf8 encoding + std::string encoding_; }; class RftReader : public BufferedSentenceReader diff --git a/libcorpus2/util/ioformat-options.cpp b/libcorpus2/util/ioformat-options.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0ed6ccf7e71c50004231cc9cab3c488da1b87691 --- /dev/null +++ b/libcorpus2/util/ioformat-options.cpp @@ -0,0 +1,66 @@ +#include <libcorpus2/util/ioformat-options.h> +#include <boost/algorithm/string/join.hpp> + +namespace Corpus2 { + +void add_input_options( + boost::program_options::options_description& desc, + const std::string& default_format /*= "xces"*/ + ) +{ + std::string readers = boost::algorithm::join( + Corpus2::TokenReader::available_reader_types_help(), " "); + std::string readers_help = "Input format, any of: " + readers + "\n"; + desc.add_options() + ("input-format,i", + boost::program_options::value<std::string>()->default_value(default_format), + readers_help.c_str()); +} + +void add_output_options( + boost::program_options::options_description& desc, + const std::string& default_format /*= "xces"*/ + ) +{ + std::string writers = boost::algorithm::join( + Corpus2::TokenWriter::available_writer_types_help(), " "); + std::string writers_help = "Output format, any of: " + writers + "\n"; + desc.add_options() + ("output-format,o", + boost::program_options::value<std::string>()->default_value(default_format), + writers_help.c_str()); +} + +boost::shared_ptr<Corpus2::TokenReader> create_reader( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename /*= "-"*/ + ) +{ + std::string format = vm["input-format"].as<std::string>(); + if (filename.empty() || filename == "-") { + return Corpus2::TokenReader::create_stream_reader( + format, tagset, std::cin); + } else { + return Corpus2::TokenReader::create_path_reader( + format, tagset, filename); + } +} + +boost::shared_ptr<Corpus2::TokenWriter> create_writer( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename /*= "-"*/ + ) +{ + std::string format = vm["output-format"].as<std::string>(); + if (filename.empty() || filename == "-") { + return Corpus2::TokenWriter::create_stream_writer( + format, std::cout, tagset); + } else { + return Corpus2::TokenWriter::create_path_writer( + format, filename, tagset); + } +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/util/ioformat-options.h b/libcorpus2/util/ioformat-options.h new file mode 100644 index 0000000000000000000000000000000000000000..cab00546d311f2db27d3b2924f579702b2fa90fc --- /dev/null +++ b/libcorpus2/util/ioformat-options.h @@ -0,0 +1,34 @@ +#ifndef LIBSORPUS2_UTIL_IOFORMAT_OPTIONS_H +#define LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H + +#include <boost/program_options.hpp> +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +void add_input_options( + boost::program_options::options_description& desc, + const std::string& default_format = "xces" + ); + +void add_output_options( + boost::program_options::options_description& desc, + const std::string& default_format = "xces" + ); + +boost::shared_ptr<Corpus2::TokenReader> create_reader( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename = "-" + ); + +boost::shared_ptr<Corpus2::TokenWriter> create_writer( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename = "-" + ); + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H diff --git a/swig/CMakeLists.txt b/swig/CMakeLists.txt index 978ece174dff98284913c1d162a075b38422948d..b61084c9d56e15d6dec10f99ce8b0280b6b6e35e 100644 --- a/swig/CMakeLists.txt +++ b/swig/CMakeLists.txt @@ -2,15 +2,12 @@ PROJECT(corpus2SwigWrap) -find_package(Corpus2 1.0.8 REQUIRED) -set(CORPUS2_LIBS ${Corpus2_LIBRARY}) - -find_package(PwrUtils 1.0.1 REQUIRED) -set(PWRUTILS_LIBS ${PwrUtils_LIBRARY}) +set(CORPUS2_LIBS corpus2) +set(PWRUTILS_LIBS pwrutils) set(CORPUS2_PWR_LIBS ${CORPUS2_LIBS} ${PWRUTILS_LIBS}) -include_directories (${Libcorpus2_SOURCE_DIR} "../libcorpus2") -include_directories (${Libpwrutils_SOURCE_DIR} "../libpwrutils") +include_directories (${corpus2_SOURCE_DIR}) +include_directories (${pwrutils_SOURCE_DIR}) link_directories(${Libcorpus2_BINARY_DIR}) @@ -31,6 +28,12 @@ message(STATUS "INFO: " "python lib: ${PYTHON_INSTDIR}" ) INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_PATH}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +if(CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-ansi $ENV{CXXFLAGS}") + set(CMAKE_CXX_FLAGS_DEBUG "-O0 -DDEBUG -ggdb3 -ansi $ENV{CXXFLAGS}") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -fno-omit-frame-pointer") +endif(CMAKE_COMPILER_IS_GNUCXX) + # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- diff --git a/swig/boost_shared_ptr.i b/swig/boost_shared_ptr.i index 7803b22154c79cba53773442e3387ca0e466f7da..6f3ec42a0061504e01d2802c46156964fa3928fe 100644 --- a/swig/boost_shared_ptr.i +++ b/swig/boost_shared_ptr.i @@ -14,6 +14,14 @@ namespace boost { shared_ptr(); shared_ptr(T * p); T* operator->(); + T* get(); + +%pythoncode %{ + def __bool__(self): + return self.get() is not None + __nonzero__=__bool__ +%} + private: T * px; int pn; diff --git a/swig/libcorpuschunk.i b/swig/libcorpuschunk.i index baa3fc1e715e4a2aa2b401ce545ce300432bf1c9..df798640ec75a6ac7f0ddfd113c4aeb3050f82ce 100644 --- a/swig/libcorpuschunk.i +++ b/swig/libcorpuschunk.i @@ -46,6 +46,7 @@ namespace Corpus2 { }; } +%template(ChunkPtrVector) std::vector<boost::shared_ptr<Chunk> >; using namespace std; using namespace Corpus2; diff --git a/swig/libcorpustokenreader.i b/swig/libcorpustokenreader.i index 93043ff8446bdb1adc53001c4da9781f2399dae1..ef2f16cb1e7fd19f01560be26fa64b1a518d5e33 100644 --- a/swig/libcorpustokenreader.i +++ b/swig/libcorpustokenreader.i @@ -4,6 +4,7 @@ %module libcorpustokenreader %{ #include <libcorpus2/io/reader.h> + #include <libcorpus2/io/helpers.h> %} %include "libcorpustag.i" @@ -17,13 +18,15 @@ %nodefaultctor Corpus2::TokenReader; %template(TokenReaderPtr) boost::shared_ptr<Corpus2::TokenReader>; +%template(TokenPtr) boost::shared_ptr<Corpus2::Token>; // %template(StdStringVector) std::vector<std::string>; // %template(ChunkPtr) boost::shared_ptr<Corpus2::Chunk>; - +typedef boost::shared_ptr<Corpus2::Token> TokenPtr; namespace Corpus2 { class TokenReader { public: typedef boost::shared_ptr<TokenReader> TokenReaderPtr; + //typedef boost::shared_ptr<Token> TokenPtr; /* --------------------------------------------------------------------- */ explicit TokenReader(const Tagset& tagset); @@ -38,6 +41,7 @@ namespace Corpus2 { return NULL; } } + %feature("autodoc", "1"); static TokenReaderPtr create_path_reader( const std::string& class_id, const Tagset& tagset, @@ -51,13 +55,14 @@ namespace Corpus2 { return NULL; } } + %feature("autodoc", "1"); static TokenReaderPtr create_stream_reader( const std::string& class_id, const Tagset& tagset, std::istream& stream); /* --------------------------------------------------------------------- */ - virtual Token* get_next_token() = 0; + /* virtual Token* get_next_token() = 0; */ virtual Sentence::Ptr get_next_sentence() = 0; virtual boost::shared_ptr<Chunk> get_next_chunk() = 0; @@ -73,6 +78,18 @@ namespace Corpus2 { static std::string reader_help(const std::string& class_id); static std::vector<std::string> available_reader_types_help(); }; + + %extend TokenReader { + /* modfify the native get_next_token to wrap the tokens into shared_ptr */ + boost::shared_ptr<Corpus2::Token> get_next_token() { + return boost::shared_ptr<Corpus2::Token>(self->get_next_token()); + } + } + +%feature("autodoc", "1"); + std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string( + const std::string& data, const Tagset& tagset, const std::string& format); + } using namespace std;