From 43ff7c3f286decbadd53f4e7d420f88ca0918077 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Mon, 16 May 2011 11:13:29 +0200 Subject: [PATCH] add util/ioformat-options, change tools dir, add corpus-get --- CMakeLists.txt | 2 +- {tagset-tool => corpus2tools}/CMakeLists.txt | 8 +- corpus2tools/corpus-get.cpp | 77 +++++++++++++++++++ .../main.cpp => corpus2tools/tagset-tool.cpp | 0 libcorpus2/CMakeLists.txt | 1 + libcorpus2/util/ioformat-options.cpp | 66 ++++++++++++++++ libcorpus2/util/ioformat-options.h | 34 ++++++++ 7 files changed, 184 insertions(+), 4 deletions(-) rename {tagset-tool => corpus2tools}/CMakeLists.txt (68%) create mode 100644 corpus2tools/corpus-get.cpp rename tagset-tool/main.cpp => corpus2tools/tagset-tool.cpp (100%) create mode 100644 libcorpus2/util/ioformat-options.cpp create mode 100644 libcorpus2/util/ioformat-options.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 55f0fe3..86c2925 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,7 @@ MARK_AS_ADVANCED(LIBCORPUS2_SRC_DATA_DIR) add_subdirectory(libpwrutils) add_subdirectory(libcorpus2) -add_subdirectory(tagset-tool) +add_subdirectory(corpus2tools) add_subdirectory(tests) add_subdirectory(swig) diff --git a/tagset-tool/CMakeLists.txt b/corpus2tools/CMakeLists.txt similarity index 68% rename from tagset-tool/CMakeLists.txt rename to corpus2tools/CMakeLists.txt index c527eda..3e7b198 100644 --- a/tagset-tool/CMakeLists.txt +++ b/corpus2tools/CMakeLists.txt @@ -9,16 +9,18 @@ endif (Libedit_FOUND) include_directories( ${CMAKE_SOURCE_DIR} ) -add_executable( tagset-tool main.cpp ) - +add_executable( tagset-tool tagset-tool.cpp ) target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) +add_executable( corpus-get corpus-get.cpp ) +target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) + include_directories(${Boost_INCLUDE_DIR}) link_directories(${Boost_LIBRARY_DIRS}) if(UNIX) - install(TARGETS tagset-tool + install(TARGETS tagset-tool corpus-get RUNTIME DESTINATION bin) endif(UNIX) diff --git a/corpus2tools/corpus-get.cpp b/corpus2tools/corpus-get.cpp new file mode 100644 index 0000000..9ea144f --- /dev/null +++ b/corpus2tools/corpus-get.cpp @@ -0,0 +1,77 @@ +#include <libcorpus2/tagsetmanager.h> +#include <libcorpus2/util/ioformat-options.h> +#include <boost/program_options.hpp> +#include <boost/algorithm/string.hpp> + +int main(int argc, char** argv) +{ + std::string tagset_name, filename; + std::string input_format, output_format; + int sentence, token = -1; + size_t stats = 0; + using boost::program_options::value; + boost::program_options::options_description desc("Allowed options"); + desc.add_options() + ("filename,F", value(&filename), + "filename") + ("sentence,S", value(&sentence), + "Sentence idx") + ("stats,s", value(&stats), + "Stats") + ("token,T", value(&token), + "Token idx ") + ("tagset,t", value(&tagset_name)->default_value("kipi"), + "Tagset name") + ; + Corpus2::add_input_options(desc); + Corpus2::add_output_options(desc); + boost::program_options::variables_map vm; + boost::program_options::positional_options_description p; + p.add("filename", 1); + p.add("sentence", 1); + p.add("token", 1); + + try { + boost::program_options::store( + boost::program_options::command_line_parser(argc, argv) + .options(desc).positional(p).run(), vm); + } catch (boost::program_options::error& e) { + std::cerr << e.what() << "\n"; + return 2; + } + boost::program_options::notify(vm); + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name); + boost::shared_ptr<Corpus2::TokenReader> reader; + reader = Corpus2::create_reader(vm, tagset, filename); + Corpus2::Sentence::Ptr s; + boost::shared_ptr<Corpus2::TokenWriter> writer; + writer = Corpus2::create_writer(vm, tagset); + std::map<int,int> lens; + for (int i = 0; i <= sentence; ++i) { + s = reader->get_next_sentence(); + if (s) { + lens[s->size()]++; + if (s->size() > stats) { + std::cerr << i << "\n"; + writer->write_sentence(*s); + } + } + } + if (s) { + if (token == -1) { + writer->write_sentence(*s); + } else if (static_cast<size_t>(token) < s->size()) { + writer->write_token(*(*s)[token]); + } + } + if (stats) { + typedef std::pair<int,int> pp; + foreach (const pp& p, lens) { + std::cerr << p.first << " " << p.second << "\n"; + } + } +} diff --git a/tagset-tool/main.cpp b/corpus2tools/tagset-tool.cpp similarity index 100% rename from tagset-tool/main.cpp rename to corpus2tools/tagset-tool.cpp diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index b225155..2af29e1 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -75,6 +75,7 @@ SET(libcorpus2_STAT_SRC io/xceswriter.cpp io/xmlreader.cpp io/xmlwriter.cpp + util/ioformat-options.cpp util/settings.cpp util/symboldictionary.cpp util/tokentimer.cpp diff --git a/libcorpus2/util/ioformat-options.cpp b/libcorpus2/util/ioformat-options.cpp new file mode 100644 index 0000000..0ed6ccf --- /dev/null +++ b/libcorpus2/util/ioformat-options.cpp @@ -0,0 +1,66 @@ +#include <libcorpus2/util/ioformat-options.h> +#include <boost/algorithm/string/join.hpp> + +namespace Corpus2 { + +void add_input_options( + boost::program_options::options_description& desc, + const std::string& default_format /*= "xces"*/ + ) +{ + std::string readers = boost::algorithm::join( + Corpus2::TokenReader::available_reader_types_help(), " "); + std::string readers_help = "Input format, any of: " + readers + "\n"; + desc.add_options() + ("input-format,i", + boost::program_options::value<std::string>()->default_value(default_format), + readers_help.c_str()); +} + +void add_output_options( + boost::program_options::options_description& desc, + const std::string& default_format /*= "xces"*/ + ) +{ + std::string writers = boost::algorithm::join( + Corpus2::TokenWriter::available_writer_types_help(), " "); + std::string writers_help = "Output format, any of: " + writers + "\n"; + desc.add_options() + ("output-format,o", + boost::program_options::value<std::string>()->default_value(default_format), + writers_help.c_str()); +} + +boost::shared_ptr<Corpus2::TokenReader> create_reader( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename /*= "-"*/ + ) +{ + std::string format = vm["input-format"].as<std::string>(); + if (filename.empty() || filename == "-") { + return Corpus2::TokenReader::create_stream_reader( + format, tagset, std::cin); + } else { + return Corpus2::TokenReader::create_path_reader( + format, tagset, filename); + } +} + +boost::shared_ptr<Corpus2::TokenWriter> create_writer( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename /*= "-"*/ + ) +{ + std::string format = vm["output-format"].as<std::string>(); + if (filename.empty() || filename == "-") { + return Corpus2::TokenWriter::create_stream_writer( + format, std::cout, tagset); + } else { + return Corpus2::TokenWriter::create_path_writer( + format, filename, tagset); + } +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/util/ioformat-options.h b/libcorpus2/util/ioformat-options.h new file mode 100644 index 0000000..cab0054 --- /dev/null +++ b/libcorpus2/util/ioformat-options.h @@ -0,0 +1,34 @@ +#ifndef LIBSORPUS2_UTIL_IOFORMAT_OPTIONS_H +#define LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H + +#include <boost/program_options.hpp> +#include <libcorpus2/io/reader.h> +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +void add_input_options( + boost::program_options::options_description& desc, + const std::string& default_format = "xces" + ); + +void add_output_options( + boost::program_options::options_description& desc, + const std::string& default_format = "xces" + ); + +boost::shared_ptr<Corpus2::TokenReader> create_reader( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename = "-" + ); + +boost::shared_ptr<Corpus2::TokenWriter> create_writer( + boost::program_options::variables_map& vm, + const Corpus2::Tagset& tagset, + const std::string& filename = "-" + ); + +} /* end ns Corpus2 */ + +#endif // LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H -- GitLab