Skip to content
Snippets Groups Projects
Commit 43ff7c3f authored by ilor's avatar ilor
Browse files

add util/ioformat-options, change tools dir, add corpus-get

parent 8b8983e9
Branches
No related merge requests found
...@@ -65,7 +65,7 @@ MARK_AS_ADVANCED(LIBCORPUS2_SRC_DATA_DIR) ...@@ -65,7 +65,7 @@ MARK_AS_ADVANCED(LIBCORPUS2_SRC_DATA_DIR)
add_subdirectory(libpwrutils) add_subdirectory(libpwrutils)
add_subdirectory(libcorpus2) add_subdirectory(libcorpus2)
add_subdirectory(tagset-tool) add_subdirectory(corpus2tools)
add_subdirectory(tests) add_subdirectory(tests)
add_subdirectory(swig) add_subdirectory(swig)
...@@ -9,16 +9,18 @@ endif (Libedit_FOUND) ...@@ -9,16 +9,18 @@ endif (Libedit_FOUND)
include_directories( ${CMAKE_SOURCE_DIR} ) include_directories( ${CMAKE_SOURCE_DIR} )
add_executable( tagset-tool main.cpp ) add_executable( tagset-tool tagset-tool.cpp )
target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
add_executable( corpus-get corpus-get.cpp )
target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
include_directories(${Boost_INCLUDE_DIR}) include_directories(${Boost_INCLUDE_DIR})
link_directories(${Boost_LIBRARY_DIRS}) link_directories(${Boost_LIBRARY_DIRS})
if(UNIX) if(UNIX)
install(TARGETS tagset-tool install(TARGETS tagset-tool corpus-get
RUNTIME DESTINATION bin) RUNTIME DESTINATION bin)
endif(UNIX) endif(UNIX)
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/ioformat-options.h>
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
int main(int argc, char** argv)
{
std::string tagset_name, filename;
std::string input_format, output_format;
int sentence, token = -1;
size_t stats = 0;
using boost::program_options::value;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("filename,F", value(&filename),
"filename")
("sentence,S", value(&sentence),
"Sentence idx")
("stats,s", value(&stats),
"Stats")
("token,T", value(&token),
"Token idx ")
("tagset,t", value(&tagset_name)->default_value("kipi"),
"Tagset name")
;
Corpus2::add_input_options(desc);
Corpus2::add_output_options(desc);
boost::program_options::variables_map vm;
boost::program_options::positional_options_description p;
p.add("filename", 1);
p.add("sentence", 1);
p.add("token", 1);
try {
boost::program_options::store(
boost::program_options::command_line_parser(argc, argv)
.options(desc).positional(p).run(), vm);
} catch (boost::program_options::error& e) {
std::cerr << e.what() << "\n";
return 2;
}
boost::program_options::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
return 1;
}
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name);
boost::shared_ptr<Corpus2::TokenReader> reader;
reader = Corpus2::create_reader(vm, tagset, filename);
Corpus2::Sentence::Ptr s;
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer = Corpus2::create_writer(vm, tagset);
std::map<int,int> lens;
for (int i = 0; i <= sentence; ++i) {
s = reader->get_next_sentence();
if (s) {
lens[s->size()]++;
if (s->size() > stats) {
std::cerr << i << "\n";
writer->write_sentence(*s);
}
}
}
if (s) {
if (token == -1) {
writer->write_sentence(*s);
} else if (static_cast<size_t>(token) < s->size()) {
writer->write_token(*(*s)[token]);
}
}
if (stats) {
typedef std::pair<int,int> pp;
foreach (const pp& p, lens) {
std::cerr << p.first << " " << p.second << "\n";
}
}
}
File moved
...@@ -75,6 +75,7 @@ SET(libcorpus2_STAT_SRC ...@@ -75,6 +75,7 @@ SET(libcorpus2_STAT_SRC
io/xceswriter.cpp io/xceswriter.cpp
io/xmlreader.cpp io/xmlreader.cpp
io/xmlwriter.cpp io/xmlwriter.cpp
util/ioformat-options.cpp
util/settings.cpp util/settings.cpp
util/symboldictionary.cpp util/symboldictionary.cpp
util/tokentimer.cpp util/tokentimer.cpp
......
#include <libcorpus2/util/ioformat-options.h>
#include <boost/algorithm/string/join.hpp>
namespace Corpus2 {
void add_input_options(
boost::program_options::options_description& desc,
const std::string& default_format /*= "xces"*/
)
{
std::string readers = boost::algorithm::join(
Corpus2::TokenReader::available_reader_types_help(), " ");
std::string readers_help = "Input format, any of: " + readers + "\n";
desc.add_options()
("input-format,i",
boost::program_options::value<std::string>()->default_value(default_format),
readers_help.c_str());
}
void add_output_options(
boost::program_options::options_description& desc,
const std::string& default_format /*= "xces"*/
)
{
std::string writers = boost::algorithm::join(
Corpus2::TokenWriter::available_writer_types_help(), " ");
std::string writers_help = "Output format, any of: " + writers + "\n";
desc.add_options()
("output-format,o",
boost::program_options::value<std::string>()->default_value(default_format),
writers_help.c_str());
}
boost::shared_ptr<Corpus2::TokenReader> create_reader(
boost::program_options::variables_map& vm,
const Corpus2::Tagset& tagset,
const std::string& filename /*= "-"*/
)
{
std::string format = vm["input-format"].as<std::string>();
if (filename.empty() || filename == "-") {
return Corpus2::TokenReader::create_stream_reader(
format, tagset, std::cin);
} else {
return Corpus2::TokenReader::create_path_reader(
format, tagset, filename);
}
}
boost::shared_ptr<Corpus2::TokenWriter> create_writer(
boost::program_options::variables_map& vm,
const Corpus2::Tagset& tagset,
const std::string& filename /*= "-"*/
)
{
std::string format = vm["output-format"].as<std::string>();
if (filename.empty() || filename == "-") {
return Corpus2::TokenWriter::create_stream_writer(
format, std::cout, tagset);
} else {
return Corpus2::TokenWriter::create_path_writer(
format, filename, tagset);
}
}
} /* end ns Corpus2 */
#ifndef LIBSORPUS2_UTIL_IOFORMAT_OPTIONS_H
#define LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H
#include <boost/program_options.hpp>
#include <libcorpus2/io/reader.h>
#include <libcorpus2/io/writer.h>
namespace Corpus2 {
void add_input_options(
boost::program_options::options_description& desc,
const std::string& default_format = "xces"
);
void add_output_options(
boost::program_options::options_description& desc,
const std::string& default_format = "xces"
);
boost::shared_ptr<Corpus2::TokenReader> create_reader(
boost::program_options::variables_map& vm,
const Corpus2::Tagset& tagset,
const std::string& filename = "-"
);
boost::shared_ptr<Corpus2::TokenWriter> create_writer(
boost::program_options::variables_map& vm,
const Corpus2::Tagset& tagset,
const std::string& filename = "-"
);
} /* end ns Corpus2 */
#endif // LIBCORPUS2_UTIL_IOFORMAT_OPTIONS_H
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment