diff --git a/CMakeScripts/FindLoki.cmake b/CMakeScripts/FindLoki.cmake new file mode 100644 index 0000000000000000000000000000000000000000..27fa48b7ec9519b8cefd698efdf33f73401e9b3b --- /dev/null +++ b/CMakeScripts/FindLoki.cmake @@ -0,0 +1,24 @@ +FIND_PATH(LOKI_INCLUDE_DIR loki/LokiExport.h /usr/include /usr/local/include) + +FIND_LIBRARY(LOKI_LIBRARY NAMES loki PATHS /usr/lib /usr/local/lib) + +MARK_AS_ADVANCED(LOKI_LIBRARY) +MARK_AS_ADVANCED(LOKI_INCLUDE_DIR) + + +IF (LOKI_INCLUDE_DIR AND LOKI_LIBRARY) + SET(LOKI_FOUND TRUE) +ENDIF (LOKI_INCLUDE_DIR AND LOKI_LIBRARY) + + +IF (LOKI_FOUND) + IF (NOT LOKI_FIND_QUIETLY) + MESSAGE(STATUS "Found LOKI: ${LOKI_LIBRARY}") + ENDIF (NOT LOKI_FIND_QUIETLY) +ELSE (LOKI_FOUND) + IF (Loki_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could not find Loki-lib") + ELSE (Loki_FIND_REQUIRED) + MESSAGE(STATUS "Loki not found") + ENDIF (Loki_FIND_REQUIRED) +ENDIF (LOKI_FOUND) diff --git a/wcclrules/CMakeLists.txt b/wcclrules/CMakeLists.txt index 46eb5096a4dd68dad81531ef1f59754214bacb90..ab54ea158dd5ab444437215de73c7807fec3f770 100644 --- a/wcclrules/CMakeLists.txt +++ b/wcclrules/CMakeLists.txt @@ -5,6 +5,9 @@ include_directories(${LibXML++_INCLUDE_DIRS}) link_directories(${LibXML++_LIBRARY_DIRS}) set(LIBS ${LIBS} ${LibXML++_LIBRARIES}) +find_package(Loki REQUIRED QUIET) +set(LIBS ${LIBS} loki) + include_directories( ${CMAKE_SOURCE_DIR} ) add_definitions(-DLIBWCCL_WCCLRUN_DATA_DIR="${PROJECT_SOURCE_DIR}/") diff --git a/wcclrules/main.cpp b/wcclrules/main.cpp index da88578e7b00b24529041fb04cfbe54cc3f57cb2..40936af13b68f04d2ef9736d706bd5e90c5d9482 100644 --- a/wcclrules/main.cpp +++ b/wcclrules/main.cpp @@ -7,6 +7,8 @@ #include <libwccl/parser/Parser.h> #include <libwccl/ops/rulesequence.h> #include <libcorpus2/tagsetmanager.h> +#include <libcorpus2/util/tokentimer.h> + #include <boost/bind.hpp> #include <boost/algorithm/string.hpp> @@ -20,6 +22,7 @@ namespace { bool quiet = false; + bool progress = false; struct options { bool first; @@ -39,7 +42,10 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru ret = parser.parseRuleSequence(is); if (ret) { - std::cerr << ret->size() << "\n"; + if (!quiet) { + std::cerr << "Loaded " << ret->size() << " rule(s) from " + << filename << "\n"; + } std::copy(ret->begin(), ret->end(), std::back_inserter(rules)); return true; } else { @@ -69,13 +75,27 @@ bool load_more_rules(Wccl::Parser& parser, const std::string& filename, Wccl::Ru void do_stream(boost::shared_ptr<Corpus2::TokenWriter> writer, const Corpus2::Tagset& tagset, Wccl::RuleSequence& rules, std::istream& is, const options& opts) { - Corpus2::XcesReader xr(tagset, is); - Corpus2::Sentence::Ptr s; - while ((s = xr.get_next_sentence())) { - rules.execute_once(s); - writer->write_sentence(*s); + Corpus2::XcesReader reader(tagset, is); + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + while (boost::shared_ptr<Corpus2::Chunk> c = reader.get_next_chunk()) { + foreach (boost::shared_ptr<Corpus2::Sentence>& s, c->sentences()) { + if (opts.until_done) { + rules.execute_until_done(s, opts.until_done_iterations); + } else { + rules.execute_once(s); + } + timer.count_sentence(*s); + if (progress) { + timer.check_slice(); + } + if (opts.first) break; + } + writer->write_chunk(*c); if (opts.first) break; } + if (progress) { + timer.stats(); + } } @@ -106,6 +126,8 @@ int main(int argc, char** argv) "Files to load, looking at the extension to determine type\n") ("output-format,o", value(&output_format)->default_value("xces"), writers_help.c_str()) + ("progress,p", value(&progress)->zero_tokens(), + "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress messages\n") ("until-done,u", value(&opts.until_done)->zero_tokens(), @@ -161,8 +183,11 @@ int main(int argc, char** argv) } } if (!rules.empty()) { + Corpus2::TokenTimer& timer = Corpus2::global_timer(); + timer.register_signal_handler(); boost::shared_ptr<Corpus2::TokenWriter> writer; - writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); foreach (const std::string& f, corpora_files) { + writer.reset(Corpus2::TokenWriter::create(output_format, std::cout, tagset)); + foreach (const std::string& f, corpora_files) { std::ifstream ifs(f.c_str()); if (ifs.good()) { do_stream(writer, tagset, rules, ifs, opts);