diff --git a/CMakeLists.txt b/CMakeLists.txt index 44d45bd8211488d0f5624523d437532beb6d0cf7..3d1aa505da8a55e96fe607720b9085cc5b0518cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,4 +59,5 @@ endif(MSVC OR BORLAND) add_subdirectory(libwccl) add_subdirectory(wcclparser) +add_subdirectory(wcclrun) add_subdirectory(tests) diff --git a/wcclrun/CMakeLists.txt b/wcclrun/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d321bb66f148ad024455531421dca46b92869c7 --- /dev/null +++ b/wcclrun/CMakeLists.txt @@ -0,0 +1,31 @@ +PROJECT( wcclrun ) + +find_package(Libedit) +if (Libedit_FOUND) + message(STATUS "Building with libedit") + add_definitions( -DHAVE_LIBEDIT ) + set(LIBS ${LIBS} ${Libedit_LIBRARIES}) +endif (Libedit_FOUND) + +find_package(LibXML++ REQUIRED) +include_directories(${LibXML++_INCLUDE_DIRS}) +link_directories(${LibXML++_LIBRARY_DIRS}) +set(LIBS ${LIBS} ${LibXML++_LIBRARIES}) + +include_directories( ${CMAKE_SOURCE_DIR} ) + +add_definitions(-DLIBWCCL_WCCLRUN_DATA_DIR="${PROJECT_SOURCE_DIR}/") + +add_executable(wcclrun + main.cpp +) +target_link_libraries (wcclrun wccl ${Boost_LIBRARIES} antlr ${LIBS}) + +include_directories(${Boost_INCLUDE_DIR}) +link_directories(${Boost_LIBRARY_DIRS}) + +if(UNIX) + install(TARGETS wcclrun + RUNTIME DESTINATION bin + ) +endif(UNIX) diff --git a/wcclrun/main.cpp b/wcclrun/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c9181232a72fd4b21919da35ef2c2dd6233a98d9 --- /dev/null +++ b/wcclrun/main.cpp @@ -0,0 +1,203 @@ +#include <cstdlib> +#include <fstream> +#include <iomanip> + + +#include <libwccl/values/strset.h> +#include <libwccl/parser/Parser.h> +#include <libcorpus2/tagsetmanager.h> + +#include <boost/bind.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/make_shared.hpp> +#include <boost/program_options.hpp> +#include <libcorpus2/io/xcesreader.h> + +#include <antlr/NoViableAltException.hpp> +#include <antlr/MismatchedTokenException.hpp> + + +void load_more_operators(const std::string& filename, Wccl::Parser& parser, + std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) +{ + + boost::shared_ptr<const Wccl::Value> retVal; + boost::shared_ptr<Wccl::FunctionalOperator> retOp; + try { + std::ifstream is(filename.c_str()); + if (!is.good()) { + throw Wccl::FileNotFound(filename, "", __FUNCTION__); + } + + retOp = parser.parseAnyOperator(is); + if (retOp) { + ops.push_back(retOp); + } else { + std::cerr << "Problem while parsing -- " + << "parser returned NULL!" << std::endl; + } + } catch (antlr::MismatchedTokenException &e) { + std::cerr << e.getFileLineColumnString() + << " " << e.getMessage() << std::endl; + } catch(antlr::NoViableAltException &e) { + std::cerr << e.getFileLineColumnString() + << " " << e.getMessage() << std::endl; + } catch (Wccl::InvalidVariableName &e) { + std::cerr << "Wccl::InvalidVariableName " << e.info() << std::endl; + } catch (Wccl::VariableTypeMismatch &e) { + std::cerr << "Wccl::VariableTypeMismatch " << e.info() << std::endl; + } catch (Wccl::WcclError& e) { + std::cerr << "Wccl::WcclError:" << e.info() << std::endl; + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << "PwrNlp::PwrNlpError " << e.info() << std::endl; + } catch (antlr::ANTLRException& e) { + std::cerr << "Antlr error " << e.getMessage() << std::endl; + } +} + +class streamsave +{ +public: + streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {} + ~streamsave() { os_.flags(flags_); } +private: + std::ostream& os_; + std::ios_base::fmtflags flags_; +}; + +void do_head(const Corpus2::Tagset& tagset, + const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) +{ + streamsave sv(std::cout); + std::cout << "## "; + std::cout << std::setw(20) << "orth"; + int i = 0; + foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { + ++i; + std::cout << " "; + std::cout.setf(std::ios::right); + std::cout << std::setw(15) << "operator "; + std::cout.setf(std::ios::left); + std::cout << std::setw(5) << i; + } + std::cout << "\n"; +} + +void do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence, + const Corpus2::Tagset& tagset, + const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops) +{ + Wccl::SentenceContext sc(sentence); + std::vector< std::vector< UnicodeString > > outputs(sentence->size()); + std::vector<int> lengths(ops.size(), 0); + streamsave sv(std::cout); + for (size_t i = 0; i < sentence->size(); ++i) { + sc.set_position(i); + UnicodeString orth = sentence->tokens()[i]->orth(); + outputs[i].push_back(orth); + lengths[0] = std::max(lengths[0], orth.length()); + int li = 1; + foreach (const boost::shared_ptr<Wccl::FunctionalOperator>& o, ops) { + boost::shared_ptr<const Wccl::Value> v = o->base_apply(sc); + UnicodeString vstr = v->to_string_u(tagset); + lengths[li] = std::max(lengths[li], vstr.length()); + ++li; + outputs[i].push_back(vstr); + } + } + for (size_t i = 0; i < sentence->size(); ++i) { + std::cout << std::setw(2) << (i + 1) << " "; + for (size_t oi = 0; oi < outputs[i].size(); ++oi) { + UnicodeString u = outputs[i][oi]; + u.padTrailing(lengths[oi]); + std::cout << PwrNlp::to_utf8(u) << " "; + } + std::cout << "\n"; + } +} + +void do_file(const std::string& filename, const Corpus2::Tagset& tagset, + const std::vector< boost::shared_ptr<Wccl::FunctionalOperator> >& ops, + bool first) +{ + Corpus2::XcesReader xr(tagset, filename); + Corpus2::Sentence::Ptr s; + //do_head(tagset, ops); + while ((s = xr.get_next_sentence())) { + do_sentence(s, tagset, ops); + std::cout << "\n"; + if (first) break; + } +} + +int main(int argc, char** argv) +{ + std::string tagset_load = "kipi"; + bool first = false; + std::vector<std::string> corpora_files, ccl_files, files; + bool quiet = false; + bool dump_variables = false; + using boost::program_options::value; + + boost::program_options::options_description desc("Allowed options"); + desc.add_options() + ("tagset,t", value(&tagset_load), + "Tagset to use\n") + ("corpus,c", value(&corpora_files), + "Corpus file to load (XCES)\n") + ("ccl-file,C", value(&ccl_files), + "CCL query file\n") + ("files,f", value(&files), + "Files to load, look at extecion to determine type\n") + ("quiet,q", value(&quiet)->zero_tokens(), + "Suppress messages\n") + ("first-sentence-only,1", value(&first)->zero_tokens(), + "Only process first sentence\n") + ("help,h", "Show help") + ; + boost::program_options::variables_map vm; + boost::program_options::positional_options_description p; + p.add("files", -1); + + try { + boost::program_options::store( + boost::program_options::command_line_parser(argc, argv) + .options(desc).positional(p).run(), vm); + } catch (boost::program_options::error& e) { + std::cerr << e.what() << std::endl; + return 2; + } + boost::program_options::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + + foreach (const std::string& f, files) { + if (boost::algorithm::ends_with(f, ".xml")) { + corpora_files.push_back(f); + } else { + ccl_files.push_back(f); + } + } + + try { + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); + std::vector< boost::shared_ptr<Wccl::FunctionalOperator> > operators; + Wccl::Parser parser(tagset); + foreach (const std::string& f, ccl_files) { + load_more_operators(f, parser, operators); + } + if (!operators.empty()) { + foreach (const std::string& f, corpora_files) { + do_file(f, tagset, operators, first); + } + } + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << e.info() << std::endl; + return 2; + } + + return 0; +}