From c79eaf5349cacfffccf0ab4adaee2327add847b7 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Sun, 17 Apr 2011 17:21:07 +0200 Subject: [PATCH] wccl-features --- CMakeLists.txt | 1 + wccl-features/CMakeLists.txt | 31 +++ wccl-features/main.cpp | 375 +++++++++++++++++++++++++++++++++++ 3 files changed, 407 insertions(+) create mode 100644 wccl-features/CMakeLists.txt create mode 100644 wccl-features/main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 27b8ca0..51abf19 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,4 +61,5 @@ add_subdirectory(libwccl) add_subdirectory(wcclparser) add_subdirectory(wcclrun) add_subdirectory(wcclrules) +add_subdirectory(wccl-features) add_subdirectory(tests) diff --git a/wccl-features/CMakeLists.txt b/wccl-features/CMakeLists.txt new file mode 100644 index 0000000..0cd2499 --- /dev/null +++ b/wccl-features/CMakeLists.txt @@ -0,0 +1,31 @@ +PROJECT( wccl-features ) + +find_package(Libedit) +if (Libedit_FOUND) + message(STATUS "Building with libedit") + add_definitions( -DHAVE_LIBEDIT ) + set(LIBS ${LIBS} ${Libedit_LIBRARIES}) +endif (Libedit_FOUND) + +find_package(LibXML++ REQUIRED) +include_directories(${LibXML++_INCLUDE_DIRS}) +link_directories(${LibXML++_LIBRARY_DIRS}) +set(LIBS ${LIBS} ${LibXML++_LIBRARIES}) + +include_directories( ${CMAKE_SOURCE_DIR} ) + +add_definitions(-DLIBWCCL_WCCLRUN_DATA_DIR="${PROJECT_SOURCE_DIR}/") + +add_executable(wccl-features + main.cpp +) +target_link_libraries (wccl-features wccl ${Boost_LIBRARIES} antlr ${LIBS}) + +include_directories(${Boost_INCLUDE_DIR}) +link_directories(${Boost_LIBRARY_DIRS}) + +if(UNIX) + install(TARGETS wccl-features + RUNTIME DESTINATION bin + ) +endif(UNIX) diff --git a/wccl-features/main.cpp b/wccl-features/main.cpp new file mode 100644 index 0000000..eae2464 --- /dev/null +++ b/wccl-features/main.cpp @@ -0,0 +1,375 @@ +#include <cstdlib> +#include <cstdio> +#include <fstream> +#include <iomanip> + + +#include <libwccl/values/strset.h> +#include <libwccl/parser/Parser.h> +#include <libcorpus2/tagsetmanager.h> + +#include <boost/bind.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/make_shared.hpp> +#include <boost/program_options.hpp> +#include <boost/filesystem.hpp> +#include <libcorpus2/io/xcesreader.h> +#include <boost/lexical_cast.hpp> +#include <boost/regex.hpp> + +#include <antlr/NoViableAltException.hpp> +#include <antlr/MismatchedTokenException.hpp> + +namespace { + bool quiet = false; + bool tabs = false; + bool output_orths = true; + bool output_variables = false; + bool global_numbering = false; + bool output_header = true; + bool in_sentence_numbering = true; +} + +class streamsave +{ +public: + streamsave(std::ostream& os) : os_(os), flags_(os.flags()) {} + ~streamsave() { os_.flags(flags_); } +private: + std::ostream& os_; + std::ios_base::fmtflags flags_; +}; + +class Runner +{ +public: + Runner(const Corpus2::Tagset& tagset) + : tagset_(tagset), parser_(tagset_), token_idx(0) + { + } + + int load_more_operators(const std::string &filename); + + int load_operator_string(const std::string &line); + + void print_header_head(); + void print_header_body(const std::string &attribute_prefix); + void print_header_foot(); + + void print_data(const std::vector< std::vector<std::string> >& data); + + std::vector< std::vector<std::string> > do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence); + + void do_stream(std::istream& is, bool first); + + bool empty() { + return bool_ops_.empty() && str_ops_.empty() && tset_ops_.empty(); + } + +private: + const Corpus2::Tagset& tagset_; + Wccl::Parser parser_; + + typedef std::map< + std::string, + boost::shared_ptr<Wccl::Operator<Wccl::Bool> > + > bool_ops_map_t; + bool_ops_map_t bool_ops_; + typedef std::map< + std::string, + boost::shared_ptr<Wccl::Operator<Wccl::StrSet> > + > str_ops_map_t; + str_ops_map_t str_ops_; + + typedef std::map< + std::string, + std::pair<std::set<Corpus2::Tag>, boost::shared_ptr<Wccl::Operator<Wccl::TSet> > > + > tset_ops_map_t; + tset_ops_map_t tset_ops_; + + int token_idx; +}; + +int Runner::load_more_operators(const std::string& filename) +{ + int ops_parsed = 0; + + std::ifstream is(filename.c_str()); + if (!is.good()) { + throw Wccl::FileNotFound(filename, "", __FUNCTION__); + } + std::string line; + int line_no = 0; + while (std::getline(is, line)) { + ++line_no; + if (line.size() < 3) continue; + if (line[0] == '#') continue; + int loaded = load_operator_string(line); + if (loaded > 0) { + ops_parsed += loaded; + } else { + std::cerr << "Line " << line_no << " did not match: " << line << "\n"; + } + } + return ops_parsed; +} + +int Runner::load_operator_string(const std::string &line) +{ + int ops_loaded = 0; + boost::regex e("(STRING|BOOL|MASK\\h([a-z@,]+))\\h+" + "(?:name:([a-zA-Z0-9_-]+)\\h)?" + "(?:range:([0-9-]+):([0-9-]+)\\h)?" + "(.*)"); + boost::smatch what; + if(boost::regex_match(line, what, e, boost::match_extra)) { + try { + const std::string& orig_name = what[3].matched ? what[3] : what[6]; + const std::string& orig_op_string = what[6]; + std::vector <std::string> op_strings; + std::vector <std::string> names; + if (what[4].matched) { + int rfrom = boost::lexical_cast<int>(what[4]); + int rto = boost::lexical_cast<int>(what[5]); + for (int i = rfrom; i <= rto; ++i) { + std::string pos = boost::lexical_cast<std::string>(i); + op_strings.push_back(boost::algorithm::replace_all_copy( + orig_op_string, "_R_", pos)); + names.push_back(orig_name + pos); + } + } else { + op_strings.push_back(orig_op_string); + names.push_back(orig_name); + } + for (size_t opi = 0; opi < op_strings.size(); ++opi) { + const std::string& name = names[opi]; + const std::string& op_string = op_strings[opi]; + if (what[1] == "STRING") { + str_ops_.insert(std::make_pair(name, + parser_.parseStringOperator(op_string))); + ++ops_loaded; + } else if (what[1] == "BOOL") { + bool_ops_.insert(std::make_pair(name, + parser_.parseBoolOperator(op_string))); + ++ops_loaded; + } else { + Corpus2::Tag tag = tagset_.parse_symbol_string(what[2]); + std::vector<std::string> sym = tagset_.tag_to_symbol_string_vector(tag, false); + std::set<Corpus2::Tag> t; + foreach (const std::string& s, sym) { + t.insert(tagset_.parse_symbol(s)); + } + tset_ops_.insert(std::make_pair(name, std::make_pair(t, + parser_.parseSymSetOperator(op_string)))); + ++ops_loaded; + } + } + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << e.scope() << " error: " << e.info() << std::endl; + } + } + return ops_loaded; +} + +void Runner::print_header_head() +{ + std::cout << "% Generated by wccl-features\n"; + std::cout << "@RELATION wccl\n"; + std::cout << "\n"; +} + +void Runner::print_header_body(const std::string& attribute_prefix) +{ + foreach (const str_ops_map_t::value_type v, str_ops_) { + std::cout << "@ATTRIBUTE " + << attribute_prefix << v.first << " string\n"; + } + foreach (const bool_ops_map_t::value_type v, bool_ops_) { + std::cout << "@ATTRIBUTE " + << attribute_prefix << v.first << " class {0,1}\n"; + } + foreach (const tset_ops_map_t::value_type v, tset_ops_) { + foreach (const Corpus2::Tag& tag, v.second.first) { + std::cout << "@ATTRIBUTE " + << attribute_prefix << v.first << "_" + << tagset_.tag_to_symbol_string(tag) << " class {0,1}\n"; + } + } +} + +void Runner::print_header_foot() +{ + std::cout << "\n@DATA\n"; +} + +void Runner::print_data(const std::vector<std::vector<std::string> > &data) +{ + foreach (const std::vector<std::string>& feats, data) { + std::cout << boost::algorithm::join(feats, ",") << "\n"; + } +} + +std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) +{ + Wccl::SentenceContext sc(sentence); + + std::vector< std::vector<std::string> > sfeats; + while (sc.is_current_inside()) { + sfeats.resize(sfeats.size() + 1); + std::vector<std::string>& feats = sfeats.back(); + foreach (const str_ops_map_t::value_type v, str_ops_) { + boost::shared_ptr<const Wccl::StrSet> s = v.second->apply(sc); + assert(s); + if (s->contents().empty()) { + feats.push_back("\"\""); + } else { + feats.push_back("\"" + PwrNlp::to_utf8(*s->contents().begin()) + "\""); + } + } + foreach (const bool_ops_map_t::value_type v, bool_ops_) { + boost::shared_ptr<const Wccl::Bool> b = v.second->apply(sc); + assert(b); + if (*b) { + feats.push_back("1"); + } else { + feats.push_back("0"); + } + } + foreach (const tset_ops_map_t::value_type v, tset_ops_) { + boost::shared_ptr<const Wccl::TSet> t = v.second.second->apply(sc); + assert(t); + foreach (const Corpus2::Tag& tag, v.second.first) { + if (!tag.get_masked(t->contents()).is_null()) { + feats.push_back("1"); + } else { + feats.push_back("0"); + } + } + } + sc.advance(); + } + return sfeats; +} + + +void Runner::do_stream(std::istream& is, bool first) +{ + Corpus2::XcesReader xr(tagset_, is); + Corpus2::Sentence::Ptr s; + print_header_head(); + print_header_body(""); + print_header_foot(); + while ((s = xr.get_next_sentence())) { + print_data(do_sentence(s)); + std::cout << "\n"; + if (first) break; + } +} + +//void Runner::do_files(std::istream& is, bool first) + + +int main(int argc, char** argv) +{ + std::string tagset_load = "kipi"; + bool first = false; + std::vector<std::string> corpora_files, files, operator_strings; + bool corpus_stdin = false; + using boost::program_options::value; + + boost::program_options::options_description desc("Allowed options"); + desc.add_options() + ("tagset,t", value(&tagset_load), + "Tagset to use") + ("corpus,c", value(&corpora_files), + "Corpus file to load (XCES)") + ("ccl-operator,C", value(&operator_strings), + "CCL operator file or string") + ("files,f", value(&files), + "Files to load, looking at the extension to determine type") + ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), + "Read corpus from stdin") + ("quiet,q", value(&quiet)->zero_tokens(), + "Suppress messages") + ("first-sentence-only,1", value(&first)->zero_tokens(), + "Only process first sentence") + ("tabs", value(&tabs)->zero_tokens(), + "Output a tab-separated file") + ("local-counts,l", value(&in_sentence_numbering), + "Output in-sentence token counts") + ("global-counts,g", value(&global_numbering), + "Output global counts") + ("output-orths,O", value(&output_orths), + "Output token orths") + ("output-variables,V", value(&output_variables), + "Output operator variables") + ("output-header,H", value(&output_header), + "Output table header") + ("help,h", "Show help") + ; + boost::program_options::variables_map vm; + boost::program_options::positional_options_description p; + p.add("files", -1); + + try { + boost::program_options::store( + boost::program_options::command_line_parser(argc, argv) + .options(desc).positional(p).run(), vm); + } catch (boost::program_options::error& e) { + std::cerr << e.what() << std::endl; + return 2; + } + boost::program_options::notify(vm); + + if (vm.count("help")) { + std::cerr << "Usage " << argv[0] << " [OPTIONS] FILES\n" + << "Files ending with .xml are treated as corpora, otherwise \n" + << "as CCL files. Use - to read corpus from stdin (as with -I)\n" + << "Files not ending with an extension are treated as raw operator strings\n"; + std::cout << desc << "\n"; + return 1; + } + + foreach (const std::string& f, files) { + if (f == "-") { + corpus_stdin = true; + } else if (boost::algorithm::ends_with(f, ".xml")) { + corpora_files.push_back(f); + } else { + operator_strings.push_back(f); + } + } + if ((corpora_files.empty() && !corpus_stdin) || (operator_strings.empty() && !output_orths)) { + std::cerr << "Nothing to do, try " << argv[0] << " -h\n"; + return 2; + } + try { + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); + Runner runner(tagset); + foreach (const std::string& f, operator_strings) { + if (boost::algorithm::ends_with(f, ".ccl")) { + if (!runner.load_more_operators(f)) { + std::cerr << "Warning: error while parsing " << f << "\n"; + } + } + } + if (!runner.empty()) { + foreach (const std::string& f, corpora_files) { + std::ifstream ifs(f.c_str()); + if (ifs.good()) { + runner.do_stream(ifs, first); + } else { + std::cerr << "Error reading corpus from " << f << "\n"; + } + } + if (corpus_stdin) { + runner.do_stream(std::cin, first); + } + } + } catch (PwrNlp::PwrNlpError& e) { + std::cerr << e.info() << std::endl; + return 2; + } + + return 0; +} -- GitLab