From f91daedfd63c61d37ff059a65443f3bc8280ba92 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 5 May 2011 15:08:06 +0200 Subject: [PATCH] lexicon path searching in wcclfile parsing, use in wccl-run (-P) --- libwccl/lexicon/lexiconparser.cpp | 8 +++++--- libwccl/lexicon/lexiconparser.h | 4 +++- libwccl/parser/Parser.cpp | 9 +++++---- libwccl/parser/Parser.h | 6 ++++-- libwccl/parser/grammar.g | 5 +++-- libwccl/wcclfile.h | 15 ++++++++++++--- wccl-apps/wccl-run.cpp | 17 +++++++++++++++-- 7 files changed, 47 insertions(+), 17 deletions(-) diff --git a/libwccl/lexicon/lexiconparser.cpp b/libwccl/lexicon/lexiconparser.cpp index 77f7561..acb54cc 100644 --- a/libwccl/lexicon/lexiconparser.cpp +++ b/libwccl/lexicon/lexiconparser.cpp @@ -10,12 +10,14 @@ namespace Wccl { boost::shared_ptr<Lexicon> LexiconParser::parse_lexicon( + const PwrNlp::PathSearcherBase& search_path, const std::string& lexicon_name, - const std::string& path) + const std::string& filename) { - std::ifstream is(path.c_str()); + std::ifstream is; + search_path.open_stream_or_throw(filename, is, "lexicon"); if (!is.good()) { - throw Wccl::FileNotFound(path, "", __FUNCTION__); + throw Wccl::FileNotFound(filename, "", __FUNCTION__); } ANTLRLexiconLexer lexer(is); diff --git a/libwccl/lexicon/lexiconparser.h b/libwccl/lexicon/lexiconparser.h index 521d19a..51cf73a 100644 --- a/libwccl/lexicon/lexiconparser.h +++ b/libwccl/lexicon/lexiconparser.h @@ -2,6 +2,7 @@ #define LIBWCCL_LEXICON_LEXICONPARSER_H #include <boost/shared_ptr.hpp> +#include <libpwrutils/pathsearch.h> namespace Wccl { @@ -11,8 +12,9 @@ class LexiconParser { public: static boost::shared_ptr<Lexicon> parse_lexicon( + const PwrNlp::PathSearcherBase& search_path, const std::string& lexicon_name, - const std::string& path); + const std::string& filename); }; } /* end ns Wccl */ diff --git a/libwccl/parser/Parser.cpp b/libwccl/parser/Parser.cpp index 7a5703c..8cf342d 100644 --- a/libwccl/parser/Parser.cpp +++ b/libwccl/parser/Parser.cpp @@ -563,13 +563,13 @@ boost::shared_ptr<MatchRule> Parser::parseMatchRule(std::istream& istr) const * @return the parsed file via a shared pointer */ boost::shared_ptr<WcclFile> Parser::parseWcclFile( - const std::string& str) const + const std::string& str, const std::string& search_path /*= "."*/) const { std::stringstream ss (std::stringstream::in | std::stringstream::out); ss << str; try { - return this->parseWcclFile(ss); + return this->parseWcclFile(ss, search_path); } catch (ParserException&) { throw; @@ -581,14 +581,15 @@ boost::shared_ptr<WcclFile> Parser::parseWcclFile( * @arg istr input stream with writed rule * @return the parsed file via a shared pointer */ -boost::shared_ptr<WcclFile> Parser::parseWcclFile(std::istream& istr) const +boost::shared_ptr<WcclFile> Parser::parseWcclFile(std::istream& istr, + const std::string& search_path /*= "."*/) const { ANTLRLexer lexer(istr); ANTLRParser parser(lexer); boost::shared_ptr<WcclFile> res; try { - res = parser.parse_wccl_file(tagset_); + res = parser.parse_wccl_file(tagset_, search_path); } catch (antlr::MismatchedTokenException &e) { throw ParserException( e.getFileLineColumnString() + " " + e.getMessage() diff --git a/libwccl/parser/Parser.h b/libwccl/parser/Parser.h index 6873365..3d642ea 100644 --- a/libwccl/parser/Parser.h +++ b/libwccl/parser/Parser.h @@ -94,9 +94,11 @@ public: // --------------------------------------------------------------------------- // WCCL file parsing boost::shared_ptr<WcclFile> - parseWcclFile(const std::string& file_contents_string) const; + parseWcclFile(const std::string& file_contents_string, + const std::string& search_path = ".") const; boost::shared_ptr<WcclFile> - parseWcclFile(std::istream& is) const; + parseWcclFile(std::istream& is, + const std::string& search_path = ".") const; // --------------------------------------------------------------------------- const Corpus2::Tagset& tagset() const { diff --git a/libwccl/parser/grammar.g b/libwccl/parser/grammar.g index 5071cda..074020e 100644 --- a/libwccl/parser/grammar.g +++ b/libwccl/parser/grammar.g @@ -291,10 +291,10 @@ parse_match_rule // ---------------------------------------------------------------------------- // Rule for parsing wccl files parse_wccl_file - [const Corpus2::Tagset& tagset] + [const Corpus2::Tagset& tagset, const std::string search_path] returns [boost::shared_ptr<WcclFile> wccl_file] { - wccl_file = boost::make_shared<WcclFile>(tagset); + wccl_file = boost::make_shared<WcclFile>(tagset, search_path); boost::shared_ptr<TagRuleSequence> rule_seq; } : (imports_section [*wccl_file])? @@ -1894,6 +1894,7 @@ import [WcclFile& wccl_file] : "import" LPAREN file_path : STRING COMMA lexicon_name : STRING RPAREN { wccl_file.import_lexicon( LexiconParser::parse_lexicon( + wccl_file.path(), token_ref_to_std_string(lexicon_name), token_ref_to_std_string(file_path))); } diff --git a/libwccl/wcclfile.h b/libwccl/wcclfile.h index c77d5be..2919b49 100644 --- a/libwccl/wcclfile.h +++ b/libwccl/wcclfile.h @@ -9,6 +9,8 @@ #include <libwccl/wcclfileopsections.h> #include <libwccl/ops/tagrulesequence.h> #include <libwccl/lexicon/lexicons.h> +#include <libwccl/exception.h> +#include <libpwrutils/pathsearch.h> namespace Wccl { @@ -21,7 +23,7 @@ class WcclFile WcclFileOpSections<OpSequence<Match> > { public: - explicit WcclFile(const Corpus2::Tagset& tagset); + WcclFile(const Corpus2::Tagset& tagset, const std::string& search_path); const std::vector<boost::shared_ptr<UntypedOpSequence> >& untyped_sections(); template<class T> @@ -102,12 +104,16 @@ public: std::string to_string() const; const Corpus2::Tagset& tagset() const; + const PwrNlp::PathSearcher<Wccl::FileNotFound> path() const { return path_; } + PwrNlp::PathSearcher<Wccl::FileNotFound> path() { return path_; } + private: std::ostream& write_to(std::ostream& ostream) const; std::vector<boost::shared_ptr<FunctionalOpSequence> > all_sections_; boost::shared_ptr<TagRuleSequence> tag_rules_; boost::shared_ptr<Lexicons> lexicons_; const Corpus2::Tagset& tagset_; + PwrNlp::PathSearcher<Wccl::FileNotFound> path_; }; } /* end ns Wccl */ @@ -118,9 +124,12 @@ private: namespace Wccl { inline -WcclFile::WcclFile(const Corpus2::Tagset& tagset) - : lexicons_(boost::make_shared<Lexicons>()), tagset_(tagset) +WcclFile::WcclFile(const Corpus2::Tagset& tagset, const std::string& search_path) + : lexicons_(boost::make_shared<Lexicons>()), tagset_(tagset), + path_(":") { + path_.set_search_path(search_path); + path_.set_verbose(true); } inline diff --git a/wccl-apps/wccl-run.cpp b/wccl-apps/wccl-run.cpp index 4b23e30..2a20c37 100644 --- a/wccl-apps/wccl-run.cpp +++ b/wccl-apps/wccl-run.cpp @@ -44,7 +44,8 @@ class Runner { public: Runner(const Corpus2::Tagset& tagset) - : tagset_(tagset), parser_(tagset_), token_idx(0), progress_(false) + : tagset_(tagset), parser_(tagset_), token_idx(0), progress_(false), + search_path_(".") { } @@ -69,6 +70,10 @@ public: void output_tabular(const std::vector< std::vector< UnicodeString > > outputs); + void set_search_path(const std::string& path) { + search_path_ = path; + } + private: const Corpus2::Tagset& tagset_; Wccl::Parser parser_; @@ -76,6 +81,7 @@ private: std::vector< std::string > op_names_; int token_idx; bool progress_; + std::string search_path_; }; bool Runner::load_more_operators(const std::string& filename) @@ -87,7 +93,7 @@ bool Runner::load_more_operators(const std::string& filename) throw Wccl::FileNotFound(filename, "", __FUNCTION__); } - retOp = parser_.parseWcclFile(is); + retOp = parser_.parseWcclFile(is, search_path_); if (retOp) { boost::filesystem::path p(filename); std::string prefix = p.stem() + ":"; @@ -144,6 +150,7 @@ bool Runner::load_operator_string(const std::string& op_string) void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence) { + std::cerr << "dos"; Wccl::SentenceContext sc(sentence); std::vector< std::vector< UnicodeString > > outputs; @@ -249,6 +256,7 @@ int main(int argc, char** argv) std::string tagset_load = "kipi"; bool first = false, progress = false; std::string input_format; + std::string search_path; std::vector<std::string> corpora_files, files, operator_strings; bool corpus_stdin = false; using boost::program_options::value; @@ -265,6 +273,8 @@ int main(int argc, char** argv) "CCL operator file or string") ("files,f", value(&files), "Files to load, looking at the extension to determine type") + ("search-path,P", value(&search_path), + "WCCL resources (lexicons) search path") ("corpus-from-stdin,I", value(&corpus_stdin)->zero_tokens(), "Read corpus from stdin") ("quiet,q", value(&quiet)->zero_tokens(), @@ -329,6 +339,9 @@ int main(int argc, char** argv) const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_load); Runner runner(tagset); runner.use_progress(progress); + if (!search_path.empty()) { + runner.set_search_path(search_path); + } foreach (const std::string& f, operator_strings) { if (boost::algorithm::ends_with(f, ".ccl")) { size_t sz = runner.operators().size(); -- GitLab