Skip to content
Snippets Groups Projects
Commit 66d3eb6d authored by Bartosz Broda's avatar Bartosz Broda
Browse files

first working version of mwereader (for sentences only)

parent d35e12c3
Branches
No related tags found
No related merge requests found
...@@ -24,7 +24,7 @@ LexicalUnit::LexicalUnit(const std::string &base, ...@@ -24,7 +24,7 @@ LexicalUnit::LexicalUnit(const std::string &base,
} }
bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc, bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
std::set<size_t> &out_position, int &head_pos) const std::set<int> &out_position, int &head_pos) const
{ {
// set variables // set variables
for(variables_map::const_iterator ivars = variables_.begin(); for(variables_map::const_iterator ivars = variables_.begin();
......
...@@ -35,7 +35,7 @@ public: ...@@ -35,7 +35,7 @@ public:
* \returns true if this lexical unit was found here * \returns true if this lexical unit was found here
*/ */
virtual bool IsHere(const Wccl::SentenceContext& sc, virtual bool IsHere(const Wccl::SentenceContext& sc,
std::set<size_t> &out_positions, int &head_pos) const; std::set<int> &out_positions, int &head_pos) const;
const std::string & get_base() const{ return base_;} const std::string & get_base() const{ return base_;}
const variables_map & get_variables() const{ return variables_;} const variables_map & get_variables() const{ return variables_;}
......
...@@ -42,6 +42,8 @@ namespace Corpus2 { ...@@ -42,6 +42,8 @@ namespace Corpus2 {
return op; return op;
} }
//////////////////////////////////////////////////////////////////////
MWEParser::MWEParser(MWEIndex &index) MWEParser::MWEParser(MWEIndex &index)
: BasicSaxParser(), state_(NONE), mwe_index_(index) : BasicSaxParser(), state_(NONE), mwe_index_(index)
{ {
...@@ -71,7 +73,7 @@ namespace Corpus2 { ...@@ -71,7 +73,7 @@ namespace Corpus2 {
+ group_type_); + group_type_);
} }
variables_.clear();
} }
std::string MWEParser::get_attribute(const AttributeList& attributes, std::string MWEParser::get_attribute(const AttributeList& attributes,
......
...@@ -30,33 +30,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>( ...@@ -30,33 +30,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
{ {
// TODO MWE stuff // TODO MWE stuff
Sentence::Ptr pSentence = inner_reader_->get_next_sentence(); Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
Wccl::SentenceContext sc(pSentence); Wccl::SentenceContext sc(pSentence);
return process_sentence(sc);
}
Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
{
for(int i = 0; i < sc.size(); ++i){ for(int i = 0; i < sc.size(); ++i){
sc.set_position(i); sc.set_position(i);
Corpus2::Token *pToken = (*pSentence)[i]; Corpus2::Token *pToken = sc.at(i);
std::cout << pToken->orth_utf8() << " "; //std::cout << pToken->orth_utf8() << " ";
std::vector<Lexeme>& lexemes = pToken->lexemes(); std::vector<Lexeme>& lexemes = pToken->lexemes();
foreach(const Lexeme& lex, lexemes){ foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){ if(lex.is_disamb()){
std::string base = lex.lemma_utf8(); std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base); const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
if(potential.size())
std::cout << "# ";
foreach(LexicalUnit::Ptr pLU, potential){ foreach(LexicalUnit::Ptr pLU, potential){
std::set<size_t> positions; std::set<int> positions;
int head; int head;
bool is_here = pLU->IsHere(sc, positions, head); bool is_here = pLU->IsHere(sc, positions, head);
if(is_here) if(is_here){
std::cout << "** " << pLU->get_base() << "** "; std::string new_orth_utf8;
Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr();
std::vector<Token*> &tokens = sent->tokens();
foreach(const int &pos, positions){
Token* tok = tokens [pos];
new_orth_utf8 += tok->orth_utf8() + " ";
if(pos != head){
delete tok;
tokens[pos] = NULL;
if(pos >= i)
i--;
}
}
Corpus2::Token *tok = (*sent)[head];
tok->set_orth_utf8(new_orth_utf8);
foreach(Lexeme& lex, tok->lexemes())
if(lex.is_disamb())
lex.set_lemma_utf8(pLU->get_base());
std::vector <Token*>::iterator del_iter = tokens.begin();
while (del_iter != tokens.end()) {
if( (*del_iter) == NULL)
del_iter = tokens.erase(del_iter);
else
++del_iter;
}
}
} }
} }
} }
} }
std::cout << "ENDL\n";
return pSentence; return sc.get_sentence_ptr();
} }
boost::shared_ptr<Chunk> MWEReader::get_next_chunk() boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
......
...@@ -46,6 +46,9 @@ public: ...@@ -46,6 +46,9 @@ public:
static bool registered; static bool registered;
protected:
Sentence::Ptr process_sentence(Wccl::SentenceContext & sc);
private: private:
void load_mwes(const std::string& filename); void load_mwes(const std::string& filename);
......
...@@ -11,4 +11,6 @@ int main(int ac, char**av) ...@@ -11,4 +11,6 @@ int main(int ac, char**av)
MWEParser parser(temp_index); MWEParser parser(temp_index);
parser.parse_file(av[1]); parser.parse_file(av[1]);
} }
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
setvar($Pos2, 1), setvar($Pos2, 1),
inter(class[0],{subst,ger,depr}), inter(class[0],{subst,ger,depr}),
inter(class[1],{subst,ger,depr}), inter(class[1],{subst,ger,depr})//,
inter(cas[0], cas[1]) //inter(cas[0], cas[1]) - głupie ograniczenie, np. Debatowali nad ceną netto
) )
</condition> </condition>
<instances> <instances>
...@@ -24,6 +24,11 @@ ...@@ -24,6 +24,11 @@
<var name="Subst2">netto</var> <var name="Subst2">netto</var>
<head>inter(base[0], "waga")</head> <head>inter(base[0], "waga")</head>
</MWE> </MWE>
<MWE base="ratyfikacja traktatu">
<var name="Subst1">ratyfikacja</var>
<var name="Subst2">traktat</var>
<head>inter(base[0], "traktat")</head>
</MWE>
</instances> </instances>
</mwegroup> </mwegroup>
<mwegroup name="AdjSubstFix" type="fix" class="subst"> <mwegroup name="AdjSubstFix" type="fix" class="subst">
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment