Skip to content
Snippets Groups Projects
Commit 66d3eb6d authored by Bartosz Broda's avatar Bartosz Broda
Browse files

first working version of mwereader (for sentences only)

parent d35e12c3
No related merge requests found
......@@ -24,7 +24,7 @@ LexicalUnit::LexicalUnit(const std::string &base,
}
bool LexicalUnit::IsHere(const Wccl::SentenceContext &sc,
std::set<size_t> &out_position, int &head_pos) const
std::set<int> &out_position, int &head_pos) const
{
// set variables
for(variables_map::const_iterator ivars = variables_.begin();
......
......@@ -35,7 +35,7 @@ public:
* \returns true if this lexical unit was found here
*/
virtual bool IsHere(const Wccl::SentenceContext& sc,
std::set<size_t> &out_positions, int &head_pos) const;
std::set<int> &out_positions, int &head_pos) const;
const std::string & get_base() const{ return base_;}
const variables_map & get_variables() const{ return variables_;}
......
......@@ -42,6 +42,8 @@ namespace Corpus2 {
return op;
}
//////////////////////////////////////////////////////////////////////
MWEParser::MWEParser(MWEIndex &index)
: BasicSaxParser(), state_(NONE), mwe_index_(index)
{
......@@ -71,7 +73,7 @@ namespace Corpus2 {
+ group_type_);
}
variables_.clear();
}
std::string MWEParser::get_attribute(const AttributeList& attributes,
......
......@@ -30,33 +30,65 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
{
// TODO MWE stuff
Sentence::Ptr pSentence = inner_reader_->get_next_sentence();
Wccl::SentenceContext sc(pSentence);
return process_sentence(sc);
}
Sentence::Ptr MWEReader::process_sentence(Wccl::SentenceContext & sc)
{
for(int i = 0; i < sc.size(); ++i){
sc.set_position(i);
Corpus2::Token *pToken = (*pSentence)[i];
std::cout << pToken->orth_utf8() << " ";
Corpus2::Token *pToken = sc.at(i);
//std::cout << pToken->orth_utf8() << " ";
std::vector<Lexeme>& lexemes = pToken->lexemes();
foreach(const Lexeme& lex, lexemes){
if(lex.is_disamb()){
std::string base = lex.lemma_utf8();
const MWEIndex::luvec& potential = mwe_index_.get_potential_lu(base);
if(potential.size())
std::cout << "# ";
foreach(LexicalUnit::Ptr pLU, potential){
std::set<size_t> positions;
std::set<int> positions;
int head;
bool is_here = pLU->IsHere(sc, positions, head);
if(is_here)
std::cout << "** " << pLU->get_base() << "** ";
if(is_here){
std::string new_orth_utf8;
Corpus2::Sentence::Ptr sent = sc.get_sentence_ptr();
std::vector<Token*> &tokens = sent->tokens();
foreach(const int &pos, positions){
Token* tok = tokens [pos];
new_orth_utf8 += tok->orth_utf8() + " ";
if(pos != head){
delete tok;
tokens[pos] = NULL;
if(pos >= i)
i--;
}
}
Corpus2::Token *tok = (*sent)[head];
tok->set_orth_utf8(new_orth_utf8);
foreach(Lexeme& lex, tok->lexemes())
if(lex.is_disamb())
lex.set_lemma_utf8(pLU->get_base());
std::vector <Token*>::iterator del_iter = tokens.begin();
while (del_iter != tokens.end()) {
if( (*del_iter) == NULL)
del_iter = tokens.erase(del_iter);
else
++del_iter;
}
}
}
}
}
}
std::cout << "ENDL\n";
return pSentence;
return sc.get_sentence_ptr();
}
boost::shared_ptr<Chunk> MWEReader::get_next_chunk()
......
......@@ -46,6 +46,9 @@ public:
static bool registered;
protected:
Sentence::Ptr process_sentence(Wccl::SentenceContext & sc);
private:
void load_mwes(const std::string& filename);
......
......@@ -11,4 +11,6 @@ int main(int ac, char**av)
MWEParser parser(temp_index);
parser.parse_file(av[1]);
}
......@@ -9,8 +9,8 @@
setvar($Pos2, 1),
inter(class[0],{subst,ger,depr}),
inter(class[1],{subst,ger,depr}),
inter(cas[0], cas[1])
inter(class[1],{subst,ger,depr})//,
//inter(cas[0], cas[1]) - głupie ograniczenie, np. Debatowali nad ceną netto
)
</condition>
<instances>
......@@ -24,6 +24,11 @@
<var name="Subst2">netto</var>
<head>inter(base[0], "waga")</head>
</MWE>
<MWE base="ratyfikacja traktatu">
<var name="Subst1">ratyfikacja</var>
<var name="Subst2">traktat</var>
<head>inter(base[0], "traktat")</head>
</MWE>
</instances>
</mwegroup>
<mwegroup name="AdjSubstFix" type="fix" class="subst">
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment