Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
WCCL
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
WCCL
Commits
1a8eb2ba
Commit
1a8eb2ba
authored
13 years ago
by
Lukasz Bilenkij
Browse files
Options
Downloads
Patches
Plain Diff
Add processing in get_next_token/chunk/sentence methods
parent
9ac9fb0d
Branches
Branches containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
libmwereader/mwereader.cpp
+46
-12
46 additions, 12 deletions
libmwereader/mwereader.cpp
libmwereader/mwereader.h
+3
-0
3 additions, 0 deletions
libmwereader/mwereader.h
libmwereader/tests/mwefunctional.cpp
+74
-0
74 additions, 0 deletions
libmwereader/tests/mwefunctional.cpp
with
123 additions
and
12 deletions
libmwereader/mwereader.cpp
+
46
−
12
View file @
1a8eb2ba
...
...
@@ -21,20 +21,40 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Token
*
MWEReader
::
get_next_token
()
{
// TODO MWE stuff
// get whole sentence -> process it -> return token by token
return
inner_reader_
->
get_next_token
();
if
(
currentSentence
->
empty
())
currentSentence
=
get_next_sentence
();
std
::
vector
<
Token
*>
tokens
=
currentSentence
->
tokens
();
if
(
token_index
<
tokens
.
size
())
{
return
tokens
.
at
(
token_index
++
);
}
else
{
currentSentence
=
get_next_sentence
();
if
(
currentSentence
==
NULL
)
{
return
NULL
;
}
tokens
=
currentSentence
->
tokens
();
token_index
=
0
;
return
tokens
.
at
(
token_index
++
);
}
}
Sentence
::
Ptr
MWEReader
::
get_next_sentence
()
{
// TODO MWE stuff
Sentence
::
Ptr
pSentence
=
inner_reader_
->
get_next_sentence
();
if
(
pSentence
==
NULL
)
return
Sentence
::
Ptr
();
Wccl
::
SentenceContext
sc
(
pSentence
);
currentSentence
=
inner_reader_
->
get_next_sentence
();
if
(
currentSentence
==
0
)
{
return
currentSentence
;
}
Wccl
::
SentenceContext
sc
(
currentSentence
);
token_index
=
0
;
return
process_sentence
(
sc
);
}
Sentence
::
Ptr
MWEReader
::
process_sentence
(
Wccl
::
SentenceContext
&
sc
)
...
...
@@ -108,9 +128,21 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
boost
::
shared_ptr
<
Chunk
>
MWEReader
::
get_next_chunk
()
{
// TODO MWE stuff
// get whole chunk -> process sentences -> return processed chunk
return
inner_reader_
->
get_next_chunk
();
currentChunk
=
inner_reader_
->
get_next_chunk
();
if
(
currentChunk
==
NULL
)
return
currentChunk
;
std
::
vector
<
boost
::
shared_ptr
<
Corpus2
::
Sentence
>
>
s1
=
currentChunk
->
sentences
();
std
::
vector
<
boost
::
shared_ptr
<
Corpus2
::
Sentence
>
>::
iterator
it
;
for
(
it
=
s1
.
begin
();
it
!=
s1
.
end
();
it
++
)
{
if
(
it
==
s1
.
begin
())
currentSentence
=*
it
;
Wccl
::
SentenceContext
sc
(
*
it
);
process_sentence
(
sc
);
}
token_index
=
0
;
return
currentChunk
;
}
void
MWEReader
::
set_option
(
const
std
::
string
&
option
)
...
...
@@ -120,6 +152,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
std
::
string
inner
=
option
.
substr
(
6
);
inner_reader_
=
create_path_reader
(
inner
,
this
->
tagset
(),
inner_filename_
);
token_index
=
0
;
currentSentence
=
boost
::
make_shared
<
Sentence
>
();
}
if
(
boost
::
algorithm
::
starts_with
(
option
,
"mwefile:"
))
{
std
::
string
mwefile
=
option
.
substr
(
8
);
...
...
This diff is collapsed.
Click to expand it.
libmwereader/mwereader.h
+
3
−
0
View file @
1a8eb2ba
...
...
@@ -62,6 +62,9 @@ private:
/// path for inner reader
std
::
string
inner_filename_
;
/// inner reader option
size_t
token_index
;
Sentence
::
Ptr
currentSentence
;
boost
::
shared_ptr
<
Chunk
>
currentChunk
;
};
}
// ns Corpus2
...
...
This diff is collapsed.
Click to expand it.
libmwereader/tests/mwefunctional.cpp
+
74
−
0
View file @
1a8eb2ba
...
...
@@ -33,6 +33,7 @@ struct Fixture{
BOOST_FIXTURE_TEST_CASE
(
preferred_lexeme
,
Fixture
)
{
BOOST_MESSAGE
(
"test: finding preferred lexeme"
);
...
...
@@ -47,6 +48,7 @@ BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture)
}
BOOST_FIXTURE_TEST_CASE
(
lexeme_no_white_spaces
,
Fixture
)
{
BOOST_MESSAGE
(
"=====================
\n
test: no white space after or before"
);
...
...
@@ -162,6 +164,78 @@ BOOST_FIXTURE_TEST_CASE( flex_gap_noun , Fixture)
}
BOOST_FIXTURE_TEST_CASE
(
get_next_chunk
,
Fixture
)
{
const
Corpus2
::
Tagset
&
tset
=
Corpus2
::
get_named_tagset
(
"kipi"
);
Corpus2
::
MWEReader
mwr1
(
tset
,
test_corpus
.
string
());
mwr1
.
set_option
(
"inner:xces"
);
mwr1
.
set_option
(
"mwefile:"
+
(
data_dir
/
"fix_mwe.xml"
).
string
());
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
BOOST_CHECK
(
mwr1
.
get_next_chunk
()
!=
NULL
);
BOOST_CHECK
(
mwr1
.
get_next_chunk
()
==
NULL
);
}
BOOST_FIXTURE_TEST_CASE
(
get_next_sentence
,
Fixture
)
{
const
Corpus2
::
Tagset
&
tset
=
Corpus2
::
get_named_tagset
(
"kipi"
);
Corpus2
::
MWEReader
mwr1
(
tset
,
test_corpus
.
string
());
mwr1
.
set_option
(
"inner:xces"
);
mwr1
.
set_option
(
"mwefile:"
+
(
data_dir
/
"fix_mwe.xml"
).
string
());
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
BOOST_CHECK
(
mwr1
.
get_next_sentence
()
!=
NULL
);
BOOST_CHECK
(
mwr1
.
get_next_sentence
()
==
NULL
);
}
BOOST_FIXTURE_TEST_CASE
(
flex_no_gap_new
,
Fixture
)
{
BOOST_MESSAGE
(
"=====================
\n
test: finding flex mwe"
);
const
Corpus2
::
Tagset
&
tset
=
Corpus2
::
get_named_tagset
(
"kipi"
);
Corpus2
::
MWEReader
mwr
(
tset
,
test_corpus
.
string
());
mwr
.
set_option
(
"inner:xces"
);
mwr
.
set_option
(
"mwefile:"
+
(
data_dir
/
"flex_mwe.xml"
).
string
());
Corpus2
::
Token
*
mwu
;
for
(
int
i
=
0
;
i
<
120
;
i
++
)
{
switch
(
i
)
{
case
26
:
mwu
=
mwr
.
get_next_token
();
BOOST_CHECK
(
mwu
->
orth_utf8
()
==
"dzień dobry"
);
BOOST_CHECK
(
mwu
->
get_preferred_lexeme
(
tset
).
lemma_utf8
()
==
"dzień dobry"
);
break
;
case
48
:
mwu
=
mwr
.
get_next_token
();
BOOST_CHECK
(
mwu
->
orth_utf8
()
==
"dobry dzień"
);
BOOST_CHECK
(
mwu
->
get_preferred_lexeme
(
tset
).
lemma_utf8
()
==
"dzień dobry"
);
break
;
default:
mwr
.
get_next_token
();
}
}
}
//Check NULL
BOOST_AUTO_TEST_SUITE_END
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment