Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
WCCL
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
WCCL
Commits
1a8eb2ba
There was an error fetching the commit references. Please try again later.
Commit
1a8eb2ba
authored
13 years ago
by
Lukasz Bilenkij
Browse files
Options
Downloads
Patches
Plain Diff
Add processing in get_next_token/chunk/sentence methods
parent
9ac9fb0d
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
libmwereader/mwereader.cpp
+46
-12
46 additions, 12 deletions
libmwereader/mwereader.cpp
libmwereader/mwereader.h
+3
-0
3 additions, 0 deletions
libmwereader/mwereader.h
libmwereader/tests/mwefunctional.cpp
+74
-0
74 additions, 0 deletions
libmwereader/tests/mwefunctional.cpp
with
123 additions
and
12 deletions
libmwereader/mwereader.cpp
+
46
−
12
View file @
1a8eb2ba
...
@@ -21,20 +21,40 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
...
@@ -21,20 +21,40 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
Token
*
MWEReader
::
get_next_token
()
Token
*
MWEReader
::
get_next_token
()
{
{
// TODO MWE stuff
if
(
currentSentence
->
empty
())
// get whole sentence -> process it -> return token by token
currentSentence
=
get_next_sentence
();
return
inner_reader_
->
get_next_token
();
std
::
vector
<
Token
*>
tokens
=
currentSentence
->
tokens
();
if
(
token_index
<
tokens
.
size
())
{
return
tokens
.
at
(
token_index
++
);
}
else
{
currentSentence
=
get_next_sentence
();
if
(
currentSentence
==
NULL
)
{
return
NULL
;
}
tokens
=
currentSentence
->
tokens
();
token_index
=
0
;
return
tokens
.
at
(
token_index
++
);
}
}
}
Sentence
::
Ptr
MWEReader
::
get_next_sentence
()
Sentence
::
Ptr
MWEReader
::
get_next_sentence
()
{
{
// TODO MWE stuff
currentSentence
=
inner_reader_
->
get_next_sentence
();
Sentence
::
Ptr
pSentence
=
inner_reader_
->
get_next_sentence
();
if
(
currentSentence
==
0
)
if
(
pSentence
==
NULL
)
{
return
Sentence
::
Ptr
();
return
currentSentence
;
Wccl
::
SentenceContext
sc
(
pSentence
);
}
Wccl
::
SentenceContext
sc
(
currentSentence
);
token_index
=
0
;
return
process_sentence
(
sc
);
return
process_sentence
(
sc
);
}
}
Sentence
::
Ptr
MWEReader
::
process_sentence
(
Wccl
::
SentenceContext
&
sc
)
Sentence
::
Ptr
MWEReader
::
process_sentence
(
Wccl
::
SentenceContext
&
sc
)
...
@@ -108,9 +128,21 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
...
@@ -108,9 +128,21 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
boost
::
shared_ptr
<
Chunk
>
MWEReader
::
get_next_chunk
()
boost
::
shared_ptr
<
Chunk
>
MWEReader
::
get_next_chunk
()
{
{
// TODO MWE stuff
currentChunk
=
inner_reader_
->
get_next_chunk
();
// get whole chunk -> process sentences -> return processed chunk
if
(
currentChunk
==
NULL
)
return
inner_reader_
->
get_next_chunk
();
return
currentChunk
;
std
::
vector
<
boost
::
shared_ptr
<
Corpus2
::
Sentence
>
>
s1
=
currentChunk
->
sentences
();
std
::
vector
<
boost
::
shared_ptr
<
Corpus2
::
Sentence
>
>::
iterator
it
;
for
(
it
=
s1
.
begin
();
it
!=
s1
.
end
();
it
++
)
{
if
(
it
==
s1
.
begin
())
currentSentence
=*
it
;
Wccl
::
SentenceContext
sc
(
*
it
);
process_sentence
(
sc
);
}
token_index
=
0
;
return
currentChunk
;
}
}
void
MWEReader
::
set_option
(
const
std
::
string
&
option
)
void
MWEReader
::
set_option
(
const
std
::
string
&
option
)
...
@@ -120,6 +152,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
...
@@ -120,6 +152,8 @@ bool MWEReader::registered = TokenReader::register_path_reader<MWEReader>(
std
::
string
inner
=
option
.
substr
(
6
);
std
::
string
inner
=
option
.
substr
(
6
);
inner_reader_
=
create_path_reader
(
inner
,
this
->
tagset
(),
inner_reader_
=
create_path_reader
(
inner
,
this
->
tagset
(),
inner_filename_
);
inner_filename_
);
token_index
=
0
;
currentSentence
=
boost
::
make_shared
<
Sentence
>
();
}
}
if
(
boost
::
algorithm
::
starts_with
(
option
,
"mwefile:"
))
{
if
(
boost
::
algorithm
::
starts_with
(
option
,
"mwefile:"
))
{
std
::
string
mwefile
=
option
.
substr
(
8
);
std
::
string
mwefile
=
option
.
substr
(
8
);
...
...
This diff is collapsed.
Click to expand it.
libmwereader/mwereader.h
+
3
−
0
View file @
1a8eb2ba
...
@@ -62,6 +62,9 @@ private:
...
@@ -62,6 +62,9 @@ private:
/// path for inner reader
/// path for inner reader
std
::
string
inner_filename_
;
std
::
string
inner_filename_
;
/// inner reader option
/// inner reader option
size_t
token_index
;
Sentence
::
Ptr
currentSentence
;
boost
::
shared_ptr
<
Chunk
>
currentChunk
;
};
};
}
// ns Corpus2
}
// ns Corpus2
...
...
This diff is collapsed.
Click to expand it.
libmwereader/tests/mwefunctional.cpp
+
74
−
0
View file @
1a8eb2ba
...
@@ -33,6 +33,7 @@ struct Fixture{
...
@@ -33,6 +33,7 @@ struct Fixture{
BOOST_FIXTURE_TEST_CASE
(
preferred_lexeme
,
Fixture
)
BOOST_FIXTURE_TEST_CASE
(
preferred_lexeme
,
Fixture
)
{
{
BOOST_MESSAGE
(
"test: finding preferred lexeme"
);
BOOST_MESSAGE
(
"test: finding preferred lexeme"
);
...
@@ -47,6 +48,7 @@ BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture)
...
@@ -47,6 +48,7 @@ BOOST_FIXTURE_TEST_CASE( preferred_lexeme, Fixture)
}
}
BOOST_FIXTURE_TEST_CASE
(
lexeme_no_white_spaces
,
Fixture
)
BOOST_FIXTURE_TEST_CASE
(
lexeme_no_white_spaces
,
Fixture
)
{
{
BOOST_MESSAGE
(
"=====================
\n
test: no white space after or before"
);
BOOST_MESSAGE
(
"=====================
\n
test: no white space after or before"
);
...
@@ -162,6 +164,78 @@ BOOST_FIXTURE_TEST_CASE( flex_gap_noun , Fixture)
...
@@ -162,6 +164,78 @@ BOOST_FIXTURE_TEST_CASE( flex_gap_noun , Fixture)
}
}
BOOST_FIXTURE_TEST_CASE
(
get_next_chunk
,
Fixture
)
{
const
Corpus2
::
Tagset
&
tset
=
Corpus2
::
get_named_tagset
(
"kipi"
);
Corpus2
::
MWEReader
mwr1
(
tset
,
test_corpus
.
string
());
mwr1
.
set_option
(
"inner:xces"
);
mwr1
.
set_option
(
"mwefile:"
+
(
data_dir
/
"fix_mwe.xml"
).
string
());
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
mwr1
.
get_next_chunk
();
BOOST_CHECK
(
mwr1
.
get_next_chunk
()
!=
NULL
);
BOOST_CHECK
(
mwr1
.
get_next_chunk
()
==
NULL
);
}
BOOST_FIXTURE_TEST_CASE
(
get_next_sentence
,
Fixture
)
{
const
Corpus2
::
Tagset
&
tset
=
Corpus2
::
get_named_tagset
(
"kipi"
);
Corpus2
::
MWEReader
mwr1
(
tset
,
test_corpus
.
string
());
mwr1
.
set_option
(
"inner:xces"
);
mwr1
.
set_option
(
"mwefile:"
+
(
data_dir
/
"fix_mwe.xml"
).
string
());
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
mwr1
.
get_next_sentence
();
BOOST_CHECK
(
mwr1
.
get_next_sentence
()
!=
NULL
);
BOOST_CHECK
(
mwr1
.
get_next_sentence
()
==
NULL
);
}
BOOST_FIXTURE_TEST_CASE
(
flex_no_gap_new
,
Fixture
)
{
BOOST_MESSAGE
(
"=====================
\n
test: finding flex mwe"
);
const
Corpus2
::
Tagset
&
tset
=
Corpus2
::
get_named_tagset
(
"kipi"
);
Corpus2
::
MWEReader
mwr
(
tset
,
test_corpus
.
string
());
mwr
.
set_option
(
"inner:xces"
);
mwr
.
set_option
(
"mwefile:"
+
(
data_dir
/
"flex_mwe.xml"
).
string
());
Corpus2
::
Token
*
mwu
;
for
(
int
i
=
0
;
i
<
120
;
i
++
)
{
switch
(
i
)
{
case
26
:
mwu
=
mwr
.
get_next_token
();
BOOST_CHECK
(
mwu
->
orth_utf8
()
==
"dzień dobry"
);
BOOST_CHECK
(
mwu
->
get_preferred_lexeme
(
tset
).
lemma_utf8
()
==
"dzień dobry"
);
break
;
case
48
:
mwu
=
mwr
.
get_next_token
();
BOOST_CHECK
(
mwu
->
orth_utf8
()
==
"dobry dzień"
);
BOOST_CHECK
(
mwu
->
get_preferred_lexeme
(
tset
).
lemma_utf8
()
==
"dzień dobry"
);
break
;
default:
mwr
.
get_next_token
();
}
}
}
//Check NULL
BOOST_AUTO_TEST_SUITE_END
()
BOOST_AUTO_TEST_SUITE_END
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment