Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
WCCL
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
WCCL
Commits
967dd4d1
Commit
967dd4d1
authored
May 12, 2011
by
ilor
Browse files
Options
Downloads
Patches
Plain Diff
wccl-run upgrade: simple filtering, sentecne # output, output aggregation
parent
4a9bf953
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
wccl-apps/wccl-run.cpp
+113
-25
113 additions, 25 deletions
wccl-apps/wccl-run.cpp
with
113 additions
and
25 deletions
wccl-apps/wccl-run.cpp
+
113
−
25
View file @
967dd4d1
...
...
@@ -26,6 +26,7 @@ namespace {
bool
output_orths
=
true
;
bool
output_variables
=
false
;
bool
global_numbering
=
false
;
bool
sentence_indices
=
true
;
bool
output_header
=
true
;
bool
in_sentence_numbering
=
true
;
}
...
...
@@ -45,7 +46,7 @@ class Runner
public:
Runner
(
const
Corpus2
::
Tagset
&
tagset
)
:
tagset_
(
tagset
),
parser_
(
tagset_
),
token_idx
(
0
),
progress_
(
false
),
search_path_
(
"."
)
search_path_
(
"."
)
,
want_header_
(
true
),
aggregate_output_
(
false
)
{
}
...
...
@@ -57,31 +58,52 @@ public:
}
}
void
set_aggregate_output
(
bool
v
)
{
aggregate_output_
=
v
;
}
bool
load_more_operators
(
const
std
::
string
&
filename
);
bool
load_operator_string
(
const
std
::
string
&
op_string
);
size_t
size
()
const
{
return
ops_
.
size
()
+
(
filter_op_
?
1
:
0
);
}
const
std
::
vector
<
boost
::
shared_ptr
<
Wccl
::
FunctionalOperator
>
>&
operators
()
const
{
return
ops_
;
}
void
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
);
void
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
,
std
::
vector
<
std
::
vector
<
UnicodeString
>
>&
outputs
,
int
sidx
);
void
run
(
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
,
bool
first
);
void
output_tabular
(
const
std
::
vector
<
std
::
vector
<
UnicodeString
>
>
outputs
);
void
output_tabular
(
const
std
::
vector
<
std
::
vector
<
UnicodeString
>
>
&
outputs
);
void
set_search_path
(
const
std
::
string
&
path
)
{
search_path_
=
path
;
}
void
set_filter_op
(
const
std
::
string
op_name
,
const
std
::
string
&
op_value
)
{
filter_op_name_
=
op_name
;
filter_op_value_
=
op_value
;
}
private
:
void
do_operator_variables
(
const
boost
::
shared_ptr
<
Wccl
::
FunctionalOperator
>&
op
,
std
::
vector
<
UnicodeString
>&
out
,
bool
variables
);
const
Corpus2
::
Tagset
&
tagset_
;
Wccl
::
Parser
parser_
;
std
::
vector
<
boost
::
shared_ptr
<
Wccl
::
FunctionalOperator
>
>
ops_
;
boost
::
shared_ptr
<
Wccl
::
FunctionalOperator
>
filter_op_
;
std
::
vector
<
std
::
string
>
op_names_
;
int
token_idx
;
bool
progress_
;
std
::
string
search_path_
;
std
::
string
filter_op_name_
;
std
::
string
filter_op_value_
;
bool
want_header_
;
bool
aggregate_output_
;
};
bool
Runner
::
load_more_operators
(
const
std
::
string
&
filename
)
...
...
@@ -96,12 +118,17 @@ bool Runner::load_more_operators(const std::string& filename)
retOp
=
parser_
.
parseWcclFile
(
is
,
search_path_
);
if
(
retOp
)
{
boost
::
filesystem
::
path
p
(
filename
);
std
::
string
prefix
=
p
.
stem
()
+
":"
;
std
::
string
prefix
=
""
;
//
p.stem() + ":";
Wccl
::
UntypedOpSequence
::
name_op_v_t
pairs
=
retOp
->
gen_all_op_pairs
();
foreach
(
const
Wccl
::
UntypedOpSequence
::
name_op_pair_t
v
,
pairs
)
{
op_names_
.
push_back
(
prefix
+
v
.
first
);
std
::
string
opname
=
v
.
first
;
if
(
opname
==
filter_op_name_
)
{
filter_op_
=
v
.
second
;
}
else
{
op_names_
.
push_back
(
opname
);
ops_
.
push_back
(
v
.
second
);
}
}
return
true
;
}
else
{
std
::
cerr
<<
"Problem while parsing -- "
...
...
@@ -148,26 +175,48 @@ bool Runner::load_operator_string(const std::string& op_string)
return
false
;
}
void
Runner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
)
void
Runner
::
do_operator_variables
(
const
boost
::
shared_ptr
<
Wccl
::
FunctionalOperator
>&
op
,
std
::
vector
<
UnicodeString
>&
out
,
bool
variables
)
{
std
::
cerr
<<
"dos"
;
Wccl
::
SentenceContext
sc
(
sentence
);
std
::
vector
<
std
::
vector
<
UnicodeString
>
>
outputs
;
if
(
variables
)
{
foreach
(
const
std
::
string
&
varname
,
op
->
valid_variable_names
())
{
out
.
push_back
((
*
op
)[
varname
].
to_string_u
(
tagset_
));
}
}
}
void
Runner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
,
std
::
vector
<
std
::
vector
<
UnicodeString
>
>&
outputs
,
int
sidx
)
{
Wccl
::
SentenceContext
sc
(
sentence
);
streamsave
sv
(
std
::
cout
);
if
(
output_header
)
{
if
(
output_header
&&
want_header_
)
{
outputs
.
resize
(
outputs
.
size
()
+
1
);
std
::
vector
<
UnicodeString
>&
out
=
outputs
.
back
();
if
(
global_numbering
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
"##"
));
}
if
(
sentence_indices
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
"S#"
));
}
if
(
in_sentence_numbering
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
"#"
));
}
if
(
output_orths
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
"orth"
));
}
if
(
filter_op_
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
filter_op_name_
));
if
(
output_variables
)
{
boost
::
shared_ptr
<
Wccl
::
FunctionalOperator
>
o
=
filter_op_
;
foreach
(
const
std
::
string
&
varname
,
o
->
valid_variable_names
())
{
const
Wccl
::
Value
&
value
=
(
*
o
)[
varname
];
std
::
string
label
=
"("
+
filter_op_name_
+
")"
+
value
.
make_var_repr
(
varname
);
out
.
push_back
(
UnicodeString
::
fromUTF8
(
label
));
}
}
}
for
(
size_t
i
=
0
;
i
<
op_names_
.
size
();
++
i
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
op_names_
[
i
]));
if
(
output_variables
)
{
...
...
@@ -181,36 +230,48 @@ void Runner::do_sentence(const boost::shared_ptr<Corpus2::Sentence>& sentence)
}
}
for
(
size_t
i
=
0
;
i
<
sentence
->
size
();
++
i
)
{
++
token_idx
;
sc
.
set_position
(
i
);
UnicodeString
vstr
;
if
(
filter_op_
)
{
boost
::
shared_ptr
<
const
Wccl
::
Value
>
v
=
filter_op_
->
base_apply
(
sc
);
vstr
=
v
->
to_string_u
(
tagset_
);
std
::
string
uvstr
=
PwrNlp
::
to_utf8
(
vstr
);
if
(
uvstr
!=
filter_op_value_
)
{
continue
;
}
else
{
}
}
outputs
.
resize
(
outputs
.
size
()
+
1
);
std
::
vector
<
UnicodeString
>&
out
=
outputs
.
back
();
++
token_idx
;
if
(
global_numbering
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
boost
::
lexical_cast
<
std
::
string
>
(
token_idx
)));
}
if
(
sentence_indices
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
boost
::
lexical_cast
<
std
::
string
>
(
sidx
)));
}
if
(
in_sentence_numbering
)
{
out
.
push_back
(
UnicodeString
::
fromUTF8
(
boost
::
lexical_cast
<
std
::
string
>
(
i
+
1
)));
}
if
(
output_orths
)
{
out
.
push_back
(
sentence
->
tokens
()[
i
]
->
orth
());
}
if
(
filter_op_
)
{
out
.
push_back
(
vstr
);
do_operator_variables
(
filter_op_
,
out
,
output_variables
);
}
sc
.
set_position
(
i
);
foreach
(
const
boost
::
shared_ptr
<
Wccl
::
FunctionalOperator
>&
o
,
ops_
)
{
boost
::
shared_ptr
<
const
Wccl
::
Value
>
v
=
o
->
base_apply
(
sc
);
UnicodeString
vstr
=
v
->
to_string_u
(
tagset_
);
out
.
push_back
(
vstr
);
if
(
output_variables
)
{
foreach
(
const
std
::
string
&
varname
,
o
->
valid_variable_names
())
{
out
.
push_back
((
*
o
)[
varname
].
to_string_u
(
tagset_
));
do_operator_variables
(
o
,
out
,
output_variables
);
}
}
}
}
output_tabular
(
outputs
);
}
void
Runner
::
output_tabular
(
const
std
::
vector
<
std
::
vector
<
UnicodeString
>
>
outputs
)
void
Runner
::
output_tabular
(
const
std
::
vector
<
std
::
vector
<
UnicodeString
>
>
&
outputs
)
{
std
::
vector
<
int
>
lengths
(
outputs
[
0
].
size
());
foreach
(
const
std
::
vector
<
UnicodeString
>&
line
,
outputs
)
{
...
...
@@ -238,27 +299,42 @@ void Runner::output_tabular(const std::vector<std::vector<UnicodeString> > outpu
void
Runner
::
run
(
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
,
bool
first
)
{
std
::
vector
<
std
::
vector
<
UnicodeString
>
>
outputs
;
Corpus2
::
Sentence
::
Ptr
s
;
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
int
sidx
=
0
;
while
((
s
=
reader
->
get_next_sentence
()))
{
do_sentence
(
s
);
do_sentence
(
s
,
outputs
,
sidx
++
);
timer
.
count_sentence
(
*
s
);
if
(
aggregate_output_
)
{
want_header_
=
false
;
}
if
(
!
outputs
.
empty
()
&&
!
aggregate_output_
)
{
output_tabular
(
outputs
);
}
if
(
progress_
)
{
timer
.
check_slice
();
}
if
(
!
filter_op_
)
{
std
::
cout
<<
"
\n
"
;
}
if
(
first
)
break
;
}
if
(
!
outputs
.
empty
()
&&
aggregate_output_
)
{
output_tabular
(
outputs
);
}
}
int
main
(
int
argc
,
char
**
argv
)
{
std
::
string
tagset_load
=
"kipi"
;
std
::
string
filter_op_name
,
filter_op_value
;
bool
first
=
false
,
progress
=
false
;
std
::
string
input_format
;
std
::
string
search_path
;
std
::
vector
<
std
::
string
>
corpora_files
,
files
,
operator_strings
;
bool
corpus_stdin
=
false
;
bool
aggregate
=
false
;
using
boost
::
program_options
::
value
;
std
::
string
readers
=
boost
::
algorithm
::
join
(
Corpus2
::
TokenReader
::
available_reader_types_help
(),
" "
);
std
::
string
readers_help
=
"Input format, any of: "
+
readers
+
"
\n
"
;
...
...
@@ -289,12 +365,20 @@ int main(int argc, char** argv)
"Output in-sentence token counts"
)
(
"global-counts,g"
,
value
(
&
global_numbering
),
"Output global counts"
)
(
"output-sentence-indices,H"
,
value
(
&
sentence_indices
),
"Output sentence indices"
)
(
"output-orths,O"
,
value
(
&
output_orths
),
"Output token orths"
)
(
"output-variables,V"
,
value
(
&
output_variables
),
"Output operator variables"
)
(
"output-header,H"
,
value
(
&
output_header
),
"Output table header"
)
(
"filter-operator,F"
,
value
(
&
filter_op_name
),
"Filter operator name"
)
(
"filter-value"
,
value
(
&
filter_op_value
)
->
default_value
(
"True"
),
"Filter operator expected valye"
)
(
"aggregate-output,A"
,
value
(
&
aggregate
),
"Aggregate output (prettier, slower)"
)
(
"progress,p"
,
value
(
&
progress
)
->
zero_tokens
(),
"Show progress info"
)
(
"help,h"
,
"Show help"
)
...
...
@@ -342,20 +426,24 @@ int main(int argc, char** argv)
if
(
!
search_path
.
empty
())
{
runner
.
set_search_path
(
search_path
);
}
if
(
!
filter_op_name
.
empty
())
{
runner
.
set_filter_op
(
filter_op_name
,
filter_op_value
);
}
runner
.
set_aggregate_output
(
aggregate
);
foreach
(
const
std
::
string
&
f
,
operator_strings
)
{
if
(
boost
::
algorithm
::
ends_with
(
f
,
".ccl"
))
{
size_t
sz
=
runner
.
operators
().
size
();
size_t
sz
=
runner
.
size
();
if
(
!
runner
.
load_more_operators
(
f
))
{
std
::
cerr
<<
"Warning: error while parsing "
<<
f
<<
"
\n
"
;
}
if
(
runner
.
operators
().
size
()
==
sz
)
{
if
(
runner
.
size
()
==
sz
)
{
std
::
cerr
<<
"Warning: no operators loaded from "
<<
f
<<
"
\n
"
;
}
}
else
{
runner
.
load_operator_string
(
f
);
}
}
if
(
!
runner
.
operators
().
empty
()
)
{
if
(
runner
.
size
()
>
0
)
{
foreach
(
const
std
::
string
&
f
,
corpora_files
)
{
runner
.
run
(
Corpus2
::
TokenReader
::
create_path_reader
(
input_format
,
tagset
,
f
),
first
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment