Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
WCCL
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
WCCL
Commits
c5fe1b46
Commit
c5fe1b46
authored
May 9, 2011
by
Adam Wardynski
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of nlp.pwr.wroc.pl:wccl
parents
61cb697b
1923cc70
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
libwccl/wcclfile.cpp
+8
-0
8 additions, 0 deletions
libwccl/wcclfile.cpp
wccl-apps/CMakeLists.txt
+1
-3
1 addition, 3 deletions
wccl-apps/CMakeLists.txt
wccl-apps/wccl-match.cpp
+0
-243
0 additions, 243 deletions
wccl-apps/wccl-match.cpp
wccl-apps/wccl-rules.cpp
+107
-80
107 additions, 80 deletions
wccl-apps/wccl-rules.cpp
with
116 additions
and
326 deletions
libwccl/wcclfile.cpp
+
8
−
0
View file @
c5fe1b46
...
@@ -36,6 +36,14 @@ boost::shared_ptr<const TagRuleSequence> WcclFile::get_tag_rules_ptr() const
...
@@ -36,6 +36,14 @@ boost::shared_ptr<const TagRuleSequence> WcclFile::get_tag_rules_ptr() const
return
tag_rules_
;
return
tag_rules_
;
}
}
boost
::
shared_ptr
<
MatchRuleSequence
>
WcclFile
::
get_match_rules_ptr
()
{
if
(
!
has_match_rules
())
{
throw
WcclError
(
"There are no match rules."
);
}
return
match_rules_
;
}
boost
::
shared_ptr
<
const
MatchRuleSequence
>
WcclFile
::
get_match_rules_ptr
()
const
boost
::
shared_ptr
<
const
MatchRuleSequence
>
WcclFile
::
get_match_rules_ptr
()
const
{
{
if
(
!
has_match_rules
())
{
if
(
!
has_match_rules
())
{
...
...
This diff is collapsed.
Click to expand it.
wccl-apps/CMakeLists.txt
+
1
−
3
View file @
c5fe1b46
...
@@ -27,11 +27,9 @@ add_executable(wccl-rules wccl-rules.cpp)
...
@@ -27,11 +27,9 @@ add_executable(wccl-rules wccl-rules.cpp)
target_link_libraries
(
wccl-rules wccl
${
Boost_LIBRARIES
}
antlr
${
LIBS
}
)
target_link_libraries
(
wccl-rules wccl
${
Boost_LIBRARIES
}
antlr
${
LIBS
}
)
add_executable
(
wccl-parser wccl-parser.cpp
)
add_executable
(
wccl-parser wccl-parser.cpp
)
target_link_libraries
(
wccl-parser wccl
${
Boost_LIBRARIES
}
antlr
${
LIBS
}
)
target_link_libraries
(
wccl-parser wccl
${
Boost_LIBRARIES
}
antlr
${
LIBS
}
)
add_executable
(
wccl-match wccl-match.cpp
)
target_link_libraries
(
wccl-match wccl
${
Boost_LIBRARIES
}
antlr
${
LIBS
}
)
if
(
UNIX
)
if
(
UNIX
)
install
(
TARGETS wccl-features wccl-run wccl-rules wccl-parser
wccl-match
install
(
TARGETS wccl-features wccl-run wccl-rules wccl-parser
RUNTIME DESTINATION bin
RUNTIME DESTINATION bin
)
)
endif
(
UNIX
)
endif
(
UNIX
)
This diff is collapsed.
Click to expand it.
wccl-apps/wccl-match.cpp
deleted
100644 → 0
+
0
−
243
View file @
61cb697b
#include
<cstdlib>
#include
<fstream>
#include
<iomanip>
#include
<libwccl/values/strset.h>
#include
<libwccl/parser/Parser.h>
#include
<libcorpus2/tagsetmanager.h>
#include
<libcorpus2/util/tokentimer.h>
#include
<boost/bind.hpp>
#include
<boost/algorithm/string.hpp>
#include
<boost/make_shared.hpp>
#include
<boost/filesystem.hpp>
#include
<boost/program_options.hpp>
#include
<libcorpus2/io/reader.h>
#include
<libcorpus2/io/writer.h>
namespace
{
bool
quiet
=
false
;
struct
options
{
bool
first
;
bool
until_done
;
int
until_done_iterations
;
};
}
class
MatchRunner
{
public:
MatchRunner
(
const
Corpus2
::
Tagset
&
tagset
)
:
tagset_
(
tagset
),
parser_
(
tagset_
),
progress_
(
false
)
{
}
void
use_progress
(
bool
use
)
{
progress_
=
use
;
if
(
use
)
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
timer
.
register_signal_handler
();
}
}
bool
load_more_rules
(
const
std
::
string
&
filename
);
bool
load_operator_string
(
const
std
::
string
&
op_string
);
void
apply_rules
(
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
,
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
);
bool
empty
()
{
return
rules_
.
empty
();
}
private
:
const
Corpus2
::
Tagset
&
tagset_
;
Wccl
::
Parser
parser_
;
std
::
vector
<
std
::
string
>
rule_names_
;
std
::
vector
<
boost
::
shared_ptr
<
Wccl
::
MatchRule
>
>
rules_
;
bool
progress_
;
};
bool
MatchRunner
::
load_more_rules
(
const
std
::
string
&
filename
)
{
boost
::
shared_ptr
<
Wccl
::
MatchRule
>
retOp
;
try
{
std
::
ifstream
is
(
filename
.
c_str
());
if
(
!
is
.
good
())
{
throw
Wccl
::
FileNotFound
(
filename
,
""
,
__FUNCTION__
);
}
retOp
=
parser_
.
parseMatchRule
(
is
);
if
(
retOp
)
{
boost
::
filesystem
::
path
p
(
filename
);
rule_names_
.
push_back
(
p
.
stem
());
rules_
.
push_back
(
retOp
);
return
true
;
}
else
{
std
::
cerr
<<
"Problem while parsing -- "
<<
"parser returned NULL!"
<<
std
::
endl
;
}
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
std
::
cerr
<<
e
.
scope
()
<<
" Error: "
<<
e
.
info
()
<<
std
::
endl
;
}
return
false
;
}
void
MatchRunner
::
apply_rules
(
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
,
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
)
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
while
(
boost
::
shared_ptr
<
Corpus2
::
Chunk
>
c
=
reader
->
get_next_chunk
())
{
foreach
(
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
s
,
c
->
sentences
())
{
boost
::
shared_ptr
<
Corpus2
::
AnnotatedSentence
>
as
;
as
=
boost
::
dynamic_pointer_cast
<
Corpus2
::
AnnotatedSentence
>
(
s
);
if
(
!
as
)
{
std
::
cerr
<<
"Did not get an AnnotatedSentence from reader,"
"'ann'' option broken?
\n
"
;
return
;
}
foreach
(
const
boost
::
shared_ptr
<
Wccl
::
MatchRule
>&
r
,
rules_
)
{
r
->
apply
(
as
);
}
timer
.
count_sentence
(
*
as
);
if
(
progress_
)
{
timer
.
check_slice
();
}
//writer->write_sentence(*as);
}
writer
->
write_chunk
(
*
c
);
}
}
void
usage
(
char
*
name
)
{
std
::
cerr
<<
"This program runs WCCL match rules.
\n
"
;
std
::
cerr
<<
"Usage "
<<
name
<<
" [OPTIONS] FILES
\n
"
<<
"Files ending with .xml are treated as corpora, otherwise
\n
"
<<
"as CCL files. Use - to read corpus from stdin (as with -I)
\n
"
<<
"Note: the ann option is implied on all input formats
\n
"
;
}
int
main
(
int
argc
,
char
**
argv
)
{
std
::
string
tagset_load
=
"kipi"
;
std
::
string
input_format
;
std
::
string
output_format
;
bool
progress
=
false
;
options
opts
;
opts
.
first
=
false
;
opts
.
until_done
=
false
;
opts
.
until_done_iterations
=
1000
;
std
::
vector
<
std
::
string
>
corpora_files
,
ccl_files
,
files
;
bool
corpus_stdin
=
true
;
using
boost
::
program_options
::
value
;
std
::
string
readers
=
boost
::
algorithm
::
join
(
Corpus2
::
TokenReader
::
available_reader_types_help
(),
" "
);
std
::
string
readers_help
=
"Input format, any of: "
+
readers
+
"
\n
"
;
std
::
string
writers
=
boost
::
algorithm
::
join
(
Corpus2
::
TokenWriter
::
available_writer_types_help
(),
" "
);
std
::
string
writers_help
=
"Output format, any of: "
+
writers
+
"
\n
"
;;
boost
::
program_options
::
options_description
desc
(
"Allowed options"
);
desc
.
add_options
()
(
"tagset,t"
,
value
(
&
tagset_load
),
"Tagset to use
\n
"
)
(
"corpus,c"
,
value
(
&
corpora_files
),
"Corpus file to load (XCES), do not load from stdin
\n
"
)
(
"ccl-file,C"
,
value
(
&
ccl_files
),
"CCL rule files
\n
"
)
(
"files,f"
,
value
(
&
files
),
"Files to load, looking at the extension to determine type
\n
"
)
(
"corpus-from-stdin,I"
,
value
(
&
corpus_stdin
)
->
zero_tokens
(),
"Read corpus from stdin"
)
(
"input-format,i"
,
value
(
&
input_format
)
->
default_value
(
"xces"
),
readers_help
.
c_str
())
(
"output-format,o"
,
value
(
&
output_format
)
->
default_value
(
"ccl"
),
writers_help
.
c_str
())
(
"progress,p"
,
value
(
&
progress
)
->
zero_tokens
(),
"Show progress info"
)
(
"quiet,q"
,
value
(
&
quiet
)
->
zero_tokens
(),
"Suppress messages
\n
"
)
(
"until-done,u"
,
value
(
&
opts
.
until_done
)
->
zero_tokens
(),
"Until-done mode
\n
"
)
(
"until-done-iterations"
,
value
(
&
opts
.
until_done_iterations
),
"Until-done iteration limit
\n
"
)
(
"first-sentence-only,1"
,
value
(
&
opts
.
first
)
->
zero_tokens
(),
"Only process first sentence
\n
"
)
(
"help,h"
,
"Show help"
)
;
boost
::
program_options
::
variables_map
vm
;
boost
::
program_options
::
positional_options_description
p
;
p
.
add
(
"files"
,
-
1
);
try
{
boost
::
program_options
::
store
(
boost
::
program_options
::
command_line_parser
(
argc
,
argv
)
.
options
(
desc
).
positional
(
p
).
run
(),
vm
);
}
catch
(
boost
::
program_options
::
error
&
e
)
{
std
::
cerr
<<
e
.
what
()
<<
std
::
endl
;
return
2
;
}
boost
::
program_options
::
notify
(
vm
);
if
(
vm
.
count
(
"help"
))
{
usage
(
argv
[
0
]);
std
::
cout
<<
desc
<<
"
\n
"
;
return
1
;
}
foreach
(
const
std
::
string
&
f
,
files
)
{
if
(
f
==
"-"
)
{
corpus_stdin
=
true
;
}
else
if
(
boost
::
algorithm
::
ends_with
(
f
,
".xml"
))
{
corpora_files
.
push_back
(
f
);
}
else
{
ccl_files
.
push_back
(
f
);
}
}
// consider stdin only when no corpus files given
corpus_stdin
=
corpus_stdin
&&
corpora_files
.
empty
();
if
(
ccl_files
.
empty
()
||
(
corpora_files
.
empty
()
&&
!
corpus_stdin
))
{
usage
(
argv
[
0
]);
return
2
;
}
try
{
const
Corpus2
::
Tagset
&
tagset
=
Corpus2
::
get_named_tagset
(
tagset_load
);
MatchRunner
runner
(
tagset
);
runner
.
use_progress
(
progress
);
foreach
(
const
std
::
string
&
file
,
ccl_files
)
{
runner
.
load_more_rules
(
file
);
}
if
(
!
runner
.
empty
())
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
timer
.
register_signal_handler
();
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
;
writer
=
Corpus2
::
TokenWriter
::
create_stream_writer
(
output_format
,
std
::
cout
,
tagset
);
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
;
foreach
(
std
::
string
cf
,
corpora_files
)
{
reader
=
Corpus2
::
TokenReader
::
create_path_reader
(
input_format
,
tagset
,
cf
);
reader
->
set_option
(
"ann"
);
runner
.
apply_rules
(
reader
,
writer
);
}
if
(
corpus_stdin
)
{
reader
=
Corpus2
::
TokenReader
::
create_stream_reader
(
input_format
,
tagset
,
std
::
cin
);
reader
->
set_option
(
"ann"
);
runner
.
apply_rules
(
reader
,
writer
);
}
if
(
progress
)
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
timer
.
stats
();
}
}
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
std
::
cerr
<<
e
.
info
()
<<
std
::
endl
;
return
2
;
}
return
0
;
}
This diff is collapsed.
Click to expand it.
wccl-apps/wccl-rules.cpp
+
107
−
80
View file @
c5fe1b46
...
@@ -2,79 +2,109 @@
...
@@ -2,79 +2,109 @@
#include
<fstream>
#include
<fstream>
#include
<iomanip>
#include
<iomanip>
#include
<libwccl/values/strset.h>
#include
<libwccl/values/strset.h>
#include
<libwccl/parser/Parser.h>
#include
<libwccl/parser/Parser.h>
#include
<libwccl/ops/tagrulesequence.h>
#include
<libcorpus2/tagsetmanager.h>
#include
<libcorpus2/tagsetmanager.h>
#include
<libcorpus2/util/tokentimer.h>
#include
<libcorpus2/util/tokentimer.h>
#include
<boost/bind.hpp>
#include
<boost/bind.hpp>
#include
<boost/algorithm/string.hpp>
#include
<boost/algorithm/string.hpp>
#include
<boost/make_shared.hpp>
#include
<boost/make_shared.hpp>
#include
<boost/filesystem.hpp>
#include
<boost/program_options.hpp>
#include
<boost/program_options.hpp>
#include
<libcorpus2/io/xcesreader.h>
#include
<libcorpus2/io/reader.h>
#include
<libcorpus2/io/xceswriter.h>
#include
<libcorpus2/io/writer.h>
#include
<antlr/NoViableAltException.hpp>
#include
<antlr/MismatchedTokenException.hpp>
namespace
{
namespace
{
bool
quiet
=
false
;
bool
quiet
=
false
;
bool
progress
=
false
;
struct
options
{
struct
options
{
bool
first
;
bool
first
;
bool
until_done
;
int
until_done_iterations
;
};
};
}
}
bool
load_more_rules
(
Wccl
::
Parser
&
parser
,
const
std
::
string
&
filename
,
Wccl
::
TagRuleSequence
&
rules
)
class
RuleRunner
{
public:
RuleRunner
(
const
Corpus2
::
Tagset
&
tagset
)
:
tagset_
(
tagset
),
parser_
(
tagset_
),
progress_
(
false
)
,
tag_rule_iterations_
(
0
),
total_match_rules_
(
0
),
total_tag_rules_
(
0
)
{
{
boost
::
shared_ptr
<
Wccl
::
TagRuleSequence
>
ret
;
}
void
use_progress
(
bool
use
)
{
progress_
=
use
;
if
(
use
)
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
timer
.
register_signal_handler
();
}
}
void
set_tag_rule_iterations
(
int
i
)
{
tag_rule_iterations_
=
i
;
}
std
::
pair
<
int
,
int
>
load_more_rules
(
const
std
::
string
&
filename
);
void
apply_rules
(
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
,
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
);
bool
empty
()
const
{
return
size
()
>
0
;
}
size_t
size
()
const
{
return
total_match_rules_
+
total_tag_rules_
;
}
size_t
total_match_rules
()
const
{
return
total_match_rules_
;
}
size_t
total_tag_rules
()
const
{
return
total_tag_rules_
;
}
private
:
const
Corpus2
::
Tagset
&
tagset_
;
Wccl
::
Parser
parser_
;
std
::
vector
<
std
::
string
>
file_names_
;
std
::
vector
<
boost
::
shared_ptr
<
Wccl
::
WcclFile
>
>
parsed_files_
;
bool
progress_
;
int
tag_rule_iterations_
;
size_t
total_match_rules_
,
total_tag_rules_
;
};
std
::
pair
<
int
,
int
>
RuleRunner
::
load_more_rules
(
const
std
::
string
&
filename
)
{
boost
::
shared_ptr
<
Wccl
::
WcclFile
>
parsed_file
;
try
{
try
{
std
::
ifstream
is
(
filename
.
c_str
());
std
::
ifstream
is
(
filename
.
c_str
());
if
(
!
is
.
good
())
{
if
(
!
is
.
good
())
{
throw
Wccl
::
FileNotFound
(
filename
,
""
,
__FUNCTION__
);
throw
Wccl
::
FileNotFound
(
filename
,
""
,
__FUNCTION__
);
}
}
parsed_file
=
parser_
.
parseWcclFile
(
is
,
"."
);
ret
=
parser
.
parseTagRuleSequence
(
is
);
if
(
parsed_file
)
{
if
(
ret
)
{
boost
::
filesystem
::
path
p
(
filename
);
if
(
!
quiet
)
{
file_names_
.
push_back
(
p
.
stem
());
std
::
cerr
<<
"Loaded "
<<
ret
->
size
()
<<
" rule(s) from "
size_t
match_rules
=
parsed_file
->
get_match_rules
().
size
();
<<
filename
<<
"
\n
"
;
size_t
tag_rules
=
parsed_file
->
get_tag_rules
().
size
();
}
total_match_rules_
+=
match_rules
;
std
::
copy
(
ret
->
begin
(),
ret
->
end
(),
std
::
back_inserter
(
rules
));
total_tag_rules_
+=
tag_rules
;
return
true
;
parsed_files_
.
push_back
(
parsed_file
);
return
std
::
make_pair
(
tag_rules
,
match_rules
);
}
else
{
}
else
{
std
::
cerr
<<
"Problem while parsing -- "
std
::
cerr
<<
"Problem while parsing -- "
<<
"parser returned NULL!"
<<
std
::
endl
;
<<
"parser returned NULL!"
<<
std
::
endl
;
}
}
}
catch
(
antlr
::
MismatchedTokenException
&
e
)
{
std
::
cerr
<<
e
.
getFileLineColumnString
()
<<
" "
<<
e
.
getMessage
()
<<
std
::
endl
;
}
catch
(
antlr
::
NoViableAltException
&
e
)
{
std
::
cerr
<<
e
.
getFileLineColumnString
()
<<
" "
<<
e
.
getMessage
()
<<
std
::
endl
;
}
catch
(
Wccl
::
InvalidVariableName
&
e
)
{
std
::
cerr
<<
"Wccl::InvalidVariableName "
<<
e
.
info
()
<<
std
::
endl
;
}
catch
(
Wccl
::
VariableTypeMismatch
&
e
)
{
std
::
cerr
<<
"Wccl::VariableTypeMismatch "
<<
e
.
info
()
<<
std
::
endl
;
}
catch
(
Wccl
::
WcclError
&
e
)
{
std
::
cerr
<<
"Wccl::WcclError:"
<<
e
.
info
()
<<
std
::
endl
;
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
std
::
cerr
<<
"PwrNlp::PwrNlpError "
<<
e
.
info
()
<<
std
::
endl
;
std
::
cerr
<<
e
.
scope
()
<<
" Error: "
<<
e
.
info
()
<<
std
::
endl
;
}
catch
(
antlr
::
ANTLRException
&
e
)
{
std
::
cerr
<<
"Antlr error "
<<
e
.
getMessage
()
<<
std
::
endl
;
}
}
return
false
;
return
std
::
make_pair
(
0
,
0
)
;
}
}
void
apply_rules
(
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
,
void
RuleRunner
::
apply_rules
(
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
,
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
,
Wccl
::
TagRuleSequence
&
rules
,
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
)
const
options
&
opts
)
{
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
while
(
boost
::
shared_ptr
<
Corpus2
::
Chunk
>
c
=
reader
->
get_next_chunk
())
{
while
(
boost
::
shared_ptr
<
Corpus2
::
Chunk
>
c
=
reader
->
get_next_chunk
())
{
...
@@ -86,45 +116,45 @@ void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
...
@@ -86,45 +116,45 @@ void apply_rules(boost::shared_ptr<Corpus2::TokenReader> reader,
"'ann'' option broken?
\n
"
;
"'ann'' option broken?
\n
"
;
return
;
return
;
}
}
if
(
opts
.
until_done
)
{
rules
.
execute_until_done
(
as
,
opts
.
until_done_iterations
);
foreach
(
boost
::
shared_ptr
<
Wccl
::
WcclFile
>&
f
,
parsed_files_
)
{
if
(
tag_rule_iterations_
==
0
)
{
f
->
get_tag_rules_ptr
()
->
execute_once
(
as
);
}
else
if
(
tag_rule_iterations_
<
0
)
{
f
->
get_tag_rules_ptr
()
->
execute_until_done
(
as
);
}
else
{
}
else
{
rules
.
execute_once
(
as
);
f
->
get_tag_rules_ptr
()
->
execute_until_done
(
as
,
tag_rule_iterations_
);
}
f
->
get_match_rules_ptr
()
->
apply_all
(
as
);
}
}
timer
.
count_sentence
(
*
as
);
timer
.
count_sentence
(
*
as
);
if
(
progress
)
{
if
(
progress
_
)
{
timer
.
check_slice
();
timer
.
check_slice
();
}
}
if
(
opts
.
first
)
break
;
//writer->write_sentence(*as);
//writer->write_sentence(*as);
}
}
writer
->
write_chunk
(
*
c
);
writer
->
write_chunk
(
*
c
);
if
(
opts
.
first
)
break
;
}
if
(
progress
)
{
timer
.
stats
();
}
}
}
}
void
usage
(
char
*
name
)
void
usage
(
char
*
name
)
{
{
std
::
cerr
<<
"This program runs WCCL
disambiguation rules
.
\n
"
;
std
::
cerr
<<
"This program runs WCCL
match and/or tag rules. Tag rules are applied first
.
\n
"
;
std
::
cerr
<<
"Usage "
<<
name
<<
" [OPTIONS] FILES
\n
"
std
::
cerr
<<
"Usage "
<<
name
<<
" [OPTIONS] FILES
\n
"
<<
"Files ending with .xml are treated as corpora, otherwise
\n
"
<<
"Files ending with .xml are treated as corpora, otherwise "
<<
"as CCL files. Use - to read corpus from stdin (as with -I)
\n
"
<<
"as
W
CCL files. Use - to read corpus from stdin (as with -I)
\n
"
<<
"Note: the ann option is implied on all input formats
\n
"
;
<<
"Note: the
,
ann option is implied on all input formats
\n
"
;
}
}
int
main
(
int
argc
,
char
**
argv
)
int
main
(
int
argc
,
char
**
argv
)
{
{
std
::
string
tagset_load
=
"kipi"
;
std
::
string
tagset_load
=
"kipi"
;
std
::
string
input_format
;
std
::
string
input_format
;
std
::
string
output_format
;
std
::
string
output_format
;
bool
progress
=
false
;
options
opts
;
options
opts
;
opts
.
first
=
false
;
opts
.
first
=
false
;
opts
.
until_done
=
false
;
opts
.
until_done_iterations
=
1000
;
std
::
vector
<
std
::
string
>
corpora_files
,
ccl_files
,
files
;
std
::
vector
<
std
::
string
>
corpora_files
,
ccl_files
,
files
;
bool
corpus_stdin
=
true
;
bool
corpus_stdin
=
true
;
using
boost
::
program_options
::
value
;
using
boost
::
program_options
::
value
;
...
@@ -145,19 +175,17 @@ int main(int argc, char** argv)
...
@@ -145,19 +175,17 @@ int main(int argc, char** argv)
(
"files,f"
,
value
(
&
files
),
(
"files,f"
,
value
(
&
files
),
"Files to load, looking at the extension to determine type
\n
"
)
"Files to load, looking at the extension to determine type
\n
"
)
(
"corpus-from-stdin,I"
,
value
(
&
corpus_stdin
)
->
zero_tokens
(),
(
"corpus-from-stdin,I"
,
value
(
&
corpus_stdin
)
->
zero_tokens
(),
"Read corpus from stdin"
)
"Read corpus from stdin
(requires that no corpora filenames are passed)
"
)
(
"input-format,i"
,
value
(
&
input_format
)
->
default_value
(
"xces"
),
(
"input-format,i"
,
value
(
&
input_format
)
->
default_value
(
"xces"
),
readers_help
.
c_str
())
readers_help
.
c_str
())
(
"output-format,o"
,
value
(
&
output_format
)
->
default_value
(
"
xces
"
),
(
"output-format,o"
,
value
(
&
output_format
)
->
default_value
(
"
ccl
"
),
writers_help
.
c_str
())
writers_help
.
c_str
())
(
"progress,p"
,
value
(
&
progress
)
->
zero_tokens
(),
(
"progress,p"
,
value
(
&
progress
)
->
zero_tokens
(),
"Show progress info"
)
"Show progress info"
)
(
"quiet,q"
,
value
(
&
quiet
)
->
zero_tokens
(),
(
"quiet,q"
,
value
(
&
quiet
)
->
zero_tokens
(),
"Suppress messages
\n
"
)
"Suppress messages
\n
"
)
(
"until-done,u"
,
value
(
&
opts
.
until_done
)
->
zero_tokens
(),
(
"until-done-iterations,u"
,
value
<
int
>
()
->
implicit_value
(
1000
),
"Until-done mode
\n
"
)
"Until-done iteration limit, no arg for default limit(1000)
\n
"
)
(
"until-done-iterations"
,
value
(
&
opts
.
until_done_iterations
),
"Until-done iteration limit
\n
"
)
(
"first-sentence-only,1"
,
value
(
&
opts
.
first
)
->
zero_tokens
(),
(
"first-sentence-only,1"
,
value
(
&
opts
.
first
)
->
zero_tokens
(),
"Only process first sentence
\n
"
)
"Only process first sentence
\n
"
)
(
"help,h"
,
"Show help"
)
(
"help,h"
,
"Show help"
)
...
@@ -202,32 +230,31 @@ int main(int argc, char** argv)
...
@@ -202,32 +230,31 @@ int main(int argc, char** argv)
try
{
try
{
const
Corpus2
::
Tagset
&
tagset
=
Corpus2
::
get_named_tagset
(
tagset_load
);
const
Corpus2
::
Tagset
&
tagset
=
Corpus2
::
get_named_tagset
(
tagset_load
);
Wccl
::
Parser
parser
(
tagset
);
RuleRunner
runner
(
tagset
);
Wccl
::
TagRuleSequence
rules
;
runner
.
set_tag_rule_iterations
(
vm
[
"until-done-iterations"
].
as
<
int
>
());
foreach
(
const
std
::
string
&
f
,
ccl_files
)
{
runner
.
use_progress
(
progress
);
size_t
sz
=
rules
.
size
();
foreach
(
const
std
::
string
&
file
,
ccl_files
)
{
if
(
!
load_more_rules
(
parser
,
f
,
rules
))
{
std
::
pair
<
int
,
int
>
res
=
runner
.
load_more_rules
(
file
);
std
::
cerr
<<
"Warning: error while parsing "
<<
f
<<
"
\n
"
;
if
(
res
.
first
==
0
&&
res
.
second
==
0
)
{
}
std
::
cerr
<<
"Warning: no rules loaded from "
<<
file
<<
"
\n
"
;
if
(
rules
.
size
()
==
sz
)
{
}
else
if
(
!
quiet
)
{
std
::
cerr
<<
"Warning: no rules loaded from "
<<
f
<<
"
\n
"
;
std
::
cerr
<<
"Loaded "
<<
res
.
first
<<
" tag rule(s) and "
<<
res
.
second
<<
" match rule(s) from "
<<
file
<<
"
\n
"
;
}
}
}
}
if
(
!
rules
.
empty
())
{
if
(
!
runner
.
empty
())
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
timer
.
register_signal_handler
();
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
;
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
;
writer
=
Corpus2
::
TokenWriter
::
create_stream_writer
(
output_format
,
std
::
cout
,
tagset
);
writer
=
Corpus2
::
TokenWriter
::
create_stream_writer
(
output_format
,
std
::
cout
,
tagset
);
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
;
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
;
foreach
(
const
std
::
string
&
f
,
corpora_files
)
{
foreach
(
std
::
string
c
f
,
corpora_files
)
{
reader
=
Corpus2
::
TokenReader
::
create_path_reader
(
input_format
,
tagset
,
f
);
reader
=
Corpus2
::
TokenReader
::
create_path_reader
(
input_format
,
tagset
,
c
f
);
reader
->
set_option
(
"ann"
);
reader
->
set_option
(
"ann"
);
apply_rules
(
reader
,
writer
,
rules
,
opts
);
runner
.
apply_rules
(
reader
,
writer
);
}
}
if
(
corpus_stdin
)
{
if
(
corpus_stdin
)
{
reader
=
Corpus2
::
TokenReader
::
create_stream_reader
(
input_format
,
tagset
,
std
::
cin
);
reader
=
Corpus2
::
TokenReader
::
create_stream_reader
(
input_format
,
tagset
,
std
::
cin
);
reader
->
set_option
(
"ann"
);
reader
->
set_option
(
"ann"
);
apply_rules
(
reader
,
writer
,
rules
,
opts
);
runner
.
apply_rules
(
reader
,
writer
);
}
}
if
(
progress
)
{
if
(
progress
)
{
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
Corpus2
::
TokenTimer
&
timer
=
Corpus2
::
global_timer
();
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment