Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
WCCL
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
WCCL
Commits
c79eaf53
Commit
c79eaf53
authored
14 years ago
by
ilor
Browse files
Options
Downloads
Patches
Plain Diff
wccl-features
parent
8ef00260
Branches
Branches containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
CMakeLists.txt
+1
-0
1 addition, 0 deletions
CMakeLists.txt
wccl-features/CMakeLists.txt
+31
-0
31 additions, 0 deletions
wccl-features/CMakeLists.txt
wccl-features/main.cpp
+375
-0
375 additions, 0 deletions
wccl-features/main.cpp
with
407 additions
and
0 deletions
CMakeLists.txt
+
1
−
0
View file @
c79eaf53
...
...
@@ -61,4 +61,5 @@ add_subdirectory(libwccl)
add_subdirectory
(
wcclparser
)
add_subdirectory
(
wcclrun
)
add_subdirectory
(
wcclrules
)
add_subdirectory
(
wccl-features
)
add_subdirectory
(
tests
)
This diff is collapsed.
Click to expand it.
wccl-features/CMakeLists.txt
0 → 100644
+
31
−
0
View file @
c79eaf53
PROJECT
(
wccl-features
)
find_package
(
Libedit
)
if
(
Libedit_FOUND
)
message
(
STATUS
"Building with libedit"
)
add_definitions
(
-DHAVE_LIBEDIT
)
set
(
LIBS
${
LIBS
}
${
Libedit_LIBRARIES
}
)
endif
(
Libedit_FOUND
)
find_package
(
LibXML++ REQUIRED
)
include_directories
(
${
LibXML++_INCLUDE_DIRS
}
)
link_directories
(
${
LibXML++_LIBRARY_DIRS
}
)
set
(
LIBS
${
LIBS
}
${
LibXML++_LIBRARIES
}
)
include_directories
(
${
CMAKE_SOURCE_DIR
}
)
add_definitions
(
-DLIBWCCL_WCCLRUN_DATA_DIR=
"
${
PROJECT_SOURCE_DIR
}
/"
)
add_executable
(
wccl-features
main.cpp
)
target_link_libraries
(
wccl-features wccl
${
Boost_LIBRARIES
}
antlr
${
LIBS
}
)
include_directories
(
${
Boost_INCLUDE_DIR
}
)
link_directories
(
${
Boost_LIBRARY_DIRS
}
)
if
(
UNIX
)
install
(
TARGETS wccl-features
RUNTIME DESTINATION bin
)
endif
(
UNIX
)
This diff is collapsed.
Click to expand it.
wccl-features/main.cpp
0 → 100644
+
375
−
0
View file @
c79eaf53
#include
<cstdlib>
#include
<cstdio>
#include
<fstream>
#include
<iomanip>
#include
<libwccl/values/strset.h>
#include
<libwccl/parser/Parser.h>
#include
<libcorpus2/tagsetmanager.h>
#include
<boost/bind.hpp>
#include
<boost/algorithm/string.hpp>
#include
<boost/make_shared.hpp>
#include
<boost/program_options.hpp>
#include
<boost/filesystem.hpp>
#include
<libcorpus2/io/xcesreader.h>
#include
<boost/lexical_cast.hpp>
#include
<boost/regex.hpp>
#include
<antlr/NoViableAltException.hpp>
#include
<antlr/MismatchedTokenException.hpp>
namespace
{
bool
quiet
=
false
;
bool
tabs
=
false
;
bool
output_orths
=
true
;
bool
output_variables
=
false
;
bool
global_numbering
=
false
;
bool
output_header
=
true
;
bool
in_sentence_numbering
=
true
;
}
class
streamsave
{
public:
streamsave
(
std
::
ostream
&
os
)
:
os_
(
os
),
flags_
(
os
.
flags
())
{}
~
streamsave
()
{
os_
.
flags
(
flags_
);
}
private
:
std
::
ostream
&
os_
;
std
::
ios_base
::
fmtflags
flags_
;
};
class
Runner
{
public:
Runner
(
const
Corpus2
::
Tagset
&
tagset
)
:
tagset_
(
tagset
),
parser_
(
tagset_
),
token_idx
(
0
)
{
}
int
load_more_operators
(
const
std
::
string
&
filename
);
int
load_operator_string
(
const
std
::
string
&
line
);
void
print_header_head
();
void
print_header_body
(
const
std
::
string
&
attribute_prefix
);
void
print_header_foot
();
void
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>&
data
);
std
::
vector
<
std
::
vector
<
std
::
string
>
>
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
);
void
do_stream
(
std
::
istream
&
is
,
bool
first
);
bool
empty
()
{
return
bool_ops_
.
empty
()
&&
str_ops_
.
empty
()
&&
tset_ops_
.
empty
();
}
private
:
const
Corpus2
::
Tagset
&
tagset_
;
Wccl
::
Parser
parser_
;
typedef
std
::
map
<
std
::
string
,
boost
::
shared_ptr
<
Wccl
::
Operator
<
Wccl
::
Bool
>
>
>
bool_ops_map_t
;
bool_ops_map_t
bool_ops_
;
typedef
std
::
map
<
std
::
string
,
boost
::
shared_ptr
<
Wccl
::
Operator
<
Wccl
::
StrSet
>
>
>
str_ops_map_t
;
str_ops_map_t
str_ops_
;
typedef
std
::
map
<
std
::
string
,
std
::
pair
<
std
::
set
<
Corpus2
::
Tag
>
,
boost
::
shared_ptr
<
Wccl
::
Operator
<
Wccl
::
TSet
>
>
>
>
tset_ops_map_t
;
tset_ops_map_t
tset_ops_
;
int
token_idx
;
};
int
Runner
::
load_more_operators
(
const
std
::
string
&
filename
)
{
int
ops_parsed
=
0
;
std
::
ifstream
is
(
filename
.
c_str
());
if
(
!
is
.
good
())
{
throw
Wccl
::
FileNotFound
(
filename
,
""
,
__FUNCTION__
);
}
std
::
string
line
;
int
line_no
=
0
;
while
(
std
::
getline
(
is
,
line
))
{
++
line_no
;
if
(
line
.
size
()
<
3
)
continue
;
if
(
line
[
0
]
==
'#'
)
continue
;
int
loaded
=
load_operator_string
(
line
);
if
(
loaded
>
0
)
{
ops_parsed
+=
loaded
;
}
else
{
std
::
cerr
<<
"Line "
<<
line_no
<<
" did not match: "
<<
line
<<
"
\n
"
;
}
}
return
ops_parsed
;
}
int
Runner
::
load_operator_string
(
const
std
::
string
&
line
)
{
int
ops_loaded
=
0
;
boost
::
regex
e
(
"(STRING|BOOL|MASK
\\
h([a-z@,]+))
\\
h+"
"(?:name:([a-zA-Z0-9_-]+)
\\
h)?"
"(?:range:([0-9-]+):([0-9-]+)
\\
h)?"
"(.*)"
);
boost
::
smatch
what
;
if
(
boost
::
regex_match
(
line
,
what
,
e
,
boost
::
match_extra
))
{
try
{
const
std
::
string
&
orig_name
=
what
[
3
].
matched
?
what
[
3
]
:
what
[
6
];
const
std
::
string
&
orig_op_string
=
what
[
6
];
std
::
vector
<
std
::
string
>
op_strings
;
std
::
vector
<
std
::
string
>
names
;
if
(
what
[
4
].
matched
)
{
int
rfrom
=
boost
::
lexical_cast
<
int
>
(
what
[
4
]);
int
rto
=
boost
::
lexical_cast
<
int
>
(
what
[
5
]);
for
(
int
i
=
rfrom
;
i
<=
rto
;
++
i
)
{
std
::
string
pos
=
boost
::
lexical_cast
<
std
::
string
>
(
i
);
op_strings
.
push_back
(
boost
::
algorithm
::
replace_all_copy
(
orig_op_string
,
"_R_"
,
pos
));
names
.
push_back
(
orig_name
+
pos
);
}
}
else
{
op_strings
.
push_back
(
orig_op_string
);
names
.
push_back
(
orig_name
);
}
for
(
size_t
opi
=
0
;
opi
<
op_strings
.
size
();
++
opi
)
{
const
std
::
string
&
name
=
names
[
opi
];
const
std
::
string
&
op_string
=
op_strings
[
opi
];
if
(
what
[
1
]
==
"STRING"
)
{
str_ops_
.
insert
(
std
::
make_pair
(
name
,
parser_
.
parseStringOperator
(
op_string
)));
++
ops_loaded
;
}
else
if
(
what
[
1
]
==
"BOOL"
)
{
bool_ops_
.
insert
(
std
::
make_pair
(
name
,
parser_
.
parseBoolOperator
(
op_string
)));
++
ops_loaded
;
}
else
{
Corpus2
::
Tag
tag
=
tagset_
.
parse_symbol_string
(
what
[
2
]);
std
::
vector
<
std
::
string
>
sym
=
tagset_
.
tag_to_symbol_string_vector
(
tag
,
false
);
std
::
set
<
Corpus2
::
Tag
>
t
;
foreach
(
const
std
::
string
&
s
,
sym
)
{
t
.
insert
(
tagset_
.
parse_symbol
(
s
));
}
tset_ops_
.
insert
(
std
::
make_pair
(
name
,
std
::
make_pair
(
t
,
parser_
.
parseSymSetOperator
(
op_string
))));
++
ops_loaded
;
}
}
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
std
::
cerr
<<
e
.
scope
()
<<
" error: "
<<
e
.
info
()
<<
std
::
endl
;
}
}
return
ops_loaded
;
}
void
Runner
::
print_header_head
()
{
std
::
cout
<<
"% Generated by wccl-features
\n
"
;
std
::
cout
<<
"@RELATION wccl
\n
"
;
std
::
cout
<<
"
\n
"
;
}
void
Runner
::
print_header_body
(
const
std
::
string
&
attribute_prefix
)
{
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
" string
\n
"
;
}
foreach
(
const
bool_ops_map_t
::
value_type
v
,
bool_ops_
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
" class {0,1}
\n
"
;
}
foreach
(
const
tset_ops_map_t
::
value_type
v
,
tset_ops_
)
{
foreach
(
const
Corpus2
::
Tag
&
tag
,
v
.
second
.
first
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
"_"
<<
tagset_
.
tag_to_symbol_string
(
tag
)
<<
" class {0,1}
\n
"
;
}
}
}
void
Runner
::
print_header_foot
()
{
std
::
cout
<<
"
\n
@DATA
\n
"
;
}
void
Runner
::
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
)
{
foreach
(
const
std
::
vector
<
std
::
string
>&
feats
,
data
)
{
std
::
cout
<<
boost
::
algorithm
::
join
(
feats
,
","
)
<<
"
\n
"
;
}
}
std
::
vector
<
std
::
vector
<
std
::
string
>
>
Runner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
)
{
Wccl
::
SentenceContext
sc
(
sentence
);
std
::
vector
<
std
::
vector
<
std
::
string
>
>
sfeats
;
while
(
sc
.
is_current_inside
())
{
sfeats
.
resize
(
sfeats
.
size
()
+
1
);
std
::
vector
<
std
::
string
>&
feats
=
sfeats
.
back
();
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
boost
::
shared_ptr
<
const
Wccl
::
StrSet
>
s
=
v
.
second
->
apply
(
sc
);
assert
(
s
);
if
(
s
->
contents
().
empty
())
{
feats
.
push_back
(
"
\"\"
"
);
}
else
{
feats
.
push_back
(
"
\"
"
+
PwrNlp
::
to_utf8
(
*
s
->
contents
().
begin
())
+
"
\"
"
);
}
}
foreach
(
const
bool_ops_map_t
::
value_type
v
,
bool_ops_
)
{
boost
::
shared_ptr
<
const
Wccl
::
Bool
>
b
=
v
.
second
->
apply
(
sc
);
assert
(
b
);
if
(
*
b
)
{
feats
.
push_back
(
"1"
);
}
else
{
feats
.
push_back
(
"0"
);
}
}
foreach
(
const
tset_ops_map_t
::
value_type
v
,
tset_ops_
)
{
boost
::
shared_ptr
<
const
Wccl
::
TSet
>
t
=
v
.
second
.
second
->
apply
(
sc
);
assert
(
t
);
foreach
(
const
Corpus2
::
Tag
&
tag
,
v
.
second
.
first
)
{
if
(
!
tag
.
get_masked
(
t
->
contents
()).
is_null
())
{
feats
.
push_back
(
"1"
);
}
else
{
feats
.
push_back
(
"0"
);
}
}
}
sc
.
advance
();
}
return
sfeats
;
}
void
Runner
::
do_stream
(
std
::
istream
&
is
,
bool
first
)
{
Corpus2
::
XcesReader
xr
(
tagset_
,
is
);
Corpus2
::
Sentence
::
Ptr
s
;
print_header_head
();
print_header_body
(
""
);
print_header_foot
();
while
((
s
=
xr
.
get_next_sentence
()))
{
print_data
(
do_sentence
(
s
));
std
::
cout
<<
"
\n
"
;
if
(
first
)
break
;
}
}
//void Runner::do_files(std::istream& is, bool first)
int
main
(
int
argc
,
char
**
argv
)
{
std
::
string
tagset_load
=
"kipi"
;
bool
first
=
false
;
std
::
vector
<
std
::
string
>
corpora_files
,
files
,
operator_strings
;
bool
corpus_stdin
=
false
;
using
boost
::
program_options
::
value
;
boost
::
program_options
::
options_description
desc
(
"Allowed options"
);
desc
.
add_options
()
(
"tagset,t"
,
value
(
&
tagset_load
),
"Tagset to use"
)
(
"corpus,c"
,
value
(
&
corpora_files
),
"Corpus file to load (XCES)"
)
(
"ccl-operator,C"
,
value
(
&
operator_strings
),
"CCL operator file or string"
)
(
"files,f"
,
value
(
&
files
),
"Files to load, looking at the extension to determine type"
)
(
"corpus-from-stdin,I"
,
value
(
&
corpus_stdin
)
->
zero_tokens
(),
"Read corpus from stdin"
)
(
"quiet,q"
,
value
(
&
quiet
)
->
zero_tokens
(),
"Suppress messages"
)
(
"first-sentence-only,1"
,
value
(
&
first
)
->
zero_tokens
(),
"Only process first sentence"
)
(
"tabs"
,
value
(
&
tabs
)
->
zero_tokens
(),
"Output a tab-separated file"
)
(
"local-counts,l"
,
value
(
&
in_sentence_numbering
),
"Output in-sentence token counts"
)
(
"global-counts,g"
,
value
(
&
global_numbering
),
"Output global counts"
)
(
"output-orths,O"
,
value
(
&
output_orths
),
"Output token orths"
)
(
"output-variables,V"
,
value
(
&
output_variables
),
"Output operator variables"
)
(
"output-header,H"
,
value
(
&
output_header
),
"Output table header"
)
(
"help,h"
,
"Show help"
)
;
boost
::
program_options
::
variables_map
vm
;
boost
::
program_options
::
positional_options_description
p
;
p
.
add
(
"files"
,
-
1
);
try
{
boost
::
program_options
::
store
(
boost
::
program_options
::
command_line_parser
(
argc
,
argv
)
.
options
(
desc
).
positional
(
p
).
run
(),
vm
);
}
catch
(
boost
::
program_options
::
error
&
e
)
{
std
::
cerr
<<
e
.
what
()
<<
std
::
endl
;
return
2
;
}
boost
::
program_options
::
notify
(
vm
);
if
(
vm
.
count
(
"help"
))
{
std
::
cerr
<<
"Usage "
<<
argv
[
0
]
<<
" [OPTIONS] FILES
\n
"
<<
"Files ending with .xml are treated as corpora, otherwise
\n
"
<<
"as CCL files. Use - to read corpus from stdin (as with -I)
\n
"
<<
"Files not ending with an extension are treated as raw operator strings
\n
"
;
std
::
cout
<<
desc
<<
"
\n
"
;
return
1
;
}
foreach
(
const
std
::
string
&
f
,
files
)
{
if
(
f
==
"-"
)
{
corpus_stdin
=
true
;
}
else
if
(
boost
::
algorithm
::
ends_with
(
f
,
".xml"
))
{
corpora_files
.
push_back
(
f
);
}
else
{
operator_strings
.
push_back
(
f
);
}
}
if
((
corpora_files
.
empty
()
&&
!
corpus_stdin
)
||
(
operator_strings
.
empty
()
&&
!
output_orths
))
{
std
::
cerr
<<
"Nothing to do, try "
<<
argv
[
0
]
<<
" -h
\n
"
;
return
2
;
}
try
{
const
Corpus2
::
Tagset
&
tagset
=
Corpus2
::
get_named_tagset
(
tagset_load
);
Runner
runner
(
tagset
);
foreach
(
const
std
::
string
&
f
,
operator_strings
)
{
if
(
boost
::
algorithm
::
ends_with
(
f
,
".ccl"
))
{
if
(
!
runner
.
load_more_operators
(
f
))
{
std
::
cerr
<<
"Warning: error while parsing "
<<
f
<<
"
\n
"
;
}
}
}
if
(
!
runner
.
empty
())
{
foreach
(
const
std
::
string
&
f
,
corpora_files
)
{
std
::
ifstream
ifs
(
f
.
c_str
());
if
(
ifs
.
good
())
{
runner
.
do_stream
(
ifs
,
first
);
}
else
{
std
::
cerr
<<
"Error reading corpus from "
<<
f
<<
"
\n
"
;
}
}
if
(
corpus_stdin
)
{
runner
.
do_stream
(
std
::
cin
,
first
);
}
}
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
std
::
cerr
<<
e
.
info
()
<<
std
::
endl
;
return
2
;
}
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment