Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
WCCL
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
WCCL
Commits
9c7dabf1
Commit
9c7dabf1
authored
13 years ago
by
ilor
Browse files
Options
Downloads
Patches
Plain Diff
wccl-features update
parent
4d705f86
Branches
Branches containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
wccl-features/main.cpp
+171
-36
171 additions, 36 deletions
wccl-features/main.cpp
with
171 additions
and
36 deletions
wccl-features/main.cpp
+
171
−
36
View file @
9c7dabf1
...
...
@@ -43,10 +43,10 @@ private:
std
::
ios_base
::
fmtflags
flags_
;
};
class
Runner
class
Feature
Runner
{
public:
Runner
(
const
Corpus2
::
Tagset
&
tagset
)
Feature
Runner
(
const
Corpus2
::
Tagset
&
tagset
)
:
tagset_
(
tagset
),
parser_
(
tagset_
),
token_idx
(
0
)
{
}
...
...
@@ -56,15 +56,26 @@ public:
int
load_operator_string
(
const
std
::
string
&
line
);
void
print_header_head
();
void
print_header_body
(
const
std
::
string
&
attribute_prefix
);
void
print_header_body
(
const
std
::
string
&
attribute_prefix
,
bool
nos
=
false
);
void
print_header_foot
();
void
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>&
data
);
void
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
,
const
std
::
vector
<
bool
>
rowmask
);
std
::
vector
<
std
::
vector
<
std
::
string
>
>
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
);
std
::
vector
<
std
::
vector
<
std
::
string
>
>
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
);
void
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
,
std
::
vector
<
std
::
vector
<
std
::
string
>
>&
sfeats
,
bool
nos
=
false
);
void
do_stream
(
std
::
istream
&
is
,
bool
first
);
void
do_files
(
std
::
vector
<
std
::
string
>&
files
,
bool
first
);
bool
empty
()
{
return
bool_ops_
.
empty
()
&&
str_ops_
.
empty
()
&&
tset_ops_
.
empty
();
}
...
...
@@ -93,7 +104,7 @@ private:
int
token_idx
;
};
int
Runner
::
load_more_operators
(
const
std
::
string
&
filename
)
int
Feature
Runner
::
load_more_operators
(
const
std
::
string
&
filename
)
{
int
ops_parsed
=
0
;
...
...
@@ -117,7 +128,7 @@ int Runner::load_more_operators(const std::string& filename)
return
ops_parsed
;
}
int
Runner
::
load_operator_string
(
const
std
::
string
&
line
)
int
Feature
Runner
::
load_operator_string
(
const
std
::
string
&
line
)
{
int
ops_loaded
=
0
;
boost
::
regex
e
(
"(STRING|BOOL|MASK
\\
h([a-z@,]+))
\\
h+"
...
...
@@ -174,59 +185,93 @@ int Runner::load_operator_string(const std::string &line)
return
ops_loaded
;
}
void
Runner
::
print_header_head
()
void
Feature
Runner
::
print_header_head
()
{
std
::
cout
<<
"% Generated by wccl-features
\n
"
;
std
::
cout
<<
"@RELATION wccl
\n
"
;
std
::
cout
<<
"
\n
"
;
}
void
Runner
::
print_header_body
(
const
std
::
string
&
attribute_prefix
)
void
FeatureRunner
::
print_header_body
(
const
std
::
string
&
attribute_prefix
,
bool
nos
/*=false*/
)
{
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
" string
\n
"
;
if
(
!
nos
)
{
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
" string
\n
"
;
}
}
foreach
(
const
bool_ops_map_t
::
value_type
v
,
bool_ops_
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
"
class
{0,1}
\n
"
;
<<
attribute_prefix
<<
v
.
first
<<
" {0,1}
\n
"
;
}
foreach
(
const
tset_ops_map_t
::
value_type
v
,
tset_ops_
)
{
foreach
(
const
Corpus2
::
Tag
&
tag
,
v
.
second
.
first
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
"_"
<<
tagset_
.
tag_to_symbol_string
(
tag
)
<<
"
class
{0,1}
\n
"
;
<<
tagset_
.
tag_to_symbol_string
(
tag
)
<<
" {0,1}
\n
"
;
}
}
}
void
Runner
::
print_header_foot
()
void
Feature
Runner
::
print_header_foot
()
{
std
::
cout
<<
"
\n
@DATA
\n
"
;
}
void
Runner
::
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
)
void
Feature
Runner
::
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
)
{
foreach
(
const
std
::
vector
<
std
::
string
>&
feats
,
data
)
{
std
::
cout
<<
boost
::
algorithm
::
join
(
feats
,
","
)
<<
"
\n
"
;
}
}
std
::
vector
<
std
::
vector
<
std
::
string
>
>
Runner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
)
void
FeatureRunner
::
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
,
const
std
::
vector
<
bool
>
rowmask
)
{
Wccl
::
SentenceContext
sc
(
sentence
);
assert
(
data
.
size
()
==
rowmask
.
size
());
for
(
size_t
i
=
0
;
i
<
data
.
size
();
++
i
)
{
if
(
rowmask
[
i
])
{
std
::
cout
<<
boost
::
algorithm
::
join
(
data
[
i
],
","
)
<<
"
\n
"
;
}
}
}
std
::
vector
<
std
::
vector
<
std
::
string
>
>
FeatureRunner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
)
{
std
::
vector
<
std
::
vector
<
std
::
string
>
>
sfeats
;
do_sentence
(
sentence
,
sfeats
);
return
sfeats
;
}
void
FeatureRunner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
,
std
::
vector
<
std
::
vector
<
std
::
string
>
>&
sfeats
,
bool
nos
/*=false*/
)
{
Wccl
::
SentenceContext
sc
(
sentence
);
while
(
sc
.
is_current_inside
())
{
sfeats
.
resize
(
sfeats
.
size
()
+
1
);
std
::
vector
<
std
::
string
>&
feats
=
sfeats
.
back
();
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
boost
::
shared_ptr
<
const
Wccl
::
StrSet
>
s
=
v
.
second
->
apply
(
sc
);
assert
(
s
);
if
(
s
->
contents
().
empty
())
{
feats
.
push_back
(
"
\"\"
"
);
}
else
{
feats
.
push_back
(
"
\"
"
+
PwrNlp
::
to_utf8
(
*
s
->
contents
().
begin
())
+
"
\"
"
);
if
(
sfeats
.
size
()
<
static_cast
<
size_t
>
(
sc
.
get_position
()
+
1
))
{
sfeats
.
resize
(
sc
.
get_position
()
+
1
);
}
assert
(
!
sfeats
.
empty
());
std
::
vector
<
std
::
string
>&
feats
=
sfeats
[
sc
.
get_position
()];
if
(
!
nos
)
{
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
boost
::
shared_ptr
<
const
Wccl
::
StrSet
>
s
=
v
.
second
->
apply
(
sc
);
assert
(
s
);
if
(
s
->
contents
().
empty
())
{
feats
.
push_back
(
"
\"\"
"
);
}
else
{
feats
.
push_back
(
"
\"
"
+
boost
::
algorithm
::
replace_all_copy
(
PwrNlp
::
to_utf8
(
*
s
->
contents
().
begin
()),
"
\"
"
,
"
\\\"
"
)
+
"
\"
"
);
}
}
}
foreach
(
const
bool_ops_map_t
::
value_type
v
,
bool_ops_
)
{
...
...
@@ -251,11 +296,10 @@ std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_
}
sc
.
advance
();
}
return
sfeats
;
}
void
Runner
::
do_stream
(
std
::
istream
&
is
,
bool
first
)
void
Feature
Runner
::
do_stream
(
std
::
istream
&
is
,
bool
first
)
{
Corpus2
::
XcesReader
xr
(
tagset_
,
is
);
Corpus2
::
Sentence
::
Ptr
s
;
...
...
@@ -269,7 +313,98 @@ void Runner::do_stream(std::istream& is, bool first)
}
}
//void Runner::do_files(std::istream& is, bool first)
void
FeatureRunner
::
do_files
(
std
::
vector
<
std
::
string
>&
files
,
bool
first
)
{
std
::
vector
<
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
>
readers
;
if
(
files
.
size
()
<
2
)
return
;
readers
.
push_back
(
Corpus2
::
TokenReader
::
create_path_reader
(
"xces"
,
tagset_
,
files
[
0
]));
for
(
size_t
i
=
1
;
i
<
files
.
size
();
++
i
)
{
readers
.
push_back
(
Corpus2
::
TokenReader
::
create_path_reader
(
"xces,disamb_only"
,
tagset_
,
files
[
i
]));
}
print_header_head
();
for
(
size_t
i
=
0
;
i
<
files
.
size
();
++
i
)
{
print_header_body
(
"T"
+
boost
::
lexical_cast
<
std
::
string
>
(
i
)
+
"_"
);
}
std
::
cout
<<
"@ATTRIBUTE correct {0"
;
for
(
size_t
si
=
1
;
si
<
files
.
size
();
++
si
)
{
//std::cout << "@ATTRIBUTE tag" << si << "ok " << "{0,1}" << "\n";
std
::
cout
<<
","
<<
si
;
}
std
::
cout
<<
"}
\n
"
;
print_header_foot
();
bool
more
=
!
first
;
int
processed
=
0
;
do
{
std
::
vector
<
Corpus2
::
Sentence
::
Ptr
>
sentences
;
foreach
(
const
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>&
r
,
readers
)
{
Corpus2
::
Sentence
::
Ptr
s
=
r
->
get_next_sentence
();
if
(
s
)
{
sentences
.
push_back
(
s
);
}
}
if
(
sentences
.
size
()
==
readers
.
size
())
{
std
::
vector
<
std
::
vector
<
std
::
string
>
>
data
;
std
::
vector
<
bool
>
rowmask
;
size_t
gold_size
=
sentences
[
0
]
->
size
();
for
(
size_t
si
=
1
;
si
<
sentences
.
size
();
++
si
)
{
if
(
sentences
[
si
]
->
size
()
!=
gold_size
)
{
std
::
cerr
<<
"Sentence size mismatch at "
<<
processed
<<
" "
<<
si
<<
"
\n
"
;
return
;
}
}
data
.
resize
(
gold_size
);
rowmask
.
resize
(
gold_size
);
do_sentence
(
sentences
[
0
],
data
,
false
);
for
(
size_t
si
=
1
;
si
<
sentences
.
size
();
++
si
)
{
do_sentence
(
sentences
[
si
],
data
,
false
);
}
int
maxv
=
0
;
for
(
size_t
i
=
0
;
i
<
gold_size
;
++
i
)
{
std
::
set
<
Corpus2
::
Tag
>
gold_tags
;
const
Corpus2
::
Token
&
gold_token
=
*
(
*
sentences
[
0
])[
i
];
foreach
(
const
Corpus2
::
Lexeme
&
gl
,
gold_token
.
disamb_lexemes
())
{
gold_tags
.
insert
(
gl
.
tag
());
}
int
wci
=
0
;
std
::
map
<
Corpus2
::
Tag
,
int
>
v
;
for
(
size_t
si
=
1
;
si
<
sentences
.
size
();
++
si
)
{
const
Corpus2
::
Token
&
token
=
*
(
*
sentences
[
si
])[
i
];
bool
wc
=
false
;
foreach
(
const
Corpus2
::
Lexeme
&
gl
,
token
.
lexemes
())
{
if
(
gold_tags
.
find
(
gl
.
tag
())
!=
gold_tags
.
end
())
{
wc
=
true
;
wci
=
si
;
}
v
[
gl
.
tag
()]
++
;
maxv
=
std
::
max
(
maxv
,
v
[
gl
.
tag
()]);
}
//data[i].push_back(wc ? "1" : "0");
}
typedef
std
::
pair
<
Corpus2
::
Tag
,
int
>
pp
;
int
mv
=
0
;
bool
tie
=
false
;
foreach
(
const
pp
&
p
,
v
)
{
if
(
p
.
second
==
mv
)
{
tie
=
true
;
}
else
if
(
p
.
second
>
mv
)
{
tie
=
false
;
mv
=
p
.
second
;
}
}
data
[
i
].
push_back
(
boost
::
lexical_cast
<
std
::
string
>
(
wci
));
rowmask
[
i
]
=
tie
;
}
print_data
(
data
,
rowmask
);
++
processed
;
}
else
{
more
=
false
;
}
}
while
(
more
);
}
int
main
(
int
argc
,
char
**
argv
)
...
...
@@ -348,7 +483,7 @@ int main(int argc, char** argv)
}
try
{
const
Corpus2
::
Tagset
&
tagset
=
Corpus2
::
get_named_tagset
(
tagset_load
);
Runner
runner
(
tagset
);
Feature
Runner
runner
(
tagset
);
foreach
(
const
std
::
string
&
f
,
operator_strings
)
{
if
(
boost
::
algorithm
::
ends_with
(
f
,
".ccl"
))
{
if
(
!
runner
.
load_more_operators
(
f
))
{
...
...
@@ -357,16 +492,16 @@ int main(int argc, char** argv)
}
}
if
(
!
runner
.
empty
())
{
foreach
(
const
std
::
string
&
f
,
corpora_files
)
{
std
::
ifstream
ifs
(
f
.
c_str
());
if
(
corpora_files
.
size
()
==
1
)
{
std
::
ifstream
ifs
(
corpora_files
[
0
]
.
c_str
());
if
(
ifs
.
good
())
{
runner
.
do_stream
(
ifs
,
first
);
}
else
{
std
::
cerr
<<
"Error reading corpus from "
<<
f
<<
"
\n
"
;
std
::
cerr
<<
"Error reading corpus from "
<<
corpora_files
[
0
]
<<
"
\n
"
;
}
}
if
(
corpus_stdin
)
{
runner
.
do_stream
(
std
::
cin
,
first
);
}
else
{
runner
.
do_files
(
corpora_files
,
first
);
}
}
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment