Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
W
WCCL
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
WCCL
Commits
9c7dabf1
Commit
9c7dabf1
authored
Apr 19, 2011
by
ilor
Browse files
Options
Downloads
Patches
Plain Diff
wccl-features update
parent
4d705f86
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
wccl-features/main.cpp
+171
-36
171 additions, 36 deletions
wccl-features/main.cpp
with
171 additions
and
36 deletions
wccl-features/main.cpp
+
171
−
36
View file @
9c7dabf1
...
...
@@ -43,10 +43,10 @@ private:
std
::
ios_base
::
fmtflags
flags_
;
};
class
Runner
class
Feature
Runner
{
public:
Runner
(
const
Corpus2
::
Tagset
&
tagset
)
Feature
Runner
(
const
Corpus2
::
Tagset
&
tagset
)
:
tagset_
(
tagset
),
parser_
(
tagset_
),
token_idx
(
0
)
{
}
...
...
@@ -56,15 +56,26 @@ public:
int
load_operator_string
(
const
std
::
string
&
line
);
void
print_header_head
();
void
print_header_body
(
const
std
::
string
&
attribute_prefix
);
void
print_header_body
(
const
std
::
string
&
attribute_prefix
,
bool
nos
=
false
);
void
print_header_foot
();
void
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>&
data
);
void
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
,
const
std
::
vector
<
bool
>
rowmask
);
std
::
vector
<
std
::
vector
<
std
::
string
>
>
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
);
std
::
vector
<
std
::
vector
<
std
::
string
>
>
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
);
void
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
,
std
::
vector
<
std
::
vector
<
std
::
string
>
>&
sfeats
,
bool
nos
=
false
);
void
do_stream
(
std
::
istream
&
is
,
bool
first
);
void
do_files
(
std
::
vector
<
std
::
string
>&
files
,
bool
first
);
bool
empty
()
{
return
bool_ops_
.
empty
()
&&
str_ops_
.
empty
()
&&
tset_ops_
.
empty
();
}
...
...
@@ -93,7 +104,7 @@ private:
int
token_idx
;
};
int
Runner
::
load_more_operators
(
const
std
::
string
&
filename
)
int
Feature
Runner
::
load_more_operators
(
const
std
::
string
&
filename
)
{
int
ops_parsed
=
0
;
...
...
@@ -117,7 +128,7 @@ int Runner::load_more_operators(const std::string& filename)
return
ops_parsed
;
}
int
Runner
::
load_operator_string
(
const
std
::
string
&
line
)
int
Feature
Runner
::
load_operator_string
(
const
std
::
string
&
line
)
{
int
ops_loaded
=
0
;
boost
::
regex
e
(
"(STRING|BOOL|MASK
\\
h([a-z@,]+))
\\
h+"
...
...
@@ -174,59 +185,93 @@ int Runner::load_operator_string(const std::string &line)
return
ops_loaded
;
}
void
Runner
::
print_header_head
()
void
Feature
Runner
::
print_header_head
()
{
std
::
cout
<<
"% Generated by wccl-features
\n
"
;
std
::
cout
<<
"@RELATION wccl
\n
"
;
std
::
cout
<<
"
\n
"
;
}
void
Runner
::
print_header_body
(
const
std
::
string
&
attribute_prefix
)
void
FeatureRunner
::
print_header_body
(
const
std
::
string
&
attribute_prefix
,
bool
nos
/*=false*/
)
{
if
(
!
nos
)
{
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
" string
\n
"
;
}
}
foreach
(
const
bool_ops_map_t
::
value_type
v
,
bool_ops_
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
"
class
{0,1}
\n
"
;
<<
attribute_prefix
<<
v
.
first
<<
" {0,1}
\n
"
;
}
foreach
(
const
tset_ops_map_t
::
value_type
v
,
tset_ops_
)
{
foreach
(
const
Corpus2
::
Tag
&
tag
,
v
.
second
.
first
)
{
std
::
cout
<<
"@ATTRIBUTE "
<<
attribute_prefix
<<
v
.
first
<<
"_"
<<
tagset_
.
tag_to_symbol_string
(
tag
)
<<
"
class
{0,1}
\n
"
;
<<
tagset_
.
tag_to_symbol_string
(
tag
)
<<
" {0,1}
\n
"
;
}
}
}
void
Runner
::
print_header_foot
()
void
Feature
Runner
::
print_header_foot
()
{
std
::
cout
<<
"
\n
@DATA
\n
"
;
}
void
Runner
::
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
)
void
Feature
Runner
::
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
)
{
foreach
(
const
std
::
vector
<
std
::
string
>&
feats
,
data
)
{
std
::
cout
<<
boost
::
algorithm
::
join
(
feats
,
","
)
<<
"
\n
"
;
}
}
std
::
vector
<
std
::
vector
<
std
::
string
>
>
Runner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
)
void
FeatureRunner
::
print_data
(
const
std
::
vector
<
std
::
vector
<
std
::
string
>
>
&
data
,
const
std
::
vector
<
bool
>
rowmask
)
{
Wccl
::
SentenceContext
sc
(
sentence
);
assert
(
data
.
size
()
==
rowmask
.
size
());
for
(
size_t
i
=
0
;
i
<
data
.
size
();
++
i
)
{
if
(
rowmask
[
i
])
{
std
::
cout
<<
boost
::
algorithm
::
join
(
data
[
i
],
","
)
<<
"
\n
"
;
}
}
}
std
::
vector
<
std
::
vector
<
std
::
string
>
>
FeatureRunner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
)
{
std
::
vector
<
std
::
vector
<
std
::
string
>
>
sfeats
;
do_sentence
(
sentence
,
sfeats
);
return
sfeats
;
}
void
FeatureRunner
::
do_sentence
(
const
boost
::
shared_ptr
<
Corpus2
::
Sentence
>&
sentence
,
std
::
vector
<
std
::
vector
<
std
::
string
>
>&
sfeats
,
bool
nos
/*=false*/
)
{
Wccl
::
SentenceContext
sc
(
sentence
);
while
(
sc
.
is_current_inside
())
{
sfeats
.
resize
(
sfeats
.
size
()
+
1
);
std
::
vector
<
std
::
string
>&
feats
=
sfeats
.
back
();
if
(
sfeats
.
size
()
<
static_cast
<
size_t
>
(
sc
.
get_position
()
+
1
))
{
sfeats
.
resize
(
sc
.
get_position
()
+
1
);
}
assert
(
!
sfeats
.
empty
());
std
::
vector
<
std
::
string
>&
feats
=
sfeats
[
sc
.
get_position
()];
if
(
!
nos
)
{
foreach
(
const
str_ops_map_t
::
value_type
v
,
str_ops_
)
{
boost
::
shared_ptr
<
const
Wccl
::
StrSet
>
s
=
v
.
second
->
apply
(
sc
);
assert
(
s
);
if
(
s
->
contents
().
empty
())
{
feats
.
push_back
(
"
\"\"
"
);
}
else
{
feats
.
push_back
(
"
\"
"
+
PwrNlp
::
to_utf8
(
*
s
->
contents
().
begin
())
+
"
\"
"
);
feats
.
push_back
(
"
\"
"
+
boost
::
algorithm
::
replace_all_copy
(
PwrNlp
::
to_utf8
(
*
s
->
contents
().
begin
()),
"
\"
"
,
"
\\\"
"
)
+
"
\"
"
);
}
}
}
foreach
(
const
bool_ops_map_t
::
value_type
v
,
bool_ops_
)
{
...
...
@@ -251,11 +296,10 @@ std::vector< std::vector<std::string> > Runner::do_sentence(const boost::shared_
}
sc
.
advance
();
}
return
sfeats
;
}
void
Runner
::
do_stream
(
std
::
istream
&
is
,
bool
first
)
void
Feature
Runner
::
do_stream
(
std
::
istream
&
is
,
bool
first
)
{
Corpus2
::
XcesReader
xr
(
tagset_
,
is
);
Corpus2
::
Sentence
::
Ptr
s
;
...
...
@@ -269,7 +313,98 @@ void Runner::do_stream(std::istream& is, bool first)
}
}
//void Runner::do_files(std::istream& is, bool first)
void
FeatureRunner
::
do_files
(
std
::
vector
<
std
::
string
>&
files
,
bool
first
)
{
std
::
vector
<
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
>
readers
;
if
(
files
.
size
()
<
2
)
return
;
readers
.
push_back
(
Corpus2
::
TokenReader
::
create_path_reader
(
"xces"
,
tagset_
,
files
[
0
]));
for
(
size_t
i
=
1
;
i
<
files
.
size
();
++
i
)
{
readers
.
push_back
(
Corpus2
::
TokenReader
::
create_path_reader
(
"xces,disamb_only"
,
tagset_
,
files
[
i
]));
}
print_header_head
();
for
(
size_t
i
=
0
;
i
<
files
.
size
();
++
i
)
{
print_header_body
(
"T"
+
boost
::
lexical_cast
<
std
::
string
>
(
i
)
+
"_"
);
}
std
::
cout
<<
"@ATTRIBUTE correct {0"
;
for
(
size_t
si
=
1
;
si
<
files
.
size
();
++
si
)
{
//std::cout << "@ATTRIBUTE tag" << si << "ok " << "{0,1}" << "\n";
std
::
cout
<<
","
<<
si
;
}
std
::
cout
<<
"}
\n
"
;
print_header_foot
();
bool
more
=
!
first
;
int
processed
=
0
;
do
{
std
::
vector
<
Corpus2
::
Sentence
::
Ptr
>
sentences
;
foreach
(
const
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>&
r
,
readers
)
{
Corpus2
::
Sentence
::
Ptr
s
=
r
->
get_next_sentence
();
if
(
s
)
{
sentences
.
push_back
(
s
);
}
}
if
(
sentences
.
size
()
==
readers
.
size
())
{
std
::
vector
<
std
::
vector
<
std
::
string
>
>
data
;
std
::
vector
<
bool
>
rowmask
;
size_t
gold_size
=
sentences
[
0
]
->
size
();
for
(
size_t
si
=
1
;
si
<
sentences
.
size
();
++
si
)
{
if
(
sentences
[
si
]
->
size
()
!=
gold_size
)
{
std
::
cerr
<<
"Sentence size mismatch at "
<<
processed
<<
" "
<<
si
<<
"
\n
"
;
return
;
}
}
data
.
resize
(
gold_size
);
rowmask
.
resize
(
gold_size
);
do_sentence
(
sentences
[
0
],
data
,
false
);
for
(
size_t
si
=
1
;
si
<
sentences
.
size
();
++
si
)
{
do_sentence
(
sentences
[
si
],
data
,
false
);
}
int
maxv
=
0
;
for
(
size_t
i
=
0
;
i
<
gold_size
;
++
i
)
{
std
::
set
<
Corpus2
::
Tag
>
gold_tags
;
const
Corpus2
::
Token
&
gold_token
=
*
(
*
sentences
[
0
])[
i
];
foreach
(
const
Corpus2
::
Lexeme
&
gl
,
gold_token
.
disamb_lexemes
())
{
gold_tags
.
insert
(
gl
.
tag
());
}
int
wci
=
0
;
std
::
map
<
Corpus2
::
Tag
,
int
>
v
;
for
(
size_t
si
=
1
;
si
<
sentences
.
size
();
++
si
)
{
const
Corpus2
::
Token
&
token
=
*
(
*
sentences
[
si
])[
i
];
bool
wc
=
false
;
foreach
(
const
Corpus2
::
Lexeme
&
gl
,
token
.
lexemes
())
{
if
(
gold_tags
.
find
(
gl
.
tag
())
!=
gold_tags
.
end
())
{
wc
=
true
;
wci
=
si
;
}
v
[
gl
.
tag
()]
++
;
maxv
=
std
::
max
(
maxv
,
v
[
gl
.
tag
()]);
}
//data[i].push_back(wc ? "1" : "0");
}
typedef
std
::
pair
<
Corpus2
::
Tag
,
int
>
pp
;
int
mv
=
0
;
bool
tie
=
false
;
foreach
(
const
pp
&
p
,
v
)
{
if
(
p
.
second
==
mv
)
{
tie
=
true
;
}
else
if
(
p
.
second
>
mv
)
{
tie
=
false
;
mv
=
p
.
second
;
}
}
data
[
i
].
push_back
(
boost
::
lexical_cast
<
std
::
string
>
(
wci
));
rowmask
[
i
]
=
tie
;
}
print_data
(
data
,
rowmask
);
++
processed
;
}
else
{
more
=
false
;
}
}
while
(
more
);
}
int
main
(
int
argc
,
char
**
argv
)
...
...
@@ -348,7 +483,7 @@ int main(int argc, char** argv)
}
try
{
const
Corpus2
::
Tagset
&
tagset
=
Corpus2
::
get_named_tagset
(
tagset_load
);
Runner
runner
(
tagset
);
Feature
Runner
runner
(
tagset
);
foreach
(
const
std
::
string
&
f
,
operator_strings
)
{
if
(
boost
::
algorithm
::
ends_with
(
f
,
".ccl"
))
{
if
(
!
runner
.
load_more_operators
(
f
))
{
...
...
@@ -357,16 +492,16 @@ int main(int argc, char** argv)
}
}
if
(
!
runner
.
empty
())
{
foreach
(
const
std
::
string
&
f
,
corpora_files
)
{
std
::
ifstream
ifs
(
f
.
c_str
());
if
(
corpora_files
.
size
()
==
1
)
{
std
::
ifstream
ifs
(
corpora_files
[
0
]
.
c_str
());
if
(
ifs
.
good
())
{
runner
.
do_stream
(
ifs
,
first
);
}
else
{
std
::
cerr
<<
"Error reading corpus from "
<<
f
<<
"
\n
"
;
}
std
::
cerr
<<
"Error reading corpus from "
<<
corpora_files
[
0
]
<<
"
\n
"
;
}
if
(
corpus_stdin
)
{
runner
.
do_
stream
(
std
::
cin
,
first
);
}
else
{
runner
.
do_
files
(
corpora_files
,
first
);
}
}
}
catch
(
PwrNlp
::
PwrNlpError
&
e
)
{
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment