Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
493c914d
Commit
493c914d
authored
Aug 7, 2012
by
jezozwierzak
Browse files
Options
Downloads
Patches
Plain Diff
Modified chunk_eval for folds
parent
31efb090
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
utils/CSVWriter.py
+192
-0
192 additions, 0 deletions
utils/CSVWriter.py
utils/chunk_eval.py
+125
-103
125 additions, 103 deletions
utils/chunk_eval.py
with
317 additions
and
103 deletions
utils/CSVWriter.py
0 → 100644
+
192
−
0
View file @
493c914d
#!/usr/bin/python
#-*- coding: utf-8 -*-
'''
Created on 03-08-2012
@author: jezozwierzak
'''
class
CSVWriter
:
def
__init__
(
self
,
separator
=
'
;
'
):
self
.
widths
=
[]
self
.
list
=
[]
self
.
rows
=
0
self
.
columns
=
0
self
.
separator
=
separator
def
addSubColumn
(
self
,
parentIndex
,
name
):
parentColumn
=
self
.
list
[
parentIndex
];
parentColumn
.
append
([
name
])
subColsStr
=
''
for
i
in
range
(
1
,
len
(
parentColumn
)):
subColsStr
+=
parentColumn
[
i
][
0
]
+
self
.
separator
if
len
(
subColsStr
)
>
self
.
widths
[
parentIndex
]:
self
.
widths
[
parentIndex
]
=
len
(
subColsStr
)
-
1
if
len
(
parentColumn
[
1
:])
>
1
:
self
.
columns
+=
1
def
addSubColumnByName
(
self
,
parentName
,
name
):
assert
self
.
rows
==
0
,
'
You have to add all Column names before adding rows
'
parentIndex
=
self
.
columnIndex
(
parentName
)
self
.
addSubColumn
(
parentIndex
,
name
)
def
addSubColumnsByName
(
self
,
parentName
,
names
=
[]):
for
name
in
names
:
self
.
addSubColumnByName
(
parentName
,
name
)
def
addSubColumns
(
self
,
parentIndex
,
names
=
[]):
for
name
in
names
:
self
.
addSubColumn
(
parentIndex
,
name
)
def
addColumn
(
self
,
name
):
assert
self
.
rows
==
0
,
'
You have to add all Column names before adding rows
'
self
.
list
.
append
([
name
])
self
.
widths
.
append
(
len
(
name
))
self
.
columns
+=
1
def
addColumns
(
self
,
names
=
[]):
for
name
in
names
:
self
.
addColumn
(
name
)
def
addRow
(
self
,
row
=
[]):
assert
len
(
row
)
==
len
(
self
.
list
),
'
Wrong number of columns in row
'
for
i
in
range
(
0
,
len
(
self
.
list
)):
column
=
self
.
list
[
i
]
if
len
(
column
)
>
1
and
type
(
column
[
1
]).
__name__
==
'
list
'
:
#Adding data to subcolumns
assert
len
(
row
[
i
])
==
len
(
column
)
-
1
,
'
Wrong number of subColumns in column
'
+
column
[
0
]
for
j
in
range
(
0
,
len
(
row
[
i
])):
column
[
j
+
1
].
append
(
row
[
i
][
j
])
subColsStr
=
''
for
j
in
range
(
0
,
len
(
row
[
i
])):
subColsStr
+=
'
{0:{base}}
'
.
format
(
row
[
i
][
j
],
base
=
'
.4f
'
)
+
self
.
separator
+
'
'
if
len
(
subColsStr
)
>
self
.
widths
[
i
]:
self
.
widths
[
i
]
=
len
(
subColsStr
)
-
1
else
:
#Adding data to column
column
.
append
(
row
[
i
])
if
len
(
str
(
row
[
i
]))
>
self
.
widths
[
i
]:
self
.
widths
[
i
]
=
len
(
str
(
row
[
i
]))
self
.
rows
+=
1
def
allWidth
(
self
):
sum
=
0
for
width
in
self
.
widths
:
sum
+=
width
return
width
def
columnIndex
(
self
,
name
):
for
column
in
self
.
list
:
if
column
[
0
]
==
name
:
return
self
.
list
.
index
(
column
)
def
hasSubColumns
(
self
):
for
column
in
self
.
list
:
if
len
(
column
)
>
1
and
type
(
column
[
1
]).
__name__
==
'
list
'
:
return
True
return
False
def
hasColumnSubColumns
(
self
,
index
):
column
=
self
.
list
[
index
]
return
len
(
column
)
>
1
and
type
(
column
[
1
]).
__name__
==
'
list
'
def
repeat_to_length
(
self
,
string_to_expand
,
length
):
return
(
string_to_expand
*
((
length
/
len
(
string_to_expand
))
+
1
))[:
length
]
def
count_avg
(
self
,
):
results
=
[]
if
not
self
.
hasSubColumns
():
for
i
in
range
(
0
,
len
(
self
.
list
)):
#Iterowanie po kolumnach
results
.
append
(
0
)
for
j
in
range
(
1
,
1
+
self
.
rows
):
# Iterowanie po wierszach
results
[
i
]
+=
self
.
list
[
i
][
j
]
results
[
i
]
/=
self
.
rows
else
:
for
i
in
range
(
0
,
len
(
self
.
list
)):
#Iterowanie po kolumnach
if
self
.
hasColumnSubColumns
(
i
):
subResults
=
[]
for
k
in
range
(
0
,
len
(
self
.
list
[
i
][
1
:])):
subColumn
=
self
.
list
[
i
][
1
:][
k
]
subResults
.
append
(
0
)
for
j
in
range
(
1
,
1
+
self
.
rows
):
# Iterowanie po wierszach
subResults
[
k
]
+=
subColumn
[
j
]
subResults
[
k
]
/=
self
.
rows
results
.
append
(
subResults
)
else
:
results
.
append
(
0
)
for
j
in
range
(
1
,
1
+
self
.
rows
):
# Iterowanie po wierszach
results
[
i
]
+=
self
.
list
[
i
][
j
]
results
[
i
]
/=
self
.
rows
results
=
results
[
1
:]
results
[:
0
]
=
[
'
AVG
'
]
self
.
addRow
(
results
)
def
__str__
(
self
):
result
=
''
if
not
self
.
hasSubColumns
():
for
j
in
range
(
0
,
1
+
self
.
rows
):
# Iterowanie po wierszach
for
i
in
range
(
0
,
len
(
self
.
list
)):
#Iterowanie po kolumnach
if
type
(
self
.
list
[
i
][
j
]).
__name__
==
'
int
'
:
result
+=
'
{0:{width}{base}}
'
.
format
(
self
.
list
[
i
][
j
],
base
=
'
d
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
elif
type
(
self
.
list
[
i
][
j
]).
__name__
==
'
float
'
:
result
+=
'
{0:{width}{base}}
'
.
format
(
self
.
list
[
i
][
j
],
base
=
'
.4f
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
else
:
result
+=
'
{0:{width}{base}}
'
.
format
(
self
.
list
[
i
][
j
],
base
=
'
s
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
result
+=
'
\n
'
else
:
#Printing Thead
thead
=
zip
(
*
self
.
list
)[
0
]
for
i
in
range
(
0
,
len
(
thead
)):
if
self
.
hasColumnSubColumns
(
i
):
numberOfColumns
=
len
(
self
.
list
[
i
][
1
:])
result
+=
'
{0:{width}{base}}
'
.
format
(
thead
[
i
],
base
=
'
s
'
,
width
=
self
.
widths
[
i
]
-
numberOfColumns
+
1
)
+
self
.
separator
for
j
in
range
(
1
,
numberOfColumns
):
result
+=
self
.
separator
else
:
result
+=
'
{0:{width}{base}}
'
.
format
(
thead
[
i
],
base
=
'
s
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
result
+=
'
\n
'
#Printing subTheads:
for
i
in
range
(
0
,
len
(
self
.
list
)):
if
self
.
hasColumnSubColumns
(
i
):
numberOfColumns
=
len
(
self
.
list
[
i
][
1
:])
for
subColumn
in
self
.
list
[
i
][
1
:]:
result
+=
'
{0:{width}{base}}
'
.
format
(
subColumn
[
0
],
base
=
'
s
'
,
width
=
(
self
.
widths
[
i
]
/
numberOfColumns
))
+
self
.
separator
else
:
result
+=
'
{0:{width}{base}}
'
.
format
(
''
,
base
=
'
s
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
result
+=
'
\n
'
#Printing Data
for
j
in
range
(
1
,
1
+
self
.
rows
):
# Iterowanie po wierszach
for
i
in
range
(
0
,
len
(
self
.
list
)):
#Iterowanie po kolumnach
if
self
.
hasColumnSubColumns
(
i
):
for
subcolumns
in
self
.
list
[
i
][
1
:]:
if
type
(
subcolumns
[
j
]).
__name__
==
'
int
'
:
result
+=
'
{0:{width}{base}}
'
.
format
(
subcolumns
[
j
],
base
=
'
d
'
,
width
=
(
self
.
widths
[
i
]
/
numberOfColumns
))
+
self
.
separator
elif
type
(
subcolumns
[
j
]).
__name__
==
'
float
'
:
result
+=
'
{0:{width}{base}}
'
.
format
(
subcolumns
[
j
],
base
=
'
.4f
'
,
width
=
(
self
.
widths
[
i
]
/
numberOfColumns
))
+
self
.
separator
else
:
result
+=
'
{0:{width}{base}}
'
.
format
(
subcolumns
[
j
],
base
=
'
s
'
,
width
=
(
self
.
widths
[
i
]
/
numberOfColumns
))
+
self
.
separator
else
:
if
type
(
self
.
list
[
i
][
j
]).
__name__
==
'
int
'
:
result
+=
'
{0:{width}{base}}
'
.
format
(
self
.
list
[
i
][
j
],
base
=
'
d
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
elif
type
(
self
.
list
[
i
][
j
]).
__name__
==
'
float
'
:
result
+=
'
{0:{width}{base}}
'
.
format
(
self
.
list
[
i
][
j
],
base
=
'
.4f
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
else
:
result
+=
'
{0:{width}{base}}
'
.
format
(
self
.
list
[
i
][
j
],
base
=
'
s
'
,
width
=
self
.
widths
[
i
])
+
self
.
separator
result
+=
'
\n
'
return
result
This diff is collapsed.
Click to expand it.
utils/chunk_eval.py
+
125
−
103
View file @
493c914d
#!/usr/bin/python
#!/usr/bin/python
#-*- coding: utf-8 -*-
#-*- coding: utf-8 -*-
'''
# Copyright (C) 2012 Adam Radziszewski.
Created on 01-08-2012
'''
# Copyright (C) 2012 Adam Pawlaczek.
# This program is free software; you can redistribute and/or modify it
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# Software Foundation; either version 3 of the License, or (at your option)
...
@@ -13,7 +15,7 @@
...
@@ -13,7 +15,7 @@
#
#
# See the LICENCE and COPYING files for more details
# See the LICENCE and COPYING files for more details
descr
=
"""
%prog [options] CHUNKED REF
CHAN_NAME
descr
=
"""
%prog [options] CHUNKED REF
Reads the two chunk-annotated corpora: CHUNKED (chunker output) and REF
Reads the two chunk-annotated corpora: CHUNKED (chunker output) and REF
(reference annotation / gold standard). Outputs precision and recall values
(reference annotation / gold standard). Outputs precision and recall values
...
@@ -24,11 +26,10 @@ for the following settings:
...
@@ -24,11 +26,10 @@ for the following settings:
NOTE: this script treats discontinuous chunks as whole annotations.
NOTE: this script treats discontinuous chunks as whole annotations.
"""
"""
from
optparse
import
OptionParser
from
optparse
import
OptionParser
import
sys
import
corpus2
import
corpus2
import
sys
,
os
from
CSVWriter
import
CSVWriter
class
Stats
:
class
Stats
:
def
__init__
(
self
):
def
__init__
(
self
):
...
@@ -56,24 +57,14 @@ class Stats:
...
@@ -56,24 +57,14 @@ class Stats:
ref
=
set
(
ann
.
head_index
for
ann
in
ref_annots
)
ref
=
set
(
ann
.
head_index
for
ann
in
ref_annots
)
self
.
head_hits
+=
len
(
ch
.
intersection
(
ref
))
self
.
head_hits
+=
len
(
ch
.
intersection
(
ref
))
def
dump_prf
(
self
,
name
,
hits
):
def
getPRF
(
self
,
hits
):
p
=
0.0
if
self
.
ch_chunks
==
0
else
100.0
*
hits
/
self
.
ch_chunks
p
=
0.0
if
self
.
ch_chunks
==
0
else
100.0
*
hits
/
self
.
ch_chunks
r
=
0.0
if
self
.
ref_chunks
==
0
else
100.0
*
hits
/
self
.
ref_chunks
r
=
0.0
if
self
.
ref_chunks
==
0
else
100.0
*
hits
/
self
.
ref_chunks
f
=
0.0
if
p
+
r
==
0.0
else
2.0
*
p
*
r
/
(
p
+
r
)
f
=
0.0
if
p
+
r
==
0.0
else
2.0
*
p
*
r
/
(
p
+
r
)
print
'
%s
\t
%.2f
\t
%.2f
\t
%.2f
'
%
(
name
,
p
,
r
,
f
)
return
[
p
,
r
,
f
]
def
dump
(
self
,
verbosity
=
2
):
if
verbosity
>
1
:
print
'
CHU chunks
\t
%d
'
%
self
.
ch_chunks
print
'
REF chunks
\t
%d
'
%
self
.
ref_chunks
print
'
Chunk hits
\t
%d
'
%
self
.
chunk_hits
print
'
Head hits
\t
%d
'
%
self
.
head_hits
print
'
Ch+Hd hits
\t
%d
'
%
self
.
both_hits
if
verbosity
>
0
:
self
.
dump_prf
(
'
Chunk P,R,F
'
,
self
.
chunk_hits
)
self
.
dump_prf
(
'
Heads P,R,F
'
,
self
.
head_hits
)
self
.
dump_prf
(
'
Ch+Hd P,R,F
'
,
self
.
both_hits
)
def
getStats
(
self
):
return
[
self
.
getPRF
(
self
.
chunk_hits
)]
def
get_annots
(
sent
,
chan_name
):
def
get_annots
(
sent
,
chan_name
):
# wrap the sentence as an AnnotatedSentence
# wrap the sentence as an AnnotatedSentence
...
@@ -87,29 +78,55 @@ def get_annots(sent, chan_name):
...
@@ -87,29 +78,55 @@ def get_annots(sent, chan_name):
annots
.
append
(
ann
)
annots
.
append
(
ann
)
return
annots
return
annots
def
go
():
def
go
():
parser
=
OptionParser
(
usage
=
descr
)
parser
=
OptionParser
(
usage
=
descr
)
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
input_format
'
,
default
=
'
ccl
'
,
dest
=
'
input_format
'
,
default
=
'
ccl
'
,
help
=
'
set the input format; default: ccl
'
)
help
=
'
set the input format; default: ccl
'
)
parser
.
add_option
(
'
-O
'
,
'
--output-file
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
out_path
'
,
default
=
''
,
help
=
'
set output filename (do not write to stdout)
'
)
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
tagset
'
,
default
=
'
nkjp
'
,
dest
=
'
tagset
'
,
default
=
'
nkjp
'
,
help
=
'
set the tagset used in input; default: nkjp
'
)
help
=
'
set the tagset used in input; default: nkjp
'
)
parser
.
add_option
(
'
-c
'
,
'
--chunk-names
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
chunk_names
'
,
default
=
''
,
help
=
'
set chunk_names to eval
'
)
parser
.
add_option
(
'
-f
'
,
'
--folds
'
,
type
=
"
int
"
,
action
=
'
store
'
,
dest
=
'
folds
'
,
default
=
1
,
help
=
'
Number of folds
'
)
parser
.
add_option
(
'
-q
'
,
'
--quiet
'
,
action
=
'
store_false
'
,
parser
.
add_option
(
'
-q
'
,
'
--quiet
'
,
action
=
'
store_false
'
,
default
=
True
,
dest
=
'
verbose
'
)
default
=
True
,
dest
=
'
verbose
'
)
(
options
,
args
)
=
parser
.
parse_args
()
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
!=
3
:
if
len
(
args
)
!=
2
:
sys
.
stderr
.
write
(
'
No args. See --help
\n
'
)
sys
.
stderr
.
write
(
'
No args. See --help
\n
'
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
ch_path
,
ref_path
,
chan_name
=
args
ch_path
,
ref_path
=
args
tagset
=
corpus2
.
get_named_tagset
(
options
.
tagset
)
main
(
ch_path
,
ref_path
,
options
.
chunk_names
,
options
.
input_format
,
options
.
out_path
,
options
.
tagset
,
options
.
verbose
,
options
.
folds
)
def
main
(
ch_path
,
ref_path
,
chan_name
,
input_format
,
out_path
,
tagset
,
verbose
,
folds
):
csvWriter
=
CSVWriter
(
"
,
"
)
csvWriter
.
addColumns
([
"
Nr
"
,
"
Chunk
"
])
csvWriter
.
addSubColumnsByName
(
"
Chunk
"
,
[
"
P
"
,
"
R
"
,
"
F
"
])
tagset
=
corpus2
.
get_named_tagset
(
tagset
)
for
fold
in
range
(
1
,
folds
+
1
):
if
folds
>
1
:
ch_path_fold
=
os
.
path
.
join
(
ch_path
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
)
ref_path_fold
=
os
.
path
.
join
(
ref_path
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
)
else
:
ch_path_fold
=
ch_path
ref_path_fold
=
ref_path
ch_rdr
=
corpus2
.
TokenReader
.
create_path_reader
(
ch_rdr
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
ch_path
)
input_format
,
tagset
,
ch_path
_fold
)
ref_rdr
=
corpus2
.
TokenReader
.
create_path_reader
(
ref_rdr
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
ref_path
)
input_format
,
tagset
,
ref_path
_fold
)
stats
=
Stats
()
stats
=
Stats
()
...
@@ -128,7 +145,12 @@ def go():
...
@@ -128,7 +145,12 @@ def go():
ch_annots
=
get_annots
(
ch_sent
,
chan_name
)
ch_annots
=
get_annots
(
ch_sent
,
chan_name
)
ref_annots
=
get_annots
(
ref_sent
,
chan_name
)
ref_annots
=
get_annots
(
ref_sent
,
chan_name
)
stats
.
update
(
ch_annots
,
ref_annots
)
stats
.
update
(
ch_annots
,
ref_annots
)
stats
.
dump
(
int
(
options
.
verbose
)
+
1
)
results
=
stats
.
getStats
()
results
[:
0
]
=
[
fold
]
csvWriter
.
addRow
(
results
)
csvWriter
.
count_avg
()
print
csvWriter
if
__name__
==
'
__main__
'
:
if
__name__
==
'
__main__
'
:
go
()
go
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment