Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
Iobber
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
Chunking
Iobber
Commits
5c0f41a9
Commit
5c0f41a9
authored
12 years ago
by
jezozwierzak
Browse files
Options
Downloads
Patches
Plain Diff
Added count_not_cont.py script
parent
869cd580
Branches
Branches containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/stats/count_not_cont.py
+140
-0
140 additions, 0 deletions
scripts/stats/count_not_cont.py
with
140 additions
and
0 deletions
scripts/stats/count_not_cont.py
0 → 100644
+
140
−
0
View file @
5c0f41a9
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on Mar 25, 2013
@author: Adam Pawlaczek
'''
from
optparse
import
OptionParser
import
sys
,
os
from
csv_table2.csv_table
import
CsvTable
import
corpus2
descr
=
"""
%prog [options] in_dir out_dir
The purpose of this program is to count number of not continous phases all types (chunk_np, chunk_vp)
You can ofcource change the list of chunks checked by script.
It will print to stdout number of all chunks, not_cont_chunks, and percent of non-contiguous phrases
"""
def
go
():
parser
=
OptionParser
(
usage
=
descr
)
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
input_format
'
,
default
=
'
ccl
'
,
help
=
'
set the input format; default: ccl
'
)
parser
.
add_option
(
'
-o
'
,
'
--output-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
output_format
'
,
default
=
'
ccl
'
,
help
=
'
set the output format; default: ccl
'
)
parser
.
add_option
(
'
-O
'
,
'
--output-file
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
out_path
'
,
default
=
''
,
help
=
'
set output filename (do not write to stdout)
'
)
parser
.
add_option
(
'
-c
'
,
'
--chunk-names
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
chunk_names
'
,
default
=
'
chunk_np,chunk_vp,chunk_adjp,chunk_agp
'
,
help
=
'
set chunk_names to count
'
)
parser
.
add_option
(
'
-f
'
,
'
--folds
'
,
type
=
"
int
"
,
action
=
'
store
'
,
dest
=
'
folds
'
,
default
=
1
,
help
=
'
Number of folds
'
)
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
tagset
'
,
default
=
'
nkjp
'
,
help
=
'
set the tagset used in input; default: nkjp
'
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
!=
1
:
sys
.
stderr
.
write
(
'
You need to provide corpus_dir and out_dir.
\n
'
)
sys
.
stderr
.
write
(
'
See %s --help
\n
'
%
sys
.
argv
[
0
])
sys
.
exit
(
1
)
in_path
=
args
[
0
]
main
(
in_path
,
options
.
input_format
,
options
.
output_format
,
options
.
chunk_names
,
options
.
folds
,
options
.
tagset
)
def
is_continous
(
inds
):
l2
=
range
(
inds
[
0
],
inds
[
-
1
]
+
1
)
return
inds
==
l2
def
get_input_paths
(
in_path
,
folds
,
input_format
):
input_paths
=
[]
if
folds
>
1
:
for
fold
in
range
(
1
,
folds
+
1
):
if
input_format
==
"
ccl
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
elif
input_format
==
"
xces
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
else
:
if
(
os
.
path
.
isdir
(
in_path
)):
for
(
path
,
dirs
,
files
)
in
os
.
walk
(
in_path
):
for
file
in
files
:
input_paths
.
append
(
os
.
path
.
join
(
path
,
file
))
else
:
input_paths
.
append
(
in_path
)
return
input_paths
def
get_writer
(
out_path
,
output_format
,
tagset
):
if
out_path
:
return
corpus2
.
TokenWriter
.
create_path_writer
(
output_format
,
out_path
,
tagset
)
else
:
return
corpus2
.
TokenWriter
.
create_stdout_writer
(
output_format
,
tagset
)
def
get_reader
(
in_path
,
input_format
,
tagset
):
if
in_path
:
return
corpus2
.
TokenReader
.
create_path_reader
(
input_format
,
tagset
,
in_path
)
else
:
return
corpus2
.
TokenReader
.
create_stdin_reader
(
input_format
,
tagset
)
def
get_output_path
(
out_path
,
basename
=
None
):
if
basename
==
None
:
return
out_path
else
:
return
os
.
path
.
join
(
out_path
,
basename
)
def
main
(
in_path
,
input_format
,
output_format
,
chunk_names
,
folds
,
tagset
):
tagset
=
corpus2
.
get_named_tagset
(
tagset
)
chunk_names
=
chunk_names
.
split
(
"
,
"
)
input_paths
=
get_input_paths
(
in_path
,
folds
,
input_format
)
results
=
[]
i
=
0
for
input_path
in
input_paths
:
reader
=
get_reader
(
input_path
,
input_format
,
tagset
)
print
input_path
fold_results
=
{
'
all
'
:
0
,
'
n_c
'
:
0
,
'
%
'
:
0
}
sent
=
reader
.
get_next_sentence
()
while
sent
:
asent
=
corpus2
.
AnnotatedSentence
.
wrap_sentence
(
sent
)
for
chan_name
in
asent
.
all_channels
():
if
chan_name
==
"
chunk_np
"
:
chan
=
asent
.
get_channel
(
chan_name
)
ann_vec
=
chan
.
make_annotation_vector
()
for
ann
in
ann_vec
:
inds
=
sorted
(
ann
.
indices
)
if
not
is_continous
(
inds
):
fold_results
[
'
n_c
'
]
+=
1
fold_results
[
'
all
'
]
+=
1
sent
=
reader
.
get_next_sentence
()
fold_results
[
'
%
'
]
=
fold_results
[
'
n_c
'
]
*
100
/
float
(
fold_results
[
'
all
'
])
results
.
append
(
fold_results
)
i
+=
1
all_count
=
0
n_c_count
=
0
print
'
all
'
,
'
n_c
'
,
'
%
'
for
fold
in
range
(
len
(
results
)):
all_count
+=
results
[
fold
][
'
all
'
]
n_c_count
+=
results
[
fold
][
'
n_c
'
]
print
results
[
fold
][
'
all
'
],
results
[
fold
][
'
n_c
'
],
results
[
fold
][
'
%
'
]
print
'
------------------------------------------------------------------
'
print
all_count
/
i
,
n_c_count
/
i
,
(
n_c_count
/
i
)
*
100
/
float
(
all_count
/
i
)
avg_results
=
{
'
all
'
:
all_count
/
float
(
i
),
'
n_c
'
:
n_c_count
/
float
(
i
),
'
%
'
:(
n_c_count
/
i
)
*
100
/
float
(
all_count
/
i
)}
results
.
append
(
avg_results
)
if
__name__
==
'
__main__
'
:
go
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment