Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
Iobber
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
Chunking
Iobber
Commits
5c0f41a9
Commit
5c0f41a9
authored
Apr 8, 2013
by
jezozwierzak
Browse files
Options
Downloads
Patches
Plain Diff
Added count_not_cont.py script
parent
869cd580
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/stats/count_not_cont.py
+140
-0
140 additions, 0 deletions
scripts/stats/count_not_cont.py
with
140 additions
and
0 deletions
scripts/stats/count_not_cont.py
0 → 100644
+
140
−
0
View file @
5c0f41a9
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on Mar 25, 2013
@author: Adam Pawlaczek
'''
from
optparse
import
OptionParser
import
sys
,
os
from
csv_table2.csv_table
import
CsvTable
import
corpus2
descr
=
"""
%prog [options] in_dir out_dir
The purpose of this program is to count number of not continous phases all types (chunk_np, chunk_vp)
You can ofcource change the list of chunks checked by script.
It will print to stdout number of all chunks, not_cont_chunks, and percent of non-contiguous phrases
"""
def
go
():
parser
=
OptionParser
(
usage
=
descr
)
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
input_format
'
,
default
=
'
ccl
'
,
help
=
'
set the input format; default: ccl
'
)
parser
.
add_option
(
'
-o
'
,
'
--output-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
output_format
'
,
default
=
'
ccl
'
,
help
=
'
set the output format; default: ccl
'
)
parser
.
add_option
(
'
-O
'
,
'
--output-file
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
out_path
'
,
default
=
''
,
help
=
'
set output filename (do not write to stdout)
'
)
parser
.
add_option
(
'
-c
'
,
'
--chunk-names
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
chunk_names
'
,
default
=
'
chunk_np,chunk_vp,chunk_adjp,chunk_agp
'
,
help
=
'
set chunk_names to count
'
)
parser
.
add_option
(
'
-f
'
,
'
--folds
'
,
type
=
"
int
"
,
action
=
'
store
'
,
dest
=
'
folds
'
,
default
=
1
,
help
=
'
Number of folds
'
)
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
tagset
'
,
default
=
'
nkjp
'
,
help
=
'
set the tagset used in input; default: nkjp
'
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
!=
1
:
sys
.
stderr
.
write
(
'
You need to provide corpus_dir and out_dir.
\n
'
)
sys
.
stderr
.
write
(
'
See %s --help
\n
'
%
sys
.
argv
[
0
])
sys
.
exit
(
1
)
in_path
=
args
[
0
]
main
(
in_path
,
options
.
input_format
,
options
.
output_format
,
options
.
chunk_names
,
options
.
folds
,
options
.
tagset
)
def
is_continous
(
inds
):
l2
=
range
(
inds
[
0
],
inds
[
-
1
]
+
1
)
return
inds
==
l2
def
get_input_paths
(
in_path
,
folds
,
input_format
):
input_paths
=
[]
if
folds
>
1
:
for
fold
in
range
(
1
,
folds
+
1
):
if
input_format
==
"
ccl
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
elif
input_format
==
"
xces
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
else
:
if
(
os
.
path
.
isdir
(
in_path
)):
for
(
path
,
dirs
,
files
)
in
os
.
walk
(
in_path
):
for
file
in
files
:
input_paths
.
append
(
os
.
path
.
join
(
path
,
file
))
else
:
input_paths
.
append
(
in_path
)
return
input_paths
def
get_writer
(
out_path
,
output_format
,
tagset
):
if
out_path
:
return
corpus2
.
TokenWriter
.
create_path_writer
(
output_format
,
out_path
,
tagset
)
else
:
return
corpus2
.
TokenWriter
.
create_stdout_writer
(
output_format
,
tagset
)
def
get_reader
(
in_path
,
input_format
,
tagset
):
if
in_path
:
return
corpus2
.
TokenReader
.
create_path_reader
(
input_format
,
tagset
,
in_path
)
else
:
return
corpus2
.
TokenReader
.
create_stdin_reader
(
input_format
,
tagset
)
def
get_output_path
(
out_path
,
basename
=
None
):
if
basename
==
None
:
return
out_path
else
:
return
os
.
path
.
join
(
out_path
,
basename
)
def
main
(
in_path
,
input_format
,
output_format
,
chunk_names
,
folds
,
tagset
):
tagset
=
corpus2
.
get_named_tagset
(
tagset
)
chunk_names
=
chunk_names
.
split
(
"
,
"
)
input_paths
=
get_input_paths
(
in_path
,
folds
,
input_format
)
results
=
[]
i
=
0
for
input_path
in
input_paths
:
reader
=
get_reader
(
input_path
,
input_format
,
tagset
)
print
input_path
fold_results
=
{
'
all
'
:
0
,
'
n_c
'
:
0
,
'
%
'
:
0
}
sent
=
reader
.
get_next_sentence
()
while
sent
:
asent
=
corpus2
.
AnnotatedSentence
.
wrap_sentence
(
sent
)
for
chan_name
in
asent
.
all_channels
():
if
chan_name
==
"
chunk_np
"
:
chan
=
asent
.
get_channel
(
chan_name
)
ann_vec
=
chan
.
make_annotation_vector
()
for
ann
in
ann_vec
:
inds
=
sorted
(
ann
.
indices
)
if
not
is_continous
(
inds
):
fold_results
[
'
n_c
'
]
+=
1
fold_results
[
'
all
'
]
+=
1
sent
=
reader
.
get_next_sentence
()
fold_results
[
'
%
'
]
=
fold_results
[
'
n_c
'
]
*
100
/
float
(
fold_results
[
'
all
'
])
results
.
append
(
fold_results
)
i
+=
1
all_count
=
0
n_c_count
=
0
print
'
all
'
,
'
n_c
'
,
'
%
'
for
fold
in
range
(
len
(
results
)):
all_count
+=
results
[
fold
][
'
all
'
]
n_c_count
+=
results
[
fold
][
'
n_c
'
]
print
results
[
fold
][
'
all
'
],
results
[
fold
][
'
n_c
'
],
results
[
fold
][
'
%
'
]
print
'
------------------------------------------------------------------
'
print
all_count
/
i
,
n_c_count
/
i
,
(
n_c_count
/
i
)
*
100
/
float
(
all_count
/
i
)
avg_results
=
{
'
all
'
:
all_count
/
float
(
i
),
'
n_c
'
:
n_c_count
/
float
(
i
),
'
%
'
:(
n_c_count
/
i
)
*
100
/
float
(
all_count
/
i
)}
results
.
append
(
avg_results
)
if
__name__
==
'
__main__
'
:
go
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment