Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
Iobber
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
Chunking
Iobber
Commits
94f441d3
Commit
94f441d3
authored
12 years ago
by
jezozwierzak
Browse files
Options
Downloads
Patches
Plain Diff
Added is_ok_liner_after_iobber script
parent
301b7263
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/chunker_scripts/stats/is_ok_liner_after_iobber.py
+100
-0
100 additions, 0 deletions
scripts/chunker_scripts/stats/is_ok_liner_after_iobber.py
with
100 additions
and
0 deletions
scripts/chunker_scripts/stats/is_ok_liner_after_iobber.py
0 → 100755
+
100
−
0
View file @
94f441d3
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 25-04-2013
@author: Adam Pawlaczek
'''
from
optparse
import
OptionParser
import
sys
,
os
import
corpus2
from
chunker_scripts
import
tools
descr
=
"""
%prog [options] liner_dir iobber_dir
Program is checking if all chunks are ok after runing liner2.
liner_dir and iobber_dir have to contain the same file names
"""
def
go
():
parser
=
OptionParser
(
usage
=
descr
)
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
input_format
'
,
default
=
'
ccl
'
,
help
=
'
set the input format; default: ccl
'
)
parser
.
add_option
(
'
-f
'
,
'
--folds
'
,
type
=
"
int
"
,
action
=
'
store
'
,
dest
=
'
folds
'
,
default
=
1
,
help
=
'
Number of folds default: 10
'
)
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
tagset
'
,
default
=
'
nkjp
'
,
help
=
'
set the tagset used in input; default: nkjp
'
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
!=
2
:
sys
.
stderr
.
write
(
'
You need to provide liner_dir and iobber_dir.
\n
'
)
sys
.
stderr
.
write
(
'
See %s --help
\n
'
%
sys
.
argv
[
0
])
sys
.
exit
(
1
)
liner_dir
,
iobber_dir
=
args
main
(
liner_dir
,
iobber_dir
,
options
.
input_format
,
options
.
tagset
,
options
.
folds
)
def
get_input_paths
(
in_path
,
folds
,
input_format
):
input_paths
=
[]
if
folds
>
1
:
for
fold
in
range
(
1
,
folds
+
1
):
if
input_format
==
"
ccl
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
elif
input_format
==
"
xces
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
else
:
if
(
os
.
path
.
isdir
(
in_path
)):
for
(
path
,
dirs
,
files
)
in
os
.
walk
(
in_path
):
for
file
in
files
:
input_paths
.
append
(
os
.
path
.
join
(
path
,
file
))
else
:
input_paths
.
append
(
in_path
)
return
input_paths
def
main
(
liner_dir
,
iobber_dir
,
input_format
,
tagset
,
folds
):
tg
=
corpus2
.
get_named_tagset
(
tagset
)
liner_paths
=
[]
for
(
path
,
dirs
,
files
)
in
tools
.
walklevel
(
liner_dir
):
for
file
in
files
:
liner_paths
.
append
(
os
.
path
.
join
(
path
,
file
))
iobber_paths
=
[]
for
(
path
,
dirs
,
files
)
in
tools
.
walklevel
(
iobber_dir
):
for
file
in
files
:
iobber_paths
.
append
(
os
.
path
.
join
(
path
,
file
))
for
liner_path
,
iobber_path
in
zip
(
liner_paths
,
iobber_paths
):
print
liner_path
,
iobber_path
reader_l
=
tools
.
get_reader
(
liner_path
,
input_format
,
tg
)
reader_i
=
tools
.
get_reader
(
iobber_path
,
input_format
,
tg
)
sent_l
=
reader_l
.
get_next_sentence
()
sent_i
=
reader_i
.
get_next_sentence
()
while
sent_l
and
sent_i
:
asent_l
=
corpus2
.
AnnotatedSentence
.
wrap_sentence
(
sent_l
)
asent_i
=
corpus2
.
AnnotatedSentence
.
wrap_sentence
(
sent_i
)
for
chan_name
in
asent_i
.
all_channels
():
if
chan_name
in
[
"
chunk_np
"
,
"
chunk_agp
"
,
"
chunk_adjp
"
,
"
chunk_vp
"
,
"
chunk_qp
"
]:
if
asent_l
.
has_channel
(
chan_name
):
#
chan_l
=
asent_l
.
get_channel
(
chan_name
)
chan_i
=
asent_i
.
get_channel
(
chan_name
)
ann_vec_l
=
chan_l
.
make_annotation_vector
()
ann_vec_i
=
chan_i
.
make_annotation_vector
()
ann_vec_l_indices
=
[
list
(
ann
.
indices
)
for
ann
in
ann_vec_l
]
ann_vec_i_indices
=
[
list
(
ann
.
indices
)
for
ann
in
ann_vec_i
]
assert
ann_vec_l_indices
==
ann_vec_i_indices
,
"
Diffrent chunk
"
+
chan_name
+
"
in files
"
+
liner_path
+
"
"
+
iobber_path
+
"
in sentence:
"
+
"
"
.
join
(
token
.
orth_utf8
()
for
token
in
sent_l
.
tokens
())
+
"
\n
VECTOR:
"
+
str
(
ann_vec_l_indices
)
+
"
\n
VECTOR:
"
+
str
(
ann_vec_i_indices
)
sent_l
=
reader_l
.
get_next_sentence
()
sent_i
=
reader_i
.
get_next_sentence
()
if
__name__
==
'
__main__
'
:
go
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment