Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
Iobber
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
Chunking
Iobber
Commits
f6bb7711
Commit
f6bb7711
authored
12 years ago
by
Adam Pawlaczek
Browse files
Options
Downloads
Patches
Plain Diff
Added oracle
parent
afca5936
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/chunker_scripts/experiments/oracle.py
+166
-0
166 additions, 0 deletions
scripts/chunker_scripts/experiments/oracle.py
with
166 additions
and
0 deletions
scripts/chunker_scripts/experiments/oracle.py
0 → 100755
+
166
−
0
View file @
f6bb7711
#!/usr/bin/python
#-*- coding: utf-8 -*-
'''
Created on 19-02-2013
@author: jezozwierzak
'''
from
optparse
import
OptionParser
import
sys
,
os
import
corpus2
from
chunker_scripts
import
tools
descr
=
"""
%prog [options] [in_dir] [out_dir]
in_dir has to contain subdirs with folds chunked by individual chunkers.
Subdir should be named as chunker which chunked files in it.
"""
def
go
():
parser
=
OptionParser
(
usage
=
descr
)
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
input_format
'
,
default
=
'
ccl
'
,
help
=
'
set the input format; default: ccl
'
)
parser
.
add_option
(
'
-o
'
,
'
--output-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
output_format
'
,
default
=
'
ccl
'
,
help
=
'
set the output format; default: ccl
'
)
parser
.
add_option
(
'
-c
'
,
'
--chunk-names
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
chunk_names
'
,
default
=
''
,
help
=
'
set chunk_names to eval
'
)
parser
.
add_option
(
'
--chunkers
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
chunkers
'
,
default
=
''
,
help
=
'
set chunkers to eval
'
)
parser
.
add_option
(
'
-f
'
,
'
--folds
'
,
type
=
"
int
"
,
action
=
'
store
'
,
dest
=
'
folds
'
,
default
=
1
,
help
=
'
Number of folds
'
)
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
tagset
'
,
default
=
'
nkjp
'
,
help
=
'
set the tagset used in input; default: nkjp
'
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
!=
3
and
options
.
chunk_names
==
''
and
options
.
chunkers
==
''
:
sys
.
stderr
.
write
(
'
You need to provide a in_dir, ref_dir and out_dir and chunk_names and chunkers.
\n
'
)
sys
.
stderr
.
write
(
'
See %s --help
\n
'
%
sys
.
argv
[
0
])
sys
.
exit
(
1
)
in_path
,
ref_path
,
out_path
=
args
main
(
in_path
,
ref_path
,
out_path
,
options
.
input_format
,
options
.
output_format
,
options
.
chunk_names
,
options
.
chunkers
,
options
.
folds
,
options
.
tagset
)
def
get_ref_paths
(
in_path
,
folds
,
input_format
):
input_paths
=
[]
if
folds
>
1
:
for
fold
in
range
(
1
,
folds
+
1
):
if
input_format
==
"
ccl
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
elif
input_format
==
"
xces
"
:
input_paths
.
append
(
os
.
path
.
join
(
in_path
,
'
test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
))
else
:
if
(
os
.
path
.
isdir
(
in_path
)):
for
(
path
,
dirs
,
files
)
in
os
.
walk
(
in_path
):
for
file
in
files
:
input_paths
.
append
(
os
.
path
.
join
(
path
,
file
))
else
:
input_paths
.
append
(
in_path
)
return
input_paths
def
get_input_paths
(
in_path
,
folds
,
input_format
,
chunkers
):
input_paths
=
[]
for
fold
in
range
(
1
,
folds
+
1
):
fold_inputs
=
{}
for
chunker
in
chunkers
:
if
os
.
path
.
isdir
(
os
.
path
.
join
(
in_path
,
chunker
)):
if
input_format
==
"
ccl
"
:
fold_inputs
[
chunker
]
=
os
.
path
.
join
(
in_path
,
chunker
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
)
elif
input_format
==
"
xces
"
:
fold_inputs
[
chunker
]
=
os
.
path
.
join
(
in_path
,
chunker
,
'
test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
)
else
:
print
os
.
path
.
join
(
in_path
,
chunker
),
"
dir doesn
'
t exist
"
input_paths
.
append
(
fold_inputs
)
return
input_paths
def
get_writer
(
out_path
,
output_format
,
tagset
,
fold
):
out_path
=
get_output_path
(
out_path
,
fold
,
output_format
)
return
corpus2
.
TokenWriter
.
create_path_writer
(
output_format
,
out_path
,
tagset
)
def
get_output_path
(
out_path
,
fold
,
output_format
):
if
output_format
==
"
ccl
"
:
return
os
.
path
.
join
(
out_path
,
'
ccl-test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
)
elif
input_format
==
"
xces
"
:
return
os
.
path
.
join
(
out_path
,
'
test
'
+
str
(
fold
).
zfill
(
2
)
+
'
.xml
'
)
def
get_readers
(
in_paths
,
input_format
,
tagset
):
readers
=
{}
for
chunker
,
in_path
in
in_paths
.
iteritems
():
readers
[
chunker
]
=
tools
.
get_reader
(
in_path
,
input_format
,
tagset
)
return
readers
def
get_next_sents
(
readers
):
result
=
{}
for
chunker
,
reader
in
readers
.
iteritems
():
result
[
chunker
]
=
reader
.
get_next_sentence
()
return
result
def
clone_sent
(
sent
):
new_sent
=
corpus2
.
Sentence
.
create_sent
(
sent
.
id
())
for
tok_idx
,
tok
in
enumerate
(
sent
.
tokens
()):
tok
=
sent
.
tokens
()[
tok_idx
]
if
any
(
lex
.
is_disamb
()
for
lex
in
tok
.
lexemes
()):
new_sent
.
append
(
tok
.
clone
())
return
new_sent
def
main
(
in_path
,
ref_path
,
out_path
,
input_format
,
output_format
,
chunk_names
,
chunkers
,
folds
,
tagset
):
tagset
=
corpus2
.
get_named_tagset
(
tagset
)
chunk_names
=
chunk_names
.
split
(
"
,
"
)
chunkers
=
chunkers
.
split
(
"
,
"
)
ref_paths
=
get_ref_paths
(
ref_path
,
folds
,
input_format
)
input_paths
=
get_input_paths
(
in_path
,
folds
,
input_format
,
chunkers
)
for
fold
in
range
(
1
,
folds
+
1
):
writer
=
get_writer
(
out_path
,
output_format
,
tagset
,
fold
)
readers
=
get_readers
(
input_paths
[
fold
-
1
],
input_format
,
tagset
)
sents
=
get_next_sents
(
readers
)
ref_reader
=
tools
.
get_reader
(
ref_paths
[
fold
-
1
],
input_format
,
tagset
)
while
sents
.
itervalues
().
next
():
ref_sent
=
ref_reader
.
get_next_sentence
()
ref_asent
=
corpus2
.
AnnotatedSentence
.
wrap_sentence
(
ref_sent
)
result_sent
=
clone_sent
(
ref_asent
)
result_asent
=
corpus2
.
AnnotatedSentence
.
wrap_sentence
(
result_sent
)
for
chunk_name
in
ref_asent
.
all_channels
():
if
chunk_name
in
chunk_names
:
right_annots
=
[]
ref_annots
=
ref_asent
.
get_channel
(
chunk_name
).
make_annotation_vector
()
ref
=
dict
([(
min
(
ann
.
indices
),
ann
)
for
ann
in
ref_annots
])
for
chunker
in
chunkers
:
ch_asent
=
corpus2
.
AnnotatedSentence
.
wrap_sentence
(
sents
[
chunker
])
if
ch_asent
.
has_channel
(
chunk_name
):
ch_annots
=
ch_asent
.
get_channel
(
chunk_name
).
make_annotation_vector
()
ch
=
dict
([(
min
(
ann
.
indices
),
ann
)
for
ann
in
ch_annots
])
maybe_hits
=
set
(
ch
).
intersection
(
ref
)
for
idx
in
maybe_hits
:
if
list
(
ch
[
idx
].
indices
)
==
list
(
ref
[
idx
].
indices
)
and
[
i
for
i
in
ch
[
idx
].
indices
]
not
in
right_annots
:
right_annots
.
append
([
i
for
i
in
ch
[
idx
].
indices
])
#add right chunks
result_asent
.
create_channel
(
chunk_name
)
chan
=
result_asent
.
get_channel
(
chunk_name
)
for
ann
in
right_annots
:
seg_no
=
chan
.
get_new_segment_index
()
for
idx
in
ann
:
chan
.
set_segment_at
(
idx
,
seg_no
)
result_sent
=
corpus2
.
AnnotatedSentence
.
cast_as_sentence
(
result_asent
)
writer
.
write_sentence
(
result_sent
)
sents
=
get_next_sents
(
readers
)
if
__name__
==
'
__main__
'
:
go
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment