Skip to content
Snippets Groups Projects
Commit 94f441d3 authored by jezozwierzak's avatar jezozwierzak
Browse files

Added is_ok_liner_after_iobber script

parent 301b7263
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on 25-04-2013
@author: Adam Pawlaczek
'''
from optparse import OptionParser
import sys, os
import corpus2
from chunker_scripts import tools
descr="""%prog [options] liner_dir iobber_dir
Program is checking if all chunks are ok after runing liner2.
liner_dir and iobber_dir have to contain the same file names"""
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='ccl',
help='set the input format; default: ccl')
parser.add_option('-f', '--folds', type="int", action='store',
dest='folds', default=1,
help='Number of folds default: 10')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
(options, args) = parser.parse_args()
if len(args) != 2:
sys.stderr.write('You need to provide liner_dir and iobber_dir.\n')
sys.stderr.write('See %s --help\n' % sys.argv[0])
sys.exit(1)
liner_dir, iobber_dir = args
main(liner_dir, iobber_dir, options.input_format, options.tagset, options.folds)
def get_input_paths(in_path, folds, input_format):
input_paths = []
if folds > 1:
for fold in range(1, folds+1):
if input_format == "ccl":
input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
elif input_format == "xces":
input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
else:
if(os.path.isdir(in_path)):
for (path, dirs, files) in os.walk(in_path):
for file in files:
input_paths.append(os.path.join(path, file))
else:
input_paths.append(in_path)
return input_paths
def main(liner_dir, iobber_dir, input_format, tagset, folds):
tg = corpus2.get_named_tagset(tagset)
liner_paths = []
for (path, dirs, files) in tools.walklevel(liner_dir):
for file in files:
liner_paths.append(os.path.join(path, file))
iobber_paths = []
for (path, dirs, files) in tools.walklevel(iobber_dir):
for file in files:
iobber_paths.append(os.path.join(path, file))
for liner_path, iobber_path in zip(liner_paths, iobber_paths):
print liner_path, iobber_path
reader_l = tools.get_reader(liner_path, input_format, tg)
reader_i = tools.get_reader(iobber_path, input_format, tg)
sent_l = reader_l.get_next_sentence()
sent_i = reader_i.get_next_sentence()
while sent_l and sent_i:
asent_l = corpus2.AnnotatedSentence.wrap_sentence(sent_l)
asent_i = corpus2.AnnotatedSentence.wrap_sentence(sent_i)
for chan_name in asent_i.all_channels():
if chan_name in ["chunk_np", "chunk_agp", "chunk_adjp", "chunk_vp", "chunk_qp"]:
if asent_l.has_channel(chan_name): #
chan_l = asent_l.get_channel(chan_name)
chan_i = asent_i.get_channel(chan_name)
ann_vec_l = chan_l.make_annotation_vector()
ann_vec_i = chan_i.make_annotation_vector()
ann_vec_l_indices = [ list(ann.indices) for ann in ann_vec_l ]
ann_vec_i_indices = [ list(ann.indices) for ann in ann_vec_i ]
assert ann_vec_l_indices == ann_vec_i_indices, "Diffrent chunk " + chan_name + " in files " + liner_path + " " + iobber_path + " in sentence: " + " ".join(token.orth_utf8() for token in sent_l.tokens()) + "\n VECTOR:" + str(ann_vec_l_indices) + "\n VECTOR:" + str(ann_vec_i_indices)
sent_l = reader_l.get_next_sentence()
sent_i = reader_i.get_next_sentence()
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment