Skip to content
Snippets Groups Projects
Commit 5c0f41a9 authored by jezozwierzak's avatar jezozwierzak
Browse files

Added count_not_cont.py script

parent 869cd580
Branches
No related merge requests found
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Created on Mar 25, 2013
@author: Adam Pawlaczek
'''
from optparse import OptionParser
import sys, os
from csv_table2.csv_table import CsvTable
import corpus2
descr="""%prog [options] in_dir out_dir
The purpose of this program is to count number of not continous phases all types (chunk_np, chunk_vp)
You can ofcource change the list of chunks checked by script.
It will print to stdout number of all chunks, not_cont_chunks, and percent of non-contiguous phrases"""
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='ccl',
help='set the input format; default: ccl')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='ccl',
help='set the output format; default: ccl')
parser.add_option('-O', '--output-file', type='string', action='store',
dest='out_path', default='',
help='set output filename (do not write to stdout)')
parser.add_option('-c', '--chunk-names', type='string', action='store',
dest='chunk_names', default='chunk_np,chunk_vp,chunk_adjp,chunk_agp',
help='set chunk_names to count')
parser.add_option('-f', '--folds', type="int", action='store',
dest='folds', default=1,
help='Number of folds')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
(options, args) = parser.parse_args()
if len(args) != 1:
sys.stderr.write('You need to provide corpus_dir and out_dir.\n')
sys.stderr.write('See %s --help\n' % sys.argv[0])
sys.exit(1)
in_path = args[0]
main(in_path, options.input_format, options.output_format, options.chunk_names, options.folds, options.tagset)
def is_continous(inds):
l2 = range(inds[0], inds[-1] + 1)
return inds == l2
def get_input_paths(in_path, folds, input_format):
input_paths = []
if folds > 1:
for fold in range(1, folds+1):
if input_format == "ccl":
input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
elif input_format == "xces":
input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
else:
if(os.path.isdir(in_path)):
for (path, dirs, files) in os.walk(in_path):
for file in files:
input_paths.append(os.path.join(path, file))
else:
input_paths.append(in_path)
return input_paths
def get_writer(out_path, output_format, tagset):
if out_path:
return corpus2.TokenWriter.create_path_writer(output_format, out_path,
tagset)
else:
return corpus2.TokenWriter.create_stdout_writer(output_format, tagset)
def get_reader(in_path, input_format, tagset):
if in_path:
return corpus2.TokenReader.create_path_reader(
input_format, tagset, in_path)
else:
return corpus2.TokenReader.create_stdin_reader(input_format, tagset)
def get_output_path(out_path, basename = None):
if basename == None:
return out_path
else:
return os.path.join(out_path, basename)
def main(in_path, input_format, output_format, chunk_names, folds, tagset):
tagset = corpus2.get_named_tagset(tagset)
chunk_names = chunk_names.split(",")
input_paths = get_input_paths(in_path, folds, input_format)
results = []
i=0
for input_path in input_paths:
reader = get_reader(input_path, input_format, tagset)
print input_path
fold_results = {'all':0, 'n_c':0, '%':0}
sent = reader.get_next_sentence()
while sent:
asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
for chan_name in asent.all_channels():
if chan_name == "chunk_np":
chan = asent.get_channel(chan_name)
ann_vec = chan.make_annotation_vector()
for ann in ann_vec:
inds = sorted(ann.indices)
if not is_continous(inds):
fold_results['n_c'] += 1
fold_results['all'] += 1
sent = reader.get_next_sentence()
fold_results['%'] = fold_results['n_c'] * 100 / float(fold_results['all'])
results.append(fold_results)
i += 1
all_count = 0
n_c_count = 0
print 'all', 'n_c', '%'
for fold in range(len(results)):
all_count += results[fold]['all']
n_c_count += results[fold]['n_c']
print results[fold]['all'], results[fold]['n_c'], results[fold]['%']
print '------------------------------------------------------------------'
print all_count/i, n_c_count/i, (n_c_count/i)*100/float(all_count/i)
avg_results = {'all':all_count/float(i), 'n_c':n_c_count/float(i), '%':(n_c_count/i)*100/float(all_count/i)}
results.append(avg_results)
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment