Added count_not_cont.py script

5c0f41a9 · jezozwierzak · 869cd580 · 5c0f41a9
Commit 5c0f41a9 authored 12 years ago by jezozwierzak
--- a/scripts/stats/count_not_cont.py
+++ b/scripts/stats/count_not_cont.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Created on Mar 25, 2013
+
+@author: Adam Pawlaczek
+'''
+from optparse import OptionParser
+import sys, os
+
+from csv_table2.csv_table import CsvTable
+
+import corpus2
+descr="""%prog [options] in_dir out_dir
+
+The purpose of this program is to count number of not continous phases all types (chunk_np, chunk_vp)
+You can ofcource change the list of chunks checked by script. 
+It will print to stdout number of all chunks, not_cont_chunks, and percent of non-contiguous phrases"""
+
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-o', '--output-format', type='string', action='store',
+        dest='output_format', default='ccl',
+        help='set the output format; default: ccl')
+    parser.add_option('-O', '--output-file', type='string', action='store',
+        dest='out_path', default='',
+        help='set output filename (do not write to stdout)')
+    parser.add_option('-c', '--chunk-names', type='string', action='store',
+        dest='chunk_names', default='chunk_np,chunk_vp,chunk_adjp,chunk_agp',
+        help='set chunk_names to count')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,
+        help='Number of folds')   
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp') 
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 1:
+        sys.stderr.write('You need to provide corpus_dir and out_dir.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    in_path = args[0]
+    main(in_path, options.input_format, options.output_format, options.chunk_names, options.folds, options.tagset)
+
+def is_continous(inds):
+    l2 = range(inds[0], inds[-1] + 1)
+    return inds == l2
+
+def get_input_paths(in_path, folds, input_format):
+    input_paths = []
+    if folds > 1:
+        for fold in range(1, folds+1):
+            if input_format == "ccl":
+                input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
+            elif input_format == "xces":
+                input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
+    else:
+        if(os.path.isdir(in_path)):
+            for (path, dirs, files) in os.walk(in_path):
+                for file in files:
+                    input_paths.append(os.path.join(path, file))
+        else:
+            input_paths.append(in_path)
+            
+    return input_paths
+
+def get_writer(out_path, output_format, tagset):
+    if out_path:
+        return corpus2.TokenWriter.create_path_writer(output_format, out_path,
+            tagset)
+    else:
+        return corpus2.TokenWriter.create_stdout_writer(output_format, tagset)
+
+def get_reader(in_path, input_format, tagset):
+    if in_path:
+        return corpus2.TokenReader.create_path_reader(
+            input_format, tagset, in_path)
+    else:
+        return corpus2.TokenReader.create_stdin_reader(input_format, tagset)
+
+def get_output_path(out_path, basename = None):
+    if basename == None:
+        return out_path
+    else:
+        return os.path.join(out_path, basename)
+    
+def main(in_path, input_format, output_format, chunk_names, folds, tagset):
+    tagset = corpus2.get_named_tagset(tagset)
+    chunk_names = chunk_names.split(",")
+    
+    input_paths = get_input_paths(in_path, folds, input_format)
+            
+    results = []
+
+    i=0
+    for input_path in input_paths:
+        reader = get_reader(input_path, input_format, tagset)
+        print input_path
+        fold_results = {'all':0, 'n_c':0, '%':0}
+
+        sent = reader.get_next_sentence()     
+        while sent:
+            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+            
+            for chan_name in asent.all_channels():
+                if chan_name == "chunk_np":
+        
+                    chan = asent.get_channel(chan_name)
+                    ann_vec = chan.make_annotation_vector()
+                    for ann in ann_vec:
+                        inds = sorted(ann.indices)
+                        if not is_continous(inds):
+                            fold_results['n_c'] += 1
+                        fold_results['all'] += 1
+                            
+            sent = reader.get_next_sentence()
+        fold_results['%'] = fold_results['n_c'] * 100 / float(fold_results['all'])
+        results.append(fold_results)        
+        i += 1
+
+    all_count = 0
+    n_c_count = 0
+    print 'all', 'n_c', '%'
+    for fold in range(len(results)):
+        all_count += results[fold]['all']
+        n_c_count += results[fold]['n_c']
+        print results[fold]['all'], results[fold]['n_c'], results[fold]['%']
+    print '------------------------------------------------------------------'
+    print all_count/i, n_c_count/i, (n_c_count/i)*100/float(all_count/i)
+    avg_results = {'all':all_count/float(i), 'n_c':n_c_count/float(i), '%':(n_c_count/i)*100/float(all_count/i)}    
+    results.append(avg_results)
+    
+                 
+if __name__ == '__main__':
+    go()