Added is_ok_liner_after_iobber script

94f441d3 · jezozwierzak · 301b7263 · 94f441d3
Commit 94f441d3 authored 12 years ago by jezozwierzak
--- a/scripts/chunker_scripts/stats/is_ok_liner_after_iobber.py
+++ b/scripts/chunker_scripts/stats/is_ok_liner_after_iobber.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Created on 25-04-2013
+
+@author: Adam Pawlaczek
+'''
+from optparse import OptionParser
+import sys, os
+import corpus2
+from chunker_scripts import tools
+descr="""%prog [options] liner_dir iobber_dir
+
+Program is checking if all chunks are ok after runing liner2.
+liner_dir and iobber_dir have to contain the same file names"""
+
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,
+        help='Number of folds default: 10')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp') 
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 2:
+        sys.stderr.write('You need to provide liner_dir and iobber_dir.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    liner_dir, iobber_dir = args
+    main(liner_dir, iobber_dir, options.input_format, options.tagset, options.folds)
+
+def get_input_paths(in_path, folds, input_format):
+    input_paths = []
+    if folds > 1:
+        for fold in range(1, folds+1):
+            if input_format == "ccl":
+                input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
+            elif input_format == "xces":
+                input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
+    else:
+        if(os.path.isdir(in_path)):
+            for (path, dirs, files) in os.walk(in_path):
+                for file in files:
+                    input_paths.append(os.path.join(path, file))
+        else:
+            input_paths.append(in_path)
+            
+    return input_paths
+    
+def main(liner_dir, iobber_dir, input_format, tagset, folds):
+    tg = corpus2.get_named_tagset(tagset)
+    
+    liner_paths = []
+    for (path, dirs, files) in tools.walklevel(liner_dir):
+        for file in files:
+            liner_paths.append(os.path.join(path, file))
+    
+    iobber_paths = []
+    for (path, dirs, files) in tools.walklevel(iobber_dir):
+        for file in files:
+            iobber_paths.append(os.path.join(path, file))
+    
+    for liner_path, iobber_path in zip(liner_paths, iobber_paths):
+        print liner_path, iobber_path
+        reader_l = tools.get_reader(liner_path, input_format, tg)
+        reader_i = tools.get_reader(iobber_path, input_format, tg)
+        
+        sent_l = reader_l.get_next_sentence()  
+        sent_i = reader_i.get_next_sentence()
+        
+        while sent_l and sent_i:
+            asent_l = corpus2.AnnotatedSentence.wrap_sentence(sent_l)
+            asent_i = corpus2.AnnotatedSentence.wrap_sentence(sent_i)
+            
+            for chan_name in asent_i.all_channels():
+                if chan_name in ["chunk_np", "chunk_agp", "chunk_adjp", "chunk_vp", "chunk_qp"]:
+                    if asent_l.has_channel(chan_name): #
+                        
+                        chan_l = asent_l.get_channel(chan_name)
+                        chan_i = asent_i.get_channel(chan_name)
+                        
+                        ann_vec_l = chan_l.make_annotation_vector()
+                        ann_vec_i = chan_i.make_annotation_vector()
+                        
+                        ann_vec_l_indices = [ list(ann.indices) for ann in ann_vec_l ]
+                        ann_vec_i_indices = [ list(ann.indices) for ann in ann_vec_i ]
+                        
+                        assert ann_vec_l_indices == ann_vec_i_indices, "Diffrent chunk " + chan_name + " in files " + liner_path + " " + iobber_path + " in sentence: " +  " ".join(token.orth_utf8() for token in sent_l.tokens()) + "\n VECTOR:" + str(ann_vec_l_indices) + "\n VECTOR:" + str(ann_vec_i_indices) 
+                
+            sent_l = reader_l.get_next_sentence()  
+            sent_i = reader_i.get_next_sentence()
+
+if __name__ == '__main__':
+    go()