diff --git a/.gitignore b/.gitignore
index 346a2a7c7498d9b6da36a537e49d8d04f70d4db2..ad2f035596b5886a8f64df710b536a26736aa513 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,9 @@
 # Compiled source #
 ###################
 *.pyc
+build
+iobber.egg-info
+.project
+.settings
+dist
+scripts/chunker_scripts.egg-info
diff --git a/scripts/chunker_scripts/chunk_eval/chunk_eval.py b/scripts/chunker_scripts/chunk_eval/chunk_eval.py
index cda686412eb97e17136401e155876bcd03d503b2..b26072e8d143866d7ef88fe0542d2642089cb3c2 100755
--- a/scripts/chunker_scripts/chunk_eval/chunk_eval.py
+++ b/scripts/chunker_scripts/chunk_eval/chunk_eval.py
@@ -19,6 +19,7 @@ Created on 01-08-2012
 
 from optparse import OptionParser
 import corpus2
+import wccl
 import sys, os
 from chunker_scripts.csv_table.CSVTable import CSVTable
 import codecs
@@ -128,7 +129,8 @@ def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose,
     global all_hits
     global all_chunked
 
-    chan_names = chan_names.split(",")
+    if not isinstance(chan_names, list):
+        chan_names = chan_names.split(",")
     
     chunkTable = CSVTable(";")
     chunkTable.addColumn('Nr')
@@ -150,8 +152,9 @@ def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose,
         bothTable.addSubColumn(chan_name, "P", type="float")
         bothTable.addSubColumn(chan_name, "R", type="float")
         bothTable.addSubColumn(chan_name, "F", type="float")
-
-    tagset = corpus2.get_named_tagset(tagset)
+    
+    if not isinstance(tagset, wccl.Tagset):
+        tagset = corpus2.get_named_tagset(tagset)
     
     for fold in range(1, folds+1):    
         if folds > 1:
diff --git a/scripts/chunker_scripts/chunker_checkers/super_classifier.py b/scripts/chunker_scripts/chunker_checkers/super_classifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad38aeb662753064dbc873f9cb90e70df715af28
--- /dev/null
+++ b/scripts/chunker_scripts/chunker_checkers/super_classifier.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 02-05-2013
+
+@author: Adam Pawlaczek
+'''
+import codecs
+
+from optparse import OptionParser
+import sys, os
+import corpus2
+
+from chunker_scripts import tools
+from chunker_checker import ChunkerChecker
+from chunker_scripts.chunk_eval.chunk_eval import main as chunk_eval
+import iobber.iobber as iobber
+
+descr = """%prog [options] [in_dir] [out_dir]
+in_dir has to contain subdirs with folds chunked by individual chunkers. 
+Subdir should be named as chunker which chunked files in it.
+"""
+
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-o', '--output-format', type='string', action='store',
+        dest='output_format', default='ccl',
+        help='set the output format; default: ccl')
+    parser.add_option('--config', type='string', action='store',
+        dest='config', default='kpwr.ini',
+        help='set iobber config; default: kpwr.ini')
+    parser.add_option('-c', '--chunk-names', type='string', action='store',
+        dest='chunk_names', default='chunk_np',
+        help='set chunk_names to eval')
+    parser.add_option('--chunkers', type='string', action='store',
+        dest='chunkers', default='',
+        help='set chunkers to eval')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,
+        help='Number of folds')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp')
+    parser.add_option('--file-prefix', type='string', action='store',
+        dest='file_prefix', default='ccl-',
+        help='set the file prefix; default: ccl-')
+    parser.add_option('--file-ext', type='string', action='store',
+        dest='file_ext', default='.xml',
+        help='set the file extention; default: .xml')
+    parser.add_option('-v', '--verbose', action='store_true',
+        default=False, dest='verbose')
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 2:
+        sys.stderr.write('You need to provide a in_dir, out_dir and chunk_names and chunkers.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    in_path, out_path = args 
+    main(in_path, out_path, options.input_format, options.output_format,
+         options.chunk_names, options.chunkers, options.folds,
+         options.tagset, options.file_prefix, options.file_ext,
+         options.verbose, options.config)
+    
+class SuperClassifier(ChunkerChecker):
+    
+    def __init__(self, input_format, output_format, chunk_names, chunkers, folds, tagset, file_prefix, file_ext, verbose, config):
+        super(Iobber_v1, self).__init__(input_format, output_format, chunk_names, chunkers, folds, tagset, file_prefix, file_ext, verbose)
+        self.config = config
+        self.chunkers = chunkers.split(",")
+        
+    def create_directories(self):
+        self.dirs['models_path'] = os.path.join(self.dirs['out_dir'], 'models')
+        self.dirs['chunked_path'] = os.path.join(self.dirs['out_dir'], 'chunked')
+        self.dirs['nochann_path'] = os.path.join(self.dirs['out_dir'], 'nochann')
+        tools.mkdir_p(self.dirs['models_path'])
+        tools.mkdir_p(self.dirs['chunked_path'])
+        tools.mkdir_p(self.dirs['nochann_path'])
+    
+    def process_fold(self, fold):
+        num = str(fold).zfill(2)
+        self.train_fold(os.path.join(self.dirs['in_dir'], self.file_prefix + 'train' + num + self.file_ext),
+                        os.path.join(self.dirs['models_path'], num))
+        
+    def train_fold(self, in_path, model_path):
+        tr_file = codecs.open(os.path.join(model_path, 'model.tr'), 'wb', 'utf-8')
+        reader = tools.get_reader(in_path, self.input_format, self.tagset)
+        
+        sent = reader.get_next_sentence()
+        while sent:
+            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+            for chunk_name in asent.all_channels():
+                chan = asent.get_channel(chunk_name)
+                
+            sent = reader.get_next_sentence()
+            
+        tr_file.close()
+    
+def main(in_path, out_path, input_format, output_format, chunk_names, folds, tagset, file_prefix, file_ext, verbose, config):
+        sc = SuperClassifier(input_format, output_format, chunk_names, folds, tagset, file_prefix, file_ext, verbose, config)
+        sc.process_folds(in_path, out_path)
+
+if __name__ == '__main__':
+    go()
+    
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/_tests/__init__.py b/scripts/chunker_scripts/experiments/_tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/crf/ccl-test01.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/crf/ccl-test01.xml
new file mode 100644
index 0000000000000000000000000000000000000000..92f0f25633e2b5f98a6f96011cc3c1d4d25628dc
--- /dev/null
+++ b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/crf/ccl-test01.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chunkList SYSTEM "ccl.dtd">
+<chunkList>
+ <chunk id="ch1" type="p">
+  <sentence id="sent3">
+   <tok>
+    <orth>Jako</orth>
+    <lex disamb="1"><base>jako</base><ctag>conj</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>pierwszy</orth>
+    <lex disamb="1"><base>pierwszy</base><ctag>adj:sg:acc:m3:pos</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>głos</orth>
+    <lex disamb="1"><base>głos</base><ctag>subst:sg:acc:m3</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>zabierze</orth>
+    <lex disamb="1"><base>zabrać</base><ctag>fin:sg:ter:perf</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>poseł</orth>
+    <lex disamb="1"><base>poseł</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np" head="1">1</ann>
+   </tok>
+   <tok>
+    <orth>Arkady</orth>
+    <lex disamb="1"><base>Arkady</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np">1</ann>
+   </tok>
+   <tok>
+    <orth>Fiedler</orth>
+    <lex disamb="1"><base>Fiedler</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np">1</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>Platforma</orth>
+    <lex disamb="1"><base>platforma</base><ctag>subst:sg:nom:f</ctag></lex>
+    <ann chan="chunk_np" head="1">2</ann>
+   </tok>
+   <tok>
+    <orth>Obywatelska</orth>
+    <lex disamb="1"><base>obywatelski</base><ctag>adj:sg:nom:f:pos</ctag></lex>
+    <ann chan="chunk_np">2</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+  </sentence>
+ </chunk>
+</chunkList>
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test01.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test01.xml
new file mode 100644
index 0000000000000000000000000000000000000000..92f0f25633e2b5f98a6f96011cc3c1d4d25628dc
--- /dev/null
+++ b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test01.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chunkList SYSTEM "ccl.dtd">
+<chunkList>
+ <chunk id="ch1" type="p">
+  <sentence id="sent3">
+   <tok>
+    <orth>Jako</orth>
+    <lex disamb="1"><base>jako</base><ctag>conj</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>pierwszy</orth>
+    <lex disamb="1"><base>pierwszy</base><ctag>adj:sg:acc:m3:pos</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>głos</orth>
+    <lex disamb="1"><base>głos</base><ctag>subst:sg:acc:m3</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>zabierze</orth>
+    <lex disamb="1"><base>zabrać</base><ctag>fin:sg:ter:perf</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>poseł</orth>
+    <lex disamb="1"><base>poseł</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np" head="1">1</ann>
+   </tok>
+   <tok>
+    <orth>Arkady</orth>
+    <lex disamb="1"><base>Arkady</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np">1</ann>
+   </tok>
+   <tok>
+    <orth>Fiedler</orth>
+    <lex disamb="1"><base>Fiedler</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np">1</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>Platforma</orth>
+    <lex disamb="1"><base>platforma</base><ctag>subst:sg:nom:f</ctag></lex>
+    <ann chan="chunk_np" head="1">2</ann>
+   </tok>
+   <tok>
+    <orth>Obywatelska</orth>
+    <lex disamb="1"><base>obywatelski</base><ctag>adj:sg:nom:f:pos</ctag></lex>
+    <ann chan="chunk_np">2</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+  </sentence>
+ </chunk>
+</chunkList>
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test02.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test02.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/ref/ccl-test01.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/ref/ccl-test01.xml
new file mode 100644
index 0000000000000000000000000000000000000000..92f0f25633e2b5f98a6f96011cc3c1d4d25628dc
--- /dev/null
+++ b/scripts/chunker_scripts/experiments/_tests/oracle_data/ref/ccl-test01.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chunkList SYSTEM "ccl.dtd">
+<chunkList>
+ <chunk id="ch1" type="p">
+  <sentence id="sent3">
+   <tok>
+    <orth>Jako</orth>
+    <lex disamb="1"><base>jako</base><ctag>conj</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>pierwszy</orth>
+    <lex disamb="1"><base>pierwszy</base><ctag>adj:sg:acc:m3:pos</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>głos</orth>
+    <lex disamb="1"><base>głos</base><ctag>subst:sg:acc:m3</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>zabierze</orth>
+    <lex disamb="1"><base>zabrać</base><ctag>fin:sg:ter:perf</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>poseł</orth>
+    <lex disamb="1"><base>poseł</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np" head="1">1</ann>
+   </tok>
+   <tok>
+    <orth>Arkady</orth>
+    <lex disamb="1"><base>Arkady</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np">1</ann>
+   </tok>
+   <tok>
+    <orth>Fiedler</orth>
+    <lex disamb="1"><base>Fiedler</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="chunk_np">1</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>,</orth>
+    <lex disamb="1"><base>,</base><ctag>interp</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+   <tok>
+    <orth>Platforma</orth>
+    <lex disamb="1"><base>platforma</base><ctag>subst:sg:nom:f</ctag></lex>
+    <ann chan="chunk_np" head="1">2</ann>
+   </tok>
+   <tok>
+    <orth>Obywatelska</orth>
+    <lex disamb="1"><base>obywatelski</base><ctag>adj:sg:nom:f:pos</ctag></lex>
+    <ann chan="chunk_np">2</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="chunk_np">0</ann>
+   </tok>
+  </sentence>
+ </chunk>
+</chunkList>
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/_tests/test_oracle.py b/scripts/chunker_scripts/experiments/_tests/test_oracle.py
new file mode 100644
index 0000000000000000000000000000000000000000..0db64b9313f9960e73c8c022a2199adf44503adc
--- /dev/null
+++ b/scripts/chunker_scripts/experiments/_tests/test_oracle.py
@@ -0,0 +1,134 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 13-05-2013
+
+@author: Adam Pawlaczek
+'''
+import shutil
+import unittest
+import os
+
+import corpus2
+
+from chunker_scripts.experiments.oracle import Oracle
+from chunker_scripts import tools
+
+class TestOracleWholeChunks(unittest.TestCase):
+    
+    def setUp(self):
+        self.oracle = Oracle("ccl", "ccl", "chunk_np", "crf,spejd", 1, "nkjp")
+        shutil.copytree("oracle_data", "oracle_data_backup")
+    
+    def tearDown(self):
+        shutil.rmtree("oracle_data")
+        shutil.copytree("oracle_data_backup", "oracle_data")
+        shutil.rmtree("oracle_data_backup")
+        if os.path.exists("out"):
+            shutil.rmtree("out")
+    
+    def test_default_params(self):
+        self.assertEqual(self.oracle.chunk_names, ["chunk_np"])
+        self.assertEqual(self.oracle.chunkers, ["crf", "spejd"])
+    
+    def test_process(self):
+        self.oracle.process("oracle_data/chunked", "oracle_data/ref", "out")
+        
+    def test_get_ref_paths(self):
+        ref_paths = self.oracle.get_ref_paths("oracle_data/ref")
+        self.assertEqual(ref_paths, ["oracle_data/ref/ccl-test01.xml"])
+ 
+    def test_get_input_paths(self):
+        in_paths = self.oracle.get_input_paths("oracle_data/chunked")
+        self.assertEqual(in_paths, [{'spejd': 'oracle_data/chunked/spejd/ccl-test01.xml', 'crf': 'oracle_data/chunked/crf/ccl-test01.xml'}])
+         
+    def test_get_writer(self):
+        tools.mkdir_p("out")
+        self.assertNotEqual(self.oracle.get_writer("out", 1), None)
+         
+    def test_get_readers(self):
+        readers = self.oracle.get_readers(self.oracle.get_input_paths("oracle_data/chunked")[0])
+        self.assertEqual(len(readers), len(self.oracle.chunkers))
+         
+    def test_clone_sent(self):
+        readers = self.oracle.get_readers(self.oracle.get_input_paths("oracle_data/chunked")[0])
+        sent = readers["crf"].get_next_sentence()
+        sent2 = self.oracle.clone_sent(sent)
+        sent1_tokens = " ".join([tok.orth_utf8() for tok in sent.tokens()])
+        sent2_tokens = " ".join([tok.orth_utf8() for tok in sent2.tokens()])
+        self.assertEqual(sent1_tokens, sent2_tokens)
+     
+    def test_get_annots_first_idx(self):
+        readers = self.oracle.get_readers(self.oracle.get_input_paths("oracle_data/chunked")[0])
+        sent = readers["crf"].get_next_sentence()
+        asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+        channel = asent.get_channel("chunk_np")
+        idxs = self.oracle.get_annots_first_idx(channel)
+        self.assertEqual(set(idxs.keys()),set([4, 8]))
+         
+    def test_get_right_annots(self):
+        sent1 = corpus2.Sentence.create_sent("1")
+        for i in range(10):
+            token = corpus2.Token.create_utf8("jakiś_orth")
+            sent1.append(token)
+        asent1 = corpus2.AnnotatedSentence.wrap_sentence(sent1)
+        asent1.create_channel("test_chunk")
+        chan = asent1.get_channel("test_chunk")
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(0, seg_no)
+        chan.set_segment_at(1, seg_no)
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(3, seg_no)
+        chan.set_segment_at(4, seg_no)
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(5, seg_no)
+        chan.set_segment_at(6, seg_no)
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(8, seg_no)
+        chan.set_segment_at(9, seg_no)
+         
+        sent2 = corpus2.Sentence.create_sent("2")
+        for i in range(10):
+            token = corpus2.Token.create_utf8("jakiś_orth")
+            sent2.append(token)
+        asent2 = corpus2.AnnotatedSentence.wrap_sentence(sent2)
+        asent2.create_channel("test_chunk")
+        chan = asent2.get_channel("test_chunk")
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(0, seg_no)
+        chan.set_segment_at(1, seg_no)
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(2, seg_no)
+        chan.set_segment_at(3, seg_no)
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(4, seg_no)
+        chan.set_segment_at(5, seg_no)
+        chan.set_segment_at(6, seg_no)
+        chan.set_segment_at(7, seg_no)
+         
+        seg_no = chan.get_new_segment_index()
+        chan.set_segment_at(8, seg_no)
+         
+        sent1 = corpus2.AnnotatedSentence.cast_as_sentence(asent1)
+        sent2 = corpus2.AnnotatedSentence.cast_as_sentence(asent2)
+ 
+        right_annots = self.oracle.get_right_annots(sent1, {"crf":sent2}, "test_chunk")
+ 
+        self.assertEqual(right_annots, [[0,1]])
+         
+    def test_is_same_chunk(self):
+        self.assertTrue(self.oracle.is_same_chunk([4, 5, 6], [4, 5, 6]))
+        self.assertFalse(self.oracle.is_same_chunk([4, 5], [4, 5, 6]))
+        self.assertFalse(self.oracle.is_same_chunk([4, 5, 6, 7], [3, 4, 5, 6]))
+        self.assertFalse(self.oracle.is_same_chunk([4, 5, 6], [8, 9, 10]))
+        
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/iobber_v1.py b/scripts/chunker_scripts/experiments/iobber_v1.py
deleted file mode 100755
index 6852ab2fd31411a0ebb7522f73a38b9525feb5f2..0000000000000000000000000000000000000000
--- a/scripts/chunker_scripts/experiments/iobber_v1.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/python
-#-*- coding: utf-8 -*-
-'''
-Created on 05-07-2012
-
-@author: jezozwierzak
-'''
-
-import os, shutil, sys
-from chunk_eval.chunk_eval import main as chunk_eval
-from optparse import OptionParser
-import tools
-descr = """%prog [options] [in_dir] [out_dir]
-"""
-# in_dir musi zawierać pliki test*.xml oraz train*.xml
-#
-def go():
-    parser = OptionParser(usage=descr)
-    parser.add_option('-i', '--input-format', type='string', action='store',
-        dest='input_format', default='ccl',
-        help='set the input format; default: ccl')
-    parser.add_option('-o', '--output-format', type='string', action='store',
-        dest='output_format', default='ccl',
-        help='set the output format; default: ccl')
-    parser.add_option('-t', '--tagset', type='string', action='store',
-        dest='tagset', default='nkjp',
-        help='set the tagset used in input; default: nkjp')
-    parser.add_option('-c', '--config', type='string', action='store',
-        dest='config', default='kpwr.ini',
-        help='Set path to config file; default: kpwr.ini')
-    parser.add_option('--chunk-names', type='string', action='store',
-        dest='chunk_names', default='chunk_np,chunk_vp,chunk_agp,chunk_adjp,chunk_qp',
-        help='set chunk_names to eval')
-    (options, args) = parser.parse_args()
-    
-    if len(args) != 2:
-        sys.stderr.write('You need to provide corpus_dir and out_dir.\n')
-        sys.stderr.write('See %s --help\n' % sys.argv[0])
-        sys.exit(1)
-    
-    in_dir, out_dir = args
-    main(in_dir, out_dir, options.chunk_names, options.config, options.tagset, options.input_format, options.output_format)
-
-def main(in_dir, out_dir, chunk_names, config, tagset, input_format, output_format):
-    #Stworzenie katalogu wyjściowego
-    tools.mkdir_p(out_dir)
-    #Przekopiowanie plików train oraz test
-    for i in range(1, 11):
-        print "Przetwarzanie foldu nr: ", i, "------------------------------------------------------------------"
-        ccl_test = 'ccl-test' + str(i).zfill(2) + '.xml'
-        ccl_train = 'ccl-train' + str(i).zfill(2) + '.xml'
-        print 'Kopiowanie foldu ze źródła'
-        shutil.copy(os.path.join(in_dir, ccl_test), os.path.join(out_dir, ccl_test))
-        shutil.copy(os.path.join(in_dir, ccl_train), os.path.join(out_dir, ccl_train))
-        print 'Usuwanie tagów iob'
-    #Usunięcie channelów z plików test
-        tools.mkdir_p(os.path.join(out_dir,'nochann'))
-        shutil.copy(os.path.join(out_dir, ccl_test), os.path.join(out_dir, 'nochann', ccl_test))
-        tools.remove_channels_except(os.path.join(out_dir, 'nochann', ccl_test), ["chunk_np_splited"])
-    #train iobber
-        print 'Trenowanie iobbera'
-        tools.mkdir_p(os.path.join(out_dir, 'iobber_model', str(i).zfill(2)))
-        tools.train_iobber(config, os.path.join(out_dir, 'iobber_model', str(i).zfill(2)), os.path.join(out_dir, ccl_train))
-    #chunk test files
-        print 'Chunkowanie'
-        tools.mkdir_p(os.path.join(out_dir, 'chunked'))
-        tools.run_iobber(config, os.path.join(out_dir, 'nochann', ccl_test), os.path.join(out_dir, 'chunked', ccl_test), os.path.join(out_dir, 'iobber_model', str(i).zfill(2)))
-    #diseval
-    print 'Porównanie wyników chunkowania z plikiami reference'
-    chunk_eval(os.path.join(out_dir, 'chunked'), out_dir, chunk_names, output_format, os.path.join(out_dir, 'results.csv'), tagset, True, 10)
-        
-#in_path, out_path, in_format, out_format, tagset
-if __name__ == '__main__':
-    go() 
-       
-#    main(args[0], args[1])    
-#    main('/home/jezozwierzak/range/remove_not_cont/out_folds', '/home/jezozwierzak/range/remove_not_cont/iobber_v1')
-    
diff --git a/scripts/chunker_scripts/experiments/oracle.py b/scripts/chunker_scripts/experiments/oracle.py
index ed373ecd42f4e61f211498b085f8945ff0deeb25..04d6601237a8cb7ff340b362844b9e383e86362c 100755
--- a/scripts/chunker_scripts/experiments/oracle.py
+++ b/scripts/chunker_scripts/experiments/oracle.py
@@ -43,124 +43,140 @@ def go():
         sys.exit(1)
         
     in_path, ref_path, out_path = args 
-    main(in_path, ref_path, out_path, options.input_format, options.output_format,
+    oracle = Oracle(options.input_format, options.output_format,
          options.chunk_names, options.chunkers, options.folds, options.tagset)
+    oracle.process(in_path, ref_path, out_path)
 
-def get_ref_paths(in_path, folds, input_format):
-    input_paths = []
-    if folds > 1:
-        for fold in range(1, folds+1):
-            if input_format == "ccl":
-                input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
-            elif input_format == "xces":
-                input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
-    else:
-        if(os.path.isdir(in_path)):
-            for (path, dirs, files) in os.walk(in_path):
-                for file in files:
-                    input_paths.append(os.path.join(path, file))
+class Oracle:
+    
+    def __init__(self, input_format, output_format, chunk_names, chunkers, folds, tagset):
+        self.input_format = input_format
+        self.output_format = output_format
+        self.chunk_names = chunk_names.split(",")
+        self.chunkers = chunkers.split(",")
+        self.folds = folds
+        self.tagset = corpus2.get_named_tagset(tagset)
+    
+    def get_ref_paths(self, ref_path):
+        input_paths = []
+        if self.folds > 1:
+            for fold in range(1, self.folds+1):
+                if self.input_format == "ccl":
+                    input_paths.append(os.path.join(ref_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
+                elif self.input_format == "xces":
+                    input_paths.append(os.path.join(ref_path, 'test' + str(fold).zfill(2) + '.xml'))
         else:
-            input_paths.append(in_path)
-    return input_paths
-
-def get_input_paths(in_path, folds, input_format, chunkers):
-    input_paths = []
-    for fold in range(1, folds+1):
-        fold_inputs = {}
-        for chunker in chunkers:
-            if os.path.isdir(os.path.join(in_path, chunker)):
-                if input_format == "ccl":
-                    fold_inputs[chunker] = os.path.join(in_path, chunker, 'ccl-test' + str(fold).zfill(2) + '.xml')
-                elif input_format == "xces":
-                    fold_inputs[chunker] = os.path.join(in_path, chunker, 'test' + str(fold).zfill(2) + '.xml')
+            if(os.path.isdir(ref_path)):
+                for (path, dirs, files) in os.walk(ref_path):
+                    for file in files:
+                        input_paths.append(os.path.join(path, file))
             else:
-                print os.path.join(in_path, chunker), " dir doesn't exist"
-        input_paths.append(fold_inputs)
-    return input_paths
+                input_paths.append(ref_path)
+        return input_paths
 
-def get_writer(out_path, output_format, tagset, fold):
-    out_path = get_output_path(out_path, fold, output_format)
-    return corpus2.TokenWriter.create_path_writer(output_format, out_path,
-            tagset)
+    def get_input_paths(self, in_path):
+        input_paths = []
+        for fold in range(1, self.folds+1):
+            fold_inputs = {}
+            for chunker in self.chunkers:
+                if os.path.isdir(os.path.join(in_path, chunker)):
+                    if self.input_format == "ccl":
+                        fold_inputs[chunker] = os.path.join(in_path, chunker, 'ccl-test' + str(fold).zfill(2) + '.xml')
+                    elif self.input_format == "xces":
+                        fold_inputs[chunker] = os.path.join(in_path, chunker, 'test' + str(fold).zfill(2) + '.xml')
+                else:
+                    print os.path.join(in_path, chunker), " dir doesn't exist"
+            input_paths.append(fold_inputs)
+        return input_paths
 
-def get_output_path(out_path, fold, output_format):
-    if output_format == "ccl":
-        return os.path.join(out_path, 'ccl-test' + str(fold).zfill(2) + '.xml')
-    elif input_format == "xces":
-        return os.path.join(out_path, 'test' + str(fold).zfill(2) + '.xml')
+    def get_output_path(self, out_path, fold):
+        if self.output_format == "ccl":
+            return os.path.join(out_path, 'ccl-test' + str(fold).zfill(2) + '.xml')
+        elif self.input_format == "xces":
+            return os.path.join(out_path, 'test' + str(fold).zfill(2) + '.xml')
 
-def get_readers(in_paths, input_format, tagset):
-    readers = {}
-    for chunker, in_path in in_paths.iteritems():
-        readers[chunker] = tools.get_reader(in_path, input_format, tagset)
-    return readers
+    def get_writer(self, out_path, fold):
+        out_path = self.get_output_path(out_path, fold)
+        return corpus2.TokenWriter.create_path_writer(self.output_format, out_path,
+                self.tagset)
+        
+    def get_readers(self, in_paths):
+        readers = {}
+        for chunker, in_path in in_paths.iteritems():
+            readers[chunker] = tools.get_reader(in_path, self.input_format, self.tagset)
+        return readers
 
-def get_next_sents(readers):
-    result = {}
-    for chunker, reader in readers.iteritems():
-        result[chunker] = reader.get_next_sentence()
-    return result
+    def get_next_sents(self, readers):
+        result = {}
+        for chunker, reader in readers.iteritems():
+            result[chunker] = reader.get_next_sentence()
+        return result
 
-def clone_sent(sent):
-    new_sent = corpus2.Sentence.create_sent(sent.id())
-    for tok_idx, tok in enumerate(sent.tokens()):
-        tok = sent.tokens()[tok_idx]
-        if any(lex.is_disamb() for lex in tok.lexemes()):
-            new_sent.append(tok.clone())
-    return new_sent
+    def clone_sent(self, sent):
+        new_sent = corpus2.Sentence.create_sent(sent.id())
+        for tok_idx, tok in enumerate(sent.tokens()):
+            tok = sent.tokens()[tok_idx]
+            if any(lex.is_disamb() for lex in tok.lexemes()):
+                new_sent.append(tok.clone())
+        return new_sent
 
-    
-def main(in_path, ref_path, out_path, input_format, output_format, chunk_names, chunkers, folds, tagset):
-    tagset = corpus2.get_named_tagset(tagset)
-    chunk_names = chunk_names.split(",")
-    chunkers = chunkers.split(",")
+    def get_annots_first_idx(self, channel):
+        annots = channel.make_annotation_vector()
+        return dict([(min(ann.indices), ann) for ann in annots])
 
-    ref_paths = get_ref_paths(ref_path, folds, input_format)
-    input_paths = get_input_paths(in_path, folds, input_format, chunkers)
+    def is_same_chunk(self, ch, ref):
+        return list(ch) == list(ref)
 
-    for fold in range(1, folds+1):
-        writer = get_writer(out_path, output_format, tagset, fold)
-        
-        readers = get_readers(input_paths[fold-1], input_format, tagset)
-        sents = get_next_sents(readers)
-        ref_reader = tools.get_reader(ref_paths[fold-1], input_format, tagset)
+    def get_right_annots(self, ref_sent, chunked_sents, chunk_name):
+        result = []
+        ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent)
+        ref_idxs = self.get_annots_first_idx(ref_asent.get_channel(chunk_name))
         
-        while sents.itervalues().next():
-            ref_sent = ref_reader.get_next_sentence()
-            ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent)
-            
-            result_sent = clone_sent(ref_asent)
-            result_asent = corpus2.AnnotatedSentence.wrap_sentence(result_sent)
-            
-            for chunk_name in ref_asent.all_channels():
-                if chunk_name in chunk_names:
-                    right_annots = []
-                    ref_annots = ref_asent.get_channel(chunk_name).make_annotation_vector()
-                    ref = dict([(min(ann.indices), ann) for ann in ref_annots])
-                    
-                    for chunker in chunkers:
-                        ch_asent = corpus2.AnnotatedSentence.wrap_sentence(sents[chunker])
-                        if ch_asent.has_channel(chunk_name):
-                            ch_annots = ch_asent.get_channel(chunk_name).make_annotation_vector()
-                            ch = dict([(min(ann.indices), ann) for ann in ch_annots])
-                            
-                            maybe_hits = set(ch).intersection(ref)
-                            for idx in maybe_hits:
-                                if list(ch[idx].indices) == list(ref[idx].indices) and [i for i in ch[idx].indices] not in right_annots:
-                                    right_annots.append([i for i in ch[idx].indices])
-                    
-                    #add right chunks
-                    result_asent.create_channel(chunk_name)
-                    chan = result_asent.get_channel(chunk_name)
-                    for ann in right_annots:
-                        seg_no = chan.get_new_segment_index()
-                        for idx in ann:
-                            chan.set_segment_at(idx, seg_no)
-            
-            result_sent = corpus2.AnnotatedSentence.cast_as_sentence(result_asent)
-            writer.write_sentence(result_sent)
-            sents = get_next_sents(readers)
+        for chunker in chunked_sents.keys():
+            ch_asent = corpus2.AnnotatedSentence.wrap_sentence(chunked_sents[chunker])
+            ch_idxs = self.get_annots_first_idx(ch_asent.get_channel(chunk_name))
             
+            maybe_hits = set(ch_idxs).intersection(ref_idxs)
+            for idx in maybe_hits:
+                chunk = [i for i in ch_idxs[idx].indices]
+                if self.is_same_chunk(ch_idxs[idx].indices, ref_idxs[idx].indices) and not chunk in result:
+                    result.append(chunk)        
+        return result
+
+    def process(self, in_path, ref_path, out_path):
+        if not os.path.exists(out_path):
+            tools.mkdir_p(out_path)
+        ref_paths = self.get_ref_paths(ref_path)
+        input_paths = self.get_input_paths(in_path)
+        
+        for fold in range(1, self.folds+1):
+            writer = self.get_writer(out_path, fold)
+            readers = self.get_readers(input_paths[fold-1])
+            ref_reader = tools.get_reader(ref_paths[fold-1], self.input_format, self.tagset)
             
+            sents = self.get_next_sents(readers)
+            while sents.itervalues().next():
+                
+                ref_sent = ref_reader.get_next_sentence()
+                ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent)
+                
+                result_sent = self.clone_sent(ref_asent)
+                result_asent = corpus2.AnnotatedSentence.wrap_sentence(result_sent)
+                
+                for chunk_name in ref_asent.all_channels():
+                    if chunk_name in self.chunk_names:
+                        right_annots = self.get_right_annots(ref_sent, sents, chunk_name)
+                        
+                        result_asent.create_channel(chunk_name)
+                        chan = result_asent.get_channel(chunk_name)
+                        for ann in right_annots:
+                            seg_no = chan.get_new_segment_index()
+                            for idx in ann:
+                                chan.set_segment_at(idx, seg_no)
+                                
+                result_sent = corpus2.AnnotatedSentence.cast_as_sentence(result_asent)
+                writer.write_sentence(result_sent)
+                sents = self.get_next_sents(readers)            
+
 if __name__ == '__main__':
     go()
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/oracle_old.py b/scripts/chunker_scripts/experiments/oracle_old.py
new file mode 100755
index 0000000000000000000000000000000000000000..ed373ecd42f4e61f211498b085f8945ff0deeb25
--- /dev/null
+++ b/scripts/chunker_scripts/experiments/oracle_old.py
@@ -0,0 +1,166 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 19-02-2013
+
+@author: jezozwierzak
+'''
+from optparse import OptionParser
+import sys, os
+import corpus2
+from chunker_scripts import tools
+
+descr = """%prog [options] [in_dir] [ref_dir] [out_dir]
+in_dir has to contain subdirs with folds chunked by individual chunkers. 
+Subdir should be named as chunker which chunked files in it.
+"""
+
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-o', '--output-format', type='string', action='store',
+        dest='output_format', default='ccl',
+        help='set the output format; default: ccl')
+    parser.add_option('-c', '--chunk-names', type='string', action='store',
+        dest='chunk_names', default='',
+        help='set chunk_names to eval')
+    parser.add_option('--chunkers', type='string', action='store',
+        dest='chunkers', default='',
+        help='set chunkers to eval')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,
+        help='Number of folds')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp')
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 3 and options.chunk_names == '' and options.chunkers == '':
+        sys.stderr.write('You need to provide a in_dir, ref_dir and out_dir and chunk_names and chunkers.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    in_path, ref_path, out_path = args 
+    main(in_path, ref_path, out_path, options.input_format, options.output_format,
+         options.chunk_names, options.chunkers, options.folds, options.tagset)
+
+def get_ref_paths(in_path, folds, input_format):
+    input_paths = []
+    if folds > 1:
+        for fold in range(1, folds+1):
+            if input_format == "ccl":
+                input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
+            elif input_format == "xces":
+                input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
+    else:
+        if(os.path.isdir(in_path)):
+            for (path, dirs, files) in os.walk(in_path):
+                for file in files:
+                    input_paths.append(os.path.join(path, file))
+        else:
+            input_paths.append(in_path)
+    return input_paths
+
+def get_input_paths(in_path, folds, input_format, chunkers):
+    input_paths = []
+    for fold in range(1, folds+1):
+        fold_inputs = {}
+        for chunker in chunkers:
+            if os.path.isdir(os.path.join(in_path, chunker)):
+                if input_format == "ccl":
+                    fold_inputs[chunker] = os.path.join(in_path, chunker, 'ccl-test' + str(fold).zfill(2) + '.xml')
+                elif input_format == "xces":
+                    fold_inputs[chunker] = os.path.join(in_path, chunker, 'test' + str(fold).zfill(2) + '.xml')
+            else:
+                print os.path.join(in_path, chunker), " dir doesn't exist"
+        input_paths.append(fold_inputs)
+    return input_paths
+
+def get_writer(out_path, output_format, tagset, fold):
+    out_path = get_output_path(out_path, fold, output_format)
+    return corpus2.TokenWriter.create_path_writer(output_format, out_path,
+            tagset)
+
+def get_output_path(out_path, fold, output_format):
+    if output_format == "ccl":
+        return os.path.join(out_path, 'ccl-test' + str(fold).zfill(2) + '.xml')
+    elif input_format == "xces":
+        return os.path.join(out_path, 'test' + str(fold).zfill(2) + '.xml')
+
+def get_readers(in_paths, input_format, tagset):
+    readers = {}
+    for chunker, in_path in in_paths.iteritems():
+        readers[chunker] = tools.get_reader(in_path, input_format, tagset)
+    return readers
+
+def get_next_sents(readers):
+    result = {}
+    for chunker, reader in readers.iteritems():
+        result[chunker] = reader.get_next_sentence()
+    return result
+
+def clone_sent(sent):
+    new_sent = corpus2.Sentence.create_sent(sent.id())
+    for tok_idx, tok in enumerate(sent.tokens()):
+        tok = sent.tokens()[tok_idx]
+        if any(lex.is_disamb() for lex in tok.lexemes()):
+            new_sent.append(tok.clone())
+    return new_sent
+
+    
+def main(in_path, ref_path, out_path, input_format, output_format, chunk_names, chunkers, folds, tagset):
+    tagset = corpus2.get_named_tagset(tagset)
+    chunk_names = chunk_names.split(",")
+    chunkers = chunkers.split(",")
+
+    ref_paths = get_ref_paths(ref_path, folds, input_format)
+    input_paths = get_input_paths(in_path, folds, input_format, chunkers)
+
+    for fold in range(1, folds+1):
+        writer = get_writer(out_path, output_format, tagset, fold)
+        
+        readers = get_readers(input_paths[fold-1], input_format, tagset)
+        sents = get_next_sents(readers)
+        ref_reader = tools.get_reader(ref_paths[fold-1], input_format, tagset)
+        
+        while sents.itervalues().next():
+            ref_sent = ref_reader.get_next_sentence()
+            ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent)
+            
+            result_sent = clone_sent(ref_asent)
+            result_asent = corpus2.AnnotatedSentence.wrap_sentence(result_sent)
+            
+            for chunk_name in ref_asent.all_channels():
+                if chunk_name in chunk_names:
+                    right_annots = []
+                    ref_annots = ref_asent.get_channel(chunk_name).make_annotation_vector()
+                    ref = dict([(min(ann.indices), ann) for ann in ref_annots])
+                    
+                    for chunker in chunkers:
+                        ch_asent = corpus2.AnnotatedSentence.wrap_sentence(sents[chunker])
+                        if ch_asent.has_channel(chunk_name):
+                            ch_annots = ch_asent.get_channel(chunk_name).make_annotation_vector()
+                            ch = dict([(min(ann.indices), ann) for ann in ch_annots])
+                            
+                            maybe_hits = set(ch).intersection(ref)
+                            for idx in maybe_hits:
+                                if list(ch[idx].indices) == list(ref[idx].indices) and [i for i in ch[idx].indices] not in right_annots:
+                                    right_annots.append([i for i in ch[idx].indices])
+                    
+                    #add right chunks
+                    result_asent.create_channel(chunk_name)
+                    chan = result_asent.get_channel(chunk_name)
+                    for ann in right_annots:
+                        seg_no = chan.get_new_segment_index()
+                        for idx in ann:
+                            chan.set_segment_at(idx, seg_no)
+            
+            result_sent = corpus2.AnnotatedSentence.cast_as_sentence(result_asent)
+            writer.write_sentence(result_sent)
+            sents = get_next_sents(readers)
+            
+            
+if __name__ == '__main__':
+    go()
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/remove_not_cont_chunks.py b/scripts/chunker_scripts/experiments/remove_not_cont_chunks.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a1f2214eaf0a0e2a5a0bd3cd65f0f1a1d5ebb90
--- /dev/null
+++ b/scripts/chunker_scripts/experiments/remove_not_cont_chunks.py
@@ -0,0 +1,141 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 27-12-2012
+
+@author: Adam Pawlaczek
+'''
+
+from optparse import OptionParser
+import sys, os, codecs
+import corpus2
+from operator import itemgetter
+
+descr = """%prog [options] in_path out_path
+
+"""
+
+def get_writer(out_path, output_format, tagset):
+    if out_path:
+        return corpus2.TokenWriter.create_path_writer(output_format, out_path,
+            tagset)
+    else:
+        return corpus2.TokenWriter.create_stdout_writer(output_format, tagset)
+
+def get_reader(in_path, input_format, tagset):
+    if in_path:
+        return corpus2.TokenReader.create_path_reader(
+            input_format, tagset, in_path)
+    else:
+        return corpus2.TokenReader.create_stdin_reader(input_format, tagset)
+
+def get_output_path(out_path, basename = None):
+    if basename == None:
+        return out_path
+    else:
+        return os.path.join(out_path, basename)
+
+def get_input_paths(in_path, folds, input_format):
+    input_paths = []
+    if folds > 1:
+        for fold in range(1, folds+1):
+            if input_format == "ccl":
+                input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
+                input_paths.append(os.path.join(in_path, 'ccl-train' + str(fold).zfill(2) + '.xml'))
+            elif input_format == "xces":
+                input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
+                input_paths.append(os.path.join(in_path, 'train' + str(fold).zfill(2) + '.xml'))
+    else:
+        if(os.path.isdir(in_path)):
+            for (path, dirs, files) in os.walk(in_path):
+                for file in files:
+                    input_paths.append(os.path.join(path, file))
+        else:
+            input_paths.append(in_path)
+            
+    return input_paths
+
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-o', '--output-format', type='string', action='store',
+        dest='output_format', default='ccl',
+        help='set the output format; default: ccl')
+    parser.add_option('-c', '--chunk-names', type='string', action='store',
+        dest='chunk_names', default='',
+        help='set chunk_names to eval')
+    parser.add_option('-C', '--chunkers', type='string', action='store',
+        dest='chunkers', default='',
+        help='set chunker names')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,
+        help='Number of folds')
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 2:
+        sys.stderr.write('You need to provide a chunked ccl file.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    in_path, out_path = args
+    main(in_path, out_path, options.input_format, options.output_format, options.chunk_names, options.folds)
+
+
+def is_not_continous(inds):
+    l2 = range(inds[0], inds[-1] + 1)
+    return inds != l2
+
+def choose_ids_with_head(inds, head_ind):
+    begin = inds[0]
+    ended = False
+    for i in range(inds[0], inds[-1]+2):
+        if not i in inds or i == inds[-1]+1:
+            if not ended and head_ind in range(begin, i):
+                return range(begin, i)
+            ended = True
+        if i in inds and ended:
+            ended = False
+            begin = i
+            
+
+def main(in_path, out_path, input_format, output_format, chunk_names, folds):
+    tagset = corpus2.get_named_tagset("nkjp")
+    chunk_names = chunk_names.split(",")
+    
+    input_paths = get_input_paths(in_path, folds, input_format)
+            
+    for input_path in input_paths:
+        reader = get_reader(input_path, input_format, tagset)
+        if folds > 1:
+            output_path = get_output_path(out_path, os.path.basename(input_path))
+        else:
+            output_path = get_output_path(out_path)
+            
+        writer = get_writer(output_path, output_format, tagset)
+        
+        while True:
+            sent = reader.get_next_sentence()
+            if not sent:
+                break
+            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+            tokens = asent.tokens()
+            
+            for chan_name in asent.all_channels():
+                if chan_name in chunk_names:
+                    chan = asent.get_channel(chan_name)
+                    ann_vec = chan.make_annotation_vector()
+                    new_idx = len(ann_vec)
+                    for ann in ann_vec:
+                        inds = sorted(ann.indices)
+                        if is_not_continous(inds):
+                            ids_with_head = choose_ids_with_head(inds, ann.head_index)
+                            for i in inds:
+                                if not i in ids_with_head:
+                                    chan.set_segment_at(i, 0)  
+                    sent = corpus2.AnnotatedSentence.cast_as_sentence(asent)
+                    writer.write_sentence(sent)
+ 
+if __name__ == '__main__':
+    go()
\ No newline at end of file
diff --git a/scripts/chunker_scripts/experiments/spejd_v1.py b/scripts/chunker_scripts/experiments/spejd_v1.py
new file mode 100755
index 0000000000000000000000000000000000000000..266d4af6a4a8a7e5753f0a40680aa5081b609347
--- /dev/null
+++ b/scripts/chunker_scripts/experiments/spejd_v1.py
@@ -0,0 +1,109 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 10-05-2013
+
+@author: Adam Pawlaczek
+'''
+
+import os, shutil, sys
+from optparse import OptionParser
+from chunker_scripts.chunk_eval.chunk_eval import main as chunk_eval
+from chunker_scripts import tools
+import logging
+
+descr = """%prog [options] [corpus_dir] [out_dir]
+"""
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp')
+    parser.add_option('-c', '--config', type='string', action='store',
+        dest='config', default='/home/jezozwierzak/chunker/spejd/verified_nkjp_gramma/config.ini',
+        help='Set path to config file; default: /home/jezozwierzak/chunker/spejd/verified_nkjp_gramma/config.ini')
+    parser.add_option('--chunk-names', type='string', action='store',
+        dest='chunk_names', default='chunk_np,chunk_vp,chunk_agp,chunk_adjp,chunk_qp',
+        help='set chunk_names to eval')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,help='Number of folds')
+    parser.add_option('-v', '--verbose', action='store_true',
+        dest='verbose', default=False, help='verbose mode')
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 2:
+        sys.stderr.write('You need to provide corpus_dir and out_dir.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+    
+    in_dir, out_dir = args
+    main(in_dir, out_dir, options.chunk_names, options.config, options.tagset, options.input_format, options.folds, options.verbose)
+
+
+def main(in_dir, out_dir, chunk_names, config, tagset, input_format, folds, verbose):
+    logging.basicConfig(filename=os.path.join(out_dir, 'spejd_v1.log'), level=logging.INFO)
+    if verbose:
+        soh = logging.StreamHandler(sys.stdout)
+        soh.setLevel(logging.INFO)
+        logger = logging.getLogger()
+        logger.addHandler(soh)
+    
+    #Stworzenie katalogu wyjściowego
+    tools.mkdir_p(out_dir)
+    #Przekopiowanie plików train oraz test
+    for i in range(1, folds+1):
+        test_name = 'test' + str(i).zfill(2) + '.xml'
+        train_name = 'train' + str(i).zfill(2) + '.xml'
+        ccl_test = 'ccl-test' + str(i).zfill(2) + '.xml'
+        ccl_train = 'ccl-train' + str(i).zfill(2) + '.xml'
+        
+        logging.info('Kopiowanie foldu ze źródła')
+        if input_format == "ccl":
+            tools.ccl2xcesWithIob(os.path.join(in_dir, ccl_test), os.path.join(out_dir, test_name))
+            tools.ccl2xcesWithIob(os.path.join(in_dir, ccl_train), os.path.join(out_dir, train_name))
+        else:
+            shutil.copy(os.path.join(in_dir, test_name), os.path.join(out_dir, test_name))
+            shutil.copy(os.path.join(in_dir, train_name), os.path.join(out_dir, train_name))
+
+        logging.info('Usuwanie tagów iob')
+        tools.mkdir_p(os.path.join(out_dir,'noiob'))
+        shutil.copy(os.path.join(out_dir, test_name), os.path.join(out_dir, 'noiob', test_name))
+        tools.remove_iob(os.path.join(out_dir, 'noiob', test_name))
+
+        logging.info('Usuwanie disambów więcej niż 1')
+        tools.mkdir_p(os.path.join(out_dir,'only1disamb'))
+        tools.force1disamb(os.path.join(out_dir, 'noiob', test_name), os.path.join(out_dir, 'only1disamb', test_name), 'xces', 'xces', tagset)
+
+        logging.info('Usuwanie tagów bez disamba')
+        tools.mkdir_p(os.path.join(out_dir,'without_lex'))
+        tools.remove_lex_without_disamb(os.path.join(out_dir, 'only1disamb', test_name), os.path.join(out_dir, 'without_lex', test_name), 'xces', 'xces', tagset)
+
+        logging.info('Uruchomienie spejda')
+        tools.mkdir_p(os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2)))
+        shutil.copy(os.path.join(out_dir, 'without_lex', test_name), os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2), 'morph.xml'))
+        tools.run_spejd(config, os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2)))
+    #tei2xces
+        logging.info('Konwersja wyniku spejda do xces')
+        tools.tei2xces(os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2)))
+    #xces2ccl
+        logging.info('Konwersja wyniku spejda xces do ccl')
+        tools.mkdir_p(os.path.join(out_dir, 'ccl', 'spejd_test_result'))
+        tools.mkdir_p(os.path.join(out_dir, 'ccl', 'ref'))
+        tools.xces2ccl(os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2), 'out.xml'), os.path.join(out_dir, 'ccl', 'spejd_test_result', ccl_test))
+        tools.xces2ccl(os.path.join(out_dir, test_name), os.path.join(out_dir, 'ccl', 'ref', ccl_test))
+    #change channels
+        logging.info('Zmiana nazw chunków')
+        tools.mkdir_p(os.path.join(out_dir, 'ccl', 'changed_spejd'))
+        shutil.copy(os.path.join(out_dir, 'ccl', 'spejd_test_result', ccl_test), os.path.join(out_dir, 'ccl', 'changed_spejd'))
+        tools.change_chunk_channels(os.path.join(out_dir, 'ccl', 'changed_spejd', ccl_test))
+        
+    #diseval
+    logging.info('Porównanie wyników chunkowania z plikiami wzorcowymi')
+    chunk_eval(os.path.join(out_dir, 'ccl', 'changed_spejd'), os.path.join(out_dir, 'ccl', 'ref'), chunk_names, "ccl", os.path.join(out_dir, 'results.csv'), tagset, verbose, folds)
+
+        
+if __name__ == '__main__':
+    go() 
diff --git a/scripts/chunker_scripts/feature_selection/crf_wrapper.py b/scripts/chunker_scripts/feature_selection/crf_wrapper.py
deleted file mode 100755
index d60a3778301de3c5d6f80203a2f462c6e1d79b01..0000000000000000000000000000000000000000
--- a/scripts/chunker_scripts/feature_selection/crf_wrapper.py
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/python
-#-*- coding: utf-8 -*-
-'''
-Created on 18-03-2013
-
-@author: Adam Pawlaczek
-'''
-import tools
-import sys, math, os, shutil, random
-from chunk_eval import chunk_eval_avg
-from optparse import OptionParser
-from threading import Thread
-
-descr = """%prog [options] corpus_dir out_dir"""
-
-def go():
-    parser = OptionParser(usage=descr)
-    parser.add_option('-w', '--max-window', type="int", action='store',
-        dest='window', default=2,
-        help='Set max window for feature')
-    parser.add_option('-c', '--config', type='string', action='store',
-        dest='config', default='/home/jezozwierzak/range2/feature_selection/config/kpwr-experimental.ini',
-        help='set the config path to ini file')
-    parser.add_option('-f', '--folds', type="int", action='store',
-        dest='folds', default=10,
-        help='Number of folds default: 10')
-    (options, args) = parser.parse_args()
-    
-    if len(args) != 2:
-        sys.stderr.write('You need to provide corpus_dir and out_dir.\n')
-        sys.stderr.write('See %s --help\n' % sys.argv[0])
-        sys.exit(1)
-    
-    corpus_dir, out_dir = args
-    main(corpus_dir, out_dir, options.config, options.window, options.folds)
-
-#Odpalenie crf-a i sprawdzenie jaki będzie F-measure dla podanych cech
-def f(out_dir, config_dir, config_name, corpus_dir, folds, vector = [], constructed = {}):
-    tools.mkdir_p(os.path.join(out_dir, "config_files"), True)
-    generate_features_txt(os.path.join(out_dir, "config_files", config_name + "-layer1.txt"), vector = vector, constructed = constructed)
-    generate_features_txt(os.path.join(out_dir, "config_files", config_name + "-layer2.txt"), vector = vector, constructed = constructed)
-    #Copy config files
-    shutil.copyfile(os.path.join(config_dir, config_name + ".ccl"), os.path.join(out_dir, "config_files", config_name + ".ccl"))
-    shutil.copyfile(os.path.join(config_dir, config_name + ".ini"), os.path.join(out_dir, "config_files", config_name + ".ini"))
-    
-    threads = []
-    
-    for fold in range(1, folds+1):
-        t = Thread(target=process_fold, args=(fold, out_dir, corpus_dir, config_dir, config_name,))
-        threads.append(t)
-        t.start()
-    
-    for fold in range(1, folds+1):
-        t.join()
-    
-    result = chunk_eval_avg.get_avg_results(os.path.join(out_dir, "chunked"), corpus_dir, ["chunk_np"])
-    
-    f = open(os.path.join(out_dir, "result.csv"), 'w+')
-    f.write("vector: ", vector)
-    f.write("result: ", result)
-    f.close()
-    return result["f"]
- 
-def process_fold(fold, out_dir, corpus_dir, config_dir, config_name):
-    tools.mkdir_p(os.path.join(out_dir, "models", str(fold).zfill(2)), True)
-    #Copy dict files
-    shutil.copyfile(os.path.join(config_dir, "dict-case.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-case.lex"))
-    shutil.copyfile(os.path.join(config_dir, "dict-prep.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-prep.lex"))
-    shutil.copyfile(os.path.join(config_dir, "dict-sie.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-sie.lex"))
-
-    #TRAINING
-    tools.train_iobber(os.path.join(config_dir, config_name + ".ini"),
-                       os.path.join(out_dir, "models", str(fold).zfill(2)),
-                       os.path.join(corpus_dir, "ccl-train%02d.xml"%(fold)))
- 
-    #Remove channels
-    tools.mkdir_p(os.path.join(out_dir, "empty"))
-    shutil.copyfile(os.path.join(corpus_dir, "ccl-test%02d.xml"%(fold)), os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold)))
-    tools.remove_channels(os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold)))
-
-    #RUNING
-    tools.mkdir_p(os.path.join(out_dir, "chunked"))
-    tools.run_iobber(os.path.join(config_dir, config_name + ".ini"),
-                     os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold)),
-                     os.path.join(out_dir, "chunked", "ccl-test%02d.xml"%(fold)),
-                     os.path.join(out_dir, "models", str(fold).zfill(2)))
-
-    #Remove dicts
-    os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-case.lex"))
-    os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-prep.lex"))
-    os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-sie.lex"))
-
-#Wybranie sąsiada
-def neightbour(v):
-    a = random.randint(0, len(v))
-    b = random.randint(0, len(v[0]))
-    v[a][b] = 1 if v[a][b] == 0 else 0
-    a = random.randint(0, len(v))
-    b = random.randint(0, len(v[0]))
-    v[a][b] = 1 if v[a][b] == 0 else 0
-    return v
-
-#Obliczenie temperatury początkowej
-def tempestimation(out_dir, config_dir, config_name, corpus_dir, folds, vector):
-    iterations = 100           #ilosc iteracji symulacji
-    sum = 0
-    results = {}                 #histogram wyników
-    for i in range(iterations):
-        
-        result = f(os.path.join(out_dir, str(i).zfill(2)), config_dir, config_name, corpus_dir, folds, vector)
-        sum += result
-        if result not in results.keys():
-            results[result] = 1
-        else:
-            results[result] += 1
-        vector = neightbour(vector)
-    avg = sum / float(iterations)#obliczenie średniego wyniku
-    k = 0
-    deviation = 0
-    for result in results:
-        deviation += results[result] * ((k - avg) ** 2)
-        k += 1
-    #obliczenie odchylenia standardowego
-    deviation = math.sqrt(deviation / iterations)  
-    return deviation
-
-def get_features_number(cclfile):
-    f = open(cclfile)
-    count = 0
-    for line in f:
-        count += line.count(";")
-    f.close()
-    return count + 1
-    
-def generate_features_txt(resultfile, vector = [], constructed = {}):
-    out = open(resultfile, 'w+')
-    feature_num = 0
-    actual_feature_num = len(vector[0]) / 2 + 1
-    
-    for i in range(len(vector)):
-        for j in range(len(vector[i])):
-            if vector[i][j] == 1:
-                out.write('U%02d:%%x[%d,%d]'%(feature_num, j if j >= actual_feature_num else -j, i))
-                out.write("\n")
-                feature_num += 1
-    out.write("\n")
-    for i in range(len(constructed)):
-        feats = constructed[i].split("%")
-        
-    out.write("\n")
-    out.write("B")
-    out.close()
-
-def create_null_vector(features_num, window):
-    vector = []
-    for i in range(features_num):
-        vector.appendCell(tools.zerolistmaker(window*2 + 1))
-    return vector
-
-def randomize_vector(features_num, window):
-    vector = []
-    for i in range(features_num):
-        win_vector = []
-        for j in range(window):
-            win_vector.appendCell(random.randint(0, 1))
-        vector.appendCell(win_vector)
-    return vector
-
-def P(e, en, temp):
-    if e - en < 0:
-        return e ** ((e - en) / (temp))
-    else:
-        return 1
-
-def main(corpus_dir, out_dir, config, window, folds):
-    config_dir = os.path.dirname(config)
-    config_name = os.path.splitext(os.path.basename(config))[0]
-    config_ccl = os.path.join(config_dir, config_name + ".ccl")
-    
-    constructed = []
-    constructed.appendCell("1%2")
-    generate_features_txt(os.path.join(out_dir, "config_files", config_name + "-layer1.txt"), constructed = constructed)
-    
-    if not os.path.exists(out_dir):
-        tools.mkdir_p(out_dir)
-    
-    a_vector = randomize_vector(get_features_number(config_ccl), window)
-    temperature = tempestimation(os.path.join(out_dir, "estimation"), config_dir, config_name, corpus_dir, folds, a_vector)
-    
-    a_value = f(os.path.join(out_dir, "selection", "first"), config_dir, config_name, corpus_dir, folds, a_vector)
-    b_value = 0
-    i = 1
-    
-    while temperature > 0:
-        b_vector = neightbour(a_vector)
-        b_value = f(os.path.join(out_dir, "selection", str(i).zfill(2)), config_dir, config_name, corpus_dir, folds, b_vector)
-        prob = P(a_value, b_value, temperature)        
-        if b_value > a_value:
-            a_vector = b_vector
-        elif random.randint(0, 1) < prob:
-            a_vector = b_vector
-        temperatore = temperature * 0.95
-        i += 1
-              
-if __name__ == '__main__':
-    go()
diff --git a/scripts/chunker_scripts/feature_selection/crf_wrapper2.py b/scripts/chunker_scripts/feature_selection/crf_wrapper2.py
index 31642560e2a7c214a5873c905f53e00d4820c13b..2bebbcb0f8e3696e6b3a7696c608fdbfff45d440 100755
--- a/scripts/chunker_scripts/feature_selection/crf_wrapper2.py
+++ b/scripts/chunker_scripts/feature_selection/crf_wrapper2.py
@@ -8,8 +8,8 @@ Created on Mar 25, 2013
 from optparse import OptionParser
 import sys, os, random, shutil
 import anneal
-from chunker_scripts2 import tools
-from chunker_scripts2.chunk_eval import chunk_eval_avg 
+from chunker_scripts import tools
+from chunker_scripts.chunk_eval import chunk_eval_avg 
 from threading import Thread
 import multiprocessing
 
diff --git a/scripts/chunker_scripts/folds_maker/FoldsMaker.py b/scripts/chunker_scripts/folds_maker/FoldsMaker.py
old mode 100644
new mode 100755
index 88fd2cd9e7a86af708c2823de28d9e99aead5b37..7696abbac88fbb59f988c2c63e08384e1c71e459
--- a/scripts/chunker_scripts/folds_maker/FoldsMaker.py
+++ b/scripts/chunker_scripts/folds_maker/FoldsMaker.py
@@ -7,7 +7,7 @@ Created on 16-08-2012
 '''
 import sys, os, subprocess, random, shutil
 import corpus2
-import tools
+from chunker_scripts import tools
 from optparse import OptionParser
 
 descr = """%prog [options] [CorpusKind] [Corpus dir] [output dir]
@@ -16,7 +16,7 @@ descr = """%prog [options] [CorpusKind] [Corpus dir] [output dir]
     
 class FoldsMaker:
     
-    def __init__(self, input, output, in_format, out_format, held_out_bool, folds):
+    def __init__(self, input, output, in_format, out_format, held_out_bool, main_part_percent, folds):
         self.folds = folds
         self.sentences = []
         self.input = input
@@ -25,8 +25,8 @@ class FoldsMaker:
         self.out_format = out_format
         self.held_out_bool = held_out_bool
         
-        self.held_out_param = 0.2
-        self.main_param = 0.8
+        self.held_out_param = 1 - float(main_part_percent)
+        self.main_param = float(main_part_percent)
         
         self.held_out = ''
         self.held_out_folds = []
@@ -76,7 +76,7 @@ class FoldsMaker:
             if not sent:
                 break
             
-            self.sentences.appendCell(sent)
+            self.sentences.append(sent)
     
     def splitHeldOut(self):
         ind = int(len(self.sentences) * self.held_out_param)
@@ -89,8 +89,8 @@ class FoldsMaker:
             fold_held_out_sents = len(self.held_out) / self.folds
         for i in range(0, self.folds):
             if self.held_out_bool:
-                self.held_out_folds.appendCell(self.held_out[i*fold_held_out_sents:(i+1)*fold_held_out_sents])
-            self.main_folds.appendCell(self.main[i*fold_main_sents:(i+1)*fold_main_sents])
+                self.held_out_folds.append(self.held_out[i*fold_held_out_sents:(i+1)*fold_held_out_sents])
+            self.main_folds.append(self.main[i*fold_main_sents:(i+1)*fold_main_sents])
       
     def saveFolds(self):
         tools.mkdir_p(os.path.join(self.output, 'main'))
@@ -152,6 +152,9 @@ def go():
         help='set the input format; default: ccl')
     parser.add_option('--with-heldout', action='store_true', dest='held_out', default=False,
         help='Split data to main and held_out')
+    parser.add_option('-p', '--main-part', type='string', action='store',
+        dest='main_part', default='0.9',
+        help='set the input main part percentage; default: 0.9')
     (options, args) = parser.parse_args()
     
     if len(args) != 3:
@@ -160,10 +163,10 @@ def go():
         sys.stderr.write('See %s --help\n' % sys.argv[0])
         sys.exit(1)
     
-    main(args[0], args[1], args[2], options.input_format, options.output_format, options.held_out, options.folds)
+    main(args[0], args[1], args[2], options.input_format, options.output_format, options.held_out, options.main_part, options.folds)
     
-def main(corpusKind, corpusDir, outDir, in_format, out_format, held_out, folds=1):
-    foldsMaker = FoldsMaker(corpusDir, outDir, in_format, out_format, held_out, folds)
+def main(corpusKind, corpusDir, outDir, in_format, out_format, held_out, main_part, folds):
+    foldsMaker = FoldsMaker(corpusDir, outDir, in_format, out_format, held_out, main_part, folds)
     if corpusKind == "nkjp":
         foldsMaker.processNkjpCorpus()
     elif corpusKind == "kpwr":
diff --git a/scripts/chunker_scripts/split_np/pp_merger/__init__.py b/scripts/chunker_scripts/split_np/pp_merger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/chunker_scripts/split_np/pp_merger/classify.py b/scripts/chunker_scripts/split_np/pp_merger/classify.py
new file mode 100644
index 0000000000000000000000000000000000000000..a90ef1f493fc12e5084eb97e7f14c2ca59e4cdf9
--- /dev/null
+++ b/scripts/chunker_scripts/split_np/pp_merger/classify.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
+# This program is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the LICENCE, COPYING.LESSER and COPYING files for more details
+
+import CRFPP # CRF++ Python wrapper
+import subprocess, os # running crf_learn
+import codecs
+
+import config, corpio
+
+DATA_SEP = '\t'
+
+def open_tr_files(model_name, data_dir, layers):
+	tr_files = {}
+	for layer in layers:
+		tr_files[layer] = codecs.open(corpio.f_name(model_name, data_dir,
+					config.EXT_DATA, layer), 'wb', 'utf-8')
+	return tr_files
+
+def close_tr_files(tr_files):
+	for chan in tr_files:
+		tr_files[chan].close()
+
+def write_example(tr_file, feat_vals, class_label):
+	"""Writes a training example in simple tab-separated format."""
+	tr_file.write(DATA_SEP.join(feat_vals))
+	tr_file.write(DATA_SEP)
+	tr_file.write(class_label)
+	tr_file.write('\n')
+
+def write_end_of_sent(tr_file):
+	"""Writes end-of-sentence marker to the training file."""
+	tr_file.write('\n')
+
+def train_and_save(conf, model_name, config_dir, data_dir, chan_name):
+	"""Trains a CRF classifier for the given chan_name. The trained model
+	is saved to filenames (generated using model_name and conf)."""
+	tr_fname = corpio.f_name(model_name, data_dir, config.EXT_DATA, chan_name)
+	cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name)
+	cr_template = corpio.f_name(model_name, config_dir, config.EXT_TEXT, chan_name)
+	crf_opts = conf.get(config.S_CLASSIFIER, config.O_PARAMS)
+	# run crf_learn
+	args = ['crf_learn', cr_template, tr_fname, cr_fname]
+	args.extend(crf_opts.split()) # if any
+	with open(os.devnull, 'w') as fnull:
+		retval = subprocess.call(args,
+			stdout = fnull, stderr = fnull)
+		if retval != 0:
+			raise IOError('Training CRF++ FAILED. Check .tab file for data validity. Call: %s' % ' '.join(args))
+
+def load(conf, model_name, data_dir, chan_name):
+	"""Tries to load a stored classifier.
+	If doesn't exist, will return None."""
+	cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name)
+	if os.path.isfile(cr_fname):
+		return CRFPP.Tagger('-m %s' % cr_fname)
+	raise IOError('can\'t open classifier from %s' % cr_fname)
+
+def open_sent(crf_obj):
+	"""
+	Notify the trained classifier than a new sentence will be classified.
+	"""
+	crf_obj.clear()
+
+def eat_token(crf_obj, feat_vals):
+	"""Feed the trained classifier with a new token (instance). The output
+	tag sequence for the sentence being processed will be reade after calling
+	close_sent."""
+	instance = DATA_SEP.join(feat_vals).encode('utf-8')
+	crf_obj.add(instance)
+
+def close_sent(crf_obj):
+	"""Notify the trained classifier that a whole sentence has been fed and
+	have the classifier classify each token."""
+	crf_obj.parse()
+
+def classify_token(crf_obj, tok_idx):
+	"""Retrieve the class label (tag) for the token at given tok_idx. Assumes
+	that a whole sentence has been fed to the trained crf_obj with open_sent,
+	eat_token and close_sent calls."""
+	return crf_obj.y2(tok_idx)
diff --git a/scripts/chunker_scripts/split_np/pp_merger/config/kpwr.ccl b/scripts/chunker_scripts/split_np/pp_merger/config/kpwr.ccl
new file mode 100644
index 0000000000000000000000000000000000000000..11144bb98f8cae63ee464f6c7006a853c1a48041
--- /dev/null
+++ b/scripts/chunker_scripts/split_np/pp_merger/config/kpwr.ccl
@@ -0,0 +1,16 @@
+@ "default" (
+   orth[0];  // 0
+   class[0]; // 1
+   cas[0];   // 2
+   gnd[0];   // 3
+   nmb[0];   // 4
+   agrpp(0,1,{nmb,gnd,cas}); // 5
+   and(inside(-1), inside(1), wagr(-1,1,{nmb,gnd,cas})); // 6
+   regex(orth[0], "\\P{Ll}.*"); regex(orth[0], "\\P{Lu}.*") // 7, 8
+)
+
+/*
+@ "layer2" (
+   isannpart(0, "chunk_agp") // 9
+)
+*/
diff --git a/scripts/chunker_scripts/split_np/pp_merger/corpio.py b/scripts/chunker_scripts/split_np/pp_merger/corpio.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf515c76c6cd408a58660f8cb65057bb0bff168a
--- /dev/null
+++ b/scripts/chunker_scripts/split_np/pp_merger/corpio.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
+# This program is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the LICENCE, COPYING.LESSER and COPYING files for more details
+
+# SWIG bug workaround: loading multiple SWIG modules brought unwrapped
+# swig::stop_iteration exceptions
+import ctypes, sys
+import platform
+if 'Linux' in platform.system():
+	# this prevents from problems with multiple SWIG wrappers
+	# (probably bug in SWIG) and possible problems with locating Maca plugin
+	dlflags = sys.getdlopenflags()
+	sys.setdlopenflags(dlflags | ctypes.RTLD_GLOBAL)
+
+import corpus2, wccl
+# TODO: get back to default dlopen policy?
+
+if 'Linux' in platform.system():
+	# get back to default dlopen policy
+	sys.setdlopenflags(dlflags)
+
+import config
+import codecs, os
+
+_ROOT = os.path.abspath(os.path.dirname(__file__))
+
+format_help = """
+Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """
+""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """
+Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """
+""" + ' '.join(corpus2.TokenWriter.available_writer_types_help())
+
+def get_data(path):
+	"""Tries to resolve path to the given subdir, trying the path locally
+	and then in the install site."""
+	if os.path.exists(path):
+		return path
+	in_data = os.path.join(_ROOT, 'data', path)
+	if os.path.exists(in_data):
+		return in_data
+	raise IOError('can\'t locate %s, tried locally and %s' % (path, in_data))
+
+def f_name(model_name, subdir, ext, suff = ''):
+	"""Gets the filename based on model_name having the given
+	extension. Optionally, you can specify name suffix."""
+	
+	base = (model_name + '-' + suff + '.' + ext) if suff else (model_name + '.' + ext)
+	return os.path.join(subdir, base)
+
+def get_tagset(conf):
+	return corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET))
+
+def get_reader(in_path, tagset, input_format, read_disamb_only):
+	"""Creates a reader using the options. If in_path evaluates to False,
+	will create a stdin reader. Set read_disamb_only to force reading only
+	'disamb' lexemes/interpretations."""
+	fixd_format = input_format
+	if read_disamb_only:
+		fixd_format += ',disamb_only'
+	# force casting sentences as AnnotatedSentences
+	# required to get XCES input right
+	fixd_format += ',ann'
+	
+	if in_path:
+		return corpus2.TokenReader.create_path_reader(
+			fixd_format, tagset, in_path)
+	else:
+		return corpus2.TokenReader.create_stdin_reader(fixd_format, tagset)
+
+def get_writer(out_path, tagset, output_format):
+	"""Creates a writer using the options. If out_path evaluates to False,
+	will create a stdout writer."""
+	if out_path:
+		return corpus2.TokenWriter.create_path_writer(output_format, out_path,
+			tagset)
+	else:
+		return corpus2.TokenWriter.create_stdout_writer(output_format, tagset)
+
+def op_list(wccl_file, sec_name):
+	"""Retrieves a list of operators corresponding to a named section from
+	the given WCCL file. If section not present, will return an empty list."""
+	ops = []
+	if wccl_file.has_untyped_section(sec_name):
+		sec = wccl_file.get_untyped_section(sec_name)
+		for op_idx in range(sec.size()):
+			ops.append(sec.get_ptr(op_idx))
+	return ops
+
+def get_wccl_ops(conf, model_name, wccl_dir, lex_dir, chan_names):
+	print lex_dir
+	"""Returns a pair: WCCL op list, that is a list of WCCL operator lists
+	corresponding to the given channel names. Each list may consists of two
+	parts: the default operators and channel-specific operators
+	(theoretically both may be empty)."""
+	wccl_file_path = f_name(model_name, wccl_dir, config.EXT_WCCL)
+	tagset = corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET))
+	wccl_file = wccl.Parser(tagset).parseWcclFileFromPath(wccl_file_path, lex_dir)
+	def_ops = op_list(wccl_file, config.DEFAULT_OPS)
+	chan_ops = [def_ops + op_list(wccl_file, chan_name) for chan_name in chan_names]
+	return chan_ops
+
+def create_context(sent):
+	"""Wraps the sentence as SentenceContext to be used with WCCL."""
+	return wccl.SentenceContext(sent)
diff --git a/scripts/chunker_scripts/split_np/pp_merger/pp_merger.py b/scripts/chunker_scripts/split_np/pp_merger/pp_merger.py
new file mode 100755
index 0000000000000000000000000000000000000000..c2cc2424c10c5dc5c88b0ec87955f2a8c6a05153
--- /dev/null
+++ b/scripts/chunker_scripts/split_np/pp_merger/pp_merger.py
@@ -0,0 +1,272 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 15-04-2013
+
+@author: Adam Pawlaczek
+'''
+
+
+from optparse import OptionParser
+import sys, os, codecs
+import corpus2
+import wccl
+from chunker_scripts import tools
+import corpio, classify
+import logging
+import timbl
+
+descr = """%prog [options] in_path out_path
+
+"""
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-o', '--output-format', type='string', action='store',
+        dest='output_format', default='ccl',
+        help='set the output format; default: ccl')
+    parser.add_option('-c', '--config', type="string", action='store',
+        dest='config_path', default='config/kpwr.ccl',
+        help='Path to config file; default: kpwr.ini')
+    parser.add_option('-d', '--data', type="string", action='store',
+        dest='data_dir', default='model-kpwr11-H_2',
+        help='Path to data_dir; default: model-kpwr11-H_2')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,
+        help='Number of folds')
+    parser.add_option('--train', action='store_true',
+        dest='is_training', help='train the pp_merger')
+    parser.add_option('--xval', action='store_true',
+        dest='xval', help='make cross validation with directory of folds')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp')
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 2 and not options.is_training:
+        sys.stderr.write('You need to provide a in_path out_path.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    ppMerger = PPMerger(options.config_path, options.tagset, options.data_dir)
+    
+    if options.is_training:
+        in_path = args[0]
+        ppMerger.train_and_save(in_path, options.input_format)
+    elif options.xval:
+        in_path, out_path = args
+        ppMerger.xval(in_path, out_path, options.input_format, options.output_format, options.folds)
+    else:
+        in_path, out_path = args
+        ppMerger.classify(in_path, out_path, options.input_format, options.output_format)
+
+
+def read_repr(inf):
+    return eval(inf.readline())
+
+class PPMerger:
+    
+    def __init__(self, config_path, tagset, data_dir):
+        self.config_dir = os.path.dirname(config_path)
+        self.config_name = os.path.splitext(os.path.basename(config_path))[0]
+        
+        self.data_dir = data_dir
+        self.tagset = corpus2.get_named_tagset(tagset)
+        self.ops = self.wccl_ops()
+    
+    def indexes_of_prep(self, ann, tokens):
+        ''' Returns indexes of prep in ann, not in sentence'''
+        result = []
+        for idx in range(len(ann.indices)):
+            pos = self.tagset.get_pos_name(tokens[idx].get_preferred_lexeme(self.tagset).tag().get_pos_index())
+            if pos == 'prep':
+                result.append(idx)
+        return result
+        
+    def is_splited_prep(self, prev_ann, ann, tokens):
+        lexeme = tokens[ann.indices[0]].get_preferred_lexeme(self.tagset).tag().get_pos_index()
+        pos = self.tagset.get_pos_name(lexeme)
+        return pos == "prep" and prev_ann != None and prev_ann.indices[-1] + 1 == ann.indices[0]
+    
+    def op_list(self, wccl_file, sec_name):
+        ops = []
+        if wccl_file.has_untyped_section(sec_name):
+            sec = wccl_file.get_untyped_section(sec_name)
+            for op_idx in range(sec.size()):
+                ops.append(sec.get_ptr(op_idx))
+        return ops
+    
+    def wccl_ops(self):
+        wccl_file_path = os.path.join(self.config_dir, self.config_name + ".ccl")
+        wccl_file = wccl.Parser(self.tagset).parseWcclFileFromPath(wccl_file_path, self.data_dir)
+        ops = self.op_list(wccl_file, "default")
+        return ops
+    
+    def train_and_save(self, in_path, input_format):
+        self.input_format = input_format
+        if not os.path.exists(self.data_dir):
+            tools.mkdir_p(self.data_dir)
+            
+        tr_path = os.path.join(self.data_dir, "pp_merger.tr")
+        tr_file = codecs.open(tr_path, 'wb', 'utf-8')
+
+        reader = tools.get_reader(in_path, self.input_format, self.tagset)
+        sent = reader.get_next_sentence()
+        while sent:
+            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+            if "chunk_np" in asent.all_channels():
+                chan = asent.get_channel("chunk_np")
+                ann_vec = chan.make_annotation_vector()
+                prev_ann = None
+                
+                con = wccl.SentenceContext(sent)
+                for ann in ann_vec:
+                    indexes_of_prep = self.indexes_of_prep(ann, asent.tokens())
+                    for idx in indexes_of_prep:
+                        con.set_position(ann.indices[idx])
+                        feat_vals = [op.base_apply(con)
+                                         .to_compact_string(self.tagset).decode('utf-8')
+                                         for op in self.ops]
+                        if (idx == 0 and self.is_splited_prep(prev_ann, ann, asent.tokens())):
+                            class_label = 'B' 
+                            classify.write_example(tr_file, feat_vals, class_label)
+                        elif (idx != 0):
+                            class_label = 'I' 
+                            classify.write_example(tr_file, feat_vals, class_label)
+                        # Może być jeszcze przypadek prep na początku w niepodzielonym chunku.
+                        # Tą opcję warto sprawdzić w osobnym doświadczeniu
+                        
+                    prev_ann = ann
+                    
+                classify.write_end_of_sent(tr_file)            
+            sent = reader.get_next_sentence()
+            
+        tr_file.close()
+        
+        cr_path = os.path.join(self.data_dir, "pp_merger.cr")
+        
+        timbl_opts = "-mM -k11 -dIL +vs"
+        timbl_obj = timbl.TimblAPI(timbl_opts, "")
+        
+        timbl_obj.learn(tr_path)
+        with open(cr_path + '.x', 'w') as out:
+            out.write('%s\n' % repr(timbl_opts))
+            # for restoring weights
+            out.write('%s\n' %  repr(timbl_obj.currentWeighting().name))
+        timbl_obj.writeInstanceBase(cr_path + '.b')
+        timbl_obj.saveWeights(cr_path + '.w')
+        timbl_obj.writeArrays(cr_path + '.a')
+        return timbl_obj
+         
+    def load_model(self):
+        cr_path = os.path.join(self.data_dir, "pp_merger.cr")
+        if not os.path.exists(cr_path + '.x'):
+            return None
+        with open(cr_path + '.x', 'r') as inf:
+            timbl_opts = read_repr(inf)
+            weighting_name = read_repr(inf)
+        timbl_obj = timbl.TimblAPI(timbl_opts, "")
+        timbl_obj.getInstanceBase(cr_path + '.b')
+        timbl_obj.getWeights(cr_path + '.w', timbl.Weighting.names[weighting_name])
+        timbl_obj.getArrays(cr_path + '.a')
+        return timbl_obj
+    
+    def to_timbl_line(self, feat_vals, class_label):
+        """Gets a TiMBL-friendly instance string representation."""
+        return u'\t'.join(feat_vals + [class_label]).encode('utf8')
+         
+    def classify(self, in_path, out_path, input_format, output_format):
+        self.input_format = input_format
+        self.output_format = output_format
+        
+        timbl_obj = self.load_model()
+        self.classify_input(in_path, out_path, input_format, output_format, timbl_obj)
+        
+    def classify_input(self, in_path, out_path, input_format, output_format, timbl_obj):
+        reader = tools.get_reader(in_path, self.input_format, self.tagset)
+        writer = tools.get_writer(out_path, output_format, self.tagset)
+        
+        chunk = reader.get_next_chunk()  
+        while chunk:
+            for sent in chunk.sentences():
+                asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+                
+                if "chunk_np" in asent.all_channels():
+                    chan = asent.get_channel("chunk_np")
+                    ann_vec = chan.make_annotation_vector()
+                    prev_ann = None
+                    
+                    con = wccl.SentenceContext(sent)
+                    for ann in ann_vec:
+                        indexes_of_prep = self.indexes_of_prep(ann, asent.tokens())
+                        for idx in indexes_of_prep:
+                            con.set_position(ann.indices[idx])
+                            feat_vals = [op.base_apply(con)
+                                 .to_compact_string(self.tagset).decode('utf-8')
+                                 for op in self.ops]
+                            #get only prep that can be included to prev ann
+                            if (idx == 0 and self.is_splited_prep(prev_ann, ann, asent.tokens())):
+                                class_label = self.decide_about_prep(timbl_obj, feat_vals)
+                                if class_label == "I":
+                                    for ann_idx in ann.indices:
+                                        chan.set_segment_at(ann_idx, prev_ann.seg_number)
+                        prev_ann = ann
+                                
+            writer.write_chunk(chunk)    
+            chunk = reader.get_next_chunk()
+            
+    def decide_about_prep(self, timbl_obj, feat_vals):
+        line = self.to_timbl_line(feat_vals, '?')
+        success, decsn = timbl_obj.classify(line)
+        if not success:
+            raise ValueError('TiMBL failed to classify %s' % line)
+        return decsn
+
+    def get_in_paths(self, in_path, input_format, folds, main_name="test"):
+        input_paths = []
+        if folds > 1:
+            for fold in range(1, folds+1):
+                if input_format == "ccl":
+                    input_paths.append(os.path.join(in_path, 'ccl-' + main_name + str(fold).zfill(2) + '.xml'))
+                elif input_format == "xces":
+                    input_paths.append(os.path.join(in_path,  main_name + str(fold).zfill(2) + '.xml'))
+        else:
+            if(os.path.isdir(in_path)):
+                for (path, dirs, files) in os.walk(in_path):
+                    for file in files:
+                        input_paths.append(os.path.join(path, file))
+            else:
+                input_paths.append(in_path)
+                
+        return input_paths
+
+    def get_out_path(self, in_path, out_dir, output_format):
+        file_name = os.path.basename(in_path)
+        if output_format == "ccl":
+            if file_name.startswith("ccl-"):
+                return os.path.join(out_dir, file_name)
+            else:
+                return os.path.join(out_dir, "ccl-" + file_name)
+        else:
+            if file_name.startswith("ccl-"):
+                return os.path.join(out_dir, "ccl-" + file_name[4:])
+            else:
+                return os.path.join(out_dir, file_name)
+            
+    def xval(self, in_path, out_path, input_format, output_format, folds):
+        in_trains = self.get_in_paths(in_path, input_format, folds, "train")
+        in_tests = self.get_in_paths(in_path, input_format, folds, "test")
+        out_dir = out_path
+        
+        for in_train, in_test in zip(in_trains, in_tests):
+            timbl_obj = self.train_and_save(in_train, input_format)
+            tools.mkdir_p(os.path.join(out_dir, "empty"))
+            empty_path = os.path.join(out_dir, "empty", os.path.basename(in_test))
+            tools.remove_channels(in_test, empty_path)
+            out_path = self.get_out_path(in_test, out_dir, output_format)
+            self.classify_input(empty_path, out_path, input_format, output_format, timbl_obj)
+            
+if __name__ == '__main__':
+    go()
\ No newline at end of file
diff --git a/scripts/chunker_scripts/split_np/pp_merger/split_half_train.py b/scripts/chunker_scripts/split_np/pp_merger/split_half_train.py
new file mode 100755
index 0000000000000000000000000000000000000000..efead2b04e5cc34151ba0ab8a9674f100a74ca27
--- /dev/null
+++ b/scripts/chunker_scripts/split_np/pp_merger/split_half_train.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+'''
+Created on 22-04-2013
+
+@author: Adam Pawlaczek
+'''
+
+from optparse import OptionParser
+import sys, os
+import corpus2
+descr = """%prog [options] in_path out_path
+
+"""
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-o', '--output-format', type='string', action='store',
+        dest='output_format', default='ccl',
+        help='set the output format; default: ccl')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp')
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 3:
+        sys.stderr.write('You need to provide a in_path out_path.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    in_path, out_path1, out_path2 = args
+    main(in_path, out_path1, out_path2, options.input_format, options.output_format, options.tagset)
+
+def get_writer(out_path, output_format, tagset):
+    if out_path:
+        return corpus2.TokenWriter.create_path_writer(output_format, out_path,
+            tagset)
+    else:
+        return corpus2.TokenWriter.create_stdout_writer(output_format, tagset)
+
+def get_reader(in_path, input_format, tagset):
+    if in_path:
+        return corpus2.TokenReader.create_path_reader(
+            input_format, tagset, in_path)
+    else:
+        return corpus2.TokenReader.create_stdin_reader(input_format, tagset)
+
+def count_sentences(in_path, input_format, tg):
+    reader = get_reader(in_path, input_format, tg)
+    
+    sent = reader.get_next_sentence()
+    sum = 1
+    while sent:
+        sent = reader.get_next_sentence()
+        sum += 1
+        
+    return sum
+
+def main(in_path, out_path1, out_path2, input_format, output_format, tagset):
+    tg = corpus2.get_named_tagset(tagset)
+    
+    sentences_nr = count_sentences(in_path, input_format, tg)
+    sentences_half_nr = sentences_nr / 2
+    reader = get_reader(in_path, input_format, tg)
+    writer = get_writer(out_path1, output_format, tg)
+    
+    sent = reader.get_next_sentence()
+    i = 1
+    while sent and i < sentences_half_nr:
+        writer.write_sentence(sent)
+        sent = reader.get_next_sentence()
+        i += 1
+    
+    writer = get_writer(out_path2, output_format, tg)
+    while sent:
+        writer.write_sentence(sent)
+        sent = reader.get_next_sentence()
+        
+        
+if __name__ == '__main__':
+    go()
\ No newline at end of file
diff --git a/scripts/chunker_scripts/stats/count_chunk_length_histogram.py b/scripts/chunker_scripts/stats/count_chunk_length_histogram.py
new file mode 100755
index 0000000000000000000000000000000000000000..3209ff70f5188f6e64c0daa4c25a0426a084456c
--- /dev/null
+++ b/scripts/chunker_scripts/stats/count_chunk_length_histogram.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Created on Dec 21, 2012
+@author: jezozwierzak
+'''
+
+descr = """%prog [options] in_path
+
+"""
+
+from optparse import OptionParser
+import sys, os
+import corpus2
+from chunker_scripts.csv_table.CSVTable import CSVTable
+
+def go():
+    parser = OptionParser(usage=descr)
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp')
+    parser.add_option('-f', '--folds', type="int", action='store',
+        dest='folds', default=1,
+        help='Number of folds')
+    parser.add_option('-c', '--chunk-names', type='string', action='store',
+        dest='chunk_names', default='',
+        help='set chunk_names to eval')
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 1:
+        sys.stderr.write('No args. See --help\n')
+        sys.exit(1)
+    
+    in_path = args[0]
+    main(in_path, options.input_format, options.folds, options.tagset, options.chunk_names)
+
+def main(in_path, input_format, folds, tagset, chunk_names):
+    
+    chunk_names = chunk_names.split(",")
+    tagset = corpus2.get_named_tagset(tagset)
+    
+    input_paths = []
+    if folds > 1:
+        for fold in range(1, folds+1):
+            if input_format == "ccl":
+                input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
+            elif input_format == "xces":
+                input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
+    else:
+        if(os.path.isdir(in_path)):
+            for (path, dirs, files) in os.walk(in_path):
+                for file in files:
+                    input_paths.append(os.path.join(path, file))
+        else:
+            input_paths.append(in_path)
+    
+    table = CSVTable()
+    table.addColumn('Nr')
+    for input_path in input_paths:
+        reader = corpus2.TokenReader.create_path_reader(
+                    input_format, tagset, input_path)
+        
+        sent = reader.get_next_sentence()
+        while sent:
+            asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
+            tokens = asent.tokens()
+            
+            for chan_name in asent.all_channels():
+                if chan_name in chunk_names:
+                    chan = asent.get_channel(chan_name)
+                    ann_vec = chan.make_annotation_vector()
+                    for ann in ann_vec:
+                        
+                        if not table.hasColumn(chan_name):
+                            table.addColumn(chan_name, type='int')
+                        
+                        while table.rows < len(ann.indices):
+                            table.addEmptyRow()
+                        
+    #                    print chan_name, len(ann_vec)
+                        table.increment(chan_name, len(ann.indices)-1)
+            sent = reader.get_next_sentence()
+    print table
+        
+if __name__ == '__main__':
+    go()
diff --git a/scripts/chunker_scripts/stats/crossing.py b/scripts/chunker_scripts/stats/crossing.py
new file mode 100755
index 0000000000000000000000000000000000000000..d7dcb911dcf0ba662f7a80d017d22c4ec5dbeaa8
--- /dev/null
+++ b/scripts/chunker_scripts/stats/crossing.py
@@ -0,0 +1,340 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Created on May 13, 2013
+
+@author: Maciej Zawadzki
+'''
+
+import corpus2
+from optparse import OptionParser
+import os
+import sys
+
+"""
+The purpose of this program is to count number of names that are outside of one chunk_np
+It will print all errors and finally total statistics
+"""
+
+def go():
+    parser = OptionParser()
+    parser.add_option('-i', '--input-format', type='string', action='store',
+        dest='input_format', default='ccl',
+        help='set the input format; default: ccl')
+    parser.add_option('-o', '--output-format', type='string', action='store',
+        dest='output_format', default='ccl',
+        help='set the output format; default: ccl')
+    parser.add_option('-t', '--tagset', type='string', action='store',
+        dest='tagset', default='nkjp',
+        help='set the tagset used in input; default: nkjp')
+    parser.add_option('-v', '--verbose', action='store_true',
+        help='verbose output')
+    parser.add_option('-f', '--folds', type="int", action='store',
+            dest='folds', default=1,help='Number of folds') 
+    (options, args) = parser.parse_args()
+    
+    if len(args) != 4:
+        sys.stderr.write('You need to provide corpus_dir and out_dir.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+    wyniki = []
+    wzrc_file = get_ref_paths(args[0], options.folds, options.input_format)
+    crf_file = get_ref_paths(args[1], options.folds, options.input_format)
+    open_file = get_ref_paths(args[2], options.folds, options.input_format)
+    spejd_file = get_ref_paths(args[3], options.folds, options.input_format)
+    i = 0
+    for files in wzrc_file:
+        wyniki.append(main(files, options.input_format, options.tagset, crf_file[i], open_file[i], spejd_file[i], options.verbose))
+        i+=1
+
+    crf_ok = 0
+    crf_no = 0
+
+    open_ok = 0
+    open_no = 0
+
+    spejd_ok = 0
+    spejd_no = 0    
+
+    err1crf = 0
+    err2crf = 0
+    err3crf = 0
+    err4crf = 0
+
+    err1open = 0
+    err2open = 0
+    err3open = 0
+    err4open = 0
+
+    err1spejd = 0
+    err2spejd = 0
+    err3spejd = 0
+    err4spejd = 0
+
+    for w in wyniki:
+        crf_ok = crf_ok + w[0]
+        crf_no = crf_no + w[1]
+        open_ok = open_ok + w[2]
+        open_no = open_no + w[3]
+        spejd_ok = spejd_ok + w[4]
+        spejd_no = spejd_no + w[5]
+        err1crf = err1crf + w[6]
+        err2crf = err2crf + w[7]
+        err3crf = err3crf + w[8]
+        err4crf = err4crf + w[9]
+        err1open = err1open + w[10]
+        err2open = err2open + w[11]
+        err3open = err3open + w[12]
+        err4open = err4open + w[13]
+        err1spejd = err1spejd + w[14]
+        err2spejd = err2spejd + w[15]
+        err3spejd = err3spejd + w[16]
+        err4spejd = err4spejd + w[17]
+
+    print "crf ok :"+str(crf_ok)+" crf err count :"+str(crf_no)+"\t"+" s1: "+str(err1crf)+ " s2: "+str(err2crf)+  " s3: "+str(err3crf)+  " s4: "+str(err4crf) 
+    print "open ok :"+str(open_ok)+" open err count :"+str(open_no)+"\t"+" s1: "+str(err1open)+ " s2: "+str(err2open)+  " s3: "+str(err3open)+  " s4: "+str(err4open) 
+    print "spejd ok :"+str(spejd_ok)+" spejd err count :"+str(spejd_no)+"\t"+" s1: "+str(err1spejd)+ " s2: "+str(err2spejd)+  " s3: "+str(err3spejd)+  " s4: "+str(err4spejd) 
+#    print "chunki nie posiadające żadnej części wspólnej z korpusem wzorcowym : "+ str(err1)
+#    print "chunki którę są podzbiorem dla chunka z korpusu wzorcowego : "+str(err2)
+#    print "chunki które są nadzbiorem dla chunka z korpusu wzorcowego : "+str(err3)
+#    print "chunki które posiadają część wspólną : "+str(err4)
+
+def main(wzrc_path, input_format, tagset, crf_path, open_path, spejd_path, verbose):
+
+    tagset = corpus2.get_named_tagset(tagset)
+    print wzrc_path
+    wzrc_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, wzrc_path)
+    crf_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, crf_path)
+    open_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, open_path)
+    spejd_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, spejd_path)
+
+    wzrc_sent = wzrc_reader.get_next_sentence()
+    crf_sent = crf_reader.get_next_sentence()
+    open_sent = open_reader.get_next_sentence()
+    spejd_sent = spejd_reader.get_next_sentence()
+
+    wyniki_o = []    
+
+    while wzrc_sent:
+        tokens = wzrc_sent.tokens()
+
+        tab = []
+        
+        asent = corpus2.AnnotatedSentence.wrap_sentence(wzrc_sent)
+        sent_id = wzrc_sent.id()
+        sent_size = wzrc_sent.size()
+
+        for chan_name in asent.all_channels():
+            if chan_name == 'chunk_np':        
+                chan = asent.get_channel(chan_name)                
+                ann_vec = chan.make_annotation_vector()
+                tab.append(["np",ann_vec])            
+        if len(tab) != 1:
+            tab.append(["np", None])
+
+        asent = corpus2.AnnotatedSentence.wrap_sentence(crf_sent)
+        for chan_name in asent.all_channels():
+            if chan_name == 'chunk_np':        
+                chan = asent.get_channel(chan_name)                
+                ann_vec = chan.make_annotation_vector()
+                tab.append(["crf",ann_vec])        
+        if len(tab) != 2:
+            tab.append(["crf", None])
+
+        asent = corpus2.AnnotatedSentence.wrap_sentence(open_sent)
+        for chan_name in asent.all_channels():
+            if chan_name == 'chunk_np':        
+                chan = asent.get_channel(chan_name)                
+                ann_vec = chan.make_annotation_vector()
+                tab.append(["open",ann_vec])
+        if len(tab) != 3:
+            tab.append(["open", None])
+
+        asent = corpus2.AnnotatedSentence.wrap_sentence(spejd_sent)
+        for chan_name in asent.all_channels():
+            if chan_name == 'chunk_np':        
+                chan = asent.get_channel(chan_name)                
+                ann_vec = chan.make_annotation_vector()
+                tab.append(["spejd",ann_vec])
+        if len(tab) != 4:
+            tab.append(["spejd", None])
+
+        if len(tab) != 0 :
+            wyniki_o.append(check_sentence(tab, sent_size, tokens, sent_id, verbose))
+
+
+        crf_sent = crf_reader.get_next_sentence()
+        open_sent = open_reader.get_next_sentence()
+        spejd_sent = spejd_reader.get_next_sentence()
+        wzrc_sent = wzrc_reader.get_next_sentence()
+    crf_ok = 0
+    crf_no = 0
+
+    open_ok = 0
+    open_no = 0
+
+    spejd_ok = 0
+    spejd_no = 0    
+
+    err1crf = 0
+    err2crf = 0
+    err3crf = 0
+    err4crf = 0
+
+    err1open = 0
+    err2open = 0
+    err3open = 0
+    err4open = 0
+
+    err1spejd = 0
+    err2spejd = 0
+    err3spejd = 0
+    err4spejd = 0
+
+    for wyniki in wyniki_o:
+        crf_ok = crf_ok + wyniki[0][0]
+        crf_no = crf_no + wyniki[0][1]
+        open_ok = open_ok + wyniki[1][0]
+        open_no = open_no + wyniki[1][1]
+        spejd_ok = spejd_ok + wyniki[2][0]
+        spejd_no = spejd_no + wyniki[2][1]
+        err1crf = err1crf + wyniki[0][2]
+        err2crf = err2crf + wyniki[0][3]
+        err3crf = err3crf + wyniki[0][4]
+        err4crf = err4crf + wyniki[0][5]
+
+        err1open = err1open + wyniki[1][2] 
+        err2open = err2open + wyniki[1][3]
+        err3open = err3open + wyniki[1][4]
+        err4open = err4open + wyniki[1][5]
+
+        err1spejd = err1spejd + wyniki[2][2]
+        err2spejd = err2spejd + wyniki[2][3]
+        err3spejd = err3spejd + wyniki[2][4]
+        err4spejd = err4spejd + wyniki[2][5]
+
+
+
+    return [crf_ok, crf_no, open_ok, open_no, spejd_ok, spejd_no, err1crf, err2crf, err3crf, err4crf, err1open, err2open, err3open, err4open, err1spejd, err2spejd, err3spejd, err4spejd]
+#    print "crf ok :"+str(crf_ok)+" crf no :"+str(crf_no)
+#    print "open ok :"+str(open_ok)+" open no :"+str(open_no)
+#    print "spejd ok :"+str(spejd_ok)+" spejd no :"+str(spejd_no)
+
+def check_sentence(tab, size, tokens, sent_id, verbose):
+    tokensent = []
+    for tok in tokens:
+        tokensent.append(tok.orth())
+        
+    sent = []
+    temp = []
+    for ele in tab:
+        if ele[1]:
+            for ann in ele[1]:
+                temp.append(sorted(ann.indices))
+        sent.append([ele[0],temp])
+        temp = []
+        
+
+    result_crf   = isOK(sent[0][1], sent[1], tokensent, sent_id, sent, verbose)
+    result_open  = isOK(sent[0][1], sent[2], tokensent, sent_id, sent, verbose)
+    result_spejd = isOK(sent[0][1], sent[3], tokensent, sent_id, sent, verbose)    
+
+    return [result_crf, result_open, result_spejd]
+
+    
+
+
+
+def isOK(comparer, chunks, tokensent, sent_id, sent, verbose):
+    ok = 0
+    err = 0
+    err1 = 0
+    err2 = 0
+    err3 = 0
+    err4 = 0
+    for chunk in chunks[1]:
+        if (is_equal(comparer, chunk)):
+            ok += 1
+        else:
+            err += 1
+            if verbose:
+                print '***'+chunks[0]
+                for s in sent:
+                    printer(s, tokensent)
+                
+
+                print 
+            if have_no_intersection(comparer, chunk):
+                err1 += 1
+            else:
+                setchunk = set(chunk)
+                for ele in comparer:
+                    setele = set(ele)
+                    if setchunk.issubset(setele):
+                        err2 += 1
+                    elif setele.issubset(setchunk):
+                        err3 += 1
+                    elif setele.intersection(setchunk) != set():
+                        err4 += 1
+    return [ok, err, err1, err2, err3, err4]
+def printer(sent, tokensent):
+    start= []
+    end = []
+    for ele in sent[1]:
+        start.append(ele[0])
+        end.append(ele[len(ele)-1])
+
+    print sent[0],
+    print "\t",  
+    i = 0
+    for tok in tokensent:
+        tmp = tok
+        if is_inside(i, start):
+            tmp = "[ "+str(tmp)
+        if is_inside(i, end):
+            tmp = str(tmp)+" ]"
+        print tmp,
+        print "\t",
+        i += 1
+    print
+
+def is_equal(chunktab, name):
+    setname = set(name)
+    for chunk in chunktab:
+        setchunk = set(chunk)
+        if setname == setchunk:
+            return True
+    return False
+
+def is_inside(i, tab):
+    for t in tab:
+        if i == t:
+            return True
+    return False
+
+def get_ref_paths(in_path, folds, input_format):
+    input_paths = []
+    if folds > 1:
+        for fold in range(1, folds+1):
+            if input_format == "ccl":
+                input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml'))
+            elif input_format == "xces":
+                input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml'))
+    else:
+        if(os.path.isdir(in_path)):
+            for (path, dirs, files) in os.walk(in_path):
+                for file in files:
+                    input_paths.append(os.path.join(path, file))
+        else:
+            input_paths.append(in_path)
+    return input_paths
+
+def have_no_intersection(comparer, chunk):
+    setchunk = set(chunk)
+    for ele in comparer:
+        setele = set(ele)
+        if setele.intersection(setchunk) != set():
+            return False
+    return True
+if __name__ == '__main__':
+    go()
diff --git a/scripts/chunker_scripts/tools.py b/scripts/chunker_scripts/tools.py
index d53ff44e0c0cb9b7c8e33f9545ba0ffb4730b223..8c73085bfd04210fc39f0a719bd0e4892407946f 100644
--- a/scripts/chunker_scripts/tools.py
+++ b/scripts/chunker_scripts/tools.py
@@ -307,7 +307,16 @@ def remove_channels_except(filepath, list=[]):
                     break
                 
             if write_line:
-                sources.write(line)   
+                sources.write(line)
+                
+def remove_all_channels(in_path, out_path):
+    with open(in_path, "r") as sources:
+        lines = sources.readlines()
+    with open(out_path, "w") as sources:
+        for line in lines:
+            if  not "<ann" in line and not "</ann>" in line:
+                sources.write(line)
+                
 
 ''' CORPUS2 METHODS '''
 
diff --git a/scripts/chunker_scripts/utils/remove_almost_all_chunks.py b/scripts/chunker_scripts/utils/remove_almost_all_chunks.py
new file mode 100755
index 0000000000000000000000000000000000000000..dddc70c183407e69885a2a0a013bd36f0615ea12
--- /dev/null
+++ b/scripts/chunker_scripts/utils/remove_almost_all_chunks.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Created on Mar 25, 2013
+
+@author: Adam Pawlaczek
+'''
+import sys
+import re
+    
+def main():
+    args = sys.argv[1:]
+        
+    if len(args) != 2:
+        sys.stderr.write('You need to provide a in_path out_path.\n')
+        sys.stderr.write('See %s --help\n' % sys.argv[0])
+        sys.exit(1)
+        
+    input = args[0]
+    output = args[1]
+    chunks = ["chunk_np", "chunk_vp", "chunk_adjp", "chunk_agp", "chunk_qp"]
+    out_f = open(output, 'w+')
+    
+    for line in open(input):
+        m = re.match(r'<ann\ chan=\"(.+)\">[0-9]*<\/ann>', line.strip())
+        if m and not m.group(1) in chunks:
+            continue
+        out_f.write(line)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file