diff --git a/.gitignore b/.gitignore index 346a2a7c7498d9b6da36a537e49d8d04f70d4db2..ad2f035596b5886a8f64df710b536a26736aa513 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ # Compiled source # ################### *.pyc +build +iobber.egg-info +.project +.settings +dist +scripts/chunker_scripts.egg-info diff --git a/scripts/chunker_scripts/chunk_eval/chunk_eval.py b/scripts/chunker_scripts/chunk_eval/chunk_eval.py index cda686412eb97e17136401e155876bcd03d503b2..b26072e8d143866d7ef88fe0542d2642089cb3c2 100755 --- a/scripts/chunker_scripts/chunk_eval/chunk_eval.py +++ b/scripts/chunker_scripts/chunk_eval/chunk_eval.py @@ -19,6 +19,7 @@ Created on 01-08-2012 from optparse import OptionParser import corpus2 +import wccl import sys, os from chunker_scripts.csv_table.CSVTable import CSVTable import codecs @@ -128,7 +129,8 @@ def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose, global all_hits global all_chunked - chan_names = chan_names.split(",") + if not isinstance(chan_names, list): + chan_names = chan_names.split(",") chunkTable = CSVTable(";") chunkTable.addColumn('Nr') @@ -150,8 +152,9 @@ def main(ch_path, ref_path, chan_names, input_format, out_path, tagset, verbose, bothTable.addSubColumn(chan_name, "P", type="float") bothTable.addSubColumn(chan_name, "R", type="float") bothTable.addSubColumn(chan_name, "F", type="float") - - tagset = corpus2.get_named_tagset(tagset) + + if not isinstance(tagset, wccl.Tagset): + tagset = corpus2.get_named_tagset(tagset) for fold in range(1, folds+1): if folds > 1: diff --git a/scripts/chunker_scripts/chunker_checkers/super_classifier.py b/scripts/chunker_scripts/chunker_checkers/super_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..ad38aeb662753064dbc873f9cb90e70df715af28 --- /dev/null +++ b/scripts/chunker_scripts/chunker_checkers/super_classifier.py @@ -0,0 +1,108 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 02-05-2013 + +@author: Adam Pawlaczek +''' +import codecs + +from optparse import OptionParser +import sys, os +import corpus2 + +from chunker_scripts import tools +from chunker_checker import ChunkerChecker +from chunker_scripts.chunk_eval.chunk_eval import main as chunk_eval +import iobber.iobber as iobber + +descr = """%prog [options] [in_dir] [out_dir] +in_dir has to contain subdirs with folds chunked by individual chunkers. +Subdir should be named as chunker which chunked files in it. +""" + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('--config', type='string', action='store', + dest='config', default='kpwr.ini', + help='set iobber config; default: kpwr.ini') + parser.add_option('-c', '--chunk-names', type='string', action='store', + dest='chunk_names', default='chunk_np', + help='set chunk_names to eval') + parser.add_option('--chunkers', type='string', action='store', + dest='chunkers', default='', + help='set chunkers to eval') + parser.add_option('-f', '--folds', type="int", action='store', + dest='folds', default=1, + help='Number of folds') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('--file-prefix', type='string', action='store', + dest='file_prefix', default='ccl-', + help='set the file prefix; default: ccl-') + parser.add_option('--file-ext', type='string', action='store', + dest='file_ext', default='.xml', + help='set the file extention; default: .xml') + parser.add_option('-v', '--verbose', action='store_true', + default=False, dest='verbose') + (options, args) = parser.parse_args() + + if len(args) != 2: + sys.stderr.write('You need to provide a in_dir, out_dir and chunk_names and chunkers.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + + in_path, out_path = args + main(in_path, out_path, options.input_format, options.output_format, + options.chunk_names, options.chunkers, options.folds, + options.tagset, options.file_prefix, options.file_ext, + options.verbose, options.config) + +class SuperClassifier(ChunkerChecker): + + def __init__(self, input_format, output_format, chunk_names, chunkers, folds, tagset, file_prefix, file_ext, verbose, config): + super(Iobber_v1, self).__init__(input_format, output_format, chunk_names, chunkers, folds, tagset, file_prefix, file_ext, verbose) + self.config = config + self.chunkers = chunkers.split(",") + + def create_directories(self): + self.dirs['models_path'] = os.path.join(self.dirs['out_dir'], 'models') + self.dirs['chunked_path'] = os.path.join(self.dirs['out_dir'], 'chunked') + self.dirs['nochann_path'] = os.path.join(self.dirs['out_dir'], 'nochann') + tools.mkdir_p(self.dirs['models_path']) + tools.mkdir_p(self.dirs['chunked_path']) + tools.mkdir_p(self.dirs['nochann_path']) + + def process_fold(self, fold): + num = str(fold).zfill(2) + self.train_fold(os.path.join(self.dirs['in_dir'], self.file_prefix + 'train' + num + self.file_ext), + os.path.join(self.dirs['models_path'], num)) + + def train_fold(self, in_path, model_path): + tr_file = codecs.open(os.path.join(model_path, 'model.tr'), 'wb', 'utf-8') + reader = tools.get_reader(in_path, self.input_format, self.tagset) + + sent = reader.get_next_sentence() + while sent: + asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + for chunk_name in asent.all_channels(): + chan = asent.get_channel(chunk_name) + + sent = reader.get_next_sentence() + + tr_file.close() + +def main(in_path, out_path, input_format, output_format, chunk_names, folds, tagset, file_prefix, file_ext, verbose, config): + sc = SuperClassifier(input_format, output_format, chunk_names, folds, tagset, file_prefix, file_ext, verbose, config) + sc.process_folds(in_path, out_path) + +if __name__ == '__main__': + go() + \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/_tests/__init__.py b/scripts/chunker_scripts/experiments/_tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/crf/ccl-test01.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/crf/ccl-test01.xml new file mode 100644 index 0000000000000000000000000000000000000000..92f0f25633e2b5f98a6f96011cc3c1d4d25628dc --- /dev/null +++ b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/crf/ccl-test01.xml @@ -0,0 +1,65 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE chunkList SYSTEM "ccl.dtd"> +<chunkList> + <chunk id="ch1" type="p"> + <sentence id="sent3"> + <tok> + <orth>Jako</orth> + <lex disamb="1"><base>jako</base><ctag>conj</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>pierwszy</orth> + <lex disamb="1"><base>pierwszy</base><ctag>adj:sg:acc:m3:pos</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>głos</orth> + <lex disamb="1"><base>głos</base><ctag>subst:sg:acc:m3</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>zabierze</orth> + <lex disamb="1"><base>zabrać</base><ctag>fin:sg:ter:perf</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>poseł</orth> + <lex disamb="1"><base>poseł</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np" head="1">1</ann> + </tok> + <tok> + <orth>Arkady</orth> + <lex disamb="1"><base>Arkady</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np">1</ann> + </tok> + <tok> + <orth>Fiedler</orth> + <lex disamb="1"><base>Fiedler</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np">1</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex disamb="1"><base>,</base><ctag>interp</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>Platforma</orth> + <lex disamb="1"><base>platforma</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="chunk_np" head="1">2</ann> + </tok> + <tok> + <orth>Obywatelska</orth> + <lex disamb="1"><base>obywatelski</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="chunk_np">2</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test01.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test01.xml new file mode 100644 index 0000000000000000000000000000000000000000..92f0f25633e2b5f98a6f96011cc3c1d4d25628dc --- /dev/null +++ b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test01.xml @@ -0,0 +1,65 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE chunkList SYSTEM "ccl.dtd"> +<chunkList> + <chunk id="ch1" type="p"> + <sentence id="sent3"> + <tok> + <orth>Jako</orth> + <lex disamb="1"><base>jako</base><ctag>conj</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>pierwszy</orth> + <lex disamb="1"><base>pierwszy</base><ctag>adj:sg:acc:m3:pos</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>głos</orth> + <lex disamb="1"><base>głos</base><ctag>subst:sg:acc:m3</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>zabierze</orth> + <lex disamb="1"><base>zabrać</base><ctag>fin:sg:ter:perf</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>poseł</orth> + <lex disamb="1"><base>poseł</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np" head="1">1</ann> + </tok> + <tok> + <orth>Arkady</orth> + <lex disamb="1"><base>Arkady</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np">1</ann> + </tok> + <tok> + <orth>Fiedler</orth> + <lex disamb="1"><base>Fiedler</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np">1</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex disamb="1"><base>,</base><ctag>interp</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>Platforma</orth> + <lex disamb="1"><base>platforma</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="chunk_np" head="1">2</ann> + </tok> + <tok> + <orth>Obywatelska</orth> + <lex disamb="1"><base>obywatelski</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="chunk_np">2</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test02.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/chunked/spejd/ccl-test02.xml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/chunker_scripts/experiments/_tests/oracle_data/ref/ccl-test01.xml b/scripts/chunker_scripts/experiments/_tests/oracle_data/ref/ccl-test01.xml new file mode 100644 index 0000000000000000000000000000000000000000..92f0f25633e2b5f98a6f96011cc3c1d4d25628dc --- /dev/null +++ b/scripts/chunker_scripts/experiments/_tests/oracle_data/ref/ccl-test01.xml @@ -0,0 +1,65 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE chunkList SYSTEM "ccl.dtd"> +<chunkList> + <chunk id="ch1" type="p"> + <sentence id="sent3"> + <tok> + <orth>Jako</orth> + <lex disamb="1"><base>jako</base><ctag>conj</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>pierwszy</orth> + <lex disamb="1"><base>pierwszy</base><ctag>adj:sg:acc:m3:pos</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>głos</orth> + <lex disamb="1"><base>głos</base><ctag>subst:sg:acc:m3</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>zabierze</orth> + <lex disamb="1"><base>zabrać</base><ctag>fin:sg:ter:perf</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>poseł</orth> + <lex disamb="1"><base>poseł</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np" head="1">1</ann> + </tok> + <tok> + <orth>Arkady</orth> + <lex disamb="1"><base>Arkady</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np">1</ann> + </tok> + <tok> + <orth>Fiedler</orth> + <lex disamb="1"><base>Fiedler</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="chunk_np">1</ann> + </tok> + <ns/> + <tok> + <orth>,</orth> + <lex disamb="1"><base>,</base><ctag>interp</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + <tok> + <orth>Platforma</orth> + <lex disamb="1"><base>platforma</base><ctag>subst:sg:nom:f</ctag></lex> + <ann chan="chunk_np" head="1">2</ann> + </tok> + <tok> + <orth>Obywatelska</orth> + <lex disamb="1"><base>obywatelski</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="chunk_np">2</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> + <ann chan="chunk_np">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/_tests/test_oracle.py b/scripts/chunker_scripts/experiments/_tests/test_oracle.py new file mode 100644 index 0000000000000000000000000000000000000000..0db64b9313f9960e73c8c022a2199adf44503adc --- /dev/null +++ b/scripts/chunker_scripts/experiments/_tests/test_oracle.py @@ -0,0 +1,134 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 13-05-2013 + +@author: Adam Pawlaczek +''' +import shutil +import unittest +import os + +import corpus2 + +from chunker_scripts.experiments.oracle import Oracle +from chunker_scripts import tools + +class TestOracleWholeChunks(unittest.TestCase): + + def setUp(self): + self.oracle = Oracle("ccl", "ccl", "chunk_np", "crf,spejd", 1, "nkjp") + shutil.copytree("oracle_data", "oracle_data_backup") + + def tearDown(self): + shutil.rmtree("oracle_data") + shutil.copytree("oracle_data_backup", "oracle_data") + shutil.rmtree("oracle_data_backup") + if os.path.exists("out"): + shutil.rmtree("out") + + def test_default_params(self): + self.assertEqual(self.oracle.chunk_names, ["chunk_np"]) + self.assertEqual(self.oracle.chunkers, ["crf", "spejd"]) + + def test_process(self): + self.oracle.process("oracle_data/chunked", "oracle_data/ref", "out") + + def test_get_ref_paths(self): + ref_paths = self.oracle.get_ref_paths("oracle_data/ref") + self.assertEqual(ref_paths, ["oracle_data/ref/ccl-test01.xml"]) + + def test_get_input_paths(self): + in_paths = self.oracle.get_input_paths("oracle_data/chunked") + self.assertEqual(in_paths, [{'spejd': 'oracle_data/chunked/spejd/ccl-test01.xml', 'crf': 'oracle_data/chunked/crf/ccl-test01.xml'}]) + + def test_get_writer(self): + tools.mkdir_p("out") + self.assertNotEqual(self.oracle.get_writer("out", 1), None) + + def test_get_readers(self): + readers = self.oracle.get_readers(self.oracle.get_input_paths("oracle_data/chunked")[0]) + self.assertEqual(len(readers), len(self.oracle.chunkers)) + + def test_clone_sent(self): + readers = self.oracle.get_readers(self.oracle.get_input_paths("oracle_data/chunked")[0]) + sent = readers["crf"].get_next_sentence() + sent2 = self.oracle.clone_sent(sent) + sent1_tokens = " ".join([tok.orth_utf8() for tok in sent.tokens()]) + sent2_tokens = " ".join([tok.orth_utf8() for tok in sent2.tokens()]) + self.assertEqual(sent1_tokens, sent2_tokens) + + def test_get_annots_first_idx(self): + readers = self.oracle.get_readers(self.oracle.get_input_paths("oracle_data/chunked")[0]) + sent = readers["crf"].get_next_sentence() + asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + channel = asent.get_channel("chunk_np") + idxs = self.oracle.get_annots_first_idx(channel) + self.assertEqual(set(idxs.keys()),set([4, 8])) + + def test_get_right_annots(self): + sent1 = corpus2.Sentence.create_sent("1") + for i in range(10): + token = corpus2.Token.create_utf8("jakiś_orth") + sent1.append(token) + asent1 = corpus2.AnnotatedSentence.wrap_sentence(sent1) + asent1.create_channel("test_chunk") + chan = asent1.get_channel("test_chunk") + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(0, seg_no) + chan.set_segment_at(1, seg_no) + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(3, seg_no) + chan.set_segment_at(4, seg_no) + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(5, seg_no) + chan.set_segment_at(6, seg_no) + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(8, seg_no) + chan.set_segment_at(9, seg_no) + + sent2 = corpus2.Sentence.create_sent("2") + for i in range(10): + token = corpus2.Token.create_utf8("jakiś_orth") + sent2.append(token) + asent2 = corpus2.AnnotatedSentence.wrap_sentence(sent2) + asent2.create_channel("test_chunk") + chan = asent2.get_channel("test_chunk") + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(0, seg_no) + chan.set_segment_at(1, seg_no) + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(2, seg_no) + chan.set_segment_at(3, seg_no) + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(4, seg_no) + chan.set_segment_at(5, seg_no) + chan.set_segment_at(6, seg_no) + chan.set_segment_at(7, seg_no) + + seg_no = chan.get_new_segment_index() + chan.set_segment_at(8, seg_no) + + sent1 = corpus2.AnnotatedSentence.cast_as_sentence(asent1) + sent2 = corpus2.AnnotatedSentence.cast_as_sentence(asent2) + + right_annots = self.oracle.get_right_annots(sent1, {"crf":sent2}, "test_chunk") + + self.assertEqual(right_annots, [[0,1]]) + + def test_is_same_chunk(self): + self.assertTrue(self.oracle.is_same_chunk([4, 5, 6], [4, 5, 6])) + self.assertFalse(self.oracle.is_same_chunk([4, 5], [4, 5, 6])) + self.assertFalse(self.oracle.is_same_chunk([4, 5, 6, 7], [3, 4, 5, 6])) + self.assertFalse(self.oracle.is_same_chunk([4, 5, 6], [8, 9, 10])) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/iobber_v1.py b/scripts/chunker_scripts/experiments/iobber_v1.py deleted file mode 100755 index 6852ab2fd31411a0ebb7522f73a38b9525feb5f2..0000000000000000000000000000000000000000 --- a/scripts/chunker_scripts/experiments/iobber_v1.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/python -#-*- coding: utf-8 -*- -''' -Created on 05-07-2012 - -@author: jezozwierzak -''' - -import os, shutil, sys -from chunk_eval.chunk_eval import main as chunk_eval -from optparse import OptionParser -import tools -descr = """%prog [options] [in_dir] [out_dir] -""" -# in_dir musi zawierać pliki test*.xml oraz train*.xml -# -def go(): - parser = OptionParser(usage=descr) - parser.add_option('-i', '--input-format', type='string', action='store', - dest='input_format', default='ccl', - help='set the input format; default: ccl') - parser.add_option('-o', '--output-format', type='string', action='store', - dest='output_format', default='ccl', - help='set the output format; default: ccl') - parser.add_option('-t', '--tagset', type='string', action='store', - dest='tagset', default='nkjp', - help='set the tagset used in input; default: nkjp') - parser.add_option('-c', '--config', type='string', action='store', - dest='config', default='kpwr.ini', - help='Set path to config file; default: kpwr.ini') - parser.add_option('--chunk-names', type='string', action='store', - dest='chunk_names', default='chunk_np,chunk_vp,chunk_agp,chunk_adjp,chunk_qp', - help='set chunk_names to eval') - (options, args) = parser.parse_args() - - if len(args) != 2: - sys.stderr.write('You need to provide corpus_dir and out_dir.\n') - sys.stderr.write('See %s --help\n' % sys.argv[0]) - sys.exit(1) - - in_dir, out_dir = args - main(in_dir, out_dir, options.chunk_names, options.config, options.tagset, options.input_format, options.output_format) - -def main(in_dir, out_dir, chunk_names, config, tagset, input_format, output_format): - #Stworzenie katalogu wyjściowego - tools.mkdir_p(out_dir) - #Przekopiowanie plików train oraz test - for i in range(1, 11): - print "Przetwarzanie foldu nr: ", i, "------------------------------------------------------------------" - ccl_test = 'ccl-test' + str(i).zfill(2) + '.xml' - ccl_train = 'ccl-train' + str(i).zfill(2) + '.xml' - print 'Kopiowanie foldu ze źródła' - shutil.copy(os.path.join(in_dir, ccl_test), os.path.join(out_dir, ccl_test)) - shutil.copy(os.path.join(in_dir, ccl_train), os.path.join(out_dir, ccl_train)) - print 'Usuwanie tagów iob' - #Usunięcie channelów z plików test - tools.mkdir_p(os.path.join(out_dir,'nochann')) - shutil.copy(os.path.join(out_dir, ccl_test), os.path.join(out_dir, 'nochann', ccl_test)) - tools.remove_channels_except(os.path.join(out_dir, 'nochann', ccl_test), ["chunk_np_splited"]) - #train iobber - print 'Trenowanie iobbera' - tools.mkdir_p(os.path.join(out_dir, 'iobber_model', str(i).zfill(2))) - tools.train_iobber(config, os.path.join(out_dir, 'iobber_model', str(i).zfill(2)), os.path.join(out_dir, ccl_train)) - #chunk test files - print 'Chunkowanie' - tools.mkdir_p(os.path.join(out_dir, 'chunked')) - tools.run_iobber(config, os.path.join(out_dir, 'nochann', ccl_test), os.path.join(out_dir, 'chunked', ccl_test), os.path.join(out_dir, 'iobber_model', str(i).zfill(2))) - #diseval - print 'Porównanie wyników chunkowania z plikiami reference' - chunk_eval(os.path.join(out_dir, 'chunked'), out_dir, chunk_names, output_format, os.path.join(out_dir, 'results.csv'), tagset, True, 10) - -#in_path, out_path, in_format, out_format, tagset -if __name__ == '__main__': - go() - -# main(args[0], args[1]) -# main('/home/jezozwierzak/range/remove_not_cont/out_folds', '/home/jezozwierzak/range/remove_not_cont/iobber_v1') - diff --git a/scripts/chunker_scripts/experiments/oracle.py b/scripts/chunker_scripts/experiments/oracle.py index ed373ecd42f4e61f211498b085f8945ff0deeb25..04d6601237a8cb7ff340b362844b9e383e86362c 100755 --- a/scripts/chunker_scripts/experiments/oracle.py +++ b/scripts/chunker_scripts/experiments/oracle.py @@ -43,124 +43,140 @@ def go(): sys.exit(1) in_path, ref_path, out_path = args - main(in_path, ref_path, out_path, options.input_format, options.output_format, + oracle = Oracle(options.input_format, options.output_format, options.chunk_names, options.chunkers, options.folds, options.tagset) + oracle.process(in_path, ref_path, out_path) -def get_ref_paths(in_path, folds, input_format): - input_paths = [] - if folds > 1: - for fold in range(1, folds+1): - if input_format == "ccl": - input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml')) - elif input_format == "xces": - input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml')) - else: - if(os.path.isdir(in_path)): - for (path, dirs, files) in os.walk(in_path): - for file in files: - input_paths.append(os.path.join(path, file)) +class Oracle: + + def __init__(self, input_format, output_format, chunk_names, chunkers, folds, tagset): + self.input_format = input_format + self.output_format = output_format + self.chunk_names = chunk_names.split(",") + self.chunkers = chunkers.split(",") + self.folds = folds + self.tagset = corpus2.get_named_tagset(tagset) + + def get_ref_paths(self, ref_path): + input_paths = [] + if self.folds > 1: + for fold in range(1, self.folds+1): + if self.input_format == "ccl": + input_paths.append(os.path.join(ref_path, 'ccl-test' + str(fold).zfill(2) + '.xml')) + elif self.input_format == "xces": + input_paths.append(os.path.join(ref_path, 'test' + str(fold).zfill(2) + '.xml')) else: - input_paths.append(in_path) - return input_paths - -def get_input_paths(in_path, folds, input_format, chunkers): - input_paths = [] - for fold in range(1, folds+1): - fold_inputs = {} - for chunker in chunkers: - if os.path.isdir(os.path.join(in_path, chunker)): - if input_format == "ccl": - fold_inputs[chunker] = os.path.join(in_path, chunker, 'ccl-test' + str(fold).zfill(2) + '.xml') - elif input_format == "xces": - fold_inputs[chunker] = os.path.join(in_path, chunker, 'test' + str(fold).zfill(2) + '.xml') + if(os.path.isdir(ref_path)): + for (path, dirs, files) in os.walk(ref_path): + for file in files: + input_paths.append(os.path.join(path, file)) else: - print os.path.join(in_path, chunker), " dir doesn't exist" - input_paths.append(fold_inputs) - return input_paths + input_paths.append(ref_path) + return input_paths -def get_writer(out_path, output_format, tagset, fold): - out_path = get_output_path(out_path, fold, output_format) - return corpus2.TokenWriter.create_path_writer(output_format, out_path, - tagset) + def get_input_paths(self, in_path): + input_paths = [] + for fold in range(1, self.folds+1): + fold_inputs = {} + for chunker in self.chunkers: + if os.path.isdir(os.path.join(in_path, chunker)): + if self.input_format == "ccl": + fold_inputs[chunker] = os.path.join(in_path, chunker, 'ccl-test' + str(fold).zfill(2) + '.xml') + elif self.input_format == "xces": + fold_inputs[chunker] = os.path.join(in_path, chunker, 'test' + str(fold).zfill(2) + '.xml') + else: + print os.path.join(in_path, chunker), " dir doesn't exist" + input_paths.append(fold_inputs) + return input_paths -def get_output_path(out_path, fold, output_format): - if output_format == "ccl": - return os.path.join(out_path, 'ccl-test' + str(fold).zfill(2) + '.xml') - elif input_format == "xces": - return os.path.join(out_path, 'test' + str(fold).zfill(2) + '.xml') + def get_output_path(self, out_path, fold): + if self.output_format == "ccl": + return os.path.join(out_path, 'ccl-test' + str(fold).zfill(2) + '.xml') + elif self.input_format == "xces": + return os.path.join(out_path, 'test' + str(fold).zfill(2) + '.xml') -def get_readers(in_paths, input_format, tagset): - readers = {} - for chunker, in_path in in_paths.iteritems(): - readers[chunker] = tools.get_reader(in_path, input_format, tagset) - return readers + def get_writer(self, out_path, fold): + out_path = self.get_output_path(out_path, fold) + return corpus2.TokenWriter.create_path_writer(self.output_format, out_path, + self.tagset) + + def get_readers(self, in_paths): + readers = {} + for chunker, in_path in in_paths.iteritems(): + readers[chunker] = tools.get_reader(in_path, self.input_format, self.tagset) + return readers -def get_next_sents(readers): - result = {} - for chunker, reader in readers.iteritems(): - result[chunker] = reader.get_next_sentence() - return result + def get_next_sents(self, readers): + result = {} + for chunker, reader in readers.iteritems(): + result[chunker] = reader.get_next_sentence() + return result -def clone_sent(sent): - new_sent = corpus2.Sentence.create_sent(sent.id()) - for tok_idx, tok in enumerate(sent.tokens()): - tok = sent.tokens()[tok_idx] - if any(lex.is_disamb() for lex in tok.lexemes()): - new_sent.append(tok.clone()) - return new_sent + def clone_sent(self, sent): + new_sent = corpus2.Sentence.create_sent(sent.id()) + for tok_idx, tok in enumerate(sent.tokens()): + tok = sent.tokens()[tok_idx] + if any(lex.is_disamb() for lex in tok.lexemes()): + new_sent.append(tok.clone()) + return new_sent - -def main(in_path, ref_path, out_path, input_format, output_format, chunk_names, chunkers, folds, tagset): - tagset = corpus2.get_named_tagset(tagset) - chunk_names = chunk_names.split(",") - chunkers = chunkers.split(",") + def get_annots_first_idx(self, channel): + annots = channel.make_annotation_vector() + return dict([(min(ann.indices), ann) for ann in annots]) - ref_paths = get_ref_paths(ref_path, folds, input_format) - input_paths = get_input_paths(in_path, folds, input_format, chunkers) + def is_same_chunk(self, ch, ref): + return list(ch) == list(ref) - for fold in range(1, folds+1): - writer = get_writer(out_path, output_format, tagset, fold) - - readers = get_readers(input_paths[fold-1], input_format, tagset) - sents = get_next_sents(readers) - ref_reader = tools.get_reader(ref_paths[fold-1], input_format, tagset) + def get_right_annots(self, ref_sent, chunked_sents, chunk_name): + result = [] + ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent) + ref_idxs = self.get_annots_first_idx(ref_asent.get_channel(chunk_name)) - while sents.itervalues().next(): - ref_sent = ref_reader.get_next_sentence() - ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent) - - result_sent = clone_sent(ref_asent) - result_asent = corpus2.AnnotatedSentence.wrap_sentence(result_sent) - - for chunk_name in ref_asent.all_channels(): - if chunk_name in chunk_names: - right_annots = [] - ref_annots = ref_asent.get_channel(chunk_name).make_annotation_vector() - ref = dict([(min(ann.indices), ann) for ann in ref_annots]) - - for chunker in chunkers: - ch_asent = corpus2.AnnotatedSentence.wrap_sentence(sents[chunker]) - if ch_asent.has_channel(chunk_name): - ch_annots = ch_asent.get_channel(chunk_name).make_annotation_vector() - ch = dict([(min(ann.indices), ann) for ann in ch_annots]) - - maybe_hits = set(ch).intersection(ref) - for idx in maybe_hits: - if list(ch[idx].indices) == list(ref[idx].indices) and [i for i in ch[idx].indices] not in right_annots: - right_annots.append([i for i in ch[idx].indices]) - - #add right chunks - result_asent.create_channel(chunk_name) - chan = result_asent.get_channel(chunk_name) - for ann in right_annots: - seg_no = chan.get_new_segment_index() - for idx in ann: - chan.set_segment_at(idx, seg_no) - - result_sent = corpus2.AnnotatedSentence.cast_as_sentence(result_asent) - writer.write_sentence(result_sent) - sents = get_next_sents(readers) + for chunker in chunked_sents.keys(): + ch_asent = corpus2.AnnotatedSentence.wrap_sentence(chunked_sents[chunker]) + ch_idxs = self.get_annots_first_idx(ch_asent.get_channel(chunk_name)) + maybe_hits = set(ch_idxs).intersection(ref_idxs) + for idx in maybe_hits: + chunk = [i for i in ch_idxs[idx].indices] + if self.is_same_chunk(ch_idxs[idx].indices, ref_idxs[idx].indices) and not chunk in result: + result.append(chunk) + return result + + def process(self, in_path, ref_path, out_path): + if not os.path.exists(out_path): + tools.mkdir_p(out_path) + ref_paths = self.get_ref_paths(ref_path) + input_paths = self.get_input_paths(in_path) + + for fold in range(1, self.folds+1): + writer = self.get_writer(out_path, fold) + readers = self.get_readers(input_paths[fold-1]) + ref_reader = tools.get_reader(ref_paths[fold-1], self.input_format, self.tagset) + sents = self.get_next_sents(readers) + while sents.itervalues().next(): + + ref_sent = ref_reader.get_next_sentence() + ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent) + + result_sent = self.clone_sent(ref_asent) + result_asent = corpus2.AnnotatedSentence.wrap_sentence(result_sent) + + for chunk_name in ref_asent.all_channels(): + if chunk_name in self.chunk_names: + right_annots = self.get_right_annots(ref_sent, sents, chunk_name) + + result_asent.create_channel(chunk_name) + chan = result_asent.get_channel(chunk_name) + for ann in right_annots: + seg_no = chan.get_new_segment_index() + for idx in ann: + chan.set_segment_at(idx, seg_no) + + result_sent = corpus2.AnnotatedSentence.cast_as_sentence(result_asent) + writer.write_sentence(result_sent) + sents = self.get_next_sents(readers) + if __name__ == '__main__': go() \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/oracle_old.py b/scripts/chunker_scripts/experiments/oracle_old.py new file mode 100755 index 0000000000000000000000000000000000000000..ed373ecd42f4e61f211498b085f8945ff0deeb25 --- /dev/null +++ b/scripts/chunker_scripts/experiments/oracle_old.py @@ -0,0 +1,166 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 19-02-2013 + +@author: jezozwierzak +''' +from optparse import OptionParser +import sys, os +import corpus2 +from chunker_scripts import tools + +descr = """%prog [options] [in_dir] [ref_dir] [out_dir] +in_dir has to contain subdirs with folds chunked by individual chunkers. +Subdir should be named as chunker which chunked files in it. +""" + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('-c', '--chunk-names', type='string', action='store', + dest='chunk_names', default='', + help='set chunk_names to eval') + parser.add_option('--chunkers', type='string', action='store', + dest='chunkers', default='', + help='set chunkers to eval') + parser.add_option('-f', '--folds', type="int", action='store', + dest='folds', default=1, + help='Number of folds') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + (options, args) = parser.parse_args() + + if len(args) != 3 and options.chunk_names == '' and options.chunkers == '': + sys.stderr.write('You need to provide a in_dir, ref_dir and out_dir and chunk_names and chunkers.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + + in_path, ref_path, out_path = args + main(in_path, ref_path, out_path, options.input_format, options.output_format, + options.chunk_names, options.chunkers, options.folds, options.tagset) + +def get_ref_paths(in_path, folds, input_format): + input_paths = [] + if folds > 1: + for fold in range(1, folds+1): + if input_format == "ccl": + input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml')) + elif input_format == "xces": + input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml')) + else: + if(os.path.isdir(in_path)): + for (path, dirs, files) in os.walk(in_path): + for file in files: + input_paths.append(os.path.join(path, file)) + else: + input_paths.append(in_path) + return input_paths + +def get_input_paths(in_path, folds, input_format, chunkers): + input_paths = [] + for fold in range(1, folds+1): + fold_inputs = {} + for chunker in chunkers: + if os.path.isdir(os.path.join(in_path, chunker)): + if input_format == "ccl": + fold_inputs[chunker] = os.path.join(in_path, chunker, 'ccl-test' + str(fold).zfill(2) + '.xml') + elif input_format == "xces": + fold_inputs[chunker] = os.path.join(in_path, chunker, 'test' + str(fold).zfill(2) + '.xml') + else: + print os.path.join(in_path, chunker), " dir doesn't exist" + input_paths.append(fold_inputs) + return input_paths + +def get_writer(out_path, output_format, tagset, fold): + out_path = get_output_path(out_path, fold, output_format) + return corpus2.TokenWriter.create_path_writer(output_format, out_path, + tagset) + +def get_output_path(out_path, fold, output_format): + if output_format == "ccl": + return os.path.join(out_path, 'ccl-test' + str(fold).zfill(2) + '.xml') + elif input_format == "xces": + return os.path.join(out_path, 'test' + str(fold).zfill(2) + '.xml') + +def get_readers(in_paths, input_format, tagset): + readers = {} + for chunker, in_path in in_paths.iteritems(): + readers[chunker] = tools.get_reader(in_path, input_format, tagset) + return readers + +def get_next_sents(readers): + result = {} + for chunker, reader in readers.iteritems(): + result[chunker] = reader.get_next_sentence() + return result + +def clone_sent(sent): + new_sent = corpus2.Sentence.create_sent(sent.id()) + for tok_idx, tok in enumerate(sent.tokens()): + tok = sent.tokens()[tok_idx] + if any(lex.is_disamb() for lex in tok.lexemes()): + new_sent.append(tok.clone()) + return new_sent + + +def main(in_path, ref_path, out_path, input_format, output_format, chunk_names, chunkers, folds, tagset): + tagset = corpus2.get_named_tagset(tagset) + chunk_names = chunk_names.split(",") + chunkers = chunkers.split(",") + + ref_paths = get_ref_paths(ref_path, folds, input_format) + input_paths = get_input_paths(in_path, folds, input_format, chunkers) + + for fold in range(1, folds+1): + writer = get_writer(out_path, output_format, tagset, fold) + + readers = get_readers(input_paths[fold-1], input_format, tagset) + sents = get_next_sents(readers) + ref_reader = tools.get_reader(ref_paths[fold-1], input_format, tagset) + + while sents.itervalues().next(): + ref_sent = ref_reader.get_next_sentence() + ref_asent = corpus2.AnnotatedSentence.wrap_sentence(ref_sent) + + result_sent = clone_sent(ref_asent) + result_asent = corpus2.AnnotatedSentence.wrap_sentence(result_sent) + + for chunk_name in ref_asent.all_channels(): + if chunk_name in chunk_names: + right_annots = [] + ref_annots = ref_asent.get_channel(chunk_name).make_annotation_vector() + ref = dict([(min(ann.indices), ann) for ann in ref_annots]) + + for chunker in chunkers: + ch_asent = corpus2.AnnotatedSentence.wrap_sentence(sents[chunker]) + if ch_asent.has_channel(chunk_name): + ch_annots = ch_asent.get_channel(chunk_name).make_annotation_vector() + ch = dict([(min(ann.indices), ann) for ann in ch_annots]) + + maybe_hits = set(ch).intersection(ref) + for idx in maybe_hits: + if list(ch[idx].indices) == list(ref[idx].indices) and [i for i in ch[idx].indices] not in right_annots: + right_annots.append([i for i in ch[idx].indices]) + + #add right chunks + result_asent.create_channel(chunk_name) + chan = result_asent.get_channel(chunk_name) + for ann in right_annots: + seg_no = chan.get_new_segment_index() + for idx in ann: + chan.set_segment_at(idx, seg_no) + + result_sent = corpus2.AnnotatedSentence.cast_as_sentence(result_asent) + writer.write_sentence(result_sent) + sents = get_next_sents(readers) + + +if __name__ == '__main__': + go() \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/remove_not_cont_chunks.py b/scripts/chunker_scripts/experiments/remove_not_cont_chunks.py new file mode 100644 index 0000000000000000000000000000000000000000..1a1f2214eaf0a0e2a5a0bd3cd65f0f1a1d5ebb90 --- /dev/null +++ b/scripts/chunker_scripts/experiments/remove_not_cont_chunks.py @@ -0,0 +1,141 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 27-12-2012 + +@author: Adam Pawlaczek +''' + +from optparse import OptionParser +import sys, os, codecs +import corpus2 +from operator import itemgetter + +descr = """%prog [options] in_path out_path + +""" + +def get_writer(out_path, output_format, tagset): + if out_path: + return corpus2.TokenWriter.create_path_writer(output_format, out_path, + tagset) + else: + return corpus2.TokenWriter.create_stdout_writer(output_format, tagset) + +def get_reader(in_path, input_format, tagset): + if in_path: + return corpus2.TokenReader.create_path_reader( + input_format, tagset, in_path) + else: + return corpus2.TokenReader.create_stdin_reader(input_format, tagset) + +def get_output_path(out_path, basename = None): + if basename == None: + return out_path + else: + return os.path.join(out_path, basename) + +def get_input_paths(in_path, folds, input_format): + input_paths = [] + if folds > 1: + for fold in range(1, folds+1): + if input_format == "ccl": + input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml')) + input_paths.append(os.path.join(in_path, 'ccl-train' + str(fold).zfill(2) + '.xml')) + elif input_format == "xces": + input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml')) + input_paths.append(os.path.join(in_path, 'train' + str(fold).zfill(2) + '.xml')) + else: + if(os.path.isdir(in_path)): + for (path, dirs, files) in os.walk(in_path): + for file in files: + input_paths.append(os.path.join(path, file)) + else: + input_paths.append(in_path) + + return input_paths + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('-c', '--chunk-names', type='string', action='store', + dest='chunk_names', default='', + help='set chunk_names to eval') + parser.add_option('-C', '--chunkers', type='string', action='store', + dest='chunkers', default='', + help='set chunker names') + parser.add_option('-f', '--folds', type="int", action='store', + dest='folds', default=1, + help='Number of folds') + (options, args) = parser.parse_args() + + if len(args) != 2: + sys.stderr.write('You need to provide a chunked ccl file.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + + in_path, out_path = args + main(in_path, out_path, options.input_format, options.output_format, options.chunk_names, options.folds) + + +def is_not_continous(inds): + l2 = range(inds[0], inds[-1] + 1) + return inds != l2 + +def choose_ids_with_head(inds, head_ind): + begin = inds[0] + ended = False + for i in range(inds[0], inds[-1]+2): + if not i in inds or i == inds[-1]+1: + if not ended and head_ind in range(begin, i): + return range(begin, i) + ended = True + if i in inds and ended: + ended = False + begin = i + + +def main(in_path, out_path, input_format, output_format, chunk_names, folds): + tagset = corpus2.get_named_tagset("nkjp") + chunk_names = chunk_names.split(",") + + input_paths = get_input_paths(in_path, folds, input_format) + + for input_path in input_paths: + reader = get_reader(input_path, input_format, tagset) + if folds > 1: + output_path = get_output_path(out_path, os.path.basename(input_path)) + else: + output_path = get_output_path(out_path) + + writer = get_writer(output_path, output_format, tagset) + + while True: + sent = reader.get_next_sentence() + if not sent: + break + asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + tokens = asent.tokens() + + for chan_name in asent.all_channels(): + if chan_name in chunk_names: + chan = asent.get_channel(chan_name) + ann_vec = chan.make_annotation_vector() + new_idx = len(ann_vec) + for ann in ann_vec: + inds = sorted(ann.indices) + if is_not_continous(inds): + ids_with_head = choose_ids_with_head(inds, ann.head_index) + for i in inds: + if not i in ids_with_head: + chan.set_segment_at(i, 0) + sent = corpus2.AnnotatedSentence.cast_as_sentence(asent) + writer.write_sentence(sent) + +if __name__ == '__main__': + go() \ No newline at end of file diff --git a/scripts/chunker_scripts/experiments/spejd_v1.py b/scripts/chunker_scripts/experiments/spejd_v1.py new file mode 100755 index 0000000000000000000000000000000000000000..266d4af6a4a8a7e5753f0a40680aa5081b609347 --- /dev/null +++ b/scripts/chunker_scripts/experiments/spejd_v1.py @@ -0,0 +1,109 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 10-05-2013 + +@author: Adam Pawlaczek +''' + +import os, shutil, sys +from optparse import OptionParser +from chunker_scripts.chunk_eval.chunk_eval import main as chunk_eval +from chunker_scripts import tools +import logging + +descr = """%prog [options] [corpus_dir] [out_dir] +""" +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-c', '--config', type='string', action='store', + dest='config', default='/home/jezozwierzak/chunker/spejd/verified_nkjp_gramma/config.ini', + help='Set path to config file; default: /home/jezozwierzak/chunker/spejd/verified_nkjp_gramma/config.ini') + parser.add_option('--chunk-names', type='string', action='store', + dest='chunk_names', default='chunk_np,chunk_vp,chunk_agp,chunk_adjp,chunk_qp', + help='set chunk_names to eval') + parser.add_option('-f', '--folds', type="int", action='store', + dest='folds', default=1,help='Number of folds') + parser.add_option('-v', '--verbose', action='store_true', + dest='verbose', default=False, help='verbose mode') + (options, args) = parser.parse_args() + + if len(args) != 2: + sys.stderr.write('You need to provide corpus_dir and out_dir.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + + in_dir, out_dir = args + main(in_dir, out_dir, options.chunk_names, options.config, options.tagset, options.input_format, options.folds, options.verbose) + + +def main(in_dir, out_dir, chunk_names, config, tagset, input_format, folds, verbose): + logging.basicConfig(filename=os.path.join(out_dir, 'spejd_v1.log'), level=logging.INFO) + if verbose: + soh = logging.StreamHandler(sys.stdout) + soh.setLevel(logging.INFO) + logger = logging.getLogger() + logger.addHandler(soh) + + #Stworzenie katalogu wyjściowego + tools.mkdir_p(out_dir) + #Przekopiowanie plików train oraz test + for i in range(1, folds+1): + test_name = 'test' + str(i).zfill(2) + '.xml' + train_name = 'train' + str(i).zfill(2) + '.xml' + ccl_test = 'ccl-test' + str(i).zfill(2) + '.xml' + ccl_train = 'ccl-train' + str(i).zfill(2) + '.xml' + + logging.info('Kopiowanie foldu ze źródła') + if input_format == "ccl": + tools.ccl2xcesWithIob(os.path.join(in_dir, ccl_test), os.path.join(out_dir, test_name)) + tools.ccl2xcesWithIob(os.path.join(in_dir, ccl_train), os.path.join(out_dir, train_name)) + else: + shutil.copy(os.path.join(in_dir, test_name), os.path.join(out_dir, test_name)) + shutil.copy(os.path.join(in_dir, train_name), os.path.join(out_dir, train_name)) + + logging.info('Usuwanie tagów iob') + tools.mkdir_p(os.path.join(out_dir,'noiob')) + shutil.copy(os.path.join(out_dir, test_name), os.path.join(out_dir, 'noiob', test_name)) + tools.remove_iob(os.path.join(out_dir, 'noiob', test_name)) + + logging.info('Usuwanie disambów więcej niż 1') + tools.mkdir_p(os.path.join(out_dir,'only1disamb')) + tools.force1disamb(os.path.join(out_dir, 'noiob', test_name), os.path.join(out_dir, 'only1disamb', test_name), 'xces', 'xces', tagset) + + logging.info('Usuwanie tagów bez disamba') + tools.mkdir_p(os.path.join(out_dir,'without_lex')) + tools.remove_lex_without_disamb(os.path.join(out_dir, 'only1disamb', test_name), os.path.join(out_dir, 'without_lex', test_name), 'xces', 'xces', tagset) + + logging.info('Uruchomienie spejda') + tools.mkdir_p(os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2))) + shutil.copy(os.path.join(out_dir, 'without_lex', test_name), os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2), 'morph.xml')) + tools.run_spejd(config, os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2))) + #tei2xces + logging.info('Konwersja wyniku spejda do xces') + tools.tei2xces(os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2))) + #xces2ccl + logging.info('Konwersja wyniku spejda xces do ccl') + tools.mkdir_p(os.path.join(out_dir, 'ccl', 'spejd_test_result')) + tools.mkdir_p(os.path.join(out_dir, 'ccl', 'ref')) + tools.xces2ccl(os.path.join(out_dir, 'spejd_test_result', str(i).zfill(2), 'out.xml'), os.path.join(out_dir, 'ccl', 'spejd_test_result', ccl_test)) + tools.xces2ccl(os.path.join(out_dir, test_name), os.path.join(out_dir, 'ccl', 'ref', ccl_test)) + #change channels + logging.info('Zmiana nazw chunków') + tools.mkdir_p(os.path.join(out_dir, 'ccl', 'changed_spejd')) + shutil.copy(os.path.join(out_dir, 'ccl', 'spejd_test_result', ccl_test), os.path.join(out_dir, 'ccl', 'changed_spejd')) + tools.change_chunk_channels(os.path.join(out_dir, 'ccl', 'changed_spejd', ccl_test)) + + #diseval + logging.info('Porównanie wyników chunkowania z plikiami wzorcowymi') + chunk_eval(os.path.join(out_dir, 'ccl', 'changed_spejd'), os.path.join(out_dir, 'ccl', 'ref'), chunk_names, "ccl", os.path.join(out_dir, 'results.csv'), tagset, verbose, folds) + + +if __name__ == '__main__': + go() diff --git a/scripts/chunker_scripts/feature_selection/crf_wrapper.py b/scripts/chunker_scripts/feature_selection/crf_wrapper.py deleted file mode 100755 index d60a3778301de3c5d6f80203a2f462c6e1d79b01..0000000000000000000000000000000000000000 --- a/scripts/chunker_scripts/feature_selection/crf_wrapper.py +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/python -#-*- coding: utf-8 -*- -''' -Created on 18-03-2013 - -@author: Adam Pawlaczek -''' -import tools -import sys, math, os, shutil, random -from chunk_eval import chunk_eval_avg -from optparse import OptionParser -from threading import Thread - -descr = """%prog [options] corpus_dir out_dir""" - -def go(): - parser = OptionParser(usage=descr) - parser.add_option('-w', '--max-window', type="int", action='store', - dest='window', default=2, - help='Set max window for feature') - parser.add_option('-c', '--config', type='string', action='store', - dest='config', default='/home/jezozwierzak/range2/feature_selection/config/kpwr-experimental.ini', - help='set the config path to ini file') - parser.add_option('-f', '--folds', type="int", action='store', - dest='folds', default=10, - help='Number of folds default: 10') - (options, args) = parser.parse_args() - - if len(args) != 2: - sys.stderr.write('You need to provide corpus_dir and out_dir.\n') - sys.stderr.write('See %s --help\n' % sys.argv[0]) - sys.exit(1) - - corpus_dir, out_dir = args - main(corpus_dir, out_dir, options.config, options.window, options.folds) - -#Odpalenie crf-a i sprawdzenie jaki będzie F-measure dla podanych cech -def f(out_dir, config_dir, config_name, corpus_dir, folds, vector = [], constructed = {}): - tools.mkdir_p(os.path.join(out_dir, "config_files"), True) - generate_features_txt(os.path.join(out_dir, "config_files", config_name + "-layer1.txt"), vector = vector, constructed = constructed) - generate_features_txt(os.path.join(out_dir, "config_files", config_name + "-layer2.txt"), vector = vector, constructed = constructed) - #Copy config files - shutil.copyfile(os.path.join(config_dir, config_name + ".ccl"), os.path.join(out_dir, "config_files", config_name + ".ccl")) - shutil.copyfile(os.path.join(config_dir, config_name + ".ini"), os.path.join(out_dir, "config_files", config_name + ".ini")) - - threads = [] - - for fold in range(1, folds+1): - t = Thread(target=process_fold, args=(fold, out_dir, corpus_dir, config_dir, config_name,)) - threads.append(t) - t.start() - - for fold in range(1, folds+1): - t.join() - - result = chunk_eval_avg.get_avg_results(os.path.join(out_dir, "chunked"), corpus_dir, ["chunk_np"]) - - f = open(os.path.join(out_dir, "result.csv"), 'w+') - f.write("vector: ", vector) - f.write("result: ", result) - f.close() - return result["f"] - -def process_fold(fold, out_dir, corpus_dir, config_dir, config_name): - tools.mkdir_p(os.path.join(out_dir, "models", str(fold).zfill(2)), True) - #Copy dict files - shutil.copyfile(os.path.join(config_dir, "dict-case.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-case.lex")) - shutil.copyfile(os.path.join(config_dir, "dict-prep.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-prep.lex")) - shutil.copyfile(os.path.join(config_dir, "dict-sie.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-sie.lex")) - - #TRAINING - tools.train_iobber(os.path.join(config_dir, config_name + ".ini"), - os.path.join(out_dir, "models", str(fold).zfill(2)), - os.path.join(corpus_dir, "ccl-train%02d.xml"%(fold))) - - #Remove channels - tools.mkdir_p(os.path.join(out_dir, "empty")) - shutil.copyfile(os.path.join(corpus_dir, "ccl-test%02d.xml"%(fold)), os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold))) - tools.remove_channels(os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold))) - - #RUNING - tools.mkdir_p(os.path.join(out_dir, "chunked")) - tools.run_iobber(os.path.join(config_dir, config_name + ".ini"), - os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold)), - os.path.join(out_dir, "chunked", "ccl-test%02d.xml"%(fold)), - os.path.join(out_dir, "models", str(fold).zfill(2))) - - #Remove dicts - os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-case.lex")) - os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-prep.lex")) - os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-sie.lex")) - -#Wybranie sąsiada -def neightbour(v): - a = random.randint(0, len(v)) - b = random.randint(0, len(v[0])) - v[a][b] = 1 if v[a][b] == 0 else 0 - a = random.randint(0, len(v)) - b = random.randint(0, len(v[0])) - v[a][b] = 1 if v[a][b] == 0 else 0 - return v - -#Obliczenie temperatury początkowej -def tempestimation(out_dir, config_dir, config_name, corpus_dir, folds, vector): - iterations = 100 #ilosc iteracji symulacji - sum = 0 - results = {} #histogram wyników - for i in range(iterations): - - result = f(os.path.join(out_dir, str(i).zfill(2)), config_dir, config_name, corpus_dir, folds, vector) - sum += result - if result not in results.keys(): - results[result] = 1 - else: - results[result] += 1 - vector = neightbour(vector) - avg = sum / float(iterations)#obliczenie średniego wyniku - k = 0 - deviation = 0 - for result in results: - deviation += results[result] * ((k - avg) ** 2) - k += 1 - #obliczenie odchylenia standardowego - deviation = math.sqrt(deviation / iterations) - return deviation - -def get_features_number(cclfile): - f = open(cclfile) - count = 0 - for line in f: - count += line.count(";") - f.close() - return count + 1 - -def generate_features_txt(resultfile, vector = [], constructed = {}): - out = open(resultfile, 'w+') - feature_num = 0 - actual_feature_num = len(vector[0]) / 2 + 1 - - for i in range(len(vector)): - for j in range(len(vector[i])): - if vector[i][j] == 1: - out.write('U%02d:%%x[%d,%d]'%(feature_num, j if j >= actual_feature_num else -j, i)) - out.write("\n") - feature_num += 1 - out.write("\n") - for i in range(len(constructed)): - feats = constructed[i].split("%") - - out.write("\n") - out.write("B") - out.close() - -def create_null_vector(features_num, window): - vector = [] - for i in range(features_num): - vector.appendCell(tools.zerolistmaker(window*2 + 1)) - return vector - -def randomize_vector(features_num, window): - vector = [] - for i in range(features_num): - win_vector = [] - for j in range(window): - win_vector.appendCell(random.randint(0, 1)) - vector.appendCell(win_vector) - return vector - -def P(e, en, temp): - if e - en < 0: - return e ** ((e - en) / (temp)) - else: - return 1 - -def main(corpus_dir, out_dir, config, window, folds): - config_dir = os.path.dirname(config) - config_name = os.path.splitext(os.path.basename(config))[0] - config_ccl = os.path.join(config_dir, config_name + ".ccl") - - constructed = [] - constructed.appendCell("1%2") - generate_features_txt(os.path.join(out_dir, "config_files", config_name + "-layer1.txt"), constructed = constructed) - - if not os.path.exists(out_dir): - tools.mkdir_p(out_dir) - - a_vector = randomize_vector(get_features_number(config_ccl), window) - temperature = tempestimation(os.path.join(out_dir, "estimation"), config_dir, config_name, corpus_dir, folds, a_vector) - - a_value = f(os.path.join(out_dir, "selection", "first"), config_dir, config_name, corpus_dir, folds, a_vector) - b_value = 0 - i = 1 - - while temperature > 0: - b_vector = neightbour(a_vector) - b_value = f(os.path.join(out_dir, "selection", str(i).zfill(2)), config_dir, config_name, corpus_dir, folds, b_vector) - prob = P(a_value, b_value, temperature) - if b_value > a_value: - a_vector = b_vector - elif random.randint(0, 1) < prob: - a_vector = b_vector - temperatore = temperature * 0.95 - i += 1 - -if __name__ == '__main__': - go() diff --git a/scripts/chunker_scripts/feature_selection/crf_wrapper2.py b/scripts/chunker_scripts/feature_selection/crf_wrapper2.py index 31642560e2a7c214a5873c905f53e00d4820c13b..2bebbcb0f8e3696e6b3a7696c608fdbfff45d440 100755 --- a/scripts/chunker_scripts/feature_selection/crf_wrapper2.py +++ b/scripts/chunker_scripts/feature_selection/crf_wrapper2.py @@ -8,8 +8,8 @@ Created on Mar 25, 2013 from optparse import OptionParser import sys, os, random, shutil import anneal -from chunker_scripts2 import tools -from chunker_scripts2.chunk_eval import chunk_eval_avg +from chunker_scripts import tools +from chunker_scripts.chunk_eval import chunk_eval_avg from threading import Thread import multiprocessing diff --git a/scripts/chunker_scripts/folds_maker/FoldsMaker.py b/scripts/chunker_scripts/folds_maker/FoldsMaker.py old mode 100644 new mode 100755 index 88fd2cd9e7a86af708c2823de28d9e99aead5b37..7696abbac88fbb59f988c2c63e08384e1c71e459 --- a/scripts/chunker_scripts/folds_maker/FoldsMaker.py +++ b/scripts/chunker_scripts/folds_maker/FoldsMaker.py @@ -7,7 +7,7 @@ Created on 16-08-2012 ''' import sys, os, subprocess, random, shutil import corpus2 -import tools +from chunker_scripts import tools from optparse import OptionParser descr = """%prog [options] [CorpusKind] [Corpus dir] [output dir] @@ -16,7 +16,7 @@ descr = """%prog [options] [CorpusKind] [Corpus dir] [output dir] class FoldsMaker: - def __init__(self, input, output, in_format, out_format, held_out_bool, folds): + def __init__(self, input, output, in_format, out_format, held_out_bool, main_part_percent, folds): self.folds = folds self.sentences = [] self.input = input @@ -25,8 +25,8 @@ class FoldsMaker: self.out_format = out_format self.held_out_bool = held_out_bool - self.held_out_param = 0.2 - self.main_param = 0.8 + self.held_out_param = 1 - float(main_part_percent) + self.main_param = float(main_part_percent) self.held_out = '' self.held_out_folds = [] @@ -76,7 +76,7 @@ class FoldsMaker: if not sent: break - self.sentences.appendCell(sent) + self.sentences.append(sent) def splitHeldOut(self): ind = int(len(self.sentences) * self.held_out_param) @@ -89,8 +89,8 @@ class FoldsMaker: fold_held_out_sents = len(self.held_out) / self.folds for i in range(0, self.folds): if self.held_out_bool: - self.held_out_folds.appendCell(self.held_out[i*fold_held_out_sents:(i+1)*fold_held_out_sents]) - self.main_folds.appendCell(self.main[i*fold_main_sents:(i+1)*fold_main_sents]) + self.held_out_folds.append(self.held_out[i*fold_held_out_sents:(i+1)*fold_held_out_sents]) + self.main_folds.append(self.main[i*fold_main_sents:(i+1)*fold_main_sents]) def saveFolds(self): tools.mkdir_p(os.path.join(self.output, 'main')) @@ -152,6 +152,9 @@ def go(): help='set the input format; default: ccl') parser.add_option('--with-heldout', action='store_true', dest='held_out', default=False, help='Split data to main and held_out') + parser.add_option('-p', '--main-part', type='string', action='store', + dest='main_part', default='0.9', + help='set the input main part percentage; default: 0.9') (options, args) = parser.parse_args() if len(args) != 3: @@ -160,10 +163,10 @@ def go(): sys.stderr.write('See %s --help\n' % sys.argv[0]) sys.exit(1) - main(args[0], args[1], args[2], options.input_format, options.output_format, options.held_out, options.folds) + main(args[0], args[1], args[2], options.input_format, options.output_format, options.held_out, options.main_part, options.folds) -def main(corpusKind, corpusDir, outDir, in_format, out_format, held_out, folds=1): - foldsMaker = FoldsMaker(corpusDir, outDir, in_format, out_format, held_out, folds) +def main(corpusKind, corpusDir, outDir, in_format, out_format, held_out, main_part, folds): + foldsMaker = FoldsMaker(corpusDir, outDir, in_format, out_format, held_out, main_part, folds) if corpusKind == "nkjp": foldsMaker.processNkjpCorpus() elif corpusKind == "kpwr": diff --git a/scripts/chunker_scripts/split_np/pp_merger/__init__.py b/scripts/chunker_scripts/split_np/pp_merger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/chunker_scripts/split_np/pp_merger/classify.py b/scripts/chunker_scripts/split_np/pp_merger/classify.py new file mode 100644 index 0000000000000000000000000000000000000000..a90ef1f493fc12e5084eb97e7f14c2ca59e4cdf9 --- /dev/null +++ b/scripts/chunker_scripts/split_np/pp_merger/classify.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER. +# This program is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE, COPYING.LESSER and COPYING files for more details + +import CRFPP # CRF++ Python wrapper +import subprocess, os # running crf_learn +import codecs + +import config, corpio + +DATA_SEP = '\t' + +def open_tr_files(model_name, data_dir, layers): + tr_files = {} + for layer in layers: + tr_files[layer] = codecs.open(corpio.f_name(model_name, data_dir, + config.EXT_DATA, layer), 'wb', 'utf-8') + return tr_files + +def close_tr_files(tr_files): + for chan in tr_files: + tr_files[chan].close() + +def write_example(tr_file, feat_vals, class_label): + """Writes a training example in simple tab-separated format.""" + tr_file.write(DATA_SEP.join(feat_vals)) + tr_file.write(DATA_SEP) + tr_file.write(class_label) + tr_file.write('\n') + +def write_end_of_sent(tr_file): + """Writes end-of-sentence marker to the training file.""" + tr_file.write('\n') + +def train_and_save(conf, model_name, config_dir, data_dir, chan_name): + """Trains a CRF classifier for the given chan_name. The trained model + is saved to filenames (generated using model_name and conf).""" + tr_fname = corpio.f_name(model_name, data_dir, config.EXT_DATA, chan_name) + cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name) + cr_template = corpio.f_name(model_name, config_dir, config.EXT_TEXT, chan_name) + crf_opts = conf.get(config.S_CLASSIFIER, config.O_PARAMS) + # run crf_learn + args = ['crf_learn', cr_template, tr_fname, cr_fname] + args.extend(crf_opts.split()) # if any + with open(os.devnull, 'w') as fnull: + retval = subprocess.call(args, + stdout = fnull, stderr = fnull) + if retval != 0: + raise IOError('Training CRF++ FAILED. Check .tab file for data validity. Call: %s' % ' '.join(args)) + +def load(conf, model_name, data_dir, chan_name): + """Tries to load a stored classifier. + If doesn't exist, will return None.""" + cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name) + if os.path.isfile(cr_fname): + return CRFPP.Tagger('-m %s' % cr_fname) + raise IOError('can\'t open classifier from %s' % cr_fname) + +def open_sent(crf_obj): + """ + Notify the trained classifier than a new sentence will be classified. + """ + crf_obj.clear() + +def eat_token(crf_obj, feat_vals): + """Feed the trained classifier with a new token (instance). The output + tag sequence for the sentence being processed will be reade after calling + close_sent.""" + instance = DATA_SEP.join(feat_vals).encode('utf-8') + crf_obj.add(instance) + +def close_sent(crf_obj): + """Notify the trained classifier that a whole sentence has been fed and + have the classifier classify each token.""" + crf_obj.parse() + +def classify_token(crf_obj, tok_idx): + """Retrieve the class label (tag) for the token at given tok_idx. Assumes + that a whole sentence has been fed to the trained crf_obj with open_sent, + eat_token and close_sent calls.""" + return crf_obj.y2(tok_idx) diff --git a/scripts/chunker_scripts/split_np/pp_merger/config/kpwr.ccl b/scripts/chunker_scripts/split_np/pp_merger/config/kpwr.ccl new file mode 100644 index 0000000000000000000000000000000000000000..11144bb98f8cae63ee464f6c7006a853c1a48041 --- /dev/null +++ b/scripts/chunker_scripts/split_np/pp_merger/config/kpwr.ccl @@ -0,0 +1,16 @@ +@ "default" ( + orth[0]; // 0 + class[0]; // 1 + cas[0]; // 2 + gnd[0]; // 3 + nmb[0]; // 4 + agrpp(0,1,{nmb,gnd,cas}); // 5 + and(inside(-1), inside(1), wagr(-1,1,{nmb,gnd,cas})); // 6 + regex(orth[0], "\\P{Ll}.*"); regex(orth[0], "\\P{Lu}.*") // 7, 8 +) + +/* +@ "layer2" ( + isannpart(0, "chunk_agp") // 9 +) +*/ diff --git a/scripts/chunker_scripts/split_np/pp_merger/corpio.py b/scripts/chunker_scripts/split_np/pp_merger/corpio.py new file mode 100644 index 0000000000000000000000000000000000000000..bf515c76c6cd408a58660f8cb65057bb0bff168a --- /dev/null +++ b/scripts/chunker_scripts/split_np/pp_merger/corpio.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER. +# This program is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE, COPYING.LESSER and COPYING files for more details + +# SWIG bug workaround: loading multiple SWIG modules brought unwrapped +# swig::stop_iteration exceptions +import ctypes, sys +import platform +if 'Linux' in platform.system(): + # this prevents from problems with multiple SWIG wrappers + # (probably bug in SWIG) and possible problems with locating Maca plugin + dlflags = sys.getdlopenflags() + sys.setdlopenflags(dlflags | ctypes.RTLD_GLOBAL) + +import corpus2, wccl +# TODO: get back to default dlopen policy? + +if 'Linux' in platform.system(): + # get back to default dlopen policy + sys.setdlopenflags(dlflags) + +import config +import codecs, os + +_ROOT = os.path.abspath(os.path.dirname(__file__)) + +format_help = """ +Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """ +""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """ +Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """ +""" + ' '.join(corpus2.TokenWriter.available_writer_types_help()) + +def get_data(path): + """Tries to resolve path to the given subdir, trying the path locally + and then in the install site.""" + if os.path.exists(path): + return path + in_data = os.path.join(_ROOT, 'data', path) + if os.path.exists(in_data): + return in_data + raise IOError('can\'t locate %s, tried locally and %s' % (path, in_data)) + +def f_name(model_name, subdir, ext, suff = ''): + """Gets the filename based on model_name having the given + extension. Optionally, you can specify name suffix.""" + + base = (model_name + '-' + suff + '.' + ext) if suff else (model_name + '.' + ext) + return os.path.join(subdir, base) + +def get_tagset(conf): + return corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET)) + +def get_reader(in_path, tagset, input_format, read_disamb_only): + """Creates a reader using the options. If in_path evaluates to False, + will create a stdin reader. Set read_disamb_only to force reading only + 'disamb' lexemes/interpretations.""" + fixd_format = input_format + if read_disamb_only: + fixd_format += ',disamb_only' + # force casting sentences as AnnotatedSentences + # required to get XCES input right + fixd_format += ',ann' + + if in_path: + return corpus2.TokenReader.create_path_reader( + fixd_format, tagset, in_path) + else: + return corpus2.TokenReader.create_stdin_reader(fixd_format, tagset) + +def get_writer(out_path, tagset, output_format): + """Creates a writer using the options. If out_path evaluates to False, + will create a stdout writer.""" + if out_path: + return corpus2.TokenWriter.create_path_writer(output_format, out_path, + tagset) + else: + return corpus2.TokenWriter.create_stdout_writer(output_format, tagset) + +def op_list(wccl_file, sec_name): + """Retrieves a list of operators corresponding to a named section from + the given WCCL file. If section not present, will return an empty list.""" + ops = [] + if wccl_file.has_untyped_section(sec_name): + sec = wccl_file.get_untyped_section(sec_name) + for op_idx in range(sec.size()): + ops.append(sec.get_ptr(op_idx)) + return ops + +def get_wccl_ops(conf, model_name, wccl_dir, lex_dir, chan_names): + print lex_dir + """Returns a pair: WCCL op list, that is a list of WCCL operator lists + corresponding to the given channel names. Each list may consists of two + parts: the default operators and channel-specific operators + (theoretically both may be empty).""" + wccl_file_path = f_name(model_name, wccl_dir, config.EXT_WCCL) + tagset = corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET)) + wccl_file = wccl.Parser(tagset).parseWcclFileFromPath(wccl_file_path, lex_dir) + def_ops = op_list(wccl_file, config.DEFAULT_OPS) + chan_ops = [def_ops + op_list(wccl_file, chan_name) for chan_name in chan_names] + return chan_ops + +def create_context(sent): + """Wraps the sentence as SentenceContext to be used with WCCL.""" + return wccl.SentenceContext(sent) diff --git a/scripts/chunker_scripts/split_np/pp_merger/pp_merger.py b/scripts/chunker_scripts/split_np/pp_merger/pp_merger.py new file mode 100755 index 0000000000000000000000000000000000000000..c2cc2424c10c5dc5c88b0ec87955f2a8c6a05153 --- /dev/null +++ b/scripts/chunker_scripts/split_np/pp_merger/pp_merger.py @@ -0,0 +1,272 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 15-04-2013 + +@author: Adam Pawlaczek +''' + + +from optparse import OptionParser +import sys, os, codecs +import corpus2 +import wccl +from chunker_scripts import tools +import corpio, classify +import logging +import timbl + +descr = """%prog [options] in_path out_path + +""" +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('-c', '--config', type="string", action='store', + dest='config_path', default='config/kpwr.ccl', + help='Path to config file; default: kpwr.ini') + parser.add_option('-d', '--data', type="string", action='store', + dest='data_dir', default='model-kpwr11-H_2', + help='Path to data_dir; default: model-kpwr11-H_2') + parser.add_option('-f', '--folds', type="int", action='store', + dest='folds', default=1, + help='Number of folds') + parser.add_option('--train', action='store_true', + dest='is_training', help='train the pp_merger') + parser.add_option('--xval', action='store_true', + dest='xval', help='make cross validation with directory of folds') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + (options, args) = parser.parse_args() + + if len(args) != 2 and not options.is_training: + sys.stderr.write('You need to provide a in_path out_path.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + + ppMerger = PPMerger(options.config_path, options.tagset, options.data_dir) + + if options.is_training: + in_path = args[0] + ppMerger.train_and_save(in_path, options.input_format) + elif options.xval: + in_path, out_path = args + ppMerger.xval(in_path, out_path, options.input_format, options.output_format, options.folds) + else: + in_path, out_path = args + ppMerger.classify(in_path, out_path, options.input_format, options.output_format) + + +def read_repr(inf): + return eval(inf.readline()) + +class PPMerger: + + def __init__(self, config_path, tagset, data_dir): + self.config_dir = os.path.dirname(config_path) + self.config_name = os.path.splitext(os.path.basename(config_path))[0] + + self.data_dir = data_dir + self.tagset = corpus2.get_named_tagset(tagset) + self.ops = self.wccl_ops() + + def indexes_of_prep(self, ann, tokens): + ''' Returns indexes of prep in ann, not in sentence''' + result = [] + for idx in range(len(ann.indices)): + pos = self.tagset.get_pos_name(tokens[idx].get_preferred_lexeme(self.tagset).tag().get_pos_index()) + if pos == 'prep': + result.append(idx) + return result + + def is_splited_prep(self, prev_ann, ann, tokens): + lexeme = tokens[ann.indices[0]].get_preferred_lexeme(self.tagset).tag().get_pos_index() + pos = self.tagset.get_pos_name(lexeme) + return pos == "prep" and prev_ann != None and prev_ann.indices[-1] + 1 == ann.indices[0] + + def op_list(self, wccl_file, sec_name): + ops = [] + if wccl_file.has_untyped_section(sec_name): + sec = wccl_file.get_untyped_section(sec_name) + for op_idx in range(sec.size()): + ops.append(sec.get_ptr(op_idx)) + return ops + + def wccl_ops(self): + wccl_file_path = os.path.join(self.config_dir, self.config_name + ".ccl") + wccl_file = wccl.Parser(self.tagset).parseWcclFileFromPath(wccl_file_path, self.data_dir) + ops = self.op_list(wccl_file, "default") + return ops + + def train_and_save(self, in_path, input_format): + self.input_format = input_format + if not os.path.exists(self.data_dir): + tools.mkdir_p(self.data_dir) + + tr_path = os.path.join(self.data_dir, "pp_merger.tr") + tr_file = codecs.open(tr_path, 'wb', 'utf-8') + + reader = tools.get_reader(in_path, self.input_format, self.tagset) + sent = reader.get_next_sentence() + while sent: + asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + if "chunk_np" in asent.all_channels(): + chan = asent.get_channel("chunk_np") + ann_vec = chan.make_annotation_vector() + prev_ann = None + + con = wccl.SentenceContext(sent) + for ann in ann_vec: + indexes_of_prep = self.indexes_of_prep(ann, asent.tokens()) + for idx in indexes_of_prep: + con.set_position(ann.indices[idx]) + feat_vals = [op.base_apply(con) + .to_compact_string(self.tagset).decode('utf-8') + for op in self.ops] + if (idx == 0 and self.is_splited_prep(prev_ann, ann, asent.tokens())): + class_label = 'B' + classify.write_example(tr_file, feat_vals, class_label) + elif (idx != 0): + class_label = 'I' + classify.write_example(tr_file, feat_vals, class_label) + # Może być jeszcze przypadek prep na początku w niepodzielonym chunku. + # Tą opcję warto sprawdzić w osobnym doświadczeniu + + prev_ann = ann + + classify.write_end_of_sent(tr_file) + sent = reader.get_next_sentence() + + tr_file.close() + + cr_path = os.path.join(self.data_dir, "pp_merger.cr") + + timbl_opts = "-mM -k11 -dIL +vs" + timbl_obj = timbl.TimblAPI(timbl_opts, "") + + timbl_obj.learn(tr_path) + with open(cr_path + '.x', 'w') as out: + out.write('%s\n' % repr(timbl_opts)) + # for restoring weights + out.write('%s\n' % repr(timbl_obj.currentWeighting().name)) + timbl_obj.writeInstanceBase(cr_path + '.b') + timbl_obj.saveWeights(cr_path + '.w') + timbl_obj.writeArrays(cr_path + '.a') + return timbl_obj + + def load_model(self): + cr_path = os.path.join(self.data_dir, "pp_merger.cr") + if not os.path.exists(cr_path + '.x'): + return None + with open(cr_path + '.x', 'r') as inf: + timbl_opts = read_repr(inf) + weighting_name = read_repr(inf) + timbl_obj = timbl.TimblAPI(timbl_opts, "") + timbl_obj.getInstanceBase(cr_path + '.b') + timbl_obj.getWeights(cr_path + '.w', timbl.Weighting.names[weighting_name]) + timbl_obj.getArrays(cr_path + '.a') + return timbl_obj + + def to_timbl_line(self, feat_vals, class_label): + """Gets a TiMBL-friendly instance string representation.""" + return u'\t'.join(feat_vals + [class_label]).encode('utf8') + + def classify(self, in_path, out_path, input_format, output_format): + self.input_format = input_format + self.output_format = output_format + + timbl_obj = self.load_model() + self.classify_input(in_path, out_path, input_format, output_format, timbl_obj) + + def classify_input(self, in_path, out_path, input_format, output_format, timbl_obj): + reader = tools.get_reader(in_path, self.input_format, self.tagset) + writer = tools.get_writer(out_path, output_format, self.tagset) + + chunk = reader.get_next_chunk() + while chunk: + for sent in chunk.sentences(): + asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + + if "chunk_np" in asent.all_channels(): + chan = asent.get_channel("chunk_np") + ann_vec = chan.make_annotation_vector() + prev_ann = None + + con = wccl.SentenceContext(sent) + for ann in ann_vec: + indexes_of_prep = self.indexes_of_prep(ann, asent.tokens()) + for idx in indexes_of_prep: + con.set_position(ann.indices[idx]) + feat_vals = [op.base_apply(con) + .to_compact_string(self.tagset).decode('utf-8') + for op in self.ops] + #get only prep that can be included to prev ann + if (idx == 0 and self.is_splited_prep(prev_ann, ann, asent.tokens())): + class_label = self.decide_about_prep(timbl_obj, feat_vals) + if class_label == "I": + for ann_idx in ann.indices: + chan.set_segment_at(ann_idx, prev_ann.seg_number) + prev_ann = ann + + writer.write_chunk(chunk) + chunk = reader.get_next_chunk() + + def decide_about_prep(self, timbl_obj, feat_vals): + line = self.to_timbl_line(feat_vals, '?') + success, decsn = timbl_obj.classify(line) + if not success: + raise ValueError('TiMBL failed to classify %s' % line) + return decsn + + def get_in_paths(self, in_path, input_format, folds, main_name="test"): + input_paths = [] + if folds > 1: + for fold in range(1, folds+1): + if input_format == "ccl": + input_paths.append(os.path.join(in_path, 'ccl-' + main_name + str(fold).zfill(2) + '.xml')) + elif input_format == "xces": + input_paths.append(os.path.join(in_path, main_name + str(fold).zfill(2) + '.xml')) + else: + if(os.path.isdir(in_path)): + for (path, dirs, files) in os.walk(in_path): + for file in files: + input_paths.append(os.path.join(path, file)) + else: + input_paths.append(in_path) + + return input_paths + + def get_out_path(self, in_path, out_dir, output_format): + file_name = os.path.basename(in_path) + if output_format == "ccl": + if file_name.startswith("ccl-"): + return os.path.join(out_dir, file_name) + else: + return os.path.join(out_dir, "ccl-" + file_name) + else: + if file_name.startswith("ccl-"): + return os.path.join(out_dir, "ccl-" + file_name[4:]) + else: + return os.path.join(out_dir, file_name) + + def xval(self, in_path, out_path, input_format, output_format, folds): + in_trains = self.get_in_paths(in_path, input_format, folds, "train") + in_tests = self.get_in_paths(in_path, input_format, folds, "test") + out_dir = out_path + + for in_train, in_test in zip(in_trains, in_tests): + timbl_obj = self.train_and_save(in_train, input_format) + tools.mkdir_p(os.path.join(out_dir, "empty")) + empty_path = os.path.join(out_dir, "empty", os.path.basename(in_test)) + tools.remove_channels(in_test, empty_path) + out_path = self.get_out_path(in_test, out_dir, output_format) + self.classify_input(empty_path, out_path, input_format, output_format, timbl_obj) + +if __name__ == '__main__': + go() \ No newline at end of file diff --git a/scripts/chunker_scripts/split_np/pp_merger/split_half_train.py b/scripts/chunker_scripts/split_np/pp_merger/split_half_train.py new file mode 100755 index 0000000000000000000000000000000000000000..efead2b04e5cc34151ba0ab8a9674f100a74ca27 --- /dev/null +++ b/scripts/chunker_scripts/split_np/pp_merger/split_half_train.py @@ -0,0 +1,83 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +''' +Created on 22-04-2013 + +@author: Adam Pawlaczek +''' + +from optparse import OptionParser +import sys, os +import corpus2 +descr = """%prog [options] in_path out_path + +""" +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + (options, args) = parser.parse_args() + + if len(args) != 3: + sys.stderr.write('You need to provide a in_path out_path.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + + in_path, out_path1, out_path2 = args + main(in_path, out_path1, out_path2, options.input_format, options.output_format, options.tagset) + +def get_writer(out_path, output_format, tagset): + if out_path: + return corpus2.TokenWriter.create_path_writer(output_format, out_path, + tagset) + else: + return corpus2.TokenWriter.create_stdout_writer(output_format, tagset) + +def get_reader(in_path, input_format, tagset): + if in_path: + return corpus2.TokenReader.create_path_reader( + input_format, tagset, in_path) + else: + return corpus2.TokenReader.create_stdin_reader(input_format, tagset) + +def count_sentences(in_path, input_format, tg): + reader = get_reader(in_path, input_format, tg) + + sent = reader.get_next_sentence() + sum = 1 + while sent: + sent = reader.get_next_sentence() + sum += 1 + + return sum + +def main(in_path, out_path1, out_path2, input_format, output_format, tagset): + tg = corpus2.get_named_tagset(tagset) + + sentences_nr = count_sentences(in_path, input_format, tg) + sentences_half_nr = sentences_nr / 2 + reader = get_reader(in_path, input_format, tg) + writer = get_writer(out_path1, output_format, tg) + + sent = reader.get_next_sentence() + i = 1 + while sent and i < sentences_half_nr: + writer.write_sentence(sent) + sent = reader.get_next_sentence() + i += 1 + + writer = get_writer(out_path2, output_format, tg) + while sent: + writer.write_sentence(sent) + sent = reader.get_next_sentence() + + +if __name__ == '__main__': + go() \ No newline at end of file diff --git a/scripts/chunker_scripts/stats/count_chunk_length_histogram.py b/scripts/chunker_scripts/stats/count_chunk_length_histogram.py new file mode 100755 index 0000000000000000000000000000000000000000..3209ff70f5188f6e64c0daa4c25a0426a084456c --- /dev/null +++ b/scripts/chunker_scripts/stats/count_chunk_length_histogram.py @@ -0,0 +1,89 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +''' +Created on Dec 21, 2012 +@author: jezozwierzak +''' + +descr = """%prog [options] in_path + +""" + +from optparse import OptionParser +import sys, os +import corpus2 +from chunker_scripts.csv_table.CSVTable import CSVTable + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-f', '--folds', type="int", action='store', + dest='folds', default=1, + help='Number of folds') + parser.add_option('-c', '--chunk-names', type='string', action='store', + dest='chunk_names', default='', + help='set chunk_names to eval') + (options, args) = parser.parse_args() + + if len(args) != 1: + sys.stderr.write('No args. See --help\n') + sys.exit(1) + + in_path = args[0] + main(in_path, options.input_format, options.folds, options.tagset, options.chunk_names) + +def main(in_path, input_format, folds, tagset, chunk_names): + + chunk_names = chunk_names.split(",") + tagset = corpus2.get_named_tagset(tagset) + + input_paths = [] + if folds > 1: + for fold in range(1, folds+1): + if input_format == "ccl": + input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml')) + elif input_format == "xces": + input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml')) + else: + if(os.path.isdir(in_path)): + for (path, dirs, files) in os.walk(in_path): + for file in files: + input_paths.append(os.path.join(path, file)) + else: + input_paths.append(in_path) + + table = CSVTable() + table.addColumn('Nr') + for input_path in input_paths: + reader = corpus2.TokenReader.create_path_reader( + input_format, tagset, input_path) + + sent = reader.get_next_sentence() + while sent: + asent = corpus2.AnnotatedSentence.wrap_sentence(sent) + tokens = asent.tokens() + + for chan_name in asent.all_channels(): + if chan_name in chunk_names: + chan = asent.get_channel(chan_name) + ann_vec = chan.make_annotation_vector() + for ann in ann_vec: + + if not table.hasColumn(chan_name): + table.addColumn(chan_name, type='int') + + while table.rows < len(ann.indices): + table.addEmptyRow() + + # print chan_name, len(ann_vec) + table.increment(chan_name, len(ann.indices)-1) + sent = reader.get_next_sentence() + print table + +if __name__ == '__main__': + go() diff --git a/scripts/chunker_scripts/stats/crossing.py b/scripts/chunker_scripts/stats/crossing.py new file mode 100755 index 0000000000000000000000000000000000000000..d7dcb911dcf0ba662f7a80d017d22c4ec5dbeaa8 --- /dev/null +++ b/scripts/chunker_scripts/stats/crossing.py @@ -0,0 +1,340 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +''' +Created on May 13, 2013 + +@author: Maciej Zawadzki +''' + +import corpus2 +from optparse import OptionParser +import os +import sys + +""" +The purpose of this program is to count number of names that are outside of one chunk_np +It will print all errors and finally total statistics +""" + +def go(): + parser = OptionParser() + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='ccl', + help='set the input format; default: ccl') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='ccl', + help='set the output format; default: ccl') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-v', '--verbose', action='store_true', + help='verbose output') + parser.add_option('-f', '--folds', type="int", action='store', + dest='folds', default=1,help='Number of folds') + (options, args) = parser.parse_args() + + if len(args) != 4: + sys.stderr.write('You need to provide corpus_dir and out_dir.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + wyniki = [] + wzrc_file = get_ref_paths(args[0], options.folds, options.input_format) + crf_file = get_ref_paths(args[1], options.folds, options.input_format) + open_file = get_ref_paths(args[2], options.folds, options.input_format) + spejd_file = get_ref_paths(args[3], options.folds, options.input_format) + i = 0 + for files in wzrc_file: + wyniki.append(main(files, options.input_format, options.tagset, crf_file[i], open_file[i], spejd_file[i], options.verbose)) + i+=1 + + crf_ok = 0 + crf_no = 0 + + open_ok = 0 + open_no = 0 + + spejd_ok = 0 + spejd_no = 0 + + err1crf = 0 + err2crf = 0 + err3crf = 0 + err4crf = 0 + + err1open = 0 + err2open = 0 + err3open = 0 + err4open = 0 + + err1spejd = 0 + err2spejd = 0 + err3spejd = 0 + err4spejd = 0 + + for w in wyniki: + crf_ok = crf_ok + w[0] + crf_no = crf_no + w[1] + open_ok = open_ok + w[2] + open_no = open_no + w[3] + spejd_ok = spejd_ok + w[4] + spejd_no = spejd_no + w[5] + err1crf = err1crf + w[6] + err2crf = err2crf + w[7] + err3crf = err3crf + w[8] + err4crf = err4crf + w[9] + err1open = err1open + w[10] + err2open = err2open + w[11] + err3open = err3open + w[12] + err4open = err4open + w[13] + err1spejd = err1spejd + w[14] + err2spejd = err2spejd + w[15] + err3spejd = err3spejd + w[16] + err4spejd = err4spejd + w[17] + + print "crf ok :"+str(crf_ok)+" crf err count :"+str(crf_no)+"\t"+" s1: "+str(err1crf)+ " s2: "+str(err2crf)+ " s3: "+str(err3crf)+ " s4: "+str(err4crf) + print "open ok :"+str(open_ok)+" open err count :"+str(open_no)+"\t"+" s1: "+str(err1open)+ " s2: "+str(err2open)+ " s3: "+str(err3open)+ " s4: "+str(err4open) + print "spejd ok :"+str(spejd_ok)+" spejd err count :"+str(spejd_no)+"\t"+" s1: "+str(err1spejd)+ " s2: "+str(err2spejd)+ " s3: "+str(err3spejd)+ " s4: "+str(err4spejd) +# print "chunki nie posiadające żadnej części wspólnej z korpusem wzorcowym : "+ str(err1) +# print "chunki którę są podzbiorem dla chunka z korpusu wzorcowego : "+str(err2) +# print "chunki które są nadzbiorem dla chunka z korpusu wzorcowego : "+str(err3) +# print "chunki które posiadają część wspólną : "+str(err4) + +def main(wzrc_path, input_format, tagset, crf_path, open_path, spejd_path, verbose): + + tagset = corpus2.get_named_tagset(tagset) + print wzrc_path + wzrc_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, wzrc_path) + crf_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, crf_path) + open_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, open_path) + spejd_reader = corpus2.TokenReader.create_path_reader(input_format, tagset, spejd_path) + + wzrc_sent = wzrc_reader.get_next_sentence() + crf_sent = crf_reader.get_next_sentence() + open_sent = open_reader.get_next_sentence() + spejd_sent = spejd_reader.get_next_sentence() + + wyniki_o = [] + + while wzrc_sent: + tokens = wzrc_sent.tokens() + + tab = [] + + asent = corpus2.AnnotatedSentence.wrap_sentence(wzrc_sent) + sent_id = wzrc_sent.id() + sent_size = wzrc_sent.size() + + for chan_name in asent.all_channels(): + if chan_name == 'chunk_np': + chan = asent.get_channel(chan_name) + ann_vec = chan.make_annotation_vector() + tab.append(["np",ann_vec]) + if len(tab) != 1: + tab.append(["np", None]) + + asent = corpus2.AnnotatedSentence.wrap_sentence(crf_sent) + for chan_name in asent.all_channels(): + if chan_name == 'chunk_np': + chan = asent.get_channel(chan_name) + ann_vec = chan.make_annotation_vector() + tab.append(["crf",ann_vec]) + if len(tab) != 2: + tab.append(["crf", None]) + + asent = corpus2.AnnotatedSentence.wrap_sentence(open_sent) + for chan_name in asent.all_channels(): + if chan_name == 'chunk_np': + chan = asent.get_channel(chan_name) + ann_vec = chan.make_annotation_vector() + tab.append(["open",ann_vec]) + if len(tab) != 3: + tab.append(["open", None]) + + asent = corpus2.AnnotatedSentence.wrap_sentence(spejd_sent) + for chan_name in asent.all_channels(): + if chan_name == 'chunk_np': + chan = asent.get_channel(chan_name) + ann_vec = chan.make_annotation_vector() + tab.append(["spejd",ann_vec]) + if len(tab) != 4: + tab.append(["spejd", None]) + + if len(tab) != 0 : + wyniki_o.append(check_sentence(tab, sent_size, tokens, sent_id, verbose)) + + + crf_sent = crf_reader.get_next_sentence() + open_sent = open_reader.get_next_sentence() + spejd_sent = spejd_reader.get_next_sentence() + wzrc_sent = wzrc_reader.get_next_sentence() + crf_ok = 0 + crf_no = 0 + + open_ok = 0 + open_no = 0 + + spejd_ok = 0 + spejd_no = 0 + + err1crf = 0 + err2crf = 0 + err3crf = 0 + err4crf = 0 + + err1open = 0 + err2open = 0 + err3open = 0 + err4open = 0 + + err1spejd = 0 + err2spejd = 0 + err3spejd = 0 + err4spejd = 0 + + for wyniki in wyniki_o: + crf_ok = crf_ok + wyniki[0][0] + crf_no = crf_no + wyniki[0][1] + open_ok = open_ok + wyniki[1][0] + open_no = open_no + wyniki[1][1] + spejd_ok = spejd_ok + wyniki[2][0] + spejd_no = spejd_no + wyniki[2][1] + err1crf = err1crf + wyniki[0][2] + err2crf = err2crf + wyniki[0][3] + err3crf = err3crf + wyniki[0][4] + err4crf = err4crf + wyniki[0][5] + + err1open = err1open + wyniki[1][2] + err2open = err2open + wyniki[1][3] + err3open = err3open + wyniki[1][4] + err4open = err4open + wyniki[1][5] + + err1spejd = err1spejd + wyniki[2][2] + err2spejd = err2spejd + wyniki[2][3] + err3spejd = err3spejd + wyniki[2][4] + err4spejd = err4spejd + wyniki[2][5] + + + + return [crf_ok, crf_no, open_ok, open_no, spejd_ok, spejd_no, err1crf, err2crf, err3crf, err4crf, err1open, err2open, err3open, err4open, err1spejd, err2spejd, err3spejd, err4spejd] +# print "crf ok :"+str(crf_ok)+" crf no :"+str(crf_no) +# print "open ok :"+str(open_ok)+" open no :"+str(open_no) +# print "spejd ok :"+str(spejd_ok)+" spejd no :"+str(spejd_no) + +def check_sentence(tab, size, tokens, sent_id, verbose): + tokensent = [] + for tok in tokens: + tokensent.append(tok.orth()) + + sent = [] + temp = [] + for ele in tab: + if ele[1]: + for ann in ele[1]: + temp.append(sorted(ann.indices)) + sent.append([ele[0],temp]) + temp = [] + + + result_crf = isOK(sent[0][1], sent[1], tokensent, sent_id, sent, verbose) + result_open = isOK(sent[0][1], sent[2], tokensent, sent_id, sent, verbose) + result_spejd = isOK(sent[0][1], sent[3], tokensent, sent_id, sent, verbose) + + return [result_crf, result_open, result_spejd] + + + + + +def isOK(comparer, chunks, tokensent, sent_id, sent, verbose): + ok = 0 + err = 0 + err1 = 0 + err2 = 0 + err3 = 0 + err4 = 0 + for chunk in chunks[1]: + if (is_equal(comparer, chunk)): + ok += 1 + else: + err += 1 + if verbose: + print '***'+chunks[0] + for s in sent: + printer(s, tokensent) + + + print + if have_no_intersection(comparer, chunk): + err1 += 1 + else: + setchunk = set(chunk) + for ele in comparer: + setele = set(ele) + if setchunk.issubset(setele): + err2 += 1 + elif setele.issubset(setchunk): + err3 += 1 + elif setele.intersection(setchunk) != set(): + err4 += 1 + return [ok, err, err1, err2, err3, err4] +def printer(sent, tokensent): + start= [] + end = [] + for ele in sent[1]: + start.append(ele[0]) + end.append(ele[len(ele)-1]) + + print sent[0], + print "\t", + i = 0 + for tok in tokensent: + tmp = tok + if is_inside(i, start): + tmp = "[ "+str(tmp) + if is_inside(i, end): + tmp = str(tmp)+" ]" + print tmp, + print "\t", + i += 1 + print + +def is_equal(chunktab, name): + setname = set(name) + for chunk in chunktab: + setchunk = set(chunk) + if setname == setchunk: + return True + return False + +def is_inside(i, tab): + for t in tab: + if i == t: + return True + return False + +def get_ref_paths(in_path, folds, input_format): + input_paths = [] + if folds > 1: + for fold in range(1, folds+1): + if input_format == "ccl": + input_paths.append(os.path.join(in_path, 'ccl-test' + str(fold).zfill(2) + '.xml')) + elif input_format == "xces": + input_paths.append(os.path.join(in_path, 'test' + str(fold).zfill(2) + '.xml')) + else: + if(os.path.isdir(in_path)): + for (path, dirs, files) in os.walk(in_path): + for file in files: + input_paths.append(os.path.join(path, file)) + else: + input_paths.append(in_path) + return input_paths + +def have_no_intersection(comparer, chunk): + setchunk = set(chunk) + for ele in comparer: + setele = set(ele) + if setele.intersection(setchunk) != set(): + return False + return True +if __name__ == '__main__': + go() diff --git a/scripts/chunker_scripts/tools.py b/scripts/chunker_scripts/tools.py index d53ff44e0c0cb9b7c8e33f9545ba0ffb4730b223..8c73085bfd04210fc39f0a719bd0e4892407946f 100644 --- a/scripts/chunker_scripts/tools.py +++ b/scripts/chunker_scripts/tools.py @@ -307,7 +307,16 @@ def remove_channels_except(filepath, list=[]): break if write_line: - sources.write(line) + sources.write(line) + +def remove_all_channels(in_path, out_path): + with open(in_path, "r") as sources: + lines = sources.readlines() + with open(out_path, "w") as sources: + for line in lines: + if not "<ann" in line and not "</ann>" in line: + sources.write(line) + ''' CORPUS2 METHODS ''' diff --git a/scripts/chunker_scripts/utils/remove_almost_all_chunks.py b/scripts/chunker_scripts/utils/remove_almost_all_chunks.py new file mode 100755 index 0000000000000000000000000000000000000000..dddc70c183407e69885a2a0a013bd36f0615ea12 --- /dev/null +++ b/scripts/chunker_scripts/utils/remove_almost_all_chunks.py @@ -0,0 +1,31 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +''' +Created on Mar 25, 2013 + +@author: Adam Pawlaczek +''' +import sys +import re + +def main(): + args = sys.argv[1:] + + if len(args) != 2: + sys.stderr.write('You need to provide a in_path out_path.\n') + sys.stderr.write('See %s --help\n' % sys.argv[0]) + sys.exit(1) + + input = args[0] + output = args[1] + chunks = ["chunk_np", "chunk_vp", "chunk_adjp", "chunk_agp", "chunk_qp"] + out_f = open(output, 'w+') + + for line in open(input): + m = re.match(r'<ann\ chan=\"(.+)\">[0-9]*<\/ann>', line.strip()) + if m and not m.group(1) in chunks: + continue + out_f.write(line) + +if __name__ == '__main__': + main() \ No newline at end of file