Skip to content
Snippets Groups Projects
Commit 4a9e1351 authored by Adam Pawlaczek's avatar Adam Pawlaczek
Browse files

Added proper crfwrapper2

parent c5e4e0fd
Branches
No related merge requests found
#!/usr/bin/python
#-*- coding: utf-8 -*-
'''
Created on 02-05-2013
@author: Adam Pawlaczek
'''
import codecs
from optparse import OptionParser
import sys, os
import corpus2
from chunker_scripts import tools
from chunker_checker import ChunkerChecker
from chunker_scripts.chunk_eval.chunk_eval import main as chunk_eval
import iobber.iobber as iobber
descr = """%prog [options] [in_dir] [out_dir]
in_dir has to contain subdirs with folds chunked by individual chunkers.
Subdir should be named as chunker which chunked files in it.
"""
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='ccl',
help='set the input format; default: ccl')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='ccl',
help='set the output format; default: ccl')
parser.add_option('--config', type='string', action='store',
dest='config', default='kpwr.ini',
help='set iobber config; default: kpwr.ini')
parser.add_option('-c', '--chunk-names', type='string', action='store',
dest='chunk_names', default='chunk_np',
help='set chunk_names to eval')
parser.add_option('--chunkers', type='string', action='store',
dest='chunkers', default='',
help='set chunkers to eval')
parser.add_option('-f', '--folds', type="int", action='store',
dest='folds', default=1,
help='Number of folds')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='nkjp',
help='set the tagset used in input; default: nkjp')
parser.add_option('--file-prefix', type='string', action='store',
dest='file_prefix', default='ccl-',
help='set the file prefix; default: ccl-')
parser.add_option('--file-ext', type='string', action='store',
dest='file_ext', default='.xml',
help='set the file extention; default: .xml')
parser.add_option('-v', '--verbose', action='store_true',
default=False, dest='verbose')
(options, args) = parser.parse_args()
if len(args) != 2:
sys.stderr.write('You need to provide a in_dir, out_dir and chunk_names and chunkers.\n')
sys.stderr.write('See %s --help\n' % sys.argv[0])
sys.exit(1)
in_path, out_path = args
main(in_path, out_path, options.input_format, options.output_format,
options.chunk_names, options.chunkers, options.folds,
options.tagset, options.file_prefix, options.file_ext,
options.verbose, options.config)
class SuperClassifier(ChunkerChecker):
def __init__(self, input_format, output_format, chunk_names, chunkers, folds, tagset, file_prefix, file_ext, verbose, config):
super(Iobber_v1, self).__init__(input_format, output_format, chunk_names, chunkers, folds, tagset, file_prefix, file_ext, verbose)
self.config = config
self.chunkers = chunkers.split(",")
def create_directories(self):
self.dirs['models_path'] = os.path.join(self.dirs['out_dir'], 'models')
self.dirs['chunked_path'] = os.path.join(self.dirs['out_dir'], 'chunked')
self.dirs['nochann_path'] = os.path.join(self.dirs['out_dir'], 'nochann')
tools.mkdir_p(self.dirs['models_path'])
tools.mkdir_p(self.dirs['chunked_path'])
tools.mkdir_p(self.dirs['nochann_path'])
def process_fold(self, fold):
num = str(fold).zfill(2)
self.train_fold(os.path.join(self.dirs['in_dir'], self.file_prefix + 'train' + num + self.file_ext),
os.path.join(self.dirs['models_path'], num))
def train_fold(self, in_path, model_path):
tr_file = codecs.open(os.path.join(model_path, 'model.tr'), 'wb', 'utf-8')
reader = tools.get_reader(in_path, self.input_format, self.tagset)
sent = reader.get_next_sentence()
while sent:
asent = corpus2.AnnotatedSentence.wrap_sentence(sent)
for chunk_name in asent.all_channels():
chan = asent.get_channel(chunk_name)
sent = reader.get_next_sentence()
tr_file.close()
def main(in_path, out_path, input_format, output_format, chunk_names, folds, tagset, file_prefix, file_ext, verbose, config):
sc = SuperClassifier(input_format, output_format, chunk_names, folds, tagset, file_prefix, file_ext, verbose, config)
sc.process_folds(in_path, out_path)
if __name__ == '__main__':
go()
\ No newline at end of file
......@@ -12,6 +12,7 @@ from chunker_scripts import tools
from chunker_scripts.chunk_eval import chunk_eval_avg
from threading import Thread
import multiprocessing
import logging
descr="""%prog [options] in_dir out_dir"""
......@@ -26,6 +27,9 @@ def go():
parser.add_option('-f', '--folds', type="int", action='store',
dest='folds', default=10,
help='Number of folds default: 10')
parser.add_option('--threads', type="int", action='store',
dest='threads', default=10,
help='Number of threads default: 8')
(options, args) = parser.parse_args()
if len(args) != 2:
......@@ -88,44 +92,59 @@ class CrfWrapper:
f.close()
return count + 1
def generate_random_vector(self):
def generate_random_vector(self, stop_list):
vector = []
for i in range(self.simple_features*(self.window*2+1) + len(self.constructed)):
vector_len = (self.simple_features - len(stop_list)) * (self.window * 2 + 1) + len(self.constructed)
for i in range(vector_len):
vector.append(random.randint(0, 1) == 1)
return vector
def f(self, vector = [], args={}):
#Create config files
config_dir = os.path.join(args['out_dir'], "config_files")
tools.mkdir_p(config_dir, check_exists=True)
# print "Created config directory"
self.generate_features_txt(os.path.join(config_dir, self.config_name + "-layer1.txt"), vector = vector, args = args)
self.generate_features_txt(os.path.join(config_dir, self.config_name + "-layer2.txt"), vector = vector, args = args)
# print "Generated features_txt "
shutil.copyfile(os.path.join(self.config_dir, self.config_name + ".ccl"), os.path.join(config_dir, self.config_name + ".ccl"))
shutil.copyfile(os.path.join(self.config_dir, self.config_name + ".ini"), os.path.join(config_dir, self.config_name + ".ini"))
# print "Copied files "
# tasks = [(process_fold, [fold, args['out_dir'], self.corpus_dir, config_dir, self.config_name]) for fold in range(1, self.folds+1)]
#
# pool = multiprocessing.Pool(processes=4) # start 4 worker processes
# pool_results = [pool.map(process_fold, [fold, args['out_dir'], self.corpus_dir, config_dir, self.config_name]) for fold in range(1, self.folds+1)] # evaluate "f(10)" asynchronously
# pool.close()
# pool.join()
threads = []
fold = 1
for fold in range(1, self.folds+1):
t = Thread(target=self.process_fold, args=(fold, args['out_dir'], config_dir))
threads.append(t)
t.start()
for t in threads:
t.join()
while fold < self.folds + 1:
t1 = Thread(target=self.process_fold, args=(fold, args['out_dir'], config_dir))
t1.start()
fold += 1
t2 = Thread(target=self.process_fold, args=(fold, args['out_dir'], config_dir))
t2.start()
fold += 1
t1.join()
t2.join()
result = chunk_eval_avg.get_avg_results(os.path.join(args['out_dir'], "chunked"), self.corpus_dir, ["chunk_np"])
# self.process_fold(6, args['out_dir'], config_dir)
# fold += 1
# print "Processed", fold, "fold"
result = chunk_eval_avg.get_avg_results(os.path.join(args['out_dir'], "chunked"), self.corpus_dir, ["chunk_np"])
# print "Checked results"
f = open(os.path.join(args['out_dir'], "result.csv"), 'w+')
f.write("vector: ", vector)
f.write("result: ", result)
f.write("vector: "+ str(vector))
f.write("\n")
f.write("result: "+ str(result))
f.close()
# print "Saved results and vector"
shutil.rmtree(os.path.join(args['out_dir'], "chunked"))
#shutil.rmtree(os.path.join(args['out_dir'], "empty"))
#shutil.rmtree(os.path.join(args['out_dir'], "models"))
return result["f"]
def generate_features_txt(self, resultfile, vector = [], args = {}):
......@@ -134,11 +153,12 @@ class CrfWrapper:
#Generate simple features
for i in range(args['nof_simple']): #iterate by simple feature
for j in range(args['window']*2+1): #iterate by window
if vector[i*(args['window']*2+1)+j]:
out.write('U%02d:%%x[%d,%d]'%(feature_num, j if j >= args['window'] else -j, i))
out.write("\n")
feature_num += 1
if i not in args['stop_list']:
for j in range(args['window']*2+1): #iterate by window
if vector[i*(args['window']*2+1)+j]:
out.write('U%02d:%%x[%d,%d]'%(feature_num, j if j >= args['window'] else -j, i))
out.write("\n")
feature_num += 1
out.write("\n")
#Generate constructed features
......@@ -172,49 +192,108 @@ class CrfWrapper:
shutil.copyfile(os.path.join(self.config_dir, "dict-case.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-case.lex"))
shutil.copyfile(os.path.join(self.config_dir, "dict-prep.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-prep.lex"))
shutil.copyfile(os.path.join(self.config_dir, "dict-sie.lex"), os.path.join(out_dir, "models", str(fold).zfill(2), "dict-sie.lex"))
# print "Trenowanie"
#TRAINING
tools.train_iobber(os.path.join(config_dir, self.config_name + ".ini"),
os.path.join(out_dir, "models", str(fold).zfill(2)),
os.path.join(self.corpus_dir, "ccl-train%02d.xml"%(fold)))
# print "Wytrenowanno"
#Remove channels
tools.mkdir_p(os.path.join(out_dir, "empty"))
shutil.copyfile(os.path.join(self.corpus_dir, "ccl-test%02d.xml"%(fold)), os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold)))
tools.remove_channels(os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold)))
# print "Usunięto channel-e"
#RUNING
tools.mkdir_p(os.path.join(out_dir, "chunked"))
tools.run_iobber(os.path.join(config_dir, self.config_name + ".ini"),
os.path.join(out_dir, "empty", "ccl-test%02d.xml"%(fold)),
os.path.join(out_dir, "chunked", "ccl-test%02d.xml"%(fold)),
os.path.join(out_dir, "models", str(fold).zfill(2)))
# print "Oznaczono chunki"
#Remove dicts
os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-case.lex"))
os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-prep.lex"))
os.remove(os.path.join(out_dir, "models", str(fold).zfill(2), "dict-sie.lex"))
# print "Usunięto modele"
def main(corpus_dir, out_dir, config, window, folds):
logging.basicConfig(filename='crf_wrapper.log', level=logging.INFO)
constructed = []
constructed.append("1.0%2.-1")
cw = CrfWrapper(corpus_dir, out_dir, config, window, constructed, folds)
constructed.append("22.0")
constructed.append("23.0")
constructed.append("24.0")
constructed.append("25.0")
constructed.append("26.0")
constructed.append("27.0")
constructed.append("28.0")
constructed.append("29.0")
constructed.append("30.0")
constructed.append("31.0")
constructed.append("15.-1%15.0")
constructed.append("15.0%15.1")
constructed.append("1.-1%1.0")
constructed.append("1.0%1.1")
constructed.append("7.-2%8.-2")
constructed.append("7.-1%8.-1")
constructed.append("7.0%8.0")
constructed.append("7.1%8.1")
constructed.append("7.2%8.2")
constructed.append("9.-1%10.-1%11.-1%12.-1%13.-1")
constructed.append("9.0%10.0%11.0%12.0%13.0")
constructed.append("9.1%10.1%11.1%12.1%13.1")
constructed.append("1.-1%2.-1")
constructed.append("1.0%2.0")
constructed.append("1.1%2.1")
constructed.append("3.-1%2.-1")
constructed.append("3.0%2.0")
constructed.append("3.1%2.1")
constructed.append("4.-1%2.-1")
constructed.append("4.0%2.0")
constructed.append("4.1%2.1")
constructed.append("1.-1%16.-1")
constructed.append("1.0%16.0")
constructed.append("1.1%16.1")
constructed.append("1.-1%17.-1")
constructed.append("1.0%17.0")
constructed.append("1.1%17.1")
constructed.append("1.-1%18.-1")
constructed.append("1.0%18.0")
constructed.append("2.1%18.1")
constructed.append("1.-1%19.-1")
constructed.append("1.0%19.0")
constructed.append("1.1%19.1")
stop_list = [7,8,9,10,11,12,13,22,23,24,25,26,27,28,29,30,31]
cw = CrfWrapper(corpus_dir, out_dir, config, window, constructed, folds)
args = {}
args['out_dir'] = out_dir
args['nof_simple'] = cw.simple_features
args['window'] = cw.window
args['constructed'] = cw.constructed
args['stop_list'] = stop_list
if not os.path.exists(out_dir):
tools.mkdir_p(out_dir)
a_vector = cw.generate_random_vector()
a_vector = [False, True, False, True, True, True, True, False, False, False, True, True, False, False, False, True, True, False, True, False, False, False, True, False, True, False, True, True, True, False, False, False, True, False, True, True, True, False, False, False, False, True, False, True, True, True, True, True, False, False, True, True, True, True, False, True, True, True, True, True, True, True, False, True, True, False, True, True, True, False, False, False, False, True, True, True, True, False, True, False, True, False, True, True, True, True, True, True, True, False, False, False, True, True, True, True, False, False, True, True, True, False, False, False, False, True, True, True, False, False, True, False, False, True, True, True, True, False]
ao = anneal.AnnealOptimalizer(cw.f, a_vector = a_vector, args = args, T0=0.1, maxiter=400, opt="max")
ao.tempestimation()
ao = anneal.AnnealOptimalizer(cw.f, a_vector = a_vector, args = args, seed=5, Tf=0.1, T0=500, temp_reducing_rate=0.90, maxiter=400, opt="max")
ao.anneal()
if __name__ == '__main__':
go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment