Skip to content
Snippets Groups Projects
Commit 99035916 authored by Adam Pawlaczek's avatar Adam Pawlaczek
Browse files

Changed version and added main methods

parent c8826774
No related branches found
No related tags found
No related merge requests found
...@@ -77,18 +77,23 @@ def go(): ...@@ -77,18 +77,23 @@ def go():
config_fname = args[0] config_fname = args[0]
files = args[1:] files = args[1:]
tagr = chunker.Chunker(config_fname, options.data_dir, main(config_fname, files, options.out_path, options.data_dir,
verbose = options.verbose) options.verbose, options.input_format, options.output_format,
options.preserve_chunks, options.batch, options.is_training)
if options.is_training: def main(config_fname, files, out_path, data_dir, verbose, input_format, output_format, preserve_chunks, batch, is_training):
tagr = chunker.Chunker(config_fname, data_dir,
verbose = verbose)
if is_training:
# chunker training # chunker training
assert len(files) == 1, 'must provide path to training file' assert len(files) == 1, 'must provide path to training file'
tagr.train_and_save(files[0], options.input_format) tagr.train_and_save(files[0], input_format)
else: else:
# normal chunker performance # normal chunker performance
inputs = [] inputs = []
outputs = [] outputs = []
if options.batch: # read each arg as input path list if batch: # read each arg as input path list
for pathpath in files: for pathpath in files:
inputs.extend(lines(pathpath)) inputs.extend(lines(pathpath))
outputs = [path + '.chunked' for path in inputs] outputs = [path + '.chunked' for path in inputs]
...@@ -98,17 +103,17 @@ def go(): ...@@ -98,17 +103,17 @@ def go():
outputs.append(None) outputs.append(None)
else: else:
inputs.append(files[0]) inputs.append(files[0])
outputs.append(options.out_path) outputs.append(out_path)
else: # multiple paths as args else: # multiple paths as args
inputs = files inputs = files
outputs = [path + '.chunked' for path in inputs] outputs = [path + '.chunked' for path in inputs]
tagr.load_model() tagr.load_model()
for in_path, out_path in zip(inputs, outputs): for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose: if in_path and verbose:
sys.stderr.write('Processing %s...\n' % in_path) sys.stderr.write('Processing %s...\n' % in_path)
tagr.tag_input(in_path, out_path, tagr.tag_input(in_path, out_path,
options.input_format, options.output_format, input_format, output_format,
options.preserve_chunks) preserve_chunks)
if __name__ == '__main__': if __name__ == '__main__':
go() go()
\ No newline at end of file
...@@ -91,17 +91,27 @@ def go(): ...@@ -91,17 +91,27 @@ def go():
files = args files = args
tagr = tagger.Tagger(options.tagger_config, options.tagger_dir) main(files, options.tagger_config, options.tagger_dir, options.shall_chunk,
if options.shall_chunk: options.chunker_config, options.chunker_dir, options.maca_config,
chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir) options.batch, options.out_path, options.verbose,
options.input_format, options.output_format)
if options.maca_config != '': def main(files, tagger_config, tagger_dir, shall_chunk,
tagr.maca_config = options.maca_config chunker_config, chunker_dir, maca_config,
batch, out_path, verbose,
input_format, output_format):
tagr = tagger.Tagger(tagger_config, tagger_dir)
if shall_chunk:
chunkr = chunker.Chunker(chunker_config, chunker_dir)
if maca_config != '':
tagr.maca_config = maca_config
# tag and chunk # tag and chunk
inputs = [] inputs = []
outputs = [] outputs = []
if options.batch: # read each arg as input path list if batch: # read each arg as input path list
for pathpath in files: for pathpath in files:
inputs.extend(lines(pathpath)) inputs.extend(lines(pathpath))
outputs = [path + '.tag' for path in inputs] outputs = [path + '.tag' for path in inputs]
...@@ -111,23 +121,23 @@ def go(): ...@@ -111,23 +121,23 @@ def go():
outputs.append(None) outputs.append(None)
else: else:
inputs.append(files[0]) inputs.append(files[0])
outputs.append(options.out_path) outputs.append(out_path)
else: # multiple paths as args else: # multiple paths as args
inputs = files inputs = files
outputs = [path + '.tag' for path in inputs] outputs = [path + '.tag' for path in inputs]
if inputs: if inputs:
tagr.load_model() tagr.load_model()
if options.shall_chunk: if shall_chunk:
chunkr.load_model() chunkr.load_model()
assert (tagr.tagset.name() assert (tagr.tagset.name()
== chunkr.tagset.name()), ('Tagger and chunker config must' == chunkr.tagset.name()), ('Tagger and chunker config must'
+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(), + 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
chunkr.tagset.name())) chunkr.tagset.name()))
for in_path, out_path in zip(inputs, outputs): for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose: if in_path and verbose:
sys.stderr.write('Processing %s...\n' % in_path) sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config) reader = tagger_io.get_reader(in_path, tagr.tagset, input_format, tagr.maca_config)
writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format) writer = tagger_io.get_writer(out_path, tagr.tagset, output_format)
while True: while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph par = reader.get_next_chunk() # here `chunk' denotes paragraph
if not par: if not par:
...@@ -146,7 +156,7 @@ def go(): ...@@ -146,7 +156,7 @@ def go():
new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent) new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
# preserve_ambiguity = False # preserve_ambiguity = False
tagr.tag_sentence(new_sent, False) tagr.tag_sentence(new_sent, False)
if options.shall_chunk: if shall_chunk:
chunkr.tag_sentence(new_sent) chunkr.tag_sentence(new_sent)
# create a new paragraph with the new sentence # create a new paragraph with the new sentence
new_par.append(new_sent) new_par.append(new_sent)
......
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
from setuptools import setup from setuptools import setup
setup(name='iobber', setup(name='iobber',
version='1.0.1', version='1.1.0',
description='Chunker for Slavic languages based on CRF++ and WCCL', description='Chunker for Slavic languages based on CRF++ and WCCL',
author= "Adam Radziszewski", author= "Adam Radziszewski, Adam Pawlaczek",
author_email="adam.radziszewski@pwr.wroc.pl", author_email="adam.radziszewski@pwr.wroc.pl, adam@pawlaczek.eu",
packages=[ packages=[
'iobber' 'iobber'
], ],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment