Skip to content
Snippets Groups Projects
Commit 99035916 authored by Adam Pawlaczek's avatar Adam Pawlaczek
Browse files

Changed version and added main methods

parent c8826774
Branches
No related merge requests found
......@@ -76,19 +76,24 @@ def go():
sys.exit(1)
config_fname = args[0]
files = args[1:]
main(config_fname, files, options.out_path, options.data_dir,
options.verbose, options.input_format, options.output_format,
options.preserve_chunks, options.batch, options.is_training)
def main(config_fname, files, out_path, data_dir, verbose, input_format, output_format, preserve_chunks, batch, is_training):
tagr = chunker.Chunker(config_fname, data_dir,
verbose = verbose)
tagr = chunker.Chunker(config_fname, options.data_dir,
verbose = options.verbose)
if options.is_training:
if is_training:
# chunker training
assert len(files) == 1, 'must provide path to training file'
tagr.train_and_save(files[0], options.input_format)
tagr.train_and_save(files[0], input_format)
else:
# normal chunker performance
inputs = []
outputs = []
if options.batch: # read each arg as input path list
if batch: # read each arg as input path list
for pathpath in files:
inputs.extend(lines(pathpath))
outputs = [path + '.chunked' for path in inputs]
......@@ -98,17 +103,17 @@ def go():
outputs.append(None)
else:
inputs.append(files[0])
outputs.append(options.out_path)
outputs.append(out_path)
else: # multiple paths as args
inputs = files
outputs = [path + '.chunked' for path in inputs]
tagr.load_model()
for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose:
if in_path and verbose:
sys.stderr.write('Processing %s...\n' % in_path)
tagr.tag_input(in_path, out_path,
options.input_format, options.output_format,
options.preserve_chunks)
input_format, output_format,
preserve_chunks)
if __name__ == '__main__':
go()
go()
\ No newline at end of file
......@@ -46,7 +46,7 @@ When processing multiple files, either give the filenames directly as arguments,
or use --batch and provide a filename to a list of paths. Either way, each file
will be chunked and the output writted to FILENAME.chunked.
"""
"""
def lines(pathfilename):
......@@ -91,17 +91,27 @@ def go():
files = args
tagr = tagger.Tagger(options.tagger_config, options.tagger_dir)
if options.shall_chunk:
chunkr = chunker.Chunker(options.chunker_config, options.chunker_dir)
if options.maca_config != '':
tagr.maca_config = options.maca_config
main(files, options.tagger_config, options.tagger_dir, options.shall_chunk,
options.chunker_config, options.chunker_dir, options.maca_config,
options.batch, options.out_path, options.verbose,
options.input_format, options.output_format)
def main(files, tagger_config, tagger_dir, shall_chunk,
chunker_config, chunker_dir, maca_config,
batch, out_path, verbose,
input_format, output_format):
tagr = tagger.Tagger(tagger_config, tagger_dir)
if shall_chunk:
chunkr = chunker.Chunker(chunker_config, chunker_dir)
if maca_config != '':
tagr.maca_config = maca_config
# tag and chunk
inputs = []
outputs = []
if options.batch: # read each arg as input path list
if batch: # read each arg as input path list
for pathpath in files:
inputs.extend(lines(pathpath))
outputs = [path + '.tag' for path in inputs]
......@@ -111,23 +121,23 @@ def go():
outputs.append(None)
else:
inputs.append(files[0])
outputs.append(options.out_path)
outputs.append(out_path)
else: # multiple paths as args
inputs = files
outputs = [path + '.tag' for path in inputs]
if inputs:
tagr.load_model()
if options.shall_chunk:
if shall_chunk:
chunkr.load_model()
assert (tagr.tagset.name()
== chunkr.tagset.name()), ('Tagger and chunker config must'
+ 'operate on the same tagset: %s v. %s' % (tagr.tagset.name(),
chunkr.tagset.name()))
for in_path, out_path in zip(inputs, outputs):
if in_path and options.verbose:
if in_path and verbose:
sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, tagr.tagset, options.input_format, tagr.maca_config)
writer = tagger_io.get_writer(out_path, tagr.tagset, options.output_format)
reader = tagger_io.get_reader(in_path, tagr.tagset, input_format, tagr.maca_config)
writer = tagger_io.get_writer(out_path, tagr.tagset, output_format)
while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph
if not par:
......@@ -146,7 +156,7 @@ def go():
new_sent = corpus2.AnnotatedSentence.cast_as_sentence(new_asent)
# preserve_ambiguity = False
tagr.tag_sentence(new_sent, False)
if options.shall_chunk:
if shall_chunk:
chunkr.tag_sentence(new_sent)
# create a new paragraph with the new sentence
new_par.append(new_sent)
......
......@@ -4,10 +4,10 @@
from setuptools import setup
setup(name='iobber',
version='1.0.1',
version='1.1.0',
description='Chunker for Slavic languages based on CRF++ and WCCL',
author= "Adam Radziszewski",
author_email="adam.radziszewski@pwr.wroc.pl",
author= "Adam Radziszewski, Adam Pawlaczek",
author_email="adam.radziszewski@pwr.wroc.pl, adam@pawlaczek.eu",
packages=[
'iobber'
],
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment