Skip to content
Snippets Groups Projects
Commit 70ea98fa authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

add docstrings, break long lines

parent 756cae0c
No related branches found
No related tags found
No related merge requests found
"""IOBBER is a chunker made originally for Polish. Its job is to recognise
syntactic phrases (chunks) in text.
The name comes from IOB tags that are assigned to tokens to represent chunks
(strictly speaking, we use IOB2 representation).
IOBBER has successfully been applied to Polish and Czech so far.
For more information, please visit project homepage:
http://nlp.pwr.wroc.pl/redmine/projects/iobber/wiki
"""
# -*- coding: utf-8 -*-
# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
......@@ -12,7 +12,8 @@
#
# See the LICENCE, COPYING.LESSER and COPYING files for more details
__doc__ = """The actual chunker implementation."""
"""The actual chunker implementation. To use IOBBER via Python API you need
to instantiate Chunker object."""
# SWIG bug workaround: loading multiple SWIG modules brought unwrapped
# swig::stop_iteration exceptions
......@@ -31,33 +32,43 @@ if 'Linux' in platform.system():
# get back to default dlopen policy
sys.setdlopenflags(dlflags)
import os, codecs
import os
import ConfigParser
from operator import itemgetter as ig
def get_layers(conf):
"""Create a simple data structure to represent layers as defined in the
given configuration. The returned structure is a list of tuples.
Each tuple has the form (name, channels) where name is a layer name (str)
and channels is a list of channel names (also strings)."""
layers = [(k, v.split(',')) for (k, v) in conf.items(config.S_LAYERS)]
for name, _ in layers:
assert '-' not in name, 'hyphens are not allowed in channel names'
return layers
def is_input_tagged(conf):
"""Return if the configuration assumes that input is fully disambiguated
(exactly one tag per token) or not (possible tag ambiguity)."""
return conf.getboolean(config.S_GLOBAL, config.O_TAGGED)
class Stats:
"""Statistics for reporting progress and diagnosis."""
def __init__(self):
"""Create statistics with zeroed counters."""
self.clear()
def clear(self):
"""Reset all counters."""
self.num_toks = 0
self.num_sents = 0
def dump(self):
"""Write counter values to stderr."""
sys.stderr.write('Toks processed: %d\n' % self.num_toks)
sys.stderr.write('Sents processed: %d\n' % self.num_sents)
def maybe_report(self):
"""Write progress to stderr if a round number of sentences have been
processed."""
if self.num_sents % 100 == 0:
sys.stderr.write('%d toks...\n' % (self.num_toks))
......@@ -142,7 +153,7 @@ class Chunker:
for layer_idx, layer in enumerate(self.layers):
chans = self.layer_channels[layer_idx]
for chan_name in chans:
# ensure the channel is there and switch to IOB2 representation
# ensure the channel is there and switch to IOB2 repr
if not asent.has_channel(chan_name):
asent.create_channel(chan_name)
chan = asent.get_channel(chan_name)
......@@ -152,13 +163,16 @@ class Chunker:
# get file for storing training data
tr_file = tr_files[layer]
# iterate over each sentence token
for tok_idx, tok in enumerate(sent.tokens()):
for tok_idx, _ in enumerate(sent.tokens()):
con.set_position(tok_idx) # for WCCL ops
feat_vals = [op.base_apply(con)
.to_compact_string(self.tagset).decode('utf-8')
for op in self.layer_ops[layer_idx]]
# get IOB2 tags as strings, find non-O IOB2 tag or mark it as O
# TODO: rename the to_string in corpus2 and fix it here
# get IOB2 tags as strings, find non-O IOB2 tag...
# ...or mark it O
# NOTE: the corpus2.to_string function should actually
# be renamed but it's kinda late to change corpus2 API now
# so we have to live with it
non_O_chan = None
non_O_tag = 'O'
for chan_name in chans:
......@@ -166,7 +180,9 @@ class Chunker:
there_iob = corpus2.to_string(chan.get_iob_at(tok_idx))
if there_iob != 'O':
if non_O_chan is not None and self.verbose:
sys.stderr.write('WARNING: overlapping phrases in sentence %s\n' % unicode(asent.id()))
sys.stderr.write(
'WARNING: overlapping phrases'
' in sentence %s\n' % unicode(asent.id()))
else:
non_O_chan = chan_name
non_O_tag = there_iob
......@@ -174,7 +190,9 @@ class Chunker:
if chan.is_head_at(tok_idx):
non_O_chan += '-H'
# B-NP, I-VP etc. or O
class_label = 'O' if non_O_chan is None else '%s-%s' % (non_O_tag, non_O_chan)
class_label = (
'O' if non_O_chan is None
else '%s-%s' % (non_O_tag, non_O_chan))
# generate training example and store to file
classify.write_example(tr_file, feat_vals, class_label)
classify.write_end_of_sent(tr_file)
......@@ -208,7 +226,7 @@ class Chunker:
if model is not None:
chans = self.layer_channels[layer_idx]
for chan_name in chans:
# ensure the channel is there and switch to IOB2 representation
# ensure the channel is there and switch to IOB2 repr
if not asent.has_channel(chan_name):
asent.create_channel(chan_name)
chan = asent.get_channel(chan_name)
......@@ -217,7 +235,7 @@ class Chunker:
con = corpio.create_context(sent)
classify.open_sent(model)
# iterate over tokens
for tok_idx, tok in enumerate(sent.tokens()):
for tok_idx, _ in enumerate(sent.tokens()):
con.set_position(tok_idx)
feat_vals = [op.base_apply(con)
.to_compact_string(self.tagset).decode('utf-8')
......@@ -226,7 +244,7 @@ class Chunker:
classify.close_sent(model)
last_iobs = {}
for tok_idx, tok in enumerate(sent.tokens()):
for tok_idx, _ in enumerate(sent.tokens()):
decsn = classify.classify_token(model, tok_idx)
non_O_chan = None
non_O_tag = 'O'
......@@ -238,11 +256,17 @@ class Chunker:
elif len(decsn_array) == 3:
non_O_tag, non_O_chan, is_head = decsn_array
else:
raise IOError('Unexpected label returned from classifier: ' + decsn)
raise IOError(
'Unexpected label returned from classifier: '
+ decsn)
for chan_name in chans:
chan = asent.get_channel(chan_name)
tag_to_set = 'O' if chan_name != non_O_chan else non_O_tag
if tag_to_set == "I" and (not last_iobs.has_key(chan_name) or last_iobs[chan_name] == "O"):
tag_to_set = (
'O' if chan_name != non_O_chan
else non_O_tag)
if tag_to_set == "I" and (
not last_iobs.has_key(chan_name)
or last_iobs[chan_name] == "O"):
tag_to_set = 'B'
if tag_to_set == 'B':
head_idx = None
......@@ -256,7 +280,8 @@ class Chunker:
self.stats.num_sents += 1
self.stats.num_toks += sent.tokens().size()
if self.verbose: self.stats.maybe_report()
if self.verbose:
self.stats.maybe_report()
def tag_input(self, in_path, out_path, input_format, output_format,
preserve_pars):
......
# -*- coding: utf-8 -*-
# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
......@@ -12,6 +12,16 @@
#
# See the LICENCE, COPYING.LESSER and COPYING files for more details
"""Collection of functions that provide access to CRF++ classifier.
This includes two scenarios: using a trained classifier as well
as classifier training.
To use a trained classifier model, load it first, then iterate over sentences
to classify. Each sentence should be processed in the following way. Call
open_sent first, then use eat_token against each sentence token and finally
call close_sent. Then gather token-level classifier decisions using
classify_token (each call corresponds to another token)."""
import CRFPP # CRF++ Python wrapper
import subprocess, os # running crf_learn
import codecs
......@@ -21,33 +31,41 @@ import config, corpio
DATA_SEP = '\t'
def open_tr_files(model_name, data_dir, layers):
"""Open files for storing training examples.
Returns a map layer_name -> open file handle."""
tr_files = {}
for layer in layers:
tr_files[layer] = codecs.open(corpio.f_name(model_name, data_dir,
config.EXT_DATA, layer), 'wb', 'utf-8')
tr_files[layer] = codecs.open(
corpio.f_name(
model_name, data_dir,
config.EXT_DATA, layer),
'wb', 'utf-8')
return tr_files
def close_tr_files(tr_files):
"""Close all training files. Should be called after storing all training
examples, before training classifiers."""
for chan in tr_files:
tr_files[chan].close()
def write_example(tr_file, feat_vals, class_label):
"""Writes a training example in simple tab-separated format."""
"""Write a training example in simple tab-separated format."""
tr_file.write(DATA_SEP.join(feat_vals))
tr_file.write(DATA_SEP)
tr_file.write(class_label)
tr_file.write('\n')
def write_end_of_sent(tr_file):
"""Writes end-of-sentence marker to the training file."""
"""Write end-of-sentence marker to the training file."""
tr_file.write('\n')
def train_and_save(conf, model_name, config_dir, data_dir, chan_name):
"""Trains a CRF classifier for the given chan_name. The trained model
"""Train a CRF classifier for chan_name. The trained model
is saved to filenames (generated using model_name and conf)."""
tr_fname = corpio.f_name(model_name, data_dir, config.EXT_DATA, chan_name)
cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name)
cr_template = corpio.f_name(model_name, config_dir, config.EXT_TEXT, chan_name)
cr_template = corpio.f_name(
model_name, config_dir, config.EXT_TEXT, chan_name)
crf_opts = conf.get(config.S_CLASSIFIER, config.O_PARAMS)
# run crf_learn
args = ['crf_learn', cr_template, tr_fname, cr_fname]
......@@ -56,20 +74,20 @@ def train_and_save(conf, model_name, config_dir, data_dir, chan_name):
retval = subprocess.call(args,
stdout = fnull, stderr = fnull)
if retval != 0:
raise IOError('Training CRF++ FAILED. Check .tab file for data validity. Call: %s' % ' '.join(args))
raise IOError(
# this is one string, just avoiding long lines
'Training CRF++ FAILED. Check .tab file for data validity.'
' Call: %s' % ' '.join(args))
def load(conf, model_name, data_dir, chan_name):
"""Tries to load a stored classifier.
If doesn't exist, will return None."""
"""Try to load a stored classifier. If doesn't exist, will return None."""
cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name)
if os.path.isfile(cr_fname):
return CRFPP.Tagger('-m %s' % cr_fname)
raise IOError('can\'t open classifier from %s' % cr_fname)
def open_sent(crf_obj):
"""
Notify the trained classifier than a new sentence will be classified.
"""
"""Notify the trained classifier than a new sentence will be classified."""
crf_obj.clear()
def eat_token(crf_obj, feat_vals):
......
......@@ -2,9 +2,9 @@
# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
......@@ -12,11 +12,15 @@
#
# See the LICENCE, COPYING.LESSER and COPYING files for more details
"""Constants related to section names in config files, WCCL section names
as well as default file extensions."""
# global settings
S_GLOBAL = 'general'
O_TAGSET = 'tagset'
O_CHANNELS = 'channels' # comma separated list of channel/phrase names???
O_TAGGED = 'tagged' # yes/no: shall we expect tagged (yes) or ambiguous (no) input
# yes/no: shall we expect tagged (yes) or ambiguous (no) input
O_TAGGED = 'tagged'
S_LAYERS = 'layers'
......
......@@ -2,9 +2,9 @@
# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
......@@ -12,6 +12,13 @@
#
# See the LICENCE, COPYING.LESSER and COPYING files for more details
"""Routines related to I/O and feature generation.
This includes reading of files referenced by configs, reading and
writing of annotated text (corpora), parsing of WCCL expressions
that define features for classification and generating textual representation
of feature values.
"""
# SWIG bug workaround: loading multiple SWIG modules brought unwrapped
# swig::stop_iteration exceptions
import ctypes, sys
......@@ -35,13 +42,15 @@ import codecs, os
_ROOT = os.path.abspath(os.path.dirname(__file__))
format_help = """
Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """
Available input formats: """ + ' '.join(
corpus2.TokenReader.available_reader_types()) + """
""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """
Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """
Available output formats: """ + ' '.join(
corpus2.TokenWriter.available_writer_types()) + """
""" + ' '.join(corpus2.TokenWriter.available_writer_types_help())
def get_data(path):
"""Tries to resolve path to the given subdir, trying the path locally
"""Try to resolve path to the given subdir, trying the path locally
and then in the install site."""
if os.path.exists(path):
return path
......@@ -51,17 +60,21 @@ def get_data(path):
raise IOError('can\'t locate %s, tried locally and %s' % (path, in_data))
def f_name(model_name, subdir, ext, suff = ''):
"""Gets the filename based on model_name having the given
"""Get the filename based on model_name having the given
extension. Optionally, you can specify name suffix."""
base = (model_name + '-' + suff + '.' + ext) if suff else (model_name + '.' + ext)
base = (
(model_name + '-' + suff + '.' + ext) if suff
else (model_name + '.' + ext))
return os.path.join(subdir, base)
def get_tagset(conf):
"""Returns a corpus2.Tagset object corresponding to the tagset
specified in the given configuration."""
return corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET))
def get_reader(in_path, tagset, input_format, read_disamb_only):
"""Creates a reader using the options. If in_path evaluates to False,
"""Create a reader using the options. If in_path evaluates to False,
will create a stdin reader. Set read_disamb_only to force reading only
'disamb' lexemes/interpretations."""
fixd_format = input_format
......@@ -78,16 +91,17 @@ def get_reader(in_path, tagset, input_format, read_disamb_only):
return corpus2.TokenReader.create_stdin_reader(fixd_format, tagset)
def get_writer(out_path, tagset, output_format):
"""Creates a writer using the options. If out_path evaluates to False,
"""Create a writer using the options. If out_path evaluates to False,
will create a stdout writer."""
if out_path:
return corpus2.TokenWriter.create_path_writer(output_format, out_path,
tagset)
return corpus2.TokenWriter.create_path_writer(
output_format, out_path, tagset)
else:
return corpus2.TokenWriter.create_stdout_writer(output_format, tagset)
return corpus2.TokenWriter.create_stdout_writer(
output_format, tagset)
def op_list(wccl_file, sec_name):
"""Retrieves a list of operators corresponding to a named section from
"""Retrieve a list of operators corresponding to a named section from
the given WCCL file. If section not present, will return an empty list."""
ops = []
if wccl_file.has_untyped_section(sec_name):
......@@ -97,17 +111,21 @@ def op_list(wccl_file, sec_name):
return ops
def get_wccl_ops(conf, model_name, wccl_dir, lex_dir, chan_names):
"""Returns a pair: WCCL op list, that is a list of WCCL operator lists
"""Return a pair: WCCL op list, that is a list of WCCL operator lists
corresponding to the given channel names. Each list may consists of two
parts: the default operators and channel-specific operators
(theoretically both may be empty)."""
wccl_file_path = f_name(model_name, wccl_dir, config.EXT_WCCL)
tagset = corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET))
wccl_file = wccl.Parser(tagset).parseWcclFileFromPath(wccl_file_path, lex_dir)
tagset = corpus2.get_named_tagset(
conf.get(config.S_GLOBAL, config.O_TAGSET))
wccl_file = wccl.Parser(tagset).parseWcclFileFromPath(
wccl_file_path, lex_dir)
def_ops = op_list(wccl_file, config.DEFAULT_OPS)
chan_ops = [def_ops + op_list(wccl_file, chan_name) for chan_name in chan_names]
chan_ops = [
def_ops + op_list(wccl_file, chan_name)
for chan_name in chan_names]
return chan_ops
def create_context(sent):
"""Wraps the sentence as SentenceContext to be used with WCCL."""
"""Wrap the sentence as SentenceContext to be used with WCCL."""
return wccl.SentenceContext(sent)
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
......@@ -13,6 +13,8 @@
#
# See the LICENCE, COPYING.LESSER and COPYING files for more details
"""Simple command-line interface for IOBBER to operate on tagged text."""
import sys
from optparse import OptionParser
......@@ -41,10 +43,13 @@ material. Trained model will be stored in DATA_DIR.
def lines(pathfilename):
"""Read all lines from the given filename without leading and trailing
whitespaces and newlines."""
with open(pathfilename) as f:
return [line.strip() for line in f if line.strip()]
def go():
"""Run command-line parsing and call appropriate Chunker functions."""
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='ccl',
......@@ -57,7 +62,8 @@ def go():
help='set output filename (do not write to stdout)')
parser.add_option('-d', '--data-dir', type='string', action='store',
dest='data_dir', default='',
help='use the given directory to look for the trained model (or save the model when training)')
help='use the given directory to look for the trained model'
' (or save the model when training)')
parser.add_option('--sent-only', action='store_false',
dest='preserve_chunks', default=True,
help='process sentences, ignoring division into paragraphs')
......@@ -81,7 +87,11 @@ def go():
options.verbose, options.input_format, options.output_format,
options.preserve_chunks, options.batch, options.is_training)
def main(config_fname, files, out_path, data_dir, verbose, input_format, output_format, preserve_chunks, batch, is_training):
def main(
config_fname, files, out_path, data_dir, verbose,
input_format, output_format, preserve_chunks, batch, is_training):
"""Create a chunker using function arguments and use it to get all the
input parts processed."""
tagr = chunker.Chunker(config_fname, data_dir,
verbose = verbose)
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This file is part of IOBBER
# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
# IOBBER is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
# This program is free software; you can redistribute and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# IOBBER is distributed in the hope that it will be useful, but
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the LICENCE and COPYING files for more details
# See the LICENCE, COPYING.LESSER and COPYING files for more details
"""Command-line interface that allows to process plain text with IOBBER
(plain text will be automatically tagged using WCRFT and fed into IOBBER)."""
import sys
from optparse import OptionParser
......@@ -22,38 +26,40 @@ from wcrft import corpio as tagger_io
import chunker
#ioformats = corpio.format_help
#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
descr = """%prog [options] [input_files]
IOBBER, configurable chunker.
(C) 2012, Wroclaw University of Technology
Processes input files through the tagger (WCRFT, must be installed) and the chunker.
By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
Processes input files through the tagger (WCRFT, must be installed) and the
chunker. By default input is assumed to be plain text (UTF-8) and writes output
in the CCL format.
Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.
The default values for -c and -w are recommended, but you may need to set trained model
directories (-C and -W).
The default values for -c and -w are recommended, but you may need to set
trained model directories (-C and -W).
Use -O to specify output path (by default will write to stdout).
Use - to process stdin to stdout.
When processing multiple files, either give the filenames directly as arguments,
or use --batch and provide a filename to a list of paths. Either way, each file
will be chunked and the output writted to FILENAME.chunked.
When processing multiple files, either give the filenames directly as
arguments, or use --batch and provide a filename to a list of paths. Either
way, each file will be chunked and the output writted to FILENAME.chunked.
"""
def lines(pathfilename):
"""Read all lines from the given filename without leading and trailing
whitespaces and newlines."""
with open(pathfilename) as f:
return [line.strip() for line in f if line.strip()]
def go():
"""Run command-line parsing and call appropriate WCRFT
and Chunker functions."""
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='txt',
......@@ -72,13 +78,15 @@ def go():
help='use given chunker config (default: kpwr.ini)')
parser.add_option('-C', '--chunker-model', type='string', action='store',
dest='chunker_dir', default='model-kpwr11-H',
help='read chunker trained model from the given dir (default: model-kpwr11-H)')
help='read chunker trained model from the given dir'
' (default: model-kpwr11-H)')
parser.add_option('-w', '--tagger-config', type='string', action='store',
dest='tagger_config', default='nkjp_s2.ini',
help='use given tagger (wcrft) config (default: nkjp_s2.ini)')
parser.add_option('-W', '--tagger-model', type='string', action='store',
dest='tagger_dir', default='model_nkjp10_wcrft_s2',
help='read tagger (wcrft) trained model from the given dir (default: model_nkjp10_wcrft_s2)')
help='read tagger (wcrft) trained model from the given dir'
' (default: model_nkjp10_wcrft_s2)')
parser.add_option('-m', '--maca-config', type='string', action='store',
dest='maca_config', default='',
help='override maca config file specified in tagger config')
......@@ -100,6 +108,8 @@ def main(files, tagger_config, tagger_dir, shall_chunk,
chunker_config, chunker_dir, maca_config,
batch, out_path, verbose,
input_format, output_format):
"""Create a Tagger (WCRFT) and a Chunker (IOBBER) object
and get all the input parts processed according to function args."""
tagr = tagger.Tagger(tagger_config, tagger_dir)
if shall_chunk:
......@@ -136,7 +146,8 @@ def main(files, tagger_config, tagger_dir, shall_chunk,
for in_path, out_path in zip(inputs, outputs):
if in_path and verbose:
sys.stderr.write('Processing %s...\n' % in_path)
reader = tagger_io.get_reader(in_path, tagr.tagset, input_format, tagr.maca_config)
reader = tagger_io.get_reader(
in_path, tagr.tagset, input_format, tagr.maca_config)
writer = tagger_io.get_writer(out_path, tagr.tagset, output_format)
while True:
par = reader.get_next_chunk() # here `chunk' denotes paragraph
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment