add docstrings, break long lines

70ea98fa · Adam Radziszewski · 756cae0c · 70ea98fa · 70ea98fa · 70ea98fa
Commit 70ea98fa authored Apr 25, 2014 by Adam Radziszewski
--- a/iobber/__init__.py
+++ b/iobber/__init__.py
+"""IOBBER is a chunker made originally for Polish. Its job is to recognise
+syntactic phrases (chunks) in text.
+The name comes from IOB tags that are assigned to tokens to represent chunks
+(strictly speaking, we use IOB2 representation).
+
+IOBBER has successfully been applied to Polish and Czech so far.
+
+For more information, please visit project homepage:
+http://nlp.pwr.wroc.pl/redmine/projects/iobber/wiki
+"""
--- a/iobber/chunker.py
+++ b/iobber/chunker.py
 # -*- coding: utf-8 -*-

-# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
+# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
 # This program is free software; you can redistribute and/or modify it
-# under the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option)
-# any later version.
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -12,7 +12,8 @@
 #
 # See the LICENCE, COPYING.LESSER and COPYING files for more details

-__doc__ = """The actual chunker implementation."""
+"""The actual chunker implementation. To use IOBBER via Python API you need
+to instantiate Chunker object."""

 # SWIG bug workaround: loading multiple SWIG modules brought unwrapped
 # swig::stop_iteration exceptions
@@ -31,33 +32,43 @@ if 'Linux' in platform.system():
 	# get back to default dlopen policy
 	sys.setdlopenflags(dlflags)

-import os, codecs
+import os
 import ConfigParser
-from operator import itemgetter as ig

 def get_layers(conf):
+	"""Create a simple data structure to represent layers as defined in the
+	given configuration. The returned structure is a list of tuples.
+	Each tuple has the form (name, channels) where name is a layer name (str)
+	and channels is a list of channel names (also strings)."""
 	layers = [(k, v.split(',')) for (k, v) in conf.items(config.S_LAYERS)]
 	for name, _ in layers:
 		assert '-' not in name, 'hyphens are not allowed in channel names'
 	return layers

 def is_input_tagged(conf):
+	"""Return if the configuration assumes that input is fully disambiguated
+	(exactly one tag per token) or not (possible tag ambiguity)."""
 	return conf.getboolean(config.S_GLOBAL, config.O_TAGGED)

 class Stats:
 	"""Statistics for reporting progress and diagnosis."""
 	def __init__(self):
+		"""Create statistics with zeroed counters."""
 		self.clear()
 	
 	def clear(self):
+		"""Reset all counters."""
 		self.num_toks = 0
 		self.num_sents = 0
 	
 	def dump(self):
+		"""Write counter values to stderr."""
 		sys.stderr.write('Toks processed: %d\n' % self.num_toks)
 		sys.stderr.write('Sents processed: %d\n' % self.num_sents)
 		
 	def maybe_report(self):
+		"""Write progress to stderr if a round number of sentences have been
+		processed."""
 		if self.num_sents % 100 == 0:
 			sys.stderr.write('%d toks...\n' % (self.num_toks))

@@ -142,7 +153,7 @@ class Chunker:
 			for layer_idx, layer in enumerate(self.layers):
 				chans = self.layer_channels[layer_idx]
 				for chan_name in chans:
-					# ensure the channel is there and switch to IOB2 representation
+					# ensure the channel is there and switch to IOB2 repr
 					if not asent.has_channel(chan_name):
 						asent.create_channel(chan_name)
 					chan = asent.get_channel(chan_name)
@@ -152,13 +163,16 @@ class Chunker:
 				# get file for storing training data
 				tr_file = tr_files[layer]
 				# iterate over each sentence token
-				for tok_idx, tok in enumerate(sent.tokens()):
+				for tok_idx, _ in enumerate(sent.tokens()):
 					con.set_position(tok_idx) # for WCCL ops
 					feat_vals = [op.base_apply(con)
 						.to_compact_string(self.tagset).decode('utf-8')
 						for op in self.layer_ops[layer_idx]]
-					# get IOB2 tags as strings, find non-O IOB2 tag or mark it as O
-					# TODO: rename the to_string in corpus2 and fix it here
+					# get IOB2 tags as strings, find non-O IOB2 tag...
+					# ...or mark it O
+					# NOTE: the corpus2.to_string function should actually
+					# be renamed but it's kinda late to change corpus2 API now
+					# so we have to live with it
 					non_O_chan = None
 					non_O_tag = 'O'
 					for chan_name in chans:
@@ -166,7 +180,9 @@ class Chunker:
 						there_iob = corpus2.to_string(chan.get_iob_at(tok_idx))
 						if there_iob != 'O':
 							if non_O_chan is not None and self.verbose:
-								sys.stderr.write('WARNING: overlapping phrases in sentence %s\n' % unicode(asent.id()))
+								sys.stderr.write(
+									'WARNING: overlapping phrases'
+									' in sentence %s\n' % unicode(asent.id()))
 							else:
 								non_O_chan = chan_name
 								non_O_tag = there_iob
@@ -174,7 +190,9 @@ class Chunker:
 								if chan.is_head_at(tok_idx):
 									non_O_chan += '-H'
 					# B-NP, I-VP etc. or O
-					class_label = 'O' if non_O_chan is None else '%s-%s' % (non_O_tag, non_O_chan)
+					class_label = (
+						'O' if non_O_chan is None
+						else '%s-%s' % (non_O_tag, non_O_chan))
 					# generate training example and store to file
 					classify.write_example(tr_file, feat_vals, class_label)
 				classify.write_end_of_sent(tr_file)
@@ -208,7 +226,7 @@ class Chunker:
 			if model is not None:
 				chans = self.layer_channels[layer_idx]
 				for chan_name in chans:
-					# ensure the channel is there and switch to IOB2 representation
+					# ensure the channel is there and switch to IOB2 repr
 					if not asent.has_channel(chan_name):
 						asent.create_channel(chan_name)
 					chan = asent.get_channel(chan_name)
@@ -217,7 +235,7 @@ class Chunker:
 				con = corpio.create_context(sent)
 				classify.open_sent(model)
 				# iterate over tokens
-				for tok_idx, tok in enumerate(sent.tokens()):
+				for tok_idx, _ in enumerate(sent.tokens()):
 					con.set_position(tok_idx)
 					feat_vals = [op.base_apply(con)
 							.to_compact_string(self.tagset).decode('utf-8')
@@ -226,7 +244,7 @@ class Chunker:
 				classify.close_sent(model)

 				last_iobs = {}
-				for tok_idx, tok in enumerate(sent.tokens()):
+				for tok_idx, _ in enumerate(sent.tokens()):
 					decsn = classify.classify_token(model, tok_idx)
 					non_O_chan = None
 					non_O_tag = 'O'
@@ -238,11 +256,17 @@ class Chunker:
 						elif len(decsn_array) == 3:
 							non_O_tag, non_O_chan, is_head = decsn_array
 						else:
-							raise IOError('Unexpected label returned from classifier: ' + decsn)
+							raise IOError(
+								'Unexpected label returned from classifier: '
+								+ decsn)
 					for chan_name in chans:
 						chan = asent.get_channel(chan_name)
-						tag_to_set = 'O' if chan_name != non_O_chan else non_O_tag
-						if tag_to_set == "I" and (not last_iobs.has_key(chan_name) or last_iobs[chan_name] == "O"):
+						tag_to_set = (
+							'O' if chan_name != non_O_chan
+							else non_O_tag)
+						if tag_to_set == "I" and (
+							not last_iobs.has_key(chan_name)
+							or last_iobs[chan_name] == "O"):
 							tag_to_set = 'B'
 						if tag_to_set == 'B':
 							head_idx = None
@@ -256,7 +280,8 @@ class Chunker:

 		self.stats.num_sents += 1
 		self.stats.num_toks += sent.tokens().size()
-		if self.verbose: self.stats.maybe_report()
+		if self.verbose:
+			self.stats.maybe_report()

 	def tag_input(self, in_path, out_path, input_format, output_format,
 			preserve_pars):

--- a/iobber/classify.py
+++ b/iobber/classify.py
 # -*- coding: utf-8 -*-

-# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
+# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
 # This program is free software; you can redistribute and/or modify it
-# under the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option)
-# any later version.
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -12,6 +12,16 @@
 #
 # See the LICENCE, COPYING.LESSER and COPYING files for more details

+"""Collection of functions that provide access to CRF++ classifier.
+This includes two scenarios: using a trained classifier as well
+as classifier training.
+
+To use a trained classifier model, load it first, then iterate over sentences
+to classify. Each sentence should be processed in the following way. Call
+open_sent first, then use eat_token against each sentence token and finally
+call close_sent. Then gather token-level classifier decisions using
+classify_token (each call corresponds to another token)."""
+
 import CRFPP # CRF++ Python wrapper
 import subprocess, os # running crf_learn
 import codecs
@@ -21,33 +31,41 @@ import config, corpio
 DATA_SEP = '\t'

 def open_tr_files(model_name, data_dir, layers):
+	"""Open files for storing training examples.
+	Returns a map layer_name -> open file handle."""
 	tr_files = {}
 	for layer in layers:
-		tr_files[layer] = codecs.open(corpio.f_name(model_name, data_dir,
-					config.EXT_DATA, layer), 'wb', 'utf-8')
+		tr_files[layer] = codecs.open(
+			corpio.f_name(
+				model_name, data_dir,
+				config.EXT_DATA, layer),
+			'wb', 'utf-8')
 	return tr_files

 def close_tr_files(tr_files):
+	"""Close all training files. Should be called after storing all training
+	examples, before training classifiers."""
 	for chan in tr_files:
 		tr_files[chan].close()

 def write_example(tr_file, feat_vals, class_label):
-	"""Writes a training example in simple tab-separated format."""
+	"""Write a training example in simple tab-separated format."""
 	tr_file.write(DATA_SEP.join(feat_vals))
 	tr_file.write(DATA_SEP)
 	tr_file.write(class_label)
 	tr_file.write('\n')

 def write_end_of_sent(tr_file):
-	"""Writes end-of-sentence marker to the training file."""
+	"""Write end-of-sentence marker to the training file."""
 	tr_file.write('\n')

 def train_and_save(conf, model_name, config_dir, data_dir, chan_name):
-	"""Trains a CRF classifier for the given chan_name. The trained model
+	"""Train a CRF classifier for chan_name. The trained model
 	is saved to filenames (generated using model_name and conf)."""
 	tr_fname = corpio.f_name(model_name, data_dir, config.EXT_DATA, chan_name)
 	cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name)
-	cr_template = corpio.f_name(model_name, config_dir, config.EXT_TEXT, chan_name)
+	cr_template = corpio.f_name(
+		model_name, config_dir, config.EXT_TEXT, chan_name)
 	crf_opts = conf.get(config.S_CLASSIFIER, config.O_PARAMS)
 	# run crf_learn
 	args = ['crf_learn', cr_template, tr_fname, cr_fname]
@@ -56,20 +74,20 @@ def train_and_save(conf, model_name, config_dir, data_dir, chan_name):
 		retval = subprocess.call(args,
 			stdout = fnull, stderr = fnull)
 		if retval != 0:
-			raise IOError('Training CRF++ FAILED. Check .tab file for data validity. Call: %s' % ' '.join(args))
+			raise IOError(
+				# this is one string, just avoiding long lines
+				'Training CRF++ FAILED. Check .tab file for data validity.'
+				' Call: %s' % ' '.join(args))

 def load(conf, model_name, data_dir, chan_name):
-	"""Tries to load a stored classifier.
-	If doesn't exist, will return None."""
+	"""Try to load a stored classifier. If doesn't exist, will return None."""
 	cr_fname = corpio.f_name(model_name, data_dir, config.EXT_CR, chan_name)
 	if os.path.isfile(cr_fname):
 		return CRFPP.Tagger('-m %s' % cr_fname)
 	raise IOError('can\'t open classifier from %s' % cr_fname)

 def open_sent(crf_obj):
-	"""
-	Notify the trained classifier than a new sentence will be classified.
-	"""
+	"""Notify the trained classifier than a new sentence will be classified."""
 	crf_obj.clear()

 def eat_token(crf_obj, feat_vals):

--- a/iobber/config.py
+++ b/iobber/config.py
@@ -2,9 +2,9 @@

 # Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
 # This program is free software; you can redistribute and/or modify it
-# under the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option)
-# any later version.
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -12,11 +12,15 @@
 #
 # See the LICENCE, COPYING.LESSER and COPYING files for more details

+"""Constants related to section names in config files, WCCL section names
+as well as default file extensions."""
+
 # global settings
 S_GLOBAL = 'general'
 O_TAGSET = 'tagset'
 O_CHANNELS = 'channels' # comma separated list of channel/phrase names???
-O_TAGGED = 'tagged' # yes/no: shall we expect tagged (yes) or ambiguous (no) input
+# yes/no: shall we expect tagged (yes) or ambiguous (no) input
+O_TAGGED = 'tagged'

 S_LAYERS = 'layers'


--- a/iobber/corpio.py
+++ b/iobber/corpio.py
@@ -2,9 +2,9 @@

 # Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
 # This program is free software; you can redistribute and/or modify it
-# under the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option)
-# any later version.
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -12,6 +12,13 @@
 #
 # See the LICENCE, COPYING.LESSER and COPYING files for more details

+"""Routines related to I/O and feature generation.
+This includes reading of files referenced by configs, reading and
+writing of annotated text (corpora), parsing of WCCL expressions
+that define features for classification and generating textual representation
+of feature values.
+"""
+
 # SWIG bug workaround: loading multiple SWIG modules brought unwrapped
 # swig::stop_iteration exceptions
 import ctypes, sys
@@ -35,13 +42,15 @@ import codecs, os
 _ROOT = os.path.abspath(os.path.dirname(__file__))

 format_help = """
-Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """
+Available input formats: """ + ' '.join(
+	corpus2.TokenReader.available_reader_types()) + """
 """ + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """
-Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """
+Available output formats: """ + ' '.join(
+	corpus2.TokenWriter.available_writer_types()) + """
 """ + ' '.join(corpus2.TokenWriter.available_writer_types_help())

 def get_data(path):
-	"""Tries to resolve path to the given subdir, trying the path locally
+	"""Try to resolve path to the given subdir, trying the path locally
 	and then in the install site."""
 	if os.path.exists(path):
 		return path
@@ -51,17 +60,21 @@ def get_data(path):
 	raise IOError('can\'t locate %s, tried locally and %s' % (path, in_data))

 def f_name(model_name, subdir, ext, suff = ''):
-	"""Gets the filename based on model_name having the given
+	"""Get the filename based on model_name having the given
 	extension. Optionally, you can specify name suffix."""
 	
-	base = (model_name + '-' + suff + '.' + ext) if suff else (model_name + '.' + ext)
+	base = (
+		(model_name + '-' + suff + '.' + ext) if suff
+		else (model_name + '.' + ext))
 	return os.path.join(subdir, base)

 def get_tagset(conf):
+	"""Returns a corpus2.Tagset object corresponding to the tagset
+	specified in the given configuration."""
 	return corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET))

 def get_reader(in_path, tagset, input_format, read_disamb_only):
-	"""Creates a reader using the options. If in_path evaluates to False,
+	"""Create a reader using the options. If in_path evaluates to False,
 	will create a stdin reader. Set read_disamb_only to force reading only
 	'disamb' lexemes/interpretations."""
 	fixd_format = input_format
@@ -78,16 +91,17 @@ def get_reader(in_path, tagset, input_format, read_disamb_only):
 		return corpus2.TokenReader.create_stdin_reader(fixd_format, tagset)

 def get_writer(out_path, tagset, output_format):
-	"""Creates a writer using the options. If out_path evaluates to False,
+	"""Create a writer using the options. If out_path evaluates to False,
 	will create a stdout writer."""
 	if out_path:
-		return corpus2.TokenWriter.create_path_writer(output_format, out_path,
-			tagset)
+		return corpus2.TokenWriter.create_path_writer(
+			output_format, out_path, tagset)
 	else:
-		return corpus2.TokenWriter.create_stdout_writer(output_format, tagset)
+		return corpus2.TokenWriter.create_stdout_writer(
+			output_format, tagset)

 def op_list(wccl_file, sec_name):
-	"""Retrieves a list of operators corresponding to a named section from
+	"""Retrieve a list of operators corresponding to a named section from
 	the given WCCL file. If section not present, will return an empty list."""
 	ops = []
 	if wccl_file.has_untyped_section(sec_name):
@@ -97,17 +111,21 @@ def op_list(wccl_file, sec_name):
 	return ops

 def get_wccl_ops(conf, model_name, wccl_dir, lex_dir, chan_names):
-	"""Returns a pair: WCCL op list, that is a list of WCCL operator lists
+	"""Return a pair: WCCL op list, that is a list of WCCL operator lists
 	corresponding to the given channel names. Each list may consists of two
 	parts: the default operators and channel-specific operators
 	(theoretically both may be empty)."""
 	wccl_file_path = f_name(model_name, wccl_dir, config.EXT_WCCL)
-	tagset = corpus2.get_named_tagset(conf.get(config.S_GLOBAL, config.O_TAGSET))
-	wccl_file = wccl.Parser(tagset).parseWcclFileFromPath(wccl_file_path, lex_dir)
+	tagset = corpus2.get_named_tagset(
+		conf.get(config.S_GLOBAL, config.O_TAGSET))
+	wccl_file = wccl.Parser(tagset).parseWcclFileFromPath(
+		wccl_file_path, lex_dir)
 	def_ops = op_list(wccl_file, config.DEFAULT_OPS)
-	chan_ops = [def_ops + op_list(wccl_file, chan_name) for chan_name in chan_names]
+	chan_ops = [
+		def_ops + op_list(wccl_file, chan_name)
+		for chan_name in chan_names]
 	return chan_ops

 def create_context(sent):
-	"""Wraps the sentence as SentenceContext to be used with WCCL."""
+	"""Wrap the sentence as SentenceContext to be used with WCCL."""
 	return wccl.SentenceContext(sent)
--- a/iobber/iobber.py
+++ b/iobber/iobber.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-

-# Copyright (C) 2012 Adam Radziszewski. Part of IOBBER.
+# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
 # This program is free software; you can redistribute and/or modify it
-# under the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option)
-# any later version.
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
@@ -13,6 +13,8 @@
 #
 # See the LICENCE, COPYING.LESSER and COPYING files for more details

+"""Simple command-line interface for IOBBER to operate on tagged text."""
+
 import sys
 from optparse import OptionParser

@@ -41,10 +43,13 @@ material. Trained model will be stored in DATA_DIR.


 def lines(pathfilename):
+	"""Read all lines from the given filename without leading and trailing
+	whitespaces and newlines."""
 	with open(pathfilename) as f:
 		return [line.strip() for line in f if line.strip()]

 def go():
+	"""Run command-line parsing and call appropriate Chunker functions."""
 	parser = OptionParser(usage=descr)
 	parser.add_option('-i', '--input-format', type='string', action='store',
 		dest='input_format', default='ccl',
@@ -57,7 +62,8 @@ def go():
 		help='set output filename (do not write to stdout)')
 	parser.add_option('-d', '--data-dir', type='string', action='store',
 		dest='data_dir', default='',
-		help='use the given directory to look for the trained model (or save the model when training)')
+		help='use the given directory to look for the trained model'
+		' (or save the model when training)')
 	parser.add_option('--sent-only', action='store_false',
 		dest='preserve_chunks', default=True,
 		help='process sentences, ignoring division into paragraphs')
@@ -81,7 +87,11 @@ def go():
 		options.verbose, options.input_format, options.output_format,
 		options.preserve_chunks, options.batch, options.is_training)

-def main(config_fname, files, out_path, data_dir, verbose, input_format, output_format, preserve_chunks, batch, is_training):	
+def main(
+		config_fname, files, out_path, data_dir, verbose,
+		input_format, output_format, preserve_chunks, batch, is_training):
+	"""Create a chunker using function arguments and use it to get all the
+	input parts processed."""
 	tagr = chunker.Chunker(config_fname, data_dir,
 		verbose = verbose)
 	

--- a/iobber/iobber_txt.py
+++ b/iobber/iobber_txt.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
-# This file is part of IOBBER
-# Copyright (C) 2011 Adam Radziszewski, Paweł Orłowicz.
-# IOBBER is free software; you can redistribute and/or modify it
-# under the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation; either version 3 of the License, or (at your option)
-# any later version.
+
+# Copyright (C) 2011 Adam Radziszewski. Part of IOBBER.
+# This program is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 3 of the License,
+# or (at your option) any later version.
 #
-# IOBBER is distributed in the hope that it will be useful, but
+# This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 # or FITNESS FOR A PARTICULAR PURPOSE.
 #
-# See the LICENCE and COPYING files for more details
+# See the LICENCE, COPYING.LESSER and COPYING files for more details
+
+"""Command-line interface that allows to process plain text with IOBBER
+(plain text will be automatically tagged using WCRFT and fed into IOBBER)."""
+
 import sys
 from optparse import OptionParser

@@ -22,38 +26,40 @@ from wcrft import corpio as tagger_io

 import chunker

-#ioformats = corpio.format_help
-#ioformats = '\nINFO: formats: txt premorph; require installed Maca and Morfeusz' + ioformats.replace('input formats: ccl', 'input formats: txt premorph ccl')
-
 descr = """%prog [options] [input_files]

 IOBBER, configurable chunker.
 (C) 2012, Wroclaw University of Technology

-Processes input files through the tagger (WCRFT, must be installed) and the chunker.
-By default input is assumed to be plain text (UTF-8) and writes output in the CCL format.
+Processes input files through the tagger (WCRFT, must be installed) and the
+chunker. By default input is assumed to be plain text (UTF-8) and writes output
+in the CCL format.

 Use -c to set IOBBER (chunker) config and -C to set IOBBER trained model.
 Use -w to set WCRFT (tagger) config and -W to set WCRFT trained model.

-The default values for -c and -w are recommended, but you may need to set trained model
-directories (-C and -W).
+The default values for -c and -w are recommended, but you may need to set
+trained model directories (-C and -W).

 Use -O to specify output path (by default will write to stdout).
 Use - to process stdin to stdout.

-When processing multiple files, either give the filenames directly as arguments,
-or use --batch and provide a filename to a list of paths. Either way, each file
-will be chunked and the output writted to FILENAME.chunked.
+When processing multiple files, either give the filenames directly as
+arguments, or use --batch and provide a filename to a list of paths. Either
+way, each file will be chunked and the output writted to FILENAME.chunked.

 """


 def lines(pathfilename):
+	"""Read all lines from the given filename without leading and trailing
+	whitespaces and newlines."""
 	with open(pathfilename) as f:
 		return [line.strip() for line in f if line.strip()]

 def go():
+	"""Run command-line parsing and call appropriate WCRFT
+	and Chunker functions."""
 	parser = OptionParser(usage=descr)
 	parser.add_option('-i', '--input-format', type='string', action='store',
 		dest='input_format', default='txt',
@@ -72,13 +78,15 @@ def go():
 		help='use given chunker config (default: kpwr.ini)')
 	parser.add_option('-C', '--chunker-model', type='string', action='store',
 		dest='chunker_dir', default='model-kpwr11-H',
-		help='read chunker trained model from the given dir (default: model-kpwr11-H)')
+		help='read chunker trained model from the given dir'
+		' (default: model-kpwr11-H)')
 	parser.add_option('-w', '--tagger-config', type='string', action='store',
 		dest='tagger_config', default='nkjp_s2.ini',
 		help='use given tagger (wcrft) config (default: nkjp_s2.ini)')
 	parser.add_option('-W', '--tagger-model', type='string', action='store',
 		dest='tagger_dir', default='model_nkjp10_wcrft_s2',
-		help='read tagger (wcrft) trained model from the given dir (default: model_nkjp10_wcrft_s2)')
+		help='read tagger (wcrft) trained model from the given dir'
+		' (default: model_nkjp10_wcrft_s2)')
 	parser.add_option('-m', '--maca-config', type='string', action='store',
 		dest='maca_config', default='',
 		help='override maca config file specified in tagger config')
@@ -100,6 +108,8 @@ def main(files, tagger_config, tagger_dir, shall_chunk,
 		chunker_config, chunker_dir, maca_config,
 		batch, out_path, verbose,
 		input_format, output_format):
+	"""Create a Tagger (WCRFT) and a Chunker (IOBBER) object
+	and get all the input parts processed according to function args."""

 	tagr = tagger.Tagger(tagger_config, tagger_dir)
 	if shall_chunk:
@@ -136,7 +146,8 @@ def main(files, tagger_config, tagger_dir, shall_chunk,
 		for in_path, out_path in zip(inputs, outputs):
 			if in_path and verbose:
 				sys.stderr.write('Processing %s...\n' % in_path)
-			reader = tagger_io.get_reader(in_path, tagr.tagset, input_format, tagr.maca_config)
+			reader = tagger_io.get_reader(
+				in_path, tagr.tagset, input_format, tagr.maca_config)
 			writer = tagger_io.get_writer(out_path, tagr.tagset, output_format)
 			while True:
 				par = reader.get_next_chunk() # here `chunk' denotes paragraph