From 9136bc802eb4e58fcaabf3c440a85188e9bddfa3 Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Fri, 20 Apr 2012 15:05:29 +0200
Subject: [PATCH] iobber: comments

---
 config/kpwr.ini   | 23 +++++++++++++++++++----
 iobber/chunker.py |  3 ++-
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/config/kpwr.ini b/config/kpwr.ini
index 0b4f6ff..0ce8498 100644
--- a/config/kpwr.ini
+++ b/config/kpwr.ini
@@ -1,9 +1,23 @@
-; Configuration for chunking phrases defined in KPWr:
+; Configuration for chunking phrases defined in KPWr, assuming NKJP tagset.
+;
+; Syntactic chunks are divided into two "layers".
+; 1. Pred-arg chunks:
 ; * chunk_np (noun phrases),
 ; * chunk_adjp (top-level adjective phrases),
-; * chunk_vp (verb phrases without nominal arguments),
-; * chunk_agp (simple agreement-based noun or adj phrases, level on its own).
-; The config assumes NKJP tagset.
+; * chunk_vp (verb phrases without nominal arguments).
+; 2. Low-level phrases based on agreement:
+; * chunk_agp (simple agreement-based noun or adj phrases).
+;
+; Chunks in one layer are disjoint (if they would overlap in the training data,
+; a warning would be issued during training, while the resulting chunker
+; will not produce any overlaps between one-layer chunks anyway).
+
+; The chunker is unable to annotate discontinuous chunks. If such cases
+; appear in the training data (which is the case in KPWr), each continuous
+; part is treated as a separate chunk. Note that it may be altered in the
+; future.
+; The chunker is also unable to recognise heads. They may be annotated after
+; chunking with a dedicated script.
 
 [general]
 tagset   = nkjp
@@ -11,6 +25,7 @@ tagged   = yes
 
 [layers]
 ; the layer ordering is inferred from alphabetical order of their names!
+; channel names should contain no hyphens
 layer1   = chunk_agp
 layer2   = chunk_vp,chunk_np,chunk_adjp
 
diff --git a/iobber/chunker.py b/iobber/chunker.py
index 1ba5019..50aeb26 100644
--- a/iobber/chunker.py
+++ b/iobber/chunker.py
@@ -58,7 +58,8 @@ class Chunker:
 	"""The CRF-based chunker. The chunker may add annotations to multiple
 	channels during one run, as specified in layer definitions.
 	Layers are applied sequentially. A layer defines a set of channels
-	that are dealt with at a time.
+	that are dealt with at a time. The chunks defined in one layer are
+	disjoint.
 	A chunker is parametrised with an INI file, defining layers and settings
 	and a WCCL file defing features to be used by the underlying classifier.
 	A new chunker object should be called either load_model to become a
-- 
GitLab