From 9136bc802eb4e58fcaabf3c440a85188e9bddfa3 Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Fri, 20 Apr 2012 15:05:29 +0200 Subject: [PATCH] iobber: comments --- config/kpwr.ini | 23 +++++++++++++++++++---- iobber/chunker.py | 3 ++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/config/kpwr.ini b/config/kpwr.ini index 0b4f6ff..0ce8498 100644 --- a/config/kpwr.ini +++ b/config/kpwr.ini @@ -1,9 +1,23 @@ -; Configuration for chunking phrases defined in KPWr: +; Configuration for chunking phrases defined in KPWr, assuming NKJP tagset. +; +; Syntactic chunks are divided into two "layers". +; 1. Pred-arg chunks: ; * chunk_np (noun phrases), ; * chunk_adjp (top-level adjective phrases), -; * chunk_vp (verb phrases without nominal arguments), -; * chunk_agp (simple agreement-based noun or adj phrases, level on its own). -; The config assumes NKJP tagset. +; * chunk_vp (verb phrases without nominal arguments). +; 2. Low-level phrases based on agreement: +; * chunk_agp (simple agreement-based noun or adj phrases). +; +; Chunks in one layer are disjoint (if they would overlap in the training data, +; a warning would be issued during training, while the resulting chunker +; will not produce any overlaps between one-layer chunks anyway). + +; The chunker is unable to annotate discontinuous chunks. If such cases +; appear in the training data (which is the case in KPWr), each continuous +; part is treated as a separate chunk. Note that it may be altered in the +; future. +; The chunker is also unable to recognise heads. They may be annotated after +; chunking with a dedicated script. [general] tagset = nkjp @@ -11,6 +25,7 @@ tagged = yes [layers] ; the layer ordering is inferred from alphabetical order of their names! +; channel names should contain no hyphens layer1 = chunk_agp layer2 = chunk_vp,chunk_np,chunk_adjp diff --git a/iobber/chunker.py b/iobber/chunker.py index 1ba5019..50aeb26 100644 --- a/iobber/chunker.py +++ b/iobber/chunker.py @@ -58,7 +58,8 @@ class Chunker: """The CRF-based chunker. The chunker may add annotations to multiple channels during one run, as specified in layer definitions. Layers are applied sequentially. A layer defines a set of channels - that are dealt with at a time. + that are dealt with at a time. The chunks defined in one layer are + disjoint. A chunker is parametrised with an INI file, defining layers and settings and a WCCL file defing features to be used by the underlying classifier. A new chunker object should be called either load_model to become a -- GitLab