diff --git a/config/kpwr.ini b/config/kpwr.ini index 0b4f6ff6d0eb1b791fc0080ec10c73a3cd8d9431..0ce8498002f00552a2a61374c18e94dceb6235a7 100644 --- a/config/kpwr.ini +++ b/config/kpwr.ini @@ -1,9 +1,23 @@ -; Configuration for chunking phrases defined in KPWr: +; Configuration for chunking phrases defined in KPWr, assuming NKJP tagset. +; +; Syntactic chunks are divided into two "layers". +; 1. Pred-arg chunks: ; * chunk_np (noun phrases), ; * chunk_adjp (top-level adjective phrases), -; * chunk_vp (verb phrases without nominal arguments), -; * chunk_agp (simple agreement-based noun or adj phrases, level on its own). -; The config assumes NKJP tagset. +; * chunk_vp (verb phrases without nominal arguments). +; 2. Low-level phrases based on agreement: +; * chunk_agp (simple agreement-based noun or adj phrases). +; +; Chunks in one layer are disjoint (if they would overlap in the training data, +; a warning would be issued during training, while the resulting chunker +; will not produce any overlaps between one-layer chunks anyway). + +; The chunker is unable to annotate discontinuous chunks. If such cases +; appear in the training data (which is the case in KPWr), each continuous +; part is treated as a separate chunk. Note that it may be altered in the +; future. +; The chunker is also unable to recognise heads. They may be annotated after +; chunking with a dedicated script. [general] tagset = nkjp @@ -11,6 +25,7 @@ tagged = yes [layers] ; the layer ordering is inferred from alphabetical order of their names! +; channel names should contain no hyphens layer1 = chunk_agp layer2 = chunk_vp,chunk_np,chunk_adjp diff --git a/iobber/chunker.py b/iobber/chunker.py index 1ba5019e074ae6f2624e731ec8c22aabe85390cb..50aeb2608d295bdc9411d23ad2f214a4ae5ba7ba 100644 --- a/iobber/chunker.py +++ b/iobber/chunker.py @@ -58,7 +58,8 @@ class Chunker: """The CRF-based chunker. The chunker may add annotations to multiple channels during one run, as specified in layer definitions. Layers are applied sequentially. A layer defines a set of channels - that are dealt with at a time. + that are dealt with at a time. The chunks defined in one layer are + disjoint. A chunker is parametrised with an INI file, defining layers and settings and a WCCL file defing features to be used by the underlying classifier. A new chunker object should be called either load_model to become a