From 3fb07c544ccd67f2967a3e1ec2f8c77a7469eeaf Mon Sep 17 00:00:00 2001 From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl> Date: Mon, 22 Oct 2012 13:04:36 +0200 Subject: [PATCH] iobber_txt help msg: default values --- README | 29 ++++++++++++++++++++--------- iobber/iobber_txt.py | 2 +- setup.py | 3 ++- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/README b/README index 6e6aa4e..b2b68a9 100644 --- a/README +++ b/README @@ -3,20 +3,22 @@ IOBBER -- a chunker for Slavic languages based on CRF++ and WCCL. This is free software. See LICENCE for details. -The chunker reads input file(s) and adds chunk annotation. By default, the -input and output format is assumed to be CCL. This may be altered by using -i -and -o options. The following formats are supported: +The chunker reads input file(s) and adds chunk annotation. It is also able +to recognise chunks' syntactic heads. + +By default, the input and output format is assumed to be CCL. This may be +altered by using -i and -o options. The following formats are supported: * xces -- morphosyntactically annotated document divided into sentences, tokens and usually paragraphs (by default this division is assumed; if the input is not divided into paragraph or the existing division should be ignored, use --sent-only); this is the XCES variant as used in the IPI PAN Corpus of Polish (korpus.pl); * ccl -- a simple modification to the above format that allows to include -chunk-style annotations; the specs may be found at: +chunk-style annotations and their heads; the specs may be found at: http://nlp.pwr.wroc.pl/redmine/projects/corpus2/wiki/CCL_format * iob-chan -- a very simple format that allows to store morphosyntactic annotation (limited to one lemma,tag pair per token) and chunk-style -annotation per "channel". +annotation per "channel". The format doesn't support chunk heads. NOTE: the rest of the formats defined in corpus2 should theoretically work, but in case of any troubles it is safe to use maca-convert to pre-convert @@ -33,18 +35,19 @@ type as a separate channel, Iobber may treat several chunk types as one "layer", effectively treating them as one channel. This means that no chunks from a given layer may overlap. -The kpwr.ini config (and its trained model -- model-kpwr04) defines two layers: +The kpwr.ini config defines two layers: * layer1 with simple agreement-based noun/adj phrases: chunk_agp, * layer2 with phrases based on pred-arg structure: chunk_np, chunk_adjp and chunk_vp. -It is also possible to recognise chunks' syntactic heads using model-kpwr04-H +There are two trained models distributed with IOBBER: +* model-kpwr04-H: recognises chunks and their syntactic heads, +* model-kpwr04: chunks but no heads. NOTE: the current version of iobber is unable to recognise discontinuous chunks. -Both types of information may be included in ccl format and this is likely to be +Discontinuities, however, may be expressed in CCL format and this is likely to be supported in the future. - If the input file contains annotations in channels other than those defined in the config, they will be preserved (this makes it possible to e.g. chunk files already annotated with named entities). @@ -62,3 +65,11 @@ the NKJP tagset is employed. Iobber also supports chunking stdin to stdout, as well as chunking multiple files at a time, see -h for details. + +There is also a convenient tool provided, named iobber_txt, that allows to +process plain text directly. The tool has an additional requirement: the +WCRFT tagger must be installed +(http://nlp.pwr.wroc.pl/redmine/projects/wcrft/wiki) + +echo 'Polacy wciąż jadają zbyt mało ryb.' | iobber_txt - + diff --git a/iobber/iobber_txt.py b/iobber/iobber_txt.py index f00c0da..aee9f3d 100755 --- a/iobber/iobber_txt.py +++ b/iobber/iobber_txt.py @@ -72,7 +72,7 @@ def go(): help='use given chunker config (default: kpwr.ini)') parser.add_option('-C', '--chunker-model', type='string', action='store', dest='chunker_dir', default='model-kpwr04-H', - help='read chunker trained model from the given dir') + help='read chunker trained model from the given dir (default: model-kpwr04-H)') parser.add_option('-w', '--tagger-config', type='string', action='store', dest='tagger_config', default='nkjp.ini', help='use given tagger (wcrft) config (default: nkjp.ini)') diff --git a/setup.py b/setup.py index 8bc72ee..c4fc72d 100755 --- a/setup.py +++ b/setup.py @@ -20,7 +20,8 @@ setup(name='iobber', license='LGPL', entry_points={ 'console_scripts': [ - 'iobber = iobber.iobber:go' + 'iobber = iobber.iobber:go', + 'iobber_txt = iobber.iobber_txt:go' ] } ) -- GitLab