Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
toki
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
toki
Commits
ebda6633
Commit
ebda6633
authored
Oct 21, 2011
by
Adam Radziszewski
Browse files
Options
Downloads
Patches
Plain Diff
slightly changed config for nkjp
parent
a7763da6
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
config/nkjp.ini
+143
-0
143 additions, 0 deletions
config/nkjp.ini
with
143 additions
and
0 deletions
config/nkjp.ini
0 → 100644
+
143
−
0
View file @
ebda6633
; Configuration for tokenising Polish customised for compatibility with
; the NKJP corpus. NOTE: the compatibility is not guaranteed, it is just our
; intention.
; Includes keeping together abbreviations with inflectional endings,
; abbreviation lexicon (so far almost empty), URLs and numbers.
; Employed token types:
; t -- word token or letters mixed with digits
; p -- punctiation or symbols
; a -- abbreviation ended with dot, possibly multipart (m.in.)
; th -- inflected acronym (acronym + hyphen + inflectional suffix)
; n -- integer
; n_f -- decimal fraction
; n_d -- date (dot- or hyphen-separated)
; n_t -- time (dot- or colon-separated)
; n_ip -- IP number
; tm -- e-mail address
; tu -- URL address
; ts -- symbol (not the above and something beside letters and apostrophe)
[input]
token_type
=
t
srx
=
segment.srx
srx_language
=
pl_two
initial_whitespace
=
newline
[layer:exc_0]
; always split on symbols (S) and punctuation (P)
; EXCEPT for two hyphens and ':;.,@$+~=/?&%#~_
; these exceptions are for URLs, e-mails, numbers and hyphenated inflected abbrevs
; NOTE: hyphens not included
class
=
split
separators
=
[[
\p
{S}
\p
{P}]-[
\u
002d
\u
2010 ' : ;
\.
, @
\$
\+
~
\=
/
\?
\&
% # ~ _]]
separator_token_type
=
p
[layer:suff_safe]
; opening and closing punctuation, apostrophe, comma, colon, semicolon, quotation and exclamation mark
; detach these as suffixes
class
=
affix_split
process_types
=
t
prefix_chars
=
[
\p
{Ps} ']
suffix_chars
=
[
\p
{Pe} ' , : ;
\?
!]
suffix_token_type
=
p
prefix_token_type
=
p
[layer:a_classify]
; abbrev regex classification
class
=
regexp
process_types
=
t
; consonant-only abbrevs (at least two letter long)
type:
a
=
[BCĆDFGHJKLŁMNŃPRSŚTVWXZŹŻbcćdfghjklłmnńprsśtvwxzźż][bcćdfghjklłmnńprsśtvwxzźż]+
\.
; multipart abbrevs (m.in., op.cit.)
type:
a
=
\p
{Ll}{1,6}
\.
(
\p
{Lowercase}{1,6}
\.
)+
[layer:a_lexicon]
; recognise abbrevs listed in the lexicon
class
=
lexicon_caseless
process_types
=
t
token_type
=
a
lexicon_file
=
abbrevs.txt
[layer:suff_dot]
; detach token-final dots, each separately (contrary to the standard config, where dots are joined)
process_types
=
t
class
=
affix_split
suffix_chars
=
.
suffix_token_type
=
p
prefix_token_type
=
p
[layer:tu_classify]
; recognise likely e-mails and URLs
class
=
regexp
process_types
=
t
type:
tm
=
[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+
\.
[a-zA-Z]{2,4}
type:
tu
=
(([a-zA-Z]+)://)?((([a-zA-Z0-9-]+)(
\.
([a-zA-Z0-9-]+))*
\.
[a-zA-Z]{2,6})|((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)
\.
){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)))(:[0-9]{1,5})?(/[a-zA-Z0-9.,;
\?
|
\'
+&%
\$
#=~_-]+)*[/]?
[layer:exc_1]
; always split on symbols (S) and punctuation (P)
; EXCEPT for two hyphens and ':.,
; these exceptions are left for numbers and inflected abbrevs
class
=
split
process_types
=
t
separators
=
[[
\p
{S}
\p
{P}]-[
\u
002d
\u
2010 ' :
\.
,]]
separator_token_type
=
p
[layer:n_classify]
; recognise numbers, date, time and IP numbers
class
=
regexp
process_types
=
t
type:
n
=
[0-9]+
type:
n_f
=
[0-9]+,[0-9]+
type:
n_d
=
(((0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.]((1|2)
\d
)?
\d\d
)|((((1|2)
\d
)?
\d\d\d\d
)[-/.](0[1-9]|1[012])[-/.](0[1-9]|[12][0-9]|3[01])))
type:
n_t
=
(24(:|[.])00|([01]?[0-9]|2[0-3])(:|[.])[0-5][0-9])((:|[.])[0-5][0-9])?
type:
n_ip
=
([0-9]{1,3})(
\.
[0-9]{1,3}){3}
[layer:exc_2]
; now always split on :.,
; two hyphens
; these exceptions are left for inflected abbrevs
class
=
split
process_types
=
t
separators
=
:.,
separator_token_type
=
p
[layer:th_classify]
; PRL-u etc. -> th
; other regex classification
class
=
regexp
process_types
=
t
type:
th
=
\p
{L}+['
\u
002d
\u
2010](?i:ą|ią|ją|etą|otą|tą|ami|etami|otami|tami|iami|jami|em|iem|jem|etem|otem|tem|owie|iowie|jowie|etowie|otowie|towie|owi|iowi|jowi|etowi|otowi|towi|om|iom|jom|etom|otom|tom|ę|ię|ję|etę|otę|tę|ach|iach|jach|etach|otach|tach|a|ia|ja|eta|ota|ta|u|etu|otu|tu|iu|ju|e|ie|ze|zie|ecie|cie|rze|je|ocie|i|eci|oci|ci|ii|ji|y|ety|oty|ty|ów|etów|otów|tów|iów|jów|owiec|etowiec|otowiec|towiec|iowiec|jowiec|wiec|owca|etowca|otowca|towca|iowca|jowca|wca|owcowi|etowcowi|otowcowi|towcowi|iowcowi|jowcowi|wcowi|owcem|etowcem|otowcem|towcem|iowcem|jowcem|wcem|owcu|etowcu|otowcu|towcu|iowcu|jowcu|wcu|owcy|etowcy|otowcy|towcy|iowcy|jowcy|wcy|owców|etowców|otowców|towców|iowców|jowców|wców|owcom|etowcom|otowcom|towcom|iowcom|jowcom|wcom|owcami|etowcami|otowcami|towcami|iowcami|jowcami|wcami|owcach|etowcach|otowcach|towcach|iowcach|jowcach|wcach|ówka|etówka|otówka|tówka|iówka|jówka|wka|ówce|etówce|otówce|tówce|iówce|jówce|wce|ówkę|etówkę|otówkę|tówkę|iówkę|jówkę|wkę|ówką|etówką|otówką|tówką|iówką|jówką|wką|ówko|etówko|otówko|tówko|iówko|jówko|wko|ówki|etówki|otówki|tówki|iówki|jówki|wki|ówa|etówa|otówa|tówa|iówa|jówa|wa|ówie|etówie|otówie|tówie|iówie|jówie|wie|ówę|etówę|otówę|tówę|iówę|jówę|wę|ówą|etówą|otówą|tówą|iówą|jówą|wą|ówo|etówo|otówo|tówo|iówo|jówo|wo|ówy|etówy|otówy|tówy|iówy|jówy|wy|ówek|etówek|otówek|tówek|iówek|jówek|wek|ówkom|etówkom|otówkom|tówkom|iówkom|jówkom|wkom|ówkach|etówkach|otówkach|tówkach|iówkach|jówkach|wkach|ówkami|etówkami|otówkami|tówkami|iówkami|jówkami|ówom|etówom|otówom|tówom|iówom|jówom|wom|ówach|etówach|otówach|tówach|iówach|jówach|wach|ówami|etówami|otówami|tówami|iówami|jówami|emu|mu|ego|go|ym|im|m|owski|etowski|otowski|towski|wski|iowski|jowski|wski|owskiego|etowskiego|otowskiego|towskiego|wskiego|iowskiego|jowskiego|wskiego|owskiemu|etowskiemu|otowskiemu|towskiemu|wskiemu|iowskiemu|jowskiemu|wskiemu|owskim|etowskim|otowskim|towskim|wskim|iowskim|jowskim|wskim|owscy|etowscy|otowscy|towscy|wscy|iowscy|jowscy|wscy|owskich|etowskich|otowskich|towskich|wskich|iowskich|jowskich|wskich|owskimi|etowskimi|otowskimi|towskimi|wskimi|iowskimi|jowskimi|owska|etowska|otowska|towska|wska|iowska|jowska|owskiej|etowskiej|otowskiej|towskiej|wskiej|iowskiej|jowskiej|owską|etowską|otowską|towską|wską|iowską|jowską|owskie|etowskie|otowskie|towskie|wskie|iowskie|jowskie|owy|etowy|otowy|towy|wy|iowy|jowy|owego|etowego|otowego|towego|wego|iowego|jowego|owemu|etowemu|otowemu|towemu|wemu|iowemu|jowemu|owym|etowym|otowym|towym|wym|iowym|jowym|owi|etowi|otowi|towi|wi|iowi|jowi|owych|etowych|otowych|towych|wych|iowych|jowych|owymi|etowymi|otowymi|towymi|wymi|iowymi|jowymi|owa|etowa|otowa|towa|wa|iowa|jowa|owej|etowej|otowej|towej|wej|iowej|jowej|ową|etową|otową|tową|wą|iową|jową|owe|etowe|otowe|towe|we|iowe|jowe)
[layer:hyphen]
; now get the remaining hyphens (hyphen-minus, hyphen proper) separately (contrary to the main config)
class
=
split
process_types
=
t
separators
=
\u
002d
\u
2010
separator_token_type
=
p
[layer:ts_classify]
; PRL-u etc. -> th
; other regex classification
class
=
regexp
process_types
=
t
type:
ts
=
\p
{L}*
\P
{L}.*
[layers]
layer
=
exc_0
layer
=
suff_safe
layer
=
a_lexicon
layer
=
a_classify
layer
=
suff_dot
layer
=
tu_classify
layer
=
exc_1
layer
=
n_classify
layer
=
exc_2
layer
=
th_classify
layer
=
hyphen
layer
=
ts_classify
[debug]
format
=
$orth/$type:$ws
\n
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment