diff --git a/iobber/data/nkjp-coarse-layer1.txt b/iobber/data/nkjp-coarse-layer1.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9e5dddb4e6bf41401d0f82e6d1a32ab3bab975d --- /dev/null +++ b/iobber/data/nkjp-coarse-layer1.txt @@ -0,0 +1,61 @@ +# Unigram +# orth +U00:%x[-2,0] +U01:%x[-1,0] +U02:%x[0,0] +U03:%x[1,0] +U04:%x[2,0] +U05:%x[-1,0]/%x[0,0] +U06:%x[0,0]/%x[1,0] + +# class +U10:%x[-2,1] +U11:%x[-1,1] +U12:%x[0,1] +U13:%x[1,1] +U14:%x[2,1] +U15:%x[-2,1]/%x[-1,1] +U16:%x[-1,1]/%x[0,1] +U17:%x[0,1]/%x[1,1] +U18:%x[1,1]/%x[2,1] + +# cas +U20:%x[-2,2] +U21:%x[-1,2] +U22:%x[0,2] +U23:%x[1,2] +U24:%x[2,2] + +# gnd +U30:%x[-2,3] +U31:%x[-1,3] +U32:%x[0,3] +U33:%x[1,3] +U34:%x[2,3] + +# nmb +U40:%x[-2,4] +U41:%x[-1,4] +U42:%x[0,4] +U43:%x[1,4] +U44:%x[2,4] + +# agr +U50:%x[-1,5] # agr(0,1) -> agr(-1,0) +U51:%x[0,5] # agr(0,1) +U52:%x[-1,6] # agr..(-1,1) -> agr(-2,0) +U53:%x[0,6] # (-1,1) +U54:%x[1,6] # ... -> (0,2) + +# regex feats +#U60:%x[-1,7]/%x[-1,8] +U61:%x[0,7]/%x[0,8] +#U62:%x[1,7]/%x[1,8] + +# wordclass trigrams +U80:%x[-2,1]/%x[-1,1]/%x[0,1] +U81:%x[-1,1]/%x[0,1]/%x[1,1] +U82:%x[0,1]/%x[1,1]/%x[2,1] + +# Bigram +B diff --git a/iobber/data/nkjp-coarse.ccl b/iobber/data/nkjp-coarse.ccl new file mode 100644 index 0000000000000000000000000000000000000000..b6c2d15177db01ce58232f52192d253bad069d9b --- /dev/null +++ b/iobber/data/nkjp-coarse.ccl @@ -0,0 +1,11 @@ +@ "default" ( + orth[0]; // 0 + class[0]; // 1 + cas[0]; // 2 + gnd[0]; // 3 + nmb[0]; // 4 + agrpp(0,1,{nmb,gnd,cas}); // 5 + and(inside(-1), inside(1), wagr(-1,1,{nmb,gnd,cas})); // 6 + regex(orth[0], "\\P{Ll}.*"); regex(orth[0], "\\P{Lu}.*") // 7, 8 +) + diff --git a/iobber/data/nkjp-coarse.ini b/iobber/data/nkjp-coarse.ini new file mode 100644 index 0000000000000000000000000000000000000000..0062c8d4e975a9f21b78b48e732c7738fce97a1a --- /dev/null +++ b/iobber/data/nkjp-coarse.ini @@ -0,0 +1,26 @@ +; Configuration for chunking of phrases taken from NKJP but subjected to merging. +; NKJP tagset. +; +; NP is merged from the following groups: +; * actual nominal groups (NG, NGadres, NGdata, NGgodz), +; * numeral groups (NumG*), +; prepositional-nominal and prepositional-numeral groups (PrepNG, PrepNumG, PrepNGadres, PrepNGb, PrepNGdata, PrepNGgodz, PrepNGp). +; +; AdjP is taken from top-level adjective and prep-adj groups (TODO: enumerate). +; +; VP is taken from syntactic words having verbal classes. +; +; There is only one layer for all the groups. + +[general] +tagset = nkjp +tagged = yes + +[layers] +; the layer ordering is inferred from alphabetical order of their names! +; channel names should contain no hyphens +layer1 = chunk_vp,chunk_np,chunk_adjp + +[crf] +params = -a CRF-L2 +