diff --git a/iobber/data/kpwr-notag-layer1.txt b/iobber/data/kpwr-notag-layer1.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9e5dddb4e6bf41401d0f82e6d1a32ab3bab975d --- /dev/null +++ b/iobber/data/kpwr-notag-layer1.txt @@ -0,0 +1,61 @@ +# Unigram +# orth +U00:%x[-2,0] +U01:%x[-1,0] +U02:%x[0,0] +U03:%x[1,0] +U04:%x[2,0] +U05:%x[-1,0]/%x[0,0] +U06:%x[0,0]/%x[1,0] + +# class +U10:%x[-2,1] +U11:%x[-1,1] +U12:%x[0,1] +U13:%x[1,1] +U14:%x[2,1] +U15:%x[-2,1]/%x[-1,1] +U16:%x[-1,1]/%x[0,1] +U17:%x[0,1]/%x[1,1] +U18:%x[1,1]/%x[2,1] + +# cas +U20:%x[-2,2] +U21:%x[-1,2] +U22:%x[0,2] +U23:%x[1,2] +U24:%x[2,2] + +# gnd +U30:%x[-2,3] +U31:%x[-1,3] +U32:%x[0,3] +U33:%x[1,3] +U34:%x[2,3] + +# nmb +U40:%x[-2,4] +U41:%x[-1,4] +U42:%x[0,4] +U43:%x[1,4] +U44:%x[2,4] + +# agr +U50:%x[-1,5] # agr(0,1) -> agr(-1,0) +U51:%x[0,5] # agr(0,1) +U52:%x[-1,6] # agr..(-1,1) -> agr(-2,0) +U53:%x[0,6] # (-1,1) +U54:%x[1,6] # ... -> (0,2) + +# regex feats +#U60:%x[-1,7]/%x[-1,8] +U61:%x[0,7]/%x[0,8] +#U62:%x[1,7]/%x[1,8] + +# wordclass trigrams +U80:%x[-2,1]/%x[-1,1]/%x[0,1] +U81:%x[-1,1]/%x[0,1]/%x[1,1] +U82:%x[0,1]/%x[1,1]/%x[2,1] + +# Bigram +B diff --git a/iobber/data/kpwr-notag-layer2.txt b/iobber/data/kpwr-notag-layer2.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9e5dddb4e6bf41401d0f82e6d1a32ab3bab975d --- /dev/null +++ b/iobber/data/kpwr-notag-layer2.txt @@ -0,0 +1,61 @@ +# Unigram +# orth +U00:%x[-2,0] +U01:%x[-1,0] +U02:%x[0,0] +U03:%x[1,0] +U04:%x[2,0] +U05:%x[-1,0]/%x[0,0] +U06:%x[0,0]/%x[1,0] + +# class +U10:%x[-2,1] +U11:%x[-1,1] +U12:%x[0,1] +U13:%x[1,1] +U14:%x[2,1] +U15:%x[-2,1]/%x[-1,1] +U16:%x[-1,1]/%x[0,1] +U17:%x[0,1]/%x[1,1] +U18:%x[1,1]/%x[2,1] + +# cas +U20:%x[-2,2] +U21:%x[-1,2] +U22:%x[0,2] +U23:%x[1,2] +U24:%x[2,2] + +# gnd +U30:%x[-2,3] +U31:%x[-1,3] +U32:%x[0,3] +U33:%x[1,3] +U34:%x[2,3] + +# nmb +U40:%x[-2,4] +U41:%x[-1,4] +U42:%x[0,4] +U43:%x[1,4] +U44:%x[2,4] + +# agr +U50:%x[-1,5] # agr(0,1) -> agr(-1,0) +U51:%x[0,5] # agr(0,1) +U52:%x[-1,6] # agr..(-1,1) -> agr(-2,0) +U53:%x[0,6] # (-1,1) +U54:%x[1,6] # ... -> (0,2) + +# regex feats +#U60:%x[-1,7]/%x[-1,8] +U61:%x[0,7]/%x[0,8] +#U62:%x[1,7]/%x[1,8] + +# wordclass trigrams +U80:%x[-2,1]/%x[-1,1]/%x[0,1] +U81:%x[-1,1]/%x[0,1]/%x[1,1] +U82:%x[0,1]/%x[1,1]/%x[2,1] + +# Bigram +B diff --git a/iobber/data/kpwr-notag.ccl b/iobber/data/kpwr-notag.ccl new file mode 100644 index 0000000000000000000000000000000000000000..c97321ab0a7ad6dcd7f835d7bca8852d4d647190 --- /dev/null +++ b/iobber/data/kpwr-notag.ccl @@ -0,0 +1,23 @@ +@ "default" ( + orth[0]; // 0 + class[0]; // 1 + cas[0]; // 2 + gnd[0]; // 3 + nmb[0]; // 4 + agrpp(0,1,{nmb,gnd,cas}); // 5 + and(inside(-1), inside(1), wagr(-1,1,{nmb,gnd,cas})); // 6 + regex(orth[0], "\\P{Ll}.*"); regex(orth[0], "\\P{Lu}.*") // 7, 8 +) + +/* +@ "chunk_np" ( + iob(0, "chunk_agp"); // 9 + iob(0, "chunk_vp") // 10 +) + +@ "chunk_adjp" ( + iob(0, chunk_agp), // 9 + iob(0, chunk_vp), // 10 + iob(0, chunk_np) // 11 +) +*/ diff --git a/iobber/data/kpwr-notag.ini b/iobber/data/kpwr-notag.ini new file mode 100644 index 0000000000000000000000000000000000000000..b0311cc8d3419a415b1435863578055b9d0d0eca --- /dev/null +++ b/iobber/data/kpwr-notag.ini @@ -0,0 +1,37 @@ +; Configuration for chunking phrases defined in KPWr, assuming NKJP tagset. +; This configuration operates on morphologically analysed input, without +; disambiguation. The perfomance of this config is significantly worse than +; kpwr.ini, which assumes the input has been fully tagged. +; +; Syntactic chunks are divided into two "layers". +; 1. Pred-arg chunks: +; * chunk_np (noun phrases), +; * chunk_adjp (top-level adjective phrases), +; * chunk_vp (verb phrases without nominal arguments). +; 2. Low-level phrases based on agreement: +; * chunk_agp (simple agreement-based noun or adj phrases). +; +; Chunks in one layer are disjoint (if they would overlap in the training data, +; a warning would be issued during training, while the resulting chunker +; will not produce any overlaps between one-layer chunks anyway). + +; The chunker is unable to annotate discontinuous chunks. If such cases +; appear in the training data (which is the case in KPWr), each continuous +; part is treated as a separate chunk. Note that it may be altered in the +; future. +; The chunker is also unable to recognise heads. They may be annotated after +; chunking with a dedicated script. + +[general] +tagset = nkjp +tagged = no + +[layers] +; the layer ordering is inferred from alphabetical order of their names! +; channel names should contain no hyphens +layer1 = chunk_agp +layer2 = chunk_vp,chunk_np,chunk_adjp + +[crf] +params = -a CRF-L2 +