Commit a21c41e2 authored by mateuszg's avatar mateuszg

remove unnessesery files

parent aa36223f
# An example of Spejd configuration file.
#
# In general, this file should be encoded in ascii. The exception is
# encoding of values of options. Any file names should be encoded in
# the filesystem's encoding (they are not converted). The text-releated
# options (acronymsAfter, acronymsBefore) should have the same encoding
# as set in inputEncoding.
#
# FILES LOCATION
# all paths in this file are relative to location of this config file,
# except for absolute paths
# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\')
#
# a file containing Spejd's grammar
# in this example file you can find the Spejd rules syntax explained
rules = rules.sr
# tagset used in grammar and input/output
# see that file for details on used format
tagset = sample.cfg
#
# PROCESSING CHAIN
#
# list of tools to be executed between reader and writer modules
# for spejd with preprocessing with dictionary
# (dictionary entries may be multiple - with different names after colon, see below)
processingChain = dictionary:example_dict spejd
# spejd preceded with the pantera tagger (Spejd must have pantera support built in)
# processingChain = pantera spejd
# spejd alone (the default)
# processingChain = spejd
# no tools (only reader and writer) - can be used as format converter
# processingChain =
# number of threads to use, 0 means autodetect ( = number of detected cpus)
maxThreads = 0
#
# INPUT
#
# inputType: auto|xcesAna|tei|txt
# auto chooses reader basing on the file name / extension:
# - *.txt/*.txt.gz = txt
# - morph.xml/morph.xml.gz = xcesAna
# - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax
# - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax
# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled)
inputType = auto
# encoding of input files (overrides any XML coding tags!)
#
# note: the acronymsAfter, acronymsBefore options, and contents of various files
# such as ogonkifyFile, morfeusz disambiguation rules or dictionaries must be
# in this encoding too.
inputEncoding = UTF-8
# regexp describing names of input files
# to look for when traversing directories given in command line
# does not affect file names given explicitly in command line
inputFiles = morph\.xml(\.gz)?|.*\.txt(\.gz)?|ann_morphosyntax\.xml(\.gz)?
# to ignore any disambiguation found in input?
ignoreDisamb = no
# Spejd can use XML id attributes available in the input. Sometimes this may cause
# problems (e.g. duplicate id values in the scope of file), so can be turned off
ignoreIDs = no
#
# OUTPUT
#
# format of the output file(s): tei|xcesAna|null
# null = for testing only, does not write anything
outputType = xcesAna
# can interpretations deleted by Spejd be discarded at will (yes)
# or should be preserved for the final output (no)?
discardDeleted = no
# the suffix to be added to the target file name
outputSuffix = Sh.xml
# The core name of the output file. Depending on the output type
# some infixes can be added between it and output suffix.
# Caution: this option replaces the name of the input file.
# With output suffix containing only extension equal to extension of
# the input file (e.g. .xml for the XML input file) spejd will
# overwrite input files with output.
#
# Leave empty or comment out to use the input file name instead.
# outputFilenameCore = ann
# If set to 'yes' spejd will backup existing output files to <name>.bak
backupExistingFiles = no
# apply gzip compression to output?
compressOutput = no
# put <f>'s in single line and omit empty sentences/paragraphs when writing tei?
compactTeiOutput = yes
# NKJP (National Corpus of Polish) compatibility mode:
# dont write <f name="interps"> and <fs type="lex"> in *_words.xml
# assuming that there can be only one interpretation marked as "correct" for each token.
#
# It is a user task to make sure that there will be no tokens with multiple
# correct interpretations
teiSingleSyntokInterp = no
# again, NKJP compatibility:
# place group's heads information inside the <fs type"group">
# as features instead of marking it as 'type' attribute of group's elements
teiFsGroupHeads = no
# for backward compatibility with Spejd 1.2, it is probably easier to parse structures
# written in bottom up order, starting from leafs to the root
# (all entities defined before referencing)
teiBottomUpSyntacticStructures = no
# DIAGNOSTICS
# report progress every reportInterval seconds
# use 0 to completely disable progress reports
reportInterval = 5
# more verbose reports?
debug = no
# mark which rule has deleted an interpretation?
ruleMarking = no
# are tag/tagset errors fatal?
# If turned on, Spejd will try to its best to output only tags conforming the tagset,
# but they may be useless.
# This option exists only to preserve compatibility with older versions of Spejd, which
# accepted incorrect rules. Please do not use when developing new grammars.
#
# !!! use at your own risk and don't report crashes when using this option !!!
nonfatalTagErrors = no
# if to silence the (nonfatal) tag errors?
muffleTagWarnings = no
# Disable correctness checks of tags in between rules execution?
# If set, tags can be temporary incomplete or incorrect, but the usual validation
# before writing is stil performed to make sure Spejd will output only
# correct tags.
# Not recommended for developing new grammars.
tagErrorsOnlyOnTheEnd = no
######################################################################
# MODULE-SPECIFIC OPTIONS
######################################################################
# DICTIONARIES
# list of files containing morphological dictionaries
# to be applied as "dictionary:example_dict" tool to the input
# the format of lines of files is:
# orthographic form,base (lexical) form:tag
#
# or:
# ,base (lexical) form:some_parts_of_tag;condition
#
# In the first variant the orthographical form is used for matching words.
# Tag definition is expanded (it may contain wildcards).
#
# In the second variant orthographic form is ommited. In that case a base form is used to match.
# The tags of existing interpretations which match the base form are corrected/modified
# according to the specified tag.
# This variant allows the tag to be not full/complete, but only specifying some
# of the attributes (some parts).
# This variant also allows to specify conditions on tag that must
# be meet to perform the modification. The condition has form of a partial tag, just like
# in the "tag" section of modifying variant. A condition restricts modified interpretations to
# that ones which have all values of the specified attributes among the specified values.
# If an attribute is ommited in the specification it means that there are no restrictions
# on this attribute value and it can be anything (including absence of value).
# When a condition is empty (that means: there are no restrictions on any attribute),
# a semicolon preceding it can be ommited and the format is:
# ,base (lexical) form:some_parts_of_tag
#
# The above two variants of entries can be mixed.
# All the entries with orthographic form are applied before applying any
# of the entries without orth in the scope of a single 'dictionary:<name>' tool,
# no matter in which file in this list they appear.
#
# The encoding of dictionary files must be the same as inputEncoding.
dictionary:example_dict = sample_dict lexdictnum
# PANTERA CONFIGURATION
# Pantera can use its own built-in tweaked version of Morfeusz.
# If this option is set, all interpretations set by reader
# or any tools preceding pantera in the toolchain are dropped
panteraDoOwnMorphAnalysis = yes
# tagset for pantera, leave empty for a default (check pantera documentation for details)
panteraTagsetName =
# pantera's engine, leave empty for a default (check pantera documentation for details)
panteraEnginePath =
# SPEJD SEMANTICS
# default strategy for matching syntactic entities
# use * for greedy, + for possessive, ? for reluctant
matchStrategy = *
# should agree(case,1,2) return true, if both 1 and 2 have no case?
nullAgreement = no
# SPEJD FSM INTERNALS
# Number of single-rule automata to be composed together, usually not needed to change.
# Generally higher values give a performance gain on large, grammars with rarely accepting rules.
# At the other hand, high values also cause higher memory usage (see 'rule of thumb' below).
composeLimit = 300
# Number of vertical splits to be done on complex automata, also usually not needed to change.
# The vertical split technique causes the complex automata to consume much less memory
# in exchange for speed. That means higher values may save some memory and surely slow down
# processing, but:
#
# Rule of thumb: if Spejd consumes much too much memory, it's better to (in order):
# * increase verticalDepth (low impact on speed but limited effect on small grammars)
# * decrease composeLimit (moderate slowdown and limited effect on large input data)
# * decrease memoryLimit (high impact on speed but highest effectivenes)
#
# However, if a memory shortage appear just after start of processing, none of the above
# methods are relayable. In that case you should try splitting the grammar into parts
# applied sequentially (with xcesAna as an intermediate data format).
verticalDepth = 2
# memory limit in megabytes
# when memory usage exceeds this limit the rarely-used states removal procedure
# (or GC, a garbage collector) is launched
# use as an emergency brake, for standard limit see above.
# The memoryLimit is approximate, actual memory usage may be slightly higher
# (it depends on memory allocator library buffers size)
memoryLimit = 1900
# approx. percent of DFA states to leave after the states removal
leavePercent = 80
# The definitive limit of normal GC usage. GC removes only complex states, so if there are lots
# of plain states it can't prevent from exceeding memoryLimit. If the percent of complex states
# is less than minComplexPercent, all the DFAs are dropped and they are built from the beggining
# just like if the spejd would be restarted. However it does not recompile rules, so it's faster.
minComplexPercent = 10
# A maximal number of unicode characters which can appear in rules compiled to internal regex
# It must be higher than the highest number of values of a single attribute (including
# numeric attributes) and must be higher than a number of unique characters appearing in all rules.
# Setting too high can increase the memory usage.
maxNumberOfValues = 4000
# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ'
# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera
disableMorfeusz = no
# Morfeusz produces ambiguous segmentation, which can be resolved by simple rule-based
# disambiguator. This option specifies a file to load rules from.
# The rule format is described in the example file
# (leave empty for the builtin default, which is actualy the example file)
# The encoding of this file must be the same as inputEncoding.
morfeuszSegmentationDisambiguationRules = segm_disamb.conf
# PLAIN TEXT READER - GENERAL
# mock xml:id for the whole text input referred from the output in string-range notation
# (in TEI output it appears in *segmentation.xml)
stringRangeMockID = p-1
# PLAIN TEXT READER - SENTENCER
# list of acronyms -
# if a dot is found after one of them, it is not a sentence break
acronymsAfter = prof|dr|mgr|doc|ul|np|godz|gen|płk|mjr|por|tzw|tzn|proc|nt|art|ust|ww|www|ws|dz
# list of acronyms (actually top level domain names) -
# if a dot is found before one of them, it is not a sentence break
acronymsBefore = ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw
# PLAIN TEXT READER - OGONKIFIER
# name of file with ogonkify (diacrit completion) substitutions
# the format is:
# <letter without diacritics>=<list of possible letters with diacritics separated by '|'>
# see the example ogonkifier.ini
# The encoding of this file must be the same as inputEncoding.
ogonkifyFile = ogonkifier.ini
# when to use ogonkifier:
# A - Always,
# N - Never,
# M - only when the Morphological analyzer fails to analyse a word
ogonkifyStrategy = M
# min and max length of words to ogonkify
ogonkifyMinLength = 3
ogonkifyMaxLength = 13
This diff is collapsed.
# An example tagset that allows to use Morfeusz lexical analyzer
# The encoding of this file should be ASCII
# (comments may be anything, but must not contain ASCII newline bytes).
# This example is not an exact tagset of Morfeusz, rather it's superset!
# Comments can start in any position of line beginning from #
# Empty lines are ignored
[ATTR]
# First section - attributes and their values
# Names and values of enumerable attributes and part-of-speeches must be
# single words containing letters and underscores.
# They are case-sensitive. Must not be single capital letter
# and any of the rules syntax keywords.
# Number of values of a single attribute should not exceed
# 'maxNumberOfValues' defined in configuration.
number = sg pl
case = nom gen dat acc inst loc voc
gender = m1 m2 m3 f n n1 n2 n3 p1 p2 p3
person = pri sec ter
degree = pos com sup
aspect = imperf perf
negation = aff neg
accommodability = congr rec
accentability = akc nakc
post-prepositionality = npraep praep
agglutination = agl nagl
vocalicity = nwok wok
fullstoppedness = pun npun
# The numeric attributes are defined the following form
#
# attrname = <lower_bound, upper_bound> number_of_partitions
#
# with lower_bound and upper_bound are floating point numbers
# and optional number_of_partitions is an integer.
# The default value of number_of_partitions is 1.
# All the numeric values of the attribute will be stored rounded to
# a multiple of precision=(upper_bound-lower_bound)/(number_of_partitions-1).
# That means they will have form of
# lower_bound+n*precision
# with n from [0..number_of_partitions]
#
# The number_of_partitions must not be lower than 2 and should not exceed
# 'maxNumberOfValues' defined in configuration.
# Setting it to high values can cause performance reduction and higher memory usage.
sen = <-20,20> 401
reversibility = rev norev
[POS]
# Part of speech definitions with lists of possible attributes
# Optional attributes are marked with square brackets
# Number of POS-es should not exceed 'maxNumberOfValues' defined in configuration.
adja =
adjp =
conj =
comp =
interp =
pred =
xxx =
adv = [degree] [sen] [reversibility]
imps = aspect [negation] [sen]
inf = aspect [negation] [sen]
pant = aspect [negation] [sen]
pcon = aspect [negation] [sen]
qub = [vocalicity] [sen]
prep = case [vocalicity]
siebie = case
subst = number case gender [sen] [reversibility]
depr = number case gender
xxs = number case gender
ger = number case gender aspect negation
ppron12 = number case gender person [accentability]
ppron3 = number case gender person [accentability] [post-prepositionality]
num = number case gender [accommodability] [sen]
adj = number case gender degree [sen] [reversibility]
pact = number case gender aspect negation [sen]
ppas = number case gender aspect negation [sen]
winien = number gender aspect [negation]
praet = number gender aspect [agglutination] [negation] [sen]
bedzie = number person aspect [negation]
fin = number person aspect [negation]
impt = number person aspect [negation]
aglt = number person aspect vocalicity
brev = fullstoppedness
burk =
interj =
ign = [sen] [reversibility]
liczba =
waluta =
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment