Commit ea0ab73b authored by mateuszg's avatar mateuszg

Initial commit

parents
; PLIK KONFIGURACYJNY WORKERA
; Plik zawiera konfiguracj zarwno Api usugi sieciowej jak i narzdzia.
;
; Autor: Tomasz Walkowiak
; email: tomasz.walkowiak@pwr.edu.pl
; --------- CZʌ DLA Serwisu ---------
[service]
#root = /mnt2/requests/
root = /samba/requests/
tool = spejd
rabbit_host =10.17.0.85
rabbit_user =clarin
rabbit_password =clarin123
; --------- CZʌ DLA Narzedzia ---------
[tool]
workers_number = 4
model = /home/tomekw/nlpworkers/spejd/model/config.ini
AMQP-CPP
https://github.com/CopernicaMarketingSoftware/AMQP-CPP
git clone https://github.com/CopernicaMarketingSoftware/AMQP-CPP.git
gcc >=4.8
\ No newline at end of file
#
# FILES LOCATION
# all paths in this file are relative to location of this config file,
# except for absolute paths
# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\')
#
# a file containing Spejd's grammar
# in this example file you can find the Spejd rules syntax explained
rules = rules.sr
# tagset used in grammar and input/output
# see for details on used format
tagset = sample.cfg
#
# PROCESSING CHAIN
#
# list of tools to be executed between reader and writer modules
# for spejd with preprocessing with dictionary
# (dictionary entries may be multiple - with different names after colon, see below)
# processingChain = dictionary:example_dict spejd
# spejd preceded with the pantera tagger (Spejd must have pantera support built in)
# processingChain = pantera spejd
# spejd alone (the default)
processingChain = spejd
# no tools (only reader and writer) - can be used as format converter
# processingChain =
# number of threads to use, 0 means autodetect ( = number of detected cpus)
maxThreads = 0
#
# INPUT
#
# inputType: auto|xcesAna|tei|txt
# auto chooses reader basing on the file name / extension:
# - *.txt/*.txt.gz = txt
# - morph.xml/*.txt.gz = xcesAna
# - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax (UNIX only)
# - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax
# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled)
inputType = auto
# encoding of input files (overrides any XML coding tags!)
inputEncoding = UTF-8
# regexp describing names of input files
# to look for when traversing directories given in command line
# does not affect file names given explicitly in command line
inputFiles = ann_morphosyntax\.xml(\.gz)?
# to ignore any disambiguation found in input?
ignoreDisamb = no
ignoreIDs = no
#
# OUTPUT
#
# format of the output file(s): tei|xcesAna|null
# null = for testing only, does not write anything
outputType = tei
# can interpretations deleted by Spejd be discarded at will (yes)
# or should be preserved for the final output (no)?
discardDeleted = no
# the suffix to be added to the target file name
outputSuffix = .xml
# The core name of the output file. Depending on the output type
# some infixes can be added between it and output suffix.
# Caution: this option replaces the name of the input file.
# With empty output suffix spejd will overwrite input files with output.
#
# Leave empty or comment out to use the input file name instead.
outputFilenameCore = ann
# If set to 'no' spejd will backup existing output files to <name>.bak
overwriteExistingFiles = no
# apply gzip compression to output?
compressOutput = yes
# put <f>'s in single line and omit empty sentences/paragraphs when writing tei?
compactTeiOutput = no
# DIAGNOSTICS
# report progress every reportInterval seconds
# use 0 to completely disable progress reports
reportInterval = 5
# more verbose reports?
debug = no
# mark which rule has deleted an interpretation?
ruleMarking = no
# are tag/tagset errors fatal?
# If turned on, Spejd will try to its best to output only tags conforming the tagset,
# but they may be useless.
# This option exists only to preserve compatibility with older versions of Spejd, which
# accepted incorrect rules. Please do not use when developing new grammars.
#
# !!! use at your own risk and don't report crashes when using this option !!!
nonfatalTagErrors = yes
# if to silence the (nonfatal) tag errors?
muffleTagWarnings = no
# Disable correctness checks of tags in between rules execution?
# If set, tags can be temporary incomplete or incorrect, but the usual validation
# before writing is stil performed to make sure Spejd will output only
# correct tags.
# Not recommended for developing new grammars.
tagErrorsOnlyOnTheEnd = yes
######################################################################
# MODULE-SPECIFIC OPTIONS
######################################################################
# DICTIONARIES
# list of files containing morphological dictionaries
# to be applied as "dictionary:example_dict" tool to the input
# the format of files is:
# orthographic form,base (lexical) form:tag
#
# Orthographic form can be ommited, in that case a base form is used to matching.
# The tags of existing interpretations which match the base form are corrected/modified
# according to the specified tag.
# This option also allows the tag to be not full/complete, but only specifying some
# of the attributes.
#
# The above two forms of entries can be mixed.
# All the entries with orthographic form are applied before applying any
# of the entries without orth in the scope of a single 'dictionary:<name>' tool,
# no matter in which file in this list they appear.
dictionary:example_dict = sample_dict lexdictnum
# PANTERA CONFIGURATION
# Pantera can use its own built-in tweaked version of Morfeusz.
# If this option is set, all interpretations set by reader
# or any tools preceding pantera in the toolchain are dropped
PanteraDoOwnMorphAnalysis = yes
# tagset for pantera, leave empty for a default (check pantera documentation for details)
panteraTagsetPath =
# pantera's engine, leave empty for a default (check pantera documentation for details)
panteraEnginePath =
# SPEJD SEMANTICS
# default strategy for matching syntactic entities
# use * for greedy, + for possessive, ? for reluctant
matchStrategy = *
# should agree(case,1,2) return true, if both 1 and 2 have no case?
nullAgreement = no
# SPEJD FSM INTERNALS
# number of single-rule automata to be composed together, usually not needed to change
# Rule of thumb: if Spejd consumes much too much memory, it's better to decrease this
# number than to set very low memoryLimit - it gives smaller impact on performance
composeLimit = 150
# memory limit in megabytes
# when memory usage exceeds this limit the rarely-used states removal procedure is launched
# use as an emergency break, for standard limit see above.
memoryLimit = 2000
# approx. percent of DFA states to leave after the states removal
leavePercent = 80
# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ'
# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera
disableMorfeusz = no
# Morfeusz produces ambiguous segmentation, which is can be resolved by simple rule-based
# disambiguator. This option specifies a file to load rules from.
# The rule format is described in the example file
# (leave empty for the builtin default, which is actualy the example file)
morfeuszSegmentationDisambiguationRules = segm_disamb.conf
# PLAIN TEXT READER - GENERAL
# mock xml:id for the whole text input referred from the output in string-range notation
# (in TEI output it appears in *segmentation.xml)
stringRangeMockID = p-1
# PLAIN TEXT READER - SENTENCER
# list of acronyms -
# if a dot is found after one of them, it is not a sentence break
acronymsAfter = prof|dr|mgr|doc|ul|np|godz|gen|płk|mjr|por|tzw|tzn|proc|nt|art|ust|ww|www|ws|dz
# list of acronyms (actually top level domain names) -
# if a dot is found before one of them, it is not a sentence break
acronymsBefore = ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw
# PLAIN TEXT READER - OGONKIFIER
# name of file with ogonkify (diacrit completion) substitutions
# the format is:
# <letter without diacritics>=<list of possible letters with diacritics separated by '|'>
# see the example ogonkifier.ini
ogonkifyFile = ogonkifier.ini
# when to use ogonkifier:
# A - Always,
# N - Never,
# M - only when the Morphological analyzer fails to analyse a word
ogonkifyStrategy = N
# min and max length of words to ogonkify
ogonkifyMinLength = 3
ogonkifyMaxLength = 13
This diff is collapsed.
# Config file format for shallow parser
[ATTR]
# Attributes and their values
number = sg pl
case = nom gen dat acc inst loc voc
gender = m1 m2 m3 f n
person = pri sec ter
degree = pos com sup
tense = pres past fut
mood = ind imp cond
reflexivity = refl nrefl
aspect = imperf perf
negation = aff neg
accommodability = congr rec
accentability = akc nakc
post-prepositionality = npraep praep
agglutination = agl nagl
vocalicity = nwok wok
fullstoppedness = pun npun
brev_pos = NOUN ADJ ADV QUB PREP CONJ VERB PPAS PACT XXX NG PrepNG AdjG DisG
cont = discr ndiscr
[POS]
# Part of speech definitions.
adja =
adjp =
adjc =
conj =
comp =
interp =
pred =
xxx =
adv = [degree]
imps = aspect
inf = aspect
pant = aspect
pcon = aspect
qub = [vocalicity]
prep = case [vocalicity]
siebie = case
subst = number case gender
depr = number case gender
ger = number case gender aspect negation
ppron12 = number case gender person [accentability]
ppron3 = number case gender person accentability post-prepositionality
num = number case gender accommodability
numcol = number case gender accommodability
adj = number case gender degree
pact = number case gender aspect negation
ppas = number case gender aspect negation
winien = number gender aspect
praet = number gender aspect [agglutination]
bedzie = number person aspect
fin = number person aspect
impt = number person aspect
aglt = number person aspect vocalicity
brev = fullstoppedness
burk =
interj =
ign =
# Syntactic words
Adjc =
Conj =
Conj1 = [cont]
Conj2 = [cont]
Conj3 = [cont]
Conj4 = [cont]
Comp =
Comp1 = [cont]
Comp2 = [cont]
Interj =
Interp =
Qub = [vocalicity]
Xxx =
Adv = [degree]
Imps = mood aspect reflexivity negation
Inf = aspect reflexivity negation
Pant = aspect reflexivity negation
Pcon = aspect reflexivity negation
Prep = case [vocalicity]
Siebie = case
Noun = number case gender [aspect] [reflexivity] [negation]
Ppron12 = number case gender person [accentability]
Ppron3 = number case gender person accentability post-prepositionality
Num = number case gender accommodability
Numcol = number case gender accommodability
Adj = number case gender degree
Adj-se = number case gender degree
Pact = number case gender aspect reflexivity negation
Ppas = number case gender aspect reflexivity negation
Verbfin = number person tense mood aspect reflexivity negation [gender]
Winien = number person gender tense mood aspect reflexivity negation
Pred = tense mood aspect negation
Brev = fullstoppedness brev_pos
Brev-se = fullstoppedness brev_pos
# Named Entities (of sorts):
liczba =
#waluta =
#include <iostream>
#include "asiohandler.h"
using boost::asio::ip::tcp;
class AmqpBuffer
{
public:
AmqpBuffer(size_t size) :
_data(size, 0),
_use(0)
{
}
size_t write(const char* data, size_t size)
{
if (_use == _data.size())
{
return 0;
}
const size_t length = (size + _use);
size_t write = length < _data.size() ? size : _data.size() - _use;
memcpy(_data.data() + _use, data, write);
_use += write;
return write;
}
void drain()
{
_use = 0;
}
size_t available() const
{
return _use;
}
const char* data() const
{
return _data.data();
}
void shl(size_t count)
{
assert(count < _use);
const size_t diff = _use - count;
std::memmove(_data.data(), _data.data() + count, diff);
_use = _use - count;
}
private:
std::vector<char> _data;
size_t _use;
};
AsioHandler::AsioHandler(boost::asio::io_service& ioService) :
_ioService(ioService),
_socket(ioService),
_timer(ioService),
_asioInputBuffer(ASIO_INPUT_BUFFER_SIZE, 0),
_amqpBuffer(new AmqpBuffer(ASIO_INPUT_BUFFER_SIZE * 2)),
_connection(nullptr),
_writeInProgress(false),
_connected(false),
_quit(false)
{
}
AsioHandler::~AsioHandler()
{
}
void AsioHandler::connect(const std::string& host, uint16_t port)
{
doConnect(host, port);
}
void AsioHandler::doConnect(const std::string& host, uint16_t port)
{
tcp::resolver::query query(host, std::to_string(port));
tcp::resolver::iterator iter = tcp::resolver(_ioService).resolve(query);
_timer.expires_from_now(boost::posix_time::seconds(15));
_timer.async_wait([this](const boost::system::error_code& ec){
if(!ec && !_connected)
{
std::cerr<<"Connection timed out";
_socket.cancel();
exit(1);
}
});
boost::asio::async_connect(_socket, iter,
[this](boost::system::error_code ec, tcp::resolver::iterator)
{
_connected = true;
if (!ec)
{
doRead();
if(!_outputBuffer.empty())
{
doWrite();
}
}
else
{
std::cerr<<"Connection error:"<<ec<<std::endl;
exit(1);
}
});
}
void AsioHandler::onData(
AMQP::Connection *connection, const char *data, size_t size)
{
_connection = connection;
_outputBuffer.push_back(std::vector<char>(data, data + size));
if (!_writeInProgress && _connected)
{
doWrite();
}
}
void AsioHandler::doRead()
{
_socket.async_receive(boost::asio::buffer(_asioInputBuffer),
[this](boost::system::error_code ec, std::size_t length)
{
if (!ec)
{
_amqpBuffer->write(_asioInputBuffer.data(), length);
parseData();
doRead();
}
else
{
std::cerr<<"Error reading:"<<ec<<std::endl;
exit(1);
}
});
}
void AsioHandler::doWrite()
{
_writeInProgress = true;
boost::asio::async_write(_socket,
boost::asio::buffer(_outputBuffer.front()),
[this](boost::system::error_code ec, std::size_t length )
{
if(!ec)
{
_outputBuffer.pop_front();
if(!_outputBuffer.empty())
{
doWrite();
}
else
{
_writeInProgress = false;
}
if(_quit)
{
_socket.close();
}
}
else
{
std::cerr<<"Error writing:"<<ec<<std::endl;
_socket.close();
exit(1);
}
});
}
void AsioHandler::parseData()
{
if (_connection == nullptr)
{
return;
}
const size_t count = _connection->parse(_amqpBuffer->data(),
_amqpBuffer->available());
if (count == _amqpBuffer->available())
{
_amqpBuffer->drain();
}
else if (count > 0)
{
_amqpBuffer->shl(count);
}
}
void AsioHandler::onConnected(AMQP::Connection *connection)
{
}
bool AsioHandler::connected() const
{
return _connected;
}
void AsioHandler::onError(AMQP::Connection *connection, const char *message)
{
std::cerr << "AMQP error " << message << std::endl;
}
void AsioHandler::onClosed(AMQP::Connection *connection)
{
std::cout << "AMQP closed connection" << std::endl;
_quit = true;
if (!_writeInProgress)
{
_socket.close();
}
}
#pragma once
#include <deque>
#include <vector>
#include <memory>
#include <amqpcpp.h>
#include <boost/asio.hpp>
class AmqpBuffer;