Commit 2284ae5d authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Initial commit

parents
model/
*.log
model/*
\ No newline at end of file
FROM clarinpl/cpp
WORKDIR /home/install
COPY src/ src/
RUN cd src && \
git clone http://nlp.pwr.wroc.pl/g419-morphodita.git && \
cd g419-morphodita/src && \
make -j lib && \
cd ../.. && \
mkdir build && \
cd build && \
cmake .. && \
make -j4
WORKDIR /home/worker
RUN cp /home/install/src/bin/morphodita morphodita_service
################################################
# NLP Tools for Polish from G4.19 Group
# Wroclaw University of Science and Technology
#
# Contact: Tomasz.Walkowiak@pwr.edu.pl
#
# Morphodita service
###############################################
FROM ubuntu:16.04 as builder
RUN apt-get update && \
apt-get install -y apt-utils && \
apt-get install -y iputils-ping && \
apt-get install -y git && \
apt-get install -y subversion && \
apt-get install -y wget nano mc zip unzip && \
apt-get install -y vim ranger atool htop curl && \
apt-get install -y locales locales-all && \
apt-get install -y cmake && \
apt-get install -y g++ && \
apt-get install -y netcat && \
apt-get install -y libboost-all-dev
##################################
## UTF-8
##################################
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LC_ALL en_US.UTF-8
##################################
# Install morphodita service and dependencies
##################################
RUN mkdir /home/install
WORKDIR /home/install
## AMQP-CPP
WORKDIR /home/install
RUN wget https://github.com/CopernicaMarketingSoftware/AMQP-CPP/archive/v2.8.0.tar.gz && \
tar -xvf v2.8.0.tar.gz && \
cd AMQP-CPP-2.8.0 && \
make -j4 && \
make install PREFIX=/install/ampq-cpp/usr/local && \
cp -r /install/ampq-cpp/* / && \
ldconfig && \
rm -r ../AMQP-CPP-2.8.0 && \
rm ../v2.8.0.tar.gz
## CCP_NLP
WORKDIR /home/install
RUN svn co http://svn.clarin-pl.eu/svn/nlpservices/src/cpp/nlp && \
cd nlp && \
mkdir build && \
cd build && \
cmake .. && \
make -j4 && \
make install DESTDIR=/install/ccp_nlp && \
cp -r /install/ccp_nlp/* / && \
ldconfig && \
rm -r ../../nlp
RUN svn co http://svn.clarin-pl.eu/svn/nlpservices/src/cpp/morphodita && \
cd morphodita && \
git clone http://nlp.pwr.wroc.pl/g419-morphodita.git && \
cd g419-morphodita/src && \
make lib && \
cd ../.. && \
mkdir build && \
cd build && \
cmake .. && \
make -j4
FROM ubuntu:16.04
COPY --from=builder /install/ampq-cpp /
COPY --from=builder /install/ccp_nlp /
RUN apt-get update && apt-get install -y libboost-all-dev
RUN mkdir /home/worker
WORKDIR /home/worker
COPY --from=builder /home/install/morphodita/bin/morphodita morphodita_service
1. get model
wget -O model/xix http://minio.clarin-pl.eu/public/models/xix
wget -O model/xxi http://minio.clarin-pl.eu/public/models/xxi
2. Build
docker-compose build
; PLIK KONFIGURACYJNY WORKERA
; Plik zawiera konfiguracj zarwno Api usugi sieciowej jak i narzdzia.
;
; Autor: Tomasz Walkowiak
; email: tomasz.walkowiak@pwr.edu.pl
; --------- CZʌ DLA Serwisu ---------
[service]
root = /samba/requests/
tool = morphoDita
rabbit_host =rabbit.clarin.ws
rabbit_user =clarin
rabbit_password =clarin123
; --------- CZʌ DLA Narzedzia ---------
[tool]
workers_number = 14
tagger_model = model/xxi
tagger_xix=model/xix
version: '3'
services:
morphodita:
container_name: clarin_morphodita
build: ./
working_dir: /home/worker
entrypoint:
- ./morphodita_service
volumes:
- /samba:/samba
- ./config.ini:/home/worker/config.ini
PROJECT(morphodita)
cmake_minimum_required(VERSION 3.5.0)
set(CMAKE_CXX_STANDARD 11)
SET(CMAKE_CXX_FLAGS -pthread)
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/CMakeScripts)
include_directories(${PROJECT_SOURCE_DIR}/g419-morphodita/src)
set(LIBS ${LIBS} ${PROJECT_SOURCE_DIR}/g419-morphodita/src/libmorphodita.a)
find_package(wsnlp 1.0.0 REQUIRED)
set(LIBS ${LIBS} ${WSNLP_LIBRARY})
find_package(Boost COMPONENTS program_options system thread filesystem)
link_directories(${Boost_LIBRARY_DIRS})
set(LIBS ${LIBS} ${Boost_LIBRARIES})
find_package(amqpcpp REQUIRED)
set(LIBS ${LIBS} amqpcpp)
add_executable(morphodita morphoDita_service.cpp unmerge.cpp)
target_link_libraries(morphodita ${LIBS})
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
# - Try to find SigC++-2.0
# Once done, this will define
#
# SigC++_FOUND - system has SigC++
# SigC++_INCLUDE_DIRS - the SigC++ include directories
# SigC++_LIBRARIES - link these to use SigC++
include(LibFindMacros)
# Use pkg-config to get hints about paths
libfind_pkg_check_modules(SigC++_PKGCONF sigc++-2.0)
# Main include dir
find_path(SigC++_INCLUDE_DIR
NAMES sigc++/sigc++.h
PATHS ${SigC++_PKGCONF_INCLUDE_DIRS}
PATH_SUFFIXES sigc++-2.0
)
# Glib-related libraries also use a separate config header, which is in lib dir
find_path(SigC++Config_INCLUDE_DIR
NAMES sigc++config.h
PATHS ${SigC++_PKGCONF_INCLUDE_DIRS} /usr
PATH_SUFFIXES lib/sigc++-2.0/include
)
libfind_library(SigC++ sigc 2.0)
# Set the include dir variables and the libraries and let libfind_process do the rest.
# NOTE: Singular variables for this library, plural for libraries this this lib depends on.
set(SigC++_PROCESS_INCLUDES SigC++_INCLUDE_DIR SigC++Config_INCLUDE_DIR)
set(SigC++_PROCESS_LIBS SigC++_LIBRARY)
libfind_process(SigC++)
# Find the Rabbitmq C library
SET(_AMQP_REQUIRED_VARS AMQP_INCLUDE_DIR AMQP_LIBRARY )
# Find the include directories
FIND_PATH(AMQP_INCLUDE_DIR
NAMES amqpcpp.h
HINTS ${AMQP_DIR}/include /usr/include/amqpcpp
)
FIND_LIBRARY(AMQP_LIBRARY
NAMES amqpcpp
HINTS ${AMQP_DIR}/lib /usr/lib/
)
SET(AMQP_PROCESS_INCLUDES AMQP_INCLUDE_DIR)
SET(AMQP_PROCESS_LIBS AMQP_LIBRARY)
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(AMQP DEFAULT_MSG ${_AMQP_REQUIRED_VARS})
\ No newline at end of file
# Find the Rabbitmq C library
SET(_WSNLP_REQUIRED_VARS WSNLP_INCLUDE_DIR WSNLP_LIBRARY )
# Find the include directories
FIND_PATH(WSNLP_INCLUDE_DIR
NAMES nlpworker.h
HINTS ${WSNLP_DIR}/include /usr/include/wsnlp
)
FIND_LIBRARY(WSNLP_LIBRARY
NAMES wsnlp
HINTS ${WSNLP_DIR}/lib /usr/lib/
)
SET(WSNLP_PROCESS_INCLUDES ${WSNLP_INCLUDE_DIR})
SET(WSNLP_PROCESS_LIBS ${WSNLP_LIBRARY})
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(WSNLP DEFAULT_MSG ${_WSNLP_REQUIRED_VARS})
\ No newline at end of file
# Works the same as find_package, but forwards the "REQUIRED" and "QUIET" arguments
# used for the current package. For this to work, the first parameter must be the
# prefix of the current package, then the prefix of the new package etc, which are
# passed to find_package.
macro (libfind_package PREFIX)
set (LIBFIND_PACKAGE_ARGS ${ARGN})
if (${PREFIX}_FIND_QUIETLY)
set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} QUIET)
endif (${PREFIX}_FIND_QUIETLY)
if (${PREFIX}_FIND_REQUIRED)
set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} REQUIRED)
endif (${PREFIX}_FIND_REQUIRED)
find_package(${LIBFIND_PACKAGE_ARGS})
endmacro (libfind_package)
# CMake developers made the UsePkgConfig system deprecated in the same release (2.6)
# where they added pkg_check_modules. Consequently I need to support both in my scripts
# to avoid those deprecated warnings. Here's a helper that does just that.
# Works identically to pkg_check_modules, except that no checks are needed prior to use.
macro (libfind_pkg_check_modules PREFIX PKGNAME)
if (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
include(UsePkgConfig)
pkgconfig(${PKGNAME} ${PREFIX}_INCLUDE_DIRS ${PREFIX}_LIBRARY_DIRS ${PREFIX}_LDFLAGS ${PREFIX}_CFLAGS)
else (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
find_package(PkgConfig)
if (PKG_CONFIG_FOUND)
pkg_check_modules(${PREFIX} ${PKGNAME})
endif (PKG_CONFIG_FOUND)
endif (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
endmacro (libfind_pkg_check_modules)
# Do the final processing once the paths have been detected.
# If include dirs are needed, ${PREFIX}_PROCESS_INCLUDES should be set to contain
# all the variables, each of which contain one include directory.
# Ditto for ${PREFIX}_PROCESS_LIBS and library files.
# Will set ${PREFIX}_FOUND, ${PREFIX}_INCLUDE_DIRS and ${PREFIX}_LIBRARIES.
# Also handles errors in case library detection was required, etc.
macro (libfind_process PREFIX)
# Skip processing if already processed during this run
if (NOT ${PREFIX}_FOUND)
# Start with the assumption that the library was found
set (${PREFIX}_FOUND TRUE)
# Process all includes and set _FOUND to false if any are missing
foreach (i ${${PREFIX}_PROCESS_INCLUDES})
if (${i})
set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIRS} ${${i}})
mark_as_advanced(${i})
else (${i})
set (${PREFIX}_FOUND FALSE)
endif (${i})
endforeach (i)
# Process all libraries and set _FOUND to false if any are missing
foreach (i ${${PREFIX}_PROCESS_LIBS})
if (${i})
set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARIES} ${${i}})
mark_as_advanced(${i})
else (${i})
set (${PREFIX}_FOUND FALSE)
endif (${i})
endforeach (i)
# Print message and/or exit on fatal error
if (${PREFIX}_FOUND)
if (NOT ${PREFIX}_FIND_QUIETLY)
message (STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}")
endif (NOT ${PREFIX}_FIND_QUIETLY)
else (${PREFIX}_FOUND)
if (${PREFIX}_FIND_REQUIRED)
foreach (i ${${PREFIX}_PROCESS_INCLUDES} ${${PREFIX}_PROCESS_LIBS})
message("${i}=${${i}}")
endforeach (i)
message (FATAL_ERROR "Required library ${PREFIX} NOT FOUND.\nInstall the library (dev version) and try again. If the library is already installed, use ccmake to set the missing variables manually.")
endif (${PREFIX}_FIND_REQUIRED)
endif (${PREFIX}_FOUND)
endif (NOT ${PREFIX}_FOUND)
endmacro (libfind_process)
macro(libfind_library PREFIX basename)
set(TMP "")
if(MSVC80)
set(TMP -vc80)
endif(MSVC80)
if(MSVC90)
set(TMP -vc90)
endif(MSVC90)
set(${PREFIX}_LIBNAMES ${basename}${TMP})
if(${ARGC} GREATER 2)
set(${PREFIX}_LIBNAMES ${basename}${TMP}-${ARGV2})
string(REGEX REPLACE "\\." "_" TMP ${${PREFIX}_LIBNAMES})
set(${PREFIX}_LIBNAMES ${${PREFIX}_LIBNAMES} ${TMP})
endif(${ARGC} GREATER 2)
find_library(${PREFIX}_LIBRARY
NAMES ${${PREFIX}_LIBNAMES}
PATHS ${${PREFIX}_PKGCONF_LIBRARY_DIRS}
)
endmacro(libfind_library)
## Requirements
amqpcpp 3.1
wsnlp >=1.0
Boost
## Build
Morphodita worker has dependencies on g419-morphodita project, so first, in the project root directory, run:
$ git clone git@nlp.pwr.edu.pl:g419-morphodita
$ cd g419-morphodita/src
$ make lib
Then you can compile worker:
$ mkdir build
$ cd build
$ cmake ..
$ make
1. Dodać process dla JSON-ow
#include "nlpworker.h"
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/ini_parser.hpp>
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#define BOOST_NO_SCOPED_ENUMS
#include <string>
#include <fstream>
#include <streambuf>
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include "g419-morphodita/src/derivator/derivation_formatter.h"
#include "g419-morphodita/src/tagger/tagger.h"
#include "g419-morphodita/src/morpho/morpho.h"
#include "g419-morphodita/src/tagset_converter/tagset_converter.h"
#include "g419-morphodita/src/utils/getpara.h"
#include "g419-morphodita/src/utils/iostreams.h"
#include "g419-morphodita/src/utils/options.h"
#include "g419-morphodita/src/utils/parse_int.h"
#include "g419-morphodita/src/utils/process_args.h"
#include "g419-morphodita/src/utils/xml_encoded.h"
#include "g419-morphodita/src/version/version.h"
#include "unmerge.h"
using namespace ufal::morphodita;
using namespace std;
map<int, map<string, string>> dictionary {
{0, { {"AA","adj"}, {"AM","adja"}, {"AU","adjp"}, {"AC","adjc"}, {"Vc","cond"}, {"J^","conj"}, {"J,","comp"}, {"Vr","pred"}, {"Db","adv"}, {"V~","imps"}, {"Vf","inf"}, {"RR","prep"}, {"P6","siebie"}, {"NN","subst"}, {"N;","depr"}, {"VG","ger"}, {"PP","ppron12"}, {"PH","ppron3"}, {"Cl","num"}, {"Cj","numcol"}, {"Vm","pant"}, {"Ve","pcon"}, {"Vp","pact"}, {"Vs","ppas"}, {"Vw","winien"}, {"Vq","praet"}, {"Vt","bedzie"}, {"VB","fin"}, {"Vi","impt"}, {"Va","aglt"}, {"II","interj"}, {"X%","burk"}, {"TT","qub"}, {"Xx","brev"}, {"X?","xxx"}, {"Z:","interp"}, {"X@","ign"} }}, //gramatic class
{3, { {"M","m1"}, {"Y","m2"}, {"I","m3"}, {"F","f"}, {"N","n"} }}, //gender
{4, { {"S","sg"}, {"P","pl"} }}, //number
{5, { {"1","nom"}, {"2","gen"}, {"3","dat"}, {"4","acc"}, {"5","voc"}, {"6","loc"}, {"7","inst"} }}, //case
{6, { {"C","congr"}, {"R","rec"} }}, //accommodability
{7, { {"A","akc"}, {"N","nakc"} }}, //accentability
{8, { {"1","pri"}, {"2","sec"}, {"3","ter"} }}, //person
{9, { {"P","praep"}, {"N","npraep"} }}, //postprepositionality
{10, { {"P","pos"}, {"C","com"}, {"S","sup"} }}, //degree
{11, { {"A","aff"}, {"N","neg"} }}, //negation
{12, { {"I","imperf"}, {"P","perf"} }}, //aspect
{13, { {"A","agl"}, {"N","nagl"} }}, //agglutination
{14, { {"W","wok"}, {"N","nwok"} }}, //vocalicity
{15, { {"P","pun"}, {"N","npun"} }} //fullstoppedness
};
bool compareLemma(const tagged_lemma &a, const tagged_lemma &b) {
if(a.lemma != b.lemma)
return a.lemma < b.lemma;
else
return a.tag < b.tag;
}
bool equalLemma(const tagged_lemma &a, const tagged_lemma &b) {
return a.lemma == b.lemma and a.tag == b.tag;
}
string convert_tagset(string tag, map<int, map<string, string>>& dictionary) {
vector<string> attributes (13, "");
string tag_head = tag.substr(0,2);
string tag_tail = tag.substr(2,13);
for(int i = 0; i < 13; i++)
{
string id = string(1, tag_tail[i]);
if(id != "-")
{
switch(i)
{
case 0: attributes[2] = dictionary[3][id]; break;
case 1: attributes[0] = dictionary[4][id]; break;
case 2: attributes[1] = dictionary[5][id]; break;
case 3: attributes[7] = dictionary[6][id]; break;
case 4: attributes[8] = dictionary[7][id]; break;
case 5: attributes[3] = dictionary[8][id]; break;
case 6: attributes[9] = dictionary[9][id]; break;
case 7: attributes[4] = dictionary[10][id]; break;
case 8: attributes[6] = dictionary[11][id]; break;
case 9: attributes[5] = dictionary[12][id]; break;
case 10: attributes[10] = dictionary[13][id]; break;
case 11: attributes[11] = dictionary[14][id]; break;
case 12: attributes[12] = dictionary[15][id]; break;
default:break;
}//switch
}//if
}//for
string new_tag = "";
new_tag.append(dictionary[0][tag_head]);
for(int i = 0; i < 13; i++)
{
if(!attributes[i].empty())
{
new_tag.append(":"+attributes[i]);
}//if
}//for
return new_tag;
}//string convert_tagset()
void tag_raw(istream& is, ostream& os, const tagger& tagger, tokenizer& tokenizer,
const tagset_converter& tagset_converter, const derivation_formatter& derivation,
int guessT, bool ifallforms, const morpho& morpho, int guessD)
{
string para;
vector<string_piece> forms;
vector<tagged_lemma> tags;
vector<tagged_lemma> lemmas;
vector<token_range> tokens;
os<<"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
os<<"<!DOCTYPE chunkList SYSTEM \"ccl.dtd\">\n";
os<<"<chunkList>\n";
//cerr << "Tagging: ";
clock_t now = clock();
long para_id = 1;
while (getpara(is, para))
{
tokenizer.set_text(para);
long s_id = 1;
os << "<chunk id=\"" << para_id++ << "\" type=\"p\">\n";
while (tokenizer.next_sentence(&forms, &tokens))
{
tagger.tag(forms, tags, morpho::guesser_mode(guessT));
unmerge(forms, tags,tokens);
for (unsigned i = 0; i < forms.size(); i++)
{
tagset_converter.convert(tags[i]);
derivation.format_derivation(tags[i].lemma);
if (ifallforms)
{
morpho.analyze(forms[i], morpho::guesser_mode(guessD), lemmas);
std::sort(lemmas.begin(), lemmas.end(), compareLemma);
lemmas.erase(std::unique(lemmas.begin(), lemmas.end(), equalLemma), lemmas.end());
}
//if (!i)os << "<chunk id=\"" << s_id++ << "\" type=\"s\">\n";
if(!i)
os << "<sentence id=\"" << s_id++ << "\">\n";
if (i)
if (tokens[i].start == tokens[i - 1].start + tokens[i - 1].length)
os << "<ns/>\n";
os << "<tok>\n";
if(forms[i].len)
os << "<orth>" << xml_encoded(forms[i]) << "</orth>\n";
else
os << "<orth> </orth>\n";
os << "<lex disamb=\"1\"><base>" << xml_encoded(tags[i].lemma, true) << "</base><ctag>" << xml_encoded(convert_tagset(tags[i].tag, dictionary), true) << "</ctag></lex>\n";
for (auto& lemma: lemmas)
{
if(lemma.lemma == tags[i].lemma and lemma.tag == tags[i].tag)
continue;
else
os << "<lex><base>" << xml_encoded(lemma.lemma, true) << "</base><ctag>" << xml_encoded(convert_tagset(lemma.tag, dictionary), true) << "</ctag></lex>\n";
}
os << "</tok>\n";
if (i + 1 == forms.size())
os << "</sentence>\n";
} //for(forms.size())
} //while(tokenizer.next_sentence))
os << "</chunk>\n" << flush;
} //while(getpara)
os << "</chunkList>\n" << flush;
// cerr << "done, in " << fixed << setprecision(3) << (clock() - now) / double(CLOCKS_PER_SEC) << " seconds." << endl;
}
class MorphoDitaWorker: public NLPWorker
{
public:
void init(boost::property_tree::ptree config);
void process(std::string task_path, boost::property_tree::ptree &options, std::string output_path);
void process(boost::property_tree::ptree &data, boost::property_tree::ptree &options);
private:
std::string tagger_model;
tagger* _tagger;
tokenizer* _tokenizer;
tagger* _taggerxix=NULL;
tokenizer* _tokenizerxix;
};
void MorphoDitaWorker::init(boost::property_tree::ptree config)
{
tagger_model = config.get<std::string>("tool.tagger_model");
_tagger = tagger::load(tagger_model.c_str());
_tokenizer = _tagger->new_tokenizer();
if(config.get_optional<std::string>("tool.tagger_xix"))
{
string tagger_modelxix = config.get<std::string>("tool.tagger_xix");
_taggerxix = tagger::load(tagger_modelxix.c_str());
_tokenizerxix = _taggerxix->new_tokenizer();
}
//LOG(INFO) << "Model loaded..";
}
void MorphoDitaWorker::process(std::string task_path, boost::property_tree::ptree &options, std::string output_path)
{
unique_ptr<derivation_formatter> derivation;
derivation.reset(derivation_formatter::new_none_derivation_formatter());
unique_ptr<tagset_converter> tagset_converter;
tagset_converter.reset(tagset_converter::new_identity_converter());
boost::optional<bool> gu = options.get_optional<bool>("guesser");
bool ifguesser=gu ? gu.get():false;