Commit 5758750a authored by rancher's avatar rancher

Initial commit

parents
PROJECT(tag_converter)
cmake_minimum_required(VERSION 3.5.0)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/CMakeScripts)
find_package(wsnlp 1.0.0 REQUIRED)
set(LIBS ${LIBS} ${WSNLP_LIBRARY})
find_package(ICU 52.0 REQUIRED)
set(LIBS ${LIBS} ${ICU_LIBRARIES})
find_package(Boost COMPONENTS program_options system thread filesystem chrono atomic date_time regex)
link_directories(${Boost_LIBRARY_DIRS})
set(LIBS ${LIBS} ${Boost_LIBRARIES})
find_package(amqpcpp REQUIRED)
set(LIBS ${LIBS} amqpcpp)
find_package(Threads)
set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
add_executable(tag_converter tag_converter_service.cpp)
target_link_libraries(tag_converter ${LIBS})
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
# Finds the International Components for Unicode (ICU) Library
#
# ICU_FOUND - True if ICU found.
# ICU_I18N_FOUND - True if ICU's internationalization library found.
# ICU_INCLUDE_DIRS - Directory to include to get ICU headers
# Note: always include ICU headers as, e.g.,
# unicode/utypes.h
# ICU_LIBRARIES - Libraries to link against for the common ICU
# ICU_I18N_LIBRARIES - Libraries to link against for ICU internationaliation
# (note: in addition to ICU_LIBRARIES)
MARK_AS_ADVANCED(ICU_DIR)
# Look for the header file.
find_path(
ICU_INCLUDE_DIR
NAMES unicode/utypes.h
DOC "Include directory for the ICU library")
mark_as_advanced(ICU_INCLUDE_DIR)
# Look for the library.
find_library(
ICU_LIBRARY
NAMES icuuc cygicuuc cygicuuc32
DOC "Libraries to link against for the common parts of ICU")
mark_as_advanced(ICU_LIBRARY)
# Copy the results to the output variables.
if(ICU_INCLUDE_DIR AND ICU_LIBRARY)
set(ICU_FOUND 1)
set(ICU_LIBRARIES ${ICU_LIBRARY})
set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
set(ICU_VERSION 0)
set(ICU_MAJOR_VERSION 0)
set(ICU_MINOR_VERSION 0)
FILE(READ "${ICU_INCLUDE_DIR}/unicode/uversion.h" _ICU_VERSION_CONENTS)
STRING(REGEX REPLACE ".*#define U_ICU_VERSION_MAJOR_NUM ([0-9]+).*" "\\1" ICU_MAJOR_VERSION "${_ICU_VERSION_CONENTS}")
STRING(REGEX REPLACE ".*#define U_ICU_VERSION_MINOR_NUM ([0-9]+).*" "\\1" ICU_MINOR_VERSION "${_ICU_VERSION_CONENTS}")
set(ICU_VERSION "${ICU_MAJOR_VERSION}.${ICU_MINOR_VERSION}")
# Look for the ICU internationalization libraries
find_library(
ICU_I18N_LIBRARY
NAMES icuin icui18n cygicuin cygicuin32
DOC "Libraries to link against for ICU internationalization")
mark_as_advanced(ICU_I18N_LIBRARY)
if (ICU_I18N_LIBRARY)
set(ICU_I18N_FOUND 1)
set(ICU_I18N_LIBRARIES ${ICU_I18N_LIBRARY})
else (ICU_I18N_LIBRARY)
set(ICU_I18N_FOUND 0)
set(ICU_I18N_LIBRARIES)
endif (ICU_I18N_LIBRARY)
else(ICU_INCLUDE_DIR AND ICU_LIBRARY)
set(ICU_FOUND 0)
set(ICU_I18N_FOUND 0)
set(ICU_LIBRARIES)
set(ICU_I18N_LIBRARIES)
set(ICU_INCLUDE_DIRS)
set(ICU_VERSION)
set(ICU_MAJOR_VERSION)
set(ICU_MINOR_VERSION)
endif(ICU_INCLUDE_DIR AND ICU_LIBRARY)
IF(ICU_FOUND)
IF( NOT ICU_FIND_QUIETLY )
MESSAGE( STATUS "Found ICU header files in ${ICU_INCLUDE_DIRS}")
MESSAGE( STATUS "Found ICU libraries: ${ICU_LIBRARIES}")
ENDIF( NOT ICU_FIND_QUIETLY )
ELSE(ICU_FOUND)
IF(ICU_FIND_REQUIRED)
MESSAGE( FATAL_ERROR "Could not find ICU" )
ELSE(ICU_FIND_REQUIRED)
MESSAGE( STATUS "Optional package ICU was not found" )
ENDIF(ICU_FIND_REQUIRED)
ENDIF(ICU_FOUND)
# - Try to find SigC++-2.0
# Once done, this will define
#
# SigC++_FOUND - system has SigC++
# SigC++_INCLUDE_DIRS - the SigC++ include directories
# SigC++_LIBRARIES - link these to use SigC++
include(LibFindMacros)
# Use pkg-config to get hints about paths
libfind_pkg_check_modules(SigC++_PKGCONF sigc++-2.0)
# Main include dir
find_path(SigC++_INCLUDE_DIR
NAMES sigc++/sigc++.h
PATHS ${SigC++_PKGCONF_INCLUDE_DIRS}
PATH_SUFFIXES sigc++-2.0
)
# Glib-related libraries also use a separate config header, which is in lib dir
find_path(SigC++Config_INCLUDE_DIR
NAMES sigc++config.h
PATHS ${SigC++_PKGCONF_INCLUDE_DIRS} /usr
PATH_SUFFIXES lib/sigc++-2.0/include
)
libfind_library(SigC++ sigc 2.0)
# Set the include dir variables and the libraries and let libfind_process do the rest.
# NOTE: Singular variables for this library, plural for libraries this this lib depends on.
set(SigC++_PROCESS_INCLUDES SigC++_INCLUDE_DIR SigC++Config_INCLUDE_DIR)
set(SigC++_PROCESS_LIBS SigC++_LIBRARY)
libfind_process(SigC++)
# Find the Rabbitmq C library
SET(_AMQP_REQUIRED_VARS AMQP_INCLUDE_DIR AMQP_LIBRARY )
# Find the include directories
FIND_PATH(AMQP_INCLUDE_DIR
NAMES amqpcpp.h
HINTS ${AMQP_DIR}/include /usr/include/amqpcpp
)
FIND_LIBRARY(AMQP_LIBRARY
NAMES amqpcpp
HINTS ${AMQP_DIR}/lib /usr/lib/
)
SET(AMQP_PROCESS_INCLUDES ${AMQP_INCLUDE_DIR})
SET(AMQP_PROCESS_LIBS ${AMQP_LIBRARY})
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(AMQP DEFAULT_MSG ${_AMQP_REQUIRED_VARS})
\ No newline at end of file
# Find the Rabbitmq C library
SET(_WSNLP_REQUIRED_VARS WSNLP_INCLUDE_DIR WSNLP_LIBRARY )
# Find the include directories
FIND_PATH(WSNLP_INCLUDE_DIR
NAMES nlpworker.h
HINTS ${WSNLP_DIR}/include /usr/include/wsnlp
)
FIND_LIBRARY(WSNLP_LIBRARY
NAMES wsnlp
HINTS ${WSNLP_DIR}/lib /usr/lib/
)
SET(WSNLP_PROCESS_INCLUDES ${WSNLP_INCLUDE_DIR})
SET(WSNLP_PROCESS_LIBS ${WSNLP_LIBRARY})
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(WSNLP DEFAULT_MSG ${_WSNLP_REQUIRED_VARS})
\ No newline at end of file
# Works the same as find_package, but forwards the "REQUIRED" and "QUIET" arguments
# used for the current package. For this to work, the first parameter must be the
# prefix of the current package, then the prefix of the new package etc, which are
# passed to find_package.
macro (libfind_package PREFIX)
set (LIBFIND_PACKAGE_ARGS ${ARGN})
if (${PREFIX}_FIND_QUIETLY)
set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} QUIET)
endif (${PREFIX}_FIND_QUIETLY)
if (${PREFIX}_FIND_REQUIRED)
set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} REQUIRED)
endif (${PREFIX}_FIND_REQUIRED)
find_package(${LIBFIND_PACKAGE_ARGS})
endmacro (libfind_package)
# CMake developers made the UsePkgConfig system deprecated in the same release (2.6)
# where they added pkg_check_modules. Consequently I need to support both in my scripts
# to avoid those deprecated warnings. Here's a helper that does just that.
# Works identically to pkg_check_modules, except that no checks are needed prior to use.
macro (libfind_pkg_check_modules PREFIX PKGNAME)
if (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
include(UsePkgConfig)
pkgconfig(${PKGNAME} ${PREFIX}_INCLUDE_DIRS ${PREFIX}_LIBRARY_DIRS ${PREFIX}_LDFLAGS ${PREFIX}_CFLAGS)
else (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
find_package(PkgConfig)
if (PKG_CONFIG_FOUND)
pkg_check_modules(${PREFIX} ${PKGNAME})
endif (PKG_CONFIG_FOUND)
endif (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
endmacro (libfind_pkg_check_modules)
# Do the final processing once the paths have been detected.
# If include dirs are needed, ${PREFIX}_PROCESS_INCLUDES should be set to contain
# all the variables, each of which contain one include directory.
# Ditto for ${PREFIX}_PROCESS_LIBS and library files.
# Will set ${PREFIX}_FOUND, ${PREFIX}_INCLUDE_DIRS and ${PREFIX}_LIBRARIES.
# Also handles errors in case library detection was required, etc.
macro (libfind_process PREFIX)
# Skip processing if already processed during this run
if (NOT ${PREFIX}_FOUND)
# Start with the assumption that the library was found
set (${PREFIX}_FOUND TRUE)
# Process all includes and set _FOUND to false if any are missing
foreach (i ${${PREFIX}_PROCESS_INCLUDES})
if (${i})
set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIRS} ${${i}})
mark_as_advanced(${i})
else (${i})
set (${PREFIX}_FOUND FALSE)
endif (${i})
endforeach (i)
# Process all libraries and set _FOUND to false if any are missing
foreach (i ${${PREFIX}_PROCESS_LIBS})
if (${i})
set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARIES} ${${i}})
mark_as_advanced(${i})
else (${i})
set (${PREFIX}_FOUND FALSE)
endif (${i})
endforeach (i)
# Print message and/or exit on fatal error
if (${PREFIX}_FOUND)
if (NOT ${PREFIX}_FIND_QUIETLY)
message (STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}")
endif (NOT ${PREFIX}_FIND_QUIETLY)
else (${PREFIX}_FOUND)
if (${PREFIX}_FIND_REQUIRED)
foreach (i ${${PREFIX}_PROCESS_INCLUDES} ${${PREFIX}_PROCESS_LIBS})
message("${i}=${${i}}")
endforeach (i)
message (FATAL_ERROR "Required library ${PREFIX} NOT FOUND.\nInstall the library (dev version) and try again. If the library is already installed, use ccmake to set the missing variables manually.")
endif (${PREFIX}_FIND_REQUIRED)
endif (${PREFIX}_FOUND)
endif (NOT ${PREFIX}_FOUND)
endmacro (libfind_process)
macro(libfind_library PREFIX basename)
set(TMP "")
if(MSVC80)
set(TMP -vc80)
endif(MSVC80)
if(MSVC90)
set(TMP -vc90)
endif(MSVC90)
set(${PREFIX}_LIBNAMES ${basename}${TMP})
if(${ARGC} GREATER 2)
set(${PREFIX}_LIBNAMES ${basename}${TMP}-${ARGV2})
string(REGEX REPLACE "\\." "_" TMP ${${PREFIX}_LIBNAMES})
set(${PREFIX}_LIBNAMES ${${PREFIX}_LIBNAMES} ${TMP})
endif(${ARGC} GREATER 2)
find_library(${PREFIX}_LIBRARY
NAMES ${${PREFIX}_LIBNAMES}
PATHS ${${PREFIX}_PKGCONF_LIBRARY_DIRS}
)
endmacro(libfind_library)
[tag_converter]
mapping=/home/igor/services/tag_converter/mappings/nkjp2upos.tconv
[service]
tool = new_tag_converter
root = /samba/requests/
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 4
[logging]
port = 11125
local_log_level = INFO
[logging_levels]
__main__ = INFO
\ No newline at end of file
[REG]
pos=([a-z0-9]+):?.*
attr=:([a-z0-9]+)
[POS]
adja=ADJ
adjp=ADJ
adjc=ADJ
conj=CONJ
comp=CONJ
interp=PUNCT
pred=VERB
xxx=X
adv=ADV
imps=VERB
inf=VERB
pant=VERB
pcon=VERB
qub=PART
prep=ADP
siebie=PRON
subst=NOUN
depr=NOUN
ger=VERB
ppron12=PRON
ppron3=PRON
num=NUM
numcol=NUM
adj=ADJ
pact=VERB
ppas=VERB
winien=VERB
praet=VERB
bedzie=VERB
fin=VERB
impt=VERB
aglt=VERB
ign=ign
brev=X
burk=ADJ
interj=INTJ
[ATTR]
sg=Number=Sing
pl=Number=Sing
nom=Case=Nom
gen=Case=Gen
dat=Case=Dat
acc=Case=Acc
inst=Case=Ins
loc=Case=Loc
voc=Case=Voc
m1=Gender=Masc|Animacy=Hum
m2=Gender=Masc|Animacy=Anim
m3=Gender=Masc|Animacy=Inan
f=Gender=Fem
n=Gender=Neut
pri=Person=1
sec=Person=2
ter=Person=3
pos=Degree=Pos
com=Degree=Cmp
sup=Degree=Sup
imperf=Aspect=Imp
perf=Aspect=Perd
aff=Polarity=Pos
neg=Polarity=Neg
congr=
rec=
akc=
nakc=
npraep=
praep=
agl=
nagl=
nwok=
wok=
pun=
npun=
[REG]
pos=([a-z0-9]+):?.*
attr=:([a-z0-9]+)
[POS]
adja=ADJ
adjp=ADJ
adjc=ADJ
conj=CCONJ
comp=SCONJ
interp=PUNCT
pred=VERB
xxx=X
adv=ADV
imps=VERB
inf=VERB
pant=VERB
pcon=VERB
qub=PART
prep=ADP
siebie=PRON
subst=NOUN
depr=NOUN
ger=VERB
ppron12=PRON
ppron3=PRON
num=NUM
numcol=NUM
adj=ADJ
pact=VERB
ppas=VERB
winien=VERB
praet=VERB
bedzie=VERB
fin=VERB
impt=VERB
aglt=VERB
ign=ign
brev=X
burk=ADJ
interj=INTJ
[ATTR]
sg=Number=Sing
pl=Number=Sing
nom=Case=Nom
gen=Case=Gen
dat=Case=Dat
acc=Case=Acc
inst=Case=Ins
loc=Case=Loc
voc=Case=Voc
m1=Gender=Masc|Animacy=Hum
m2=Gender=Masc|Animacy=Anim
m3=Gender=Masc|Animacy=Inan
f=Gender=Fem
n=Gender=Neut
pri=Person=1
sec=Person=2
ter=Person=3
pos=Degree=Pos
com=Degree=Cmp
sup=Degree=Sup
imperf=Aspect=Imp
perf=Aspect=Perd
aff=Polarity=Pos
neg=Polarity=Neg
congr=
rec=
akc=
nakc=
npraep=
praep=
agl=
nagl=
nwok=
wok=
pun=
npun=
[REG]
pos=
attr=
[POS]
ADJ=ADJ
ADP=ADP
ADV=ADV
AUX=AUX
CONJ=CCONJ
DET=DET
INTJ=INTJ
NOUN=NOUN
NUM=NUM
PART=PART
PRON=PRON
PROPN=PROPN
PUNCT=PUNCT
SYM=SYM
VERB=VERB
X=X
SPACE=X
[ATTR]
#include <fstream>
#include <regex>
#include <boost/filesystem.hpp>
#include "nlpworker.h"
// region using
using namespace std;
using ptree = boost::property_tree::ptree;
using dir_iter = boost::filesystem::directory_iterator;
// endregion using
class tag_converter_Worker: public NLPWorker
{
public:
void process(string task_path, ptree &config, string output_path);
void static_init(ptree config);
void init(ptree config);
private:
ptree mappings;
static std::vector<std::string> filenames;
};
std::vector<std::string> tag_converter_Worker::filenames;
void tag_converter_Worker::static_init(ptree config)
{
std::string dir_path = config.get<std::string>("tag_converter.mapping", "mappings/");
dir_iter end_itr; // default construction yields past-the-end
for(dir_iter itr(dir_path); itr != end_itr; ++itr)
{
if(boost::filesystem::is_regular_file(itr->status()))
{
filenames.emplace_back(itr->path().c_str());
}
}
}
void tag_converter_Worker::init(ptree config)
{
for(auto filename : filenames)
{
std::ifstream map_file(filename);
ptree mapping;
boost::property_tree::read_ini(map_file, mapping);
size_t beg = filename.find_last_of('/') + 1;
size_t len = filename.find_last_of('.') - beg;
auto name = filename.substr(beg, len);
mappings.add_child(name, mapping.get_child("POS"));
}
}
void tag_converter_Worker::process(string task_path, ptree &config, string output_path)
{
std::ifstream input(task_path);
std::ofstream output(output_path);
std::string line, new_pos;
std::string_view tag, pos;
line.reserve(500);
new_pos.reserve(100);
int beg, end;
auto pos_map = mappings.get_child(config.get<std::string>("mapping","nkjp2upos"));
std::regex reg("<ctag>.*</ctag>");
while(!input.eof())
{
std::getline(input, line);
end = line.find("</ctag>");
if(end == std::string::npos)
output << line << endl;
else
{
beg = line.find("<ctag>") + 6;
tag = std::string_view(line).substr(beg, end - beg);
pos = tag.substr(0, tag.find_first_of(':'));
new_pos = "<ctag>" + pos_map.get<std::string>(static_cast<std::string>(pos), "ign") + "</ctag>";
output << std::regex_replace(line, reg, new_pos) << std::endl;
}
}
}
int main(int argc, char* argv[])
{
run_workers<tag_converter_Worker>(argc,argv);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment