Skip to content
Snippets Groups Projects
Commit 5758750a authored by mateuszg's avatar mateuszg
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
PROJECT(tag_converter)
cmake_minimum_required(VERSION 3.5.0)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/CMakeScripts)
find_package(wsnlp 1.0.0 REQUIRED)
set(LIBS ${LIBS} ${WSNLP_LIBRARY})
find_package(ICU 52.0 REQUIRED)
set(LIBS ${LIBS} ${ICU_LIBRARIES})
find_package(Boost COMPONENTS program_options system thread filesystem chrono atomic date_time regex)
link_directories(${Boost_LIBRARY_DIRS})
set(LIBS ${LIBS} ${Boost_LIBRARIES})
find_package(amqpcpp REQUIRED)
set(LIBS ${LIBS} amqpcpp)
find_package(Threads)
set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
add_executable(tag_converter tag_converter_service.cpp)
target_link_libraries(tag_converter ${LIBS})
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/../bin)
# Finds the International Components for Unicode (ICU) Library
#
# ICU_FOUND - True if ICU found.
# ICU_I18N_FOUND - True if ICU's internationalization library found.
# ICU_INCLUDE_DIRS - Directory to include to get ICU headers
# Note: always include ICU headers as, e.g.,
# unicode/utypes.h
# ICU_LIBRARIES - Libraries to link against for the common ICU
# ICU_I18N_LIBRARIES - Libraries to link against for ICU internationaliation
# (note: in addition to ICU_LIBRARIES)
MARK_AS_ADVANCED(ICU_DIR)
# Look for the header file.
find_path(
ICU_INCLUDE_DIR
NAMES unicode/utypes.h
DOC "Include directory for the ICU library")
mark_as_advanced(ICU_INCLUDE_DIR)
# Look for the library.
find_library(
ICU_LIBRARY
NAMES icuuc cygicuuc cygicuuc32
DOC "Libraries to link against for the common parts of ICU")
mark_as_advanced(ICU_LIBRARY)
# Copy the results to the output variables.
if(ICU_INCLUDE_DIR AND ICU_LIBRARY)
set(ICU_FOUND 1)
set(ICU_LIBRARIES ${ICU_LIBRARY})
set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
set(ICU_VERSION 0)
set(ICU_MAJOR_VERSION 0)
set(ICU_MINOR_VERSION 0)
FILE(READ "${ICU_INCLUDE_DIR}/unicode/uversion.h" _ICU_VERSION_CONENTS)
STRING(REGEX REPLACE ".*#define U_ICU_VERSION_MAJOR_NUM ([0-9]+).*" "\\1" ICU_MAJOR_VERSION "${_ICU_VERSION_CONENTS}")
STRING(REGEX REPLACE ".*#define U_ICU_VERSION_MINOR_NUM ([0-9]+).*" "\\1" ICU_MINOR_VERSION "${_ICU_VERSION_CONENTS}")
set(ICU_VERSION "${ICU_MAJOR_VERSION}.${ICU_MINOR_VERSION}")
# Look for the ICU internationalization libraries
find_library(
ICU_I18N_LIBRARY
NAMES icuin icui18n cygicuin cygicuin32
DOC "Libraries to link against for ICU internationalization")
mark_as_advanced(ICU_I18N_LIBRARY)
if (ICU_I18N_LIBRARY)
set(ICU_I18N_FOUND 1)
set(ICU_I18N_LIBRARIES ${ICU_I18N_LIBRARY})
else (ICU_I18N_LIBRARY)
set(ICU_I18N_FOUND 0)
set(ICU_I18N_LIBRARIES)
endif (ICU_I18N_LIBRARY)
else(ICU_INCLUDE_DIR AND ICU_LIBRARY)
set(ICU_FOUND 0)
set(ICU_I18N_FOUND 0)
set(ICU_LIBRARIES)
set(ICU_I18N_LIBRARIES)
set(ICU_INCLUDE_DIRS)
set(ICU_VERSION)
set(ICU_MAJOR_VERSION)
set(ICU_MINOR_VERSION)
endif(ICU_INCLUDE_DIR AND ICU_LIBRARY)
IF(ICU_FOUND)
IF( NOT ICU_FIND_QUIETLY )
MESSAGE( STATUS "Found ICU header files in ${ICU_INCLUDE_DIRS}")
MESSAGE( STATUS "Found ICU libraries: ${ICU_LIBRARIES}")
ENDIF( NOT ICU_FIND_QUIETLY )
ELSE(ICU_FOUND)
IF(ICU_FIND_REQUIRED)
MESSAGE( FATAL_ERROR "Could not find ICU" )
ELSE(ICU_FIND_REQUIRED)
MESSAGE( STATUS "Optional package ICU was not found" )
ENDIF(ICU_FIND_REQUIRED)
ENDIF(ICU_FOUND)
# - Try to find SigC++-2.0
# Once done, this will define
#
# SigC++_FOUND - system has SigC++
# SigC++_INCLUDE_DIRS - the SigC++ include directories
# SigC++_LIBRARIES - link these to use SigC++
include(LibFindMacros)
# Use pkg-config to get hints about paths
libfind_pkg_check_modules(SigC++_PKGCONF sigc++-2.0)
# Main include dir
find_path(SigC++_INCLUDE_DIR
NAMES sigc++/sigc++.h
PATHS ${SigC++_PKGCONF_INCLUDE_DIRS}
PATH_SUFFIXES sigc++-2.0
)
# Glib-related libraries also use a separate config header, which is in lib dir
find_path(SigC++Config_INCLUDE_DIR
NAMES sigc++config.h
PATHS ${SigC++_PKGCONF_INCLUDE_DIRS} /usr
PATH_SUFFIXES lib/sigc++-2.0/include
)
libfind_library(SigC++ sigc 2.0)
# Set the include dir variables and the libraries and let libfind_process do the rest.
# NOTE: Singular variables for this library, plural for libraries this this lib depends on.
set(SigC++_PROCESS_INCLUDES SigC++_INCLUDE_DIR SigC++Config_INCLUDE_DIR)
set(SigC++_PROCESS_LIBS SigC++_LIBRARY)
libfind_process(SigC++)
# Find the Rabbitmq C library
SET(_AMQP_REQUIRED_VARS AMQP_INCLUDE_DIR AMQP_LIBRARY )
# Find the include directories
FIND_PATH(AMQP_INCLUDE_DIR
NAMES amqpcpp.h
HINTS ${AMQP_DIR}/include /usr/include/amqpcpp
)
FIND_LIBRARY(AMQP_LIBRARY
NAMES amqpcpp
HINTS ${AMQP_DIR}/lib /usr/lib/
)
SET(AMQP_PROCESS_INCLUDES ${AMQP_INCLUDE_DIR})
SET(AMQP_PROCESS_LIBS ${AMQP_LIBRARY})
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(AMQP DEFAULT_MSG ${_AMQP_REQUIRED_VARS})
\ No newline at end of file
# Find the Rabbitmq C library
SET(_WSNLP_REQUIRED_VARS WSNLP_INCLUDE_DIR WSNLP_LIBRARY )
# Find the include directories
FIND_PATH(WSNLP_INCLUDE_DIR
NAMES nlpworker.h
HINTS ${WSNLP_DIR}/include /usr/include/wsnlp
)
FIND_LIBRARY(WSNLP_LIBRARY
NAMES wsnlp
HINTS ${WSNLP_DIR}/lib /usr/lib/
)
SET(WSNLP_PROCESS_INCLUDES ${WSNLP_INCLUDE_DIR})
SET(WSNLP_PROCESS_LIBS ${WSNLP_LIBRARY})
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(WSNLP DEFAULT_MSG ${_WSNLP_REQUIRED_VARS})
\ No newline at end of file
# Works the same as find_package, but forwards the "REQUIRED" and "QUIET" arguments
# used for the current package. For this to work, the first parameter must be the
# prefix of the current package, then the prefix of the new package etc, which are
# passed to find_package.
macro (libfind_package PREFIX)
set (LIBFIND_PACKAGE_ARGS ${ARGN})
if (${PREFIX}_FIND_QUIETLY)
set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} QUIET)
endif (${PREFIX}_FIND_QUIETLY)
if (${PREFIX}_FIND_REQUIRED)
set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} REQUIRED)
endif (${PREFIX}_FIND_REQUIRED)
find_package(${LIBFIND_PACKAGE_ARGS})
endmacro (libfind_package)
# CMake developers made the UsePkgConfig system deprecated in the same release (2.6)
# where they added pkg_check_modules. Consequently I need to support both in my scripts
# to avoid those deprecated warnings. Here's a helper that does just that.
# Works identically to pkg_check_modules, except that no checks are needed prior to use.
macro (libfind_pkg_check_modules PREFIX PKGNAME)
if (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
include(UsePkgConfig)
pkgconfig(${PKGNAME} ${PREFIX}_INCLUDE_DIRS ${PREFIX}_LIBRARY_DIRS ${PREFIX}_LDFLAGS ${PREFIX}_CFLAGS)
else (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
find_package(PkgConfig)
if (PKG_CONFIG_FOUND)
pkg_check_modules(${PREFIX} ${PKGNAME})
endif (PKG_CONFIG_FOUND)
endif (${CMAKE_MAJOR_VERSION} EQUAL 2 AND ${CMAKE_MINOR_VERSION} EQUAL 4)
endmacro (libfind_pkg_check_modules)
# Do the final processing once the paths have been detected.
# If include dirs are needed, ${PREFIX}_PROCESS_INCLUDES should be set to contain
# all the variables, each of which contain one include directory.
# Ditto for ${PREFIX}_PROCESS_LIBS and library files.
# Will set ${PREFIX}_FOUND, ${PREFIX}_INCLUDE_DIRS and ${PREFIX}_LIBRARIES.
# Also handles errors in case library detection was required, etc.
macro (libfind_process PREFIX)
# Skip processing if already processed during this run
if (NOT ${PREFIX}_FOUND)
# Start with the assumption that the library was found
set (${PREFIX}_FOUND TRUE)
# Process all includes and set _FOUND to false if any are missing
foreach (i ${${PREFIX}_PROCESS_INCLUDES})
if (${i})
set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIRS} ${${i}})
mark_as_advanced(${i})
else (${i})
set (${PREFIX}_FOUND FALSE)
endif (${i})
endforeach (i)
# Process all libraries and set _FOUND to false if any are missing
foreach (i ${${PREFIX}_PROCESS_LIBS})
if (${i})
set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARIES} ${${i}})
mark_as_advanced(${i})
else (${i})
set (${PREFIX}_FOUND FALSE)
endif (${i})
endforeach (i)
# Print message and/or exit on fatal error
if (${PREFIX}_FOUND)
if (NOT ${PREFIX}_FIND_QUIETLY)
message (STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}")
endif (NOT ${PREFIX}_FIND_QUIETLY)
else (${PREFIX}_FOUND)
if (${PREFIX}_FIND_REQUIRED)
foreach (i ${${PREFIX}_PROCESS_INCLUDES} ${${PREFIX}_PROCESS_LIBS})
message("${i}=${${i}}")
endforeach (i)
message (FATAL_ERROR "Required library ${PREFIX} NOT FOUND.\nInstall the library (dev version) and try again. If the library is already installed, use ccmake to set the missing variables manually.")
endif (${PREFIX}_FIND_REQUIRED)
endif (${PREFIX}_FOUND)
endif (NOT ${PREFIX}_FOUND)
endmacro (libfind_process)
macro(libfind_library PREFIX basename)
set(TMP "")
if(MSVC80)
set(TMP -vc80)
endif(MSVC80)
if(MSVC90)
set(TMP -vc90)
endif(MSVC90)
set(${PREFIX}_LIBNAMES ${basename}${TMP})
if(${ARGC} GREATER 2)
set(${PREFIX}_LIBNAMES ${basename}${TMP}-${ARGV2})
string(REGEX REPLACE "\\." "_" TMP ${${PREFIX}_LIBNAMES})
set(${PREFIX}_LIBNAMES ${${PREFIX}_LIBNAMES} ${TMP})
endif(${ARGC} GREATER 2)
find_library(${PREFIX}_LIBRARY
NAMES ${${PREFIX}_LIBNAMES}
PATHS ${${PREFIX}_PKGCONF_LIBRARY_DIRS}
)
endmacro(libfind_library)
[tag_converter]
mapping=/home/igor/services/tag_converter/mappings/nkjp2upos.tconv
[service]
tool = new_tag_converter
root = /samba/requests/
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 4
[logging]
port = 11125
local_log_level = INFO
[logging_levels]
__main__ = INFO
\ No newline at end of file
[REG]
pos=([a-z0-9]+):?.*
attr=:([a-z0-9]+)
[POS]
adja=ADJ
adjp=ADJ
adjc=ADJ
conj=CONJ
comp=CONJ
interp=PUNCT
pred=VERB
xxx=X
adv=ADV
imps=VERB
inf=VERB
pant=VERB
pcon=VERB
qub=PART
prep=ADP
siebie=PRON
subst=NOUN
depr=NOUN
ger=VERB
ppron12=PRON
ppron3=PRON
num=NUM
numcol=NUM
adj=ADJ
pact=VERB
ppas=VERB
winien=VERB
praet=VERB
bedzie=VERB
fin=VERB
impt=VERB
aglt=VERB
ign=ign
brev=X
burk=ADJ
interj=INTJ
[ATTR]
sg=Number=Sing
pl=Number=Sing
nom=Case=Nom
gen=Case=Gen
dat=Case=Dat
acc=Case=Acc
inst=Case=Ins
loc=Case=Loc
voc=Case=Voc
m1=Gender=Masc|Animacy=Hum
m2=Gender=Masc|Animacy=Anim
m3=Gender=Masc|Animacy=Inan
f=Gender=Fem
n=Gender=Neut
pri=Person=1
sec=Person=2
ter=Person=3
pos=Degree=Pos
com=Degree=Cmp
sup=Degree=Sup
imperf=Aspect=Imp
perf=Aspect=Perd
aff=Polarity=Pos
neg=Polarity=Neg
congr=
rec=
akc=
nakc=
npraep=
praep=
agl=
nagl=
nwok=
wok=
pun=
npun=
[REG]
pos=([a-z0-9]+):?.*
attr=:([a-z0-9]+)
[POS]
adja=ADJ
adjp=ADJ
adjc=ADJ
conj=CCONJ
comp=SCONJ
interp=PUNCT
pred=VERB
xxx=X
adv=ADV
imps=VERB
inf=VERB
pant=VERB
pcon=VERB
qub=PART
prep=ADP
siebie=PRON
subst=NOUN
depr=NOUN
ger=VERB
ppron12=PRON
ppron3=PRON
num=NUM
numcol=NUM
adj=ADJ
pact=VERB
ppas=VERB
winien=VERB
praet=VERB
bedzie=VERB
fin=VERB
impt=VERB
aglt=VERB
ign=ign
brev=X
burk=ADJ
interj=INTJ
[ATTR]
sg=Number=Sing
pl=Number=Sing
nom=Case=Nom
gen=Case=Gen
dat=Case=Dat
acc=Case=Acc
inst=Case=Ins
loc=Case=Loc
voc=Case=Voc
m1=Gender=Masc|Animacy=Hum
m2=Gender=Masc|Animacy=Anim
m3=Gender=Masc|Animacy=Inan
f=Gender=Fem
n=Gender=Neut
pri=Person=1
sec=Person=2
ter=Person=3
pos=Degree=Pos
com=Degree=Cmp
sup=Degree=Sup
imperf=Aspect=Imp
perf=Aspect=Perd
aff=Polarity=Pos
neg=Polarity=Neg
congr=
rec=
akc=
nakc=
npraep=
praep=
agl=
nagl=
nwok=
wok=
pun=
npun=
[REG]
pos=
attr=
[POS]
ADJ=ADJ
ADP=ADP
ADV=ADV
AUX=AUX
CONJ=CCONJ
DET=DET
INTJ=INTJ
NOUN=NOUN
NUM=NUM
PART=PART
PRON=PRON
PROPN=PROPN
PUNCT=PUNCT
SYM=SYM
VERB=VERB
X=X
SPACE=X
[ATTR]
#include <fstream>
#include <regex>
#include <boost/filesystem.hpp>
#include "nlpworker.h"
// region using
using namespace std;
using ptree = boost::property_tree::ptree;
using dir_iter = boost::filesystem::directory_iterator;
// endregion using
class tag_converter_Worker: public NLPWorker
{
public:
void process(string task_path, ptree &config, string output_path);
void static_init(ptree config);
void init(ptree config);
private:
ptree mappings;
static std::vector<std::string> filenames;
};
std::vector<std::string> tag_converter_Worker::filenames;
void tag_converter_Worker::static_init(ptree config)
{
std::string dir_path = config.get<std::string>("tag_converter.mapping", "mappings/");
dir_iter end_itr; // default construction yields past-the-end
for(dir_iter itr(dir_path); itr != end_itr; ++itr)
{
if(boost::filesystem::is_regular_file(itr->status()))
{
filenames.emplace_back(itr->path().c_str());
}
}
}
void tag_converter_Worker::init(ptree config)
{
for(auto filename : filenames)
{
std::ifstream map_file(filename);
ptree mapping;
boost::property_tree::read_ini(map_file, mapping);
size_t beg = filename.find_last_of('/') + 1;
size_t len = filename.find_last_of('.') - beg;
auto name = filename.substr(beg, len);
mappings.add_child(name, mapping.get_child("POS"));
}
}
void tag_converter_Worker::process(string task_path, ptree &config, string output_path)
{
std::ifstream input(task_path);
std::ofstream output(output_path);
std::string line, new_pos;
std::string_view tag, pos;
line.reserve(500);
new_pos.reserve(100);
int beg, end;
auto pos_map = mappings.get_child(config.get<std::string>("mapping","nkjp2upos"));
std::regex reg("<ctag>.*</ctag>");
while(!input.eof())
{
std::getline(input, line);
end = line.find("</ctag>");
if(end == std::string::npos)
output << line << endl;
else
{
beg = line.find("<ctag>") + 6;
tag = std::string_view(line).substr(beg, end - beg);
pos = tag.substr(0, tag.find_first_of(':'));
new_pos = "<ctag>" + pos_map.get<std::string>(static_cast<std::string>(pos), "ign") + "</ctag>";
output << std::regex_replace(line, reg, new_pos) << std::endl;
}
}
}
int main(int argc, char* argv[])
{
run_workers<tag_converter_Worker>(argc,argv);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment