Commit 5e8c74a0 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Initial commit

parents
model/*
\ No newline at end of file
model/*
ai-research-keyphrase-extraction/*
\ No newline at end of file
cmake_minimum_required(VERSION 2.8)
project(polem)
if(UNIX)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -std=gnu++0x")
endif()
set(BUILD_SHARED_LIBS 1)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake/Modules")
set(CMAKE_INSTALL_LIBDIR "lib")
set(CMAKE_INSTALL_INCLUDEDIR "include")
include(LibFindMacros)
find_package(JNI REQUIRED)
find_package(Java REQUIRED)
include(UseJava)
find_package(Boost COMPONENTS program_options REQUIRED)
include_directories(${Boost_INCLUDE_DIRS})
set(LIBS ${LIBS} ${Boost_LIBRARIES})
find_package(Corpus2 REQUIRED)
include_directories(${Corpus2_INCLUDE_DIRS})
set(LIBS ${LIBS} ${Corpus2_LIBRARY})
find_package(ICU REQUIRED)
include_directories(${ICU_INCLUDE_DIRS})
set(LIBS ${LIBS} ${ICU_LIBRARIES})
set(LIBS ${LIBS} ${ICU_I18N_LIBRARIES})
find_library(WCCL_LIBRARY wccl /usr/lib/ /usr/local/lib/)
find_path(WCCL_INCLUDE_DIR libwccl /usr/lib/ /usr/local/lib)
if(WCCL_LIBRARY AND WCCL_INCLUDE_DIR)
message("-- Found WCCL libraries: " ${WCCL_LIBRARY})
include_directories(${WCCL_INCLUDE_DIR})
set(LIBS ${LIBS} ${WCCL_LIBRARY})
endif(WCCL_LIBRARY AND WCCL_INCLUDE_DIR)
find_package(Morfeusz2 REQUIRED)
include_directories(${MORFEUSZ2_INCLUDE_DIR})
set(LIBS ${LIBS} ${MORFEUSZ2_LIBRARY})
find_package(PwrUtils REQUIRED)
include_directories(${PwrUtils_INCLUDE_DIRS})
set(LIBS ${LIBS} ${PwrUtils_LIBRARIES})
set(LIBS ${LIBS} ${PwrUtils_LIBRARY})
set(SOURCE_FILES polem/main.cpp ${CMAKE_CURRENT_SOURCE_DIR}/pugixml-1.8/src/pugixml.cpp
polem/CascadeLemmatizer.cpp polem/RuleLemmatizer.cpp
polem/DictionaryLemmatizer.cpp polem/Inflection.cpp
polem/NamLivPersonLemmatizer.cpp polem/NamLocLemmatizer.cpp
polem/OrthLemmatizer.cpp polem/CascadeLemmatizer.h
polem/RuleLemmatizer.h polem/DictionaryLemmatizer.h
polem/Inflection.h polem/NamLivPersonLemmatizer.h
polem/NamLocLemmatizer.h polem/OrthLemmatizer.h
polem/InflectionRule.h polem/InflectionRule.cpp)
set(LIBRARY_FILES ${CMAKE_CURRENT_SOURCE_DIR}/pugixml-1.8/src/pugixml.cpp ${CMAKE_CURRENT_SOURCE_DIR}/polem/CascadeLemmatizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/polem/RuleLemmatizer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/polem/DictionaryLemmatizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/polem/Inflection.cpp ${CMAKE_CURRENT_SOURCE_DIR}/polem/NamLivPersonLemmatizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/polem/NamLocLemmatizer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/polem/OrthLemmatizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/polem/CascadeLemmatizer.h ${CMAKE_CURRENT_SOURCE_DIR}/polem/RuleLemmatizer.h
${CMAKE_CURRENT_SOURCE_DIR}/polem/DictionaryLemmatizer.h ${CMAKE_CURRENT_SOURCE_DIR}/polem/Inflection.h
${CMAKE_CURRENT_SOURCE_DIR}/polem/NamLivPersonLemmatizer.h ${CMAKE_CURRENT_SOURCE_DIR}/polem/NamLocLemmatizer.h
${CMAKE_CURRENT_SOURCE_DIR}/polem/OrthLemmatizer.h ${CMAKE_CURRENT_SOURCE_DIR}/polem/InflectionRule.h
${CMAKE_CURRENT_SOURCE_DIR}/polem/InflectionRule.cpp)
set(PUBLIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/polem/CascadeLemmatizer.h ${CMAKE_CURRENT_SOURCE_DIR}/polem/DictionaryLemmatizer.h
${CMAKE_CURRENT_SOURCE_DIR}/polem/NamLivPersonLemmatizer.h ${CMAKE_CURRENT_SOURCE_DIR}/polem/NamLocLemmatizer.h
${CMAKE_CURRENT_SOURCE_DIR}/polem/RuleLemmatizer.h ${CMAKE_CURRENT_SOURCE_DIR}/polem/OrthLemmatizer.h
${CMAKE_CURRENT_SOURCE_DIR}/polem/Inflection.h)
set(WRAPPER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/javawrap/WrapLem_wrap.cxx)
#set(WRAPPER_HEADERS "")
set(PY_FILES ${CMAKE_CURRENT_SOURCE_DIR}/pythonwrap/WrapLem_wrap.cxx)
set(PY_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/pythonwrap/WrapLem.py)
file(GLOB DEPLOY_FILES_AND_DIRS "${PROJECT_SOURCE_DIR}/dict/*")
foreach(ITEM ${DEPLOY_FILES_AND_DIRS})
LIST( APPEND FILES_TO_DEPLOY "${ITEM}" )
endforeach()
INSTALL( FILES ${FILES_TO_DEPLOY} DESTINATION share/polem )
add_executable( polem ${SOURCE_FILES} ${LIBS})
target_link_libraries(polem ${LIBS})
install(TARGETS polem RUNTIME DESTINATION "/usr/local/bin/")
#set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
add_library(polem-dev SHARED ${LIBRARY_FILES})
set_target_properties(polem-dev PROPERTIES
PUBLIC_HEADER ${PUBLIC_HEADERS})
target_include_directories(polem-dev PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/polem)
target_include_directories(polem-dev PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
install(TARGETS polem-dev
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/polem-dev)
target_link_libraries(polem-dev ${ICU_I18N_LIBRARIES})
target_link_libraries(polem-dev ${ICU_LIBRARIES})
target_link_libraries(polem-dev ${Corpus2_LIBRARY})
target_link_libraries(polem-dev ${LIBS})
install(FILES ${PUBLIC_HEADERS}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/polem-dev)
#####JAVA
FIND_PACKAGE(SWIG REQUIRED)
INCLUDE(${SWIG_USE_FILE})
include(UseSWIG)
include_directories(${JNI_INCLUDE_DIRS})
SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}/java)
SET(CMAKE_SWIG_FLAGS -package g419.polem)
set_property(SOURCE "WrapLem.i" PROPERTY CPLUSPLUS ON)
swig_add_module(
PolemJava
java
WrapLem.i
${LIBRARY_FILES}
)
TARGET_INCLUDE_DIRECTORIES(PolemJava
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
PRIVATE ${JAVA_INCLUDE_PATH}
PRIVATE ${JAVA_INCLUDE_PATH2}
)
target_link_libraries(PolemJava ${LIBS})
FILE(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/classes)
add_custom_command(TARGET PolemJava POST_BUILD
COMMAND ${Java_JAVAC_EXECUTABLE} -d classes java/*.java
COMMAND ${Java_JAR_EXECUTABLE} -cfM PolemJava.jar -C classes .
)
INSTALL(TARGETS PolemJava
DESTINATION ${CMAKE_INSTALL_LIBDIR})
####PYTHON
find_package(PythonInterp REQUIRED)
FIND_PACKAGE(PythonLibs REQUIRED)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_PATH})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
SET(CMAKE_SWIG_FLAGS "")
SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/WrapLem.i PROPERTIES CPLUSPLUS ON)
SWIG_ADD_MODULE(WrapLem python ${CMAKE_CURRENT_SOURCE_DIR}/WrapLem.i)
SWIG_LINK_LIBRARIES(WrapLem polem-dev ${LIBS} ${PYTHON_LIBRARIES})
#message("${PYTHON_LIBRARIES}")
#execute_process ( COMMAND python3 -c "import site; print(site.getsitepackages()[0])" OUTPUT_VARIABLE PYTHON_SITE_PACKAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND python -c "from distutils.sysconfig import get_python_lib; print get_python_lib()" OUTPUT_VARIABLE PYTHON_SITE_PACKAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
install(TARGETS _WrapLem
LIBRARY DESTINATION ${PYTHON_SITE_PACKAGES})
install(FILES ${PY_HEADERS}
DESTINATION ${PYTHON_SITE_PACKAGES})
FROM clarinpl/builder AS builder
FROM clarinpl/python:3.6
RUN apt-get update && apt-get install -y \
libxml++2.6-dev \
libloki-dev \
libboost-all-dev \
libicu-dev \
libffi-dev \
libssl-dev \
libxml2-utils \
swig \
openjdk-8-jdk
WORKDIR /tmp/
RUN apt remove -y cmake && \
wget https://github.com/Kitware/CMake/releases/download/v3.16.0-rc2/cmake-3.16.0-rc2-Linux-x86_64.tar.gz && \
tar -xzf cmake*tar.gz && \
ln -s $(pwd)/cmake*/bin/cmake /usr/bin/cmake
COPY --from=builder /install/corpus2 /
COPY --from=builder /install/wccl /
COPY --from=builder /usr/lib/libmorfeusz* /usr/lib/
#install POLEM
RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/download' && \
dpkg -i ./morf && \
git clone https://github.com/gkubon/Polem && \
mkdir -p Polem/build && \
cd Polem/build && \
cmake .. && \
make -j && \
make install && \
ldconfig && \
cd / && rm -r /tmp/*
RUN python3.6 -m pip install pip --upgrade && \
python3.6 -m pip install --no-cache-dir Cython
WORKDIR /home/worker
COPY requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
# Install sent2vec
RUN apt-get -y install git g++ make && \
git clone https://github.com/epfml/sent2vec && \
cd sent2vec && \
git checkout f827d014a473aa22b2fef28d9e29211d50808d48 && \
make -j && \
cd src && \
python3.6 setup.py build_ext && \
python3.6 -m pip install .
# Download NLTK data
RUN python3.6 -c "import nltk; nltk.download('punkt')"
1. Get model
curl https://minio.clarin-pl.eu/public/models/kgr10.bin --create-dirs -o model/kgr10.bin
2. Get code
git clone https://gitlab.clarin-pl.eu/embedrankgroup/ai-research-keyphrase-extraction.git -b polish-azon-improvement --single-branch
\ No newline at end of file
ai-research-keyphrase-extraction @ a938b716
Subproject commit a938b71687915898c3bab855f25f2a357ab6223b
[service]
root = /samba/requests/
tool = embedRank
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 2
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
embedrank_worker = INFO
[SENT2VEC]
model_path =/sent2vec/pretrained_model.bin
[STANFORDTAGGER]
jar_path = /stanford-tagger/stanford-postagger.jar
model_directory_path =/stanford-tagger/models/
#!/usr/bin/python3.6
import WrapLem
from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import EmbeddingDistributorLocal
from swisscom_ai.research_keyphrase.model.input_representation import InputTextObj
from swisscom_ai.research_keyphrase.model.methods_embeddings import extract_candidates_embedding_for_doc
from swisscom_ai.research_keyphrase.model.method import MMRPhrase
from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingPL
from swisscom_ai.research_keyphrase.util.fileIO import read_file
from improvement import get_keywords
import nlp_ws
import logging
_log = logging.getLogger(__name__)
class EmbedRankWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(cls, config):
print("Worker started loading models")
cls.ptagger = PosTaggingPL()
print(" starting sent2vec")
sent2vec_model = config['SENT2VEC']['model_path']
cls.sent2vec=EmbeddingDistributorLocal(sent2vec_model)
print(" startring lemmatizer ")
cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
print("Worker finished loading models ")
def saveResult(self,keywords_dict,outputFile):
file = open(outputFile, 'w')
file.write('[')
if keywords_dict[0] is not None:
for idx in range(len(keywords_dict[0])):
element_dict = {'keyword': keywords_dict[0][idx], 'score': keywords_dict[1][idx], 'alias': keywords_dict[2][idx]}
file.write(str(element_dict)+', ')
file.write(']')
file.close()
def process(self, inputFile, taskOptions,outputFile):
if "N" not in taskOptions:
taskOptions["N"]="10";
try:
if os.path.isdir(inputFile):
shutil.copytree(inputFile,outputFile)
#_log.info(inputFile+"/text.ccl")
tagged,lemmas = self.ptagger.pos_tag_raw_text(inputFile+"/text.ccl")
_log.info(lemmas)
else:
_log.info("CCL")
try:
os.makedirs(outputFile)
except:
pass
tagged,lemmas = self.ptagger.pos_tag_raw_text(inputFile)
#_log.info(lemmas)
shutil.copy2(inputFile,outputFile+"/text.ccl")
#log.info("tagging finished ")
text_obj = InputTextObj(tagged, 'en')
keywords_dict=MMRPhrase(self.sent2vec, text_obj, lemmas, N=int(taskOptions['N']), beta=0.55, alias_threshold=0.7,lemmatizer=self.lemmatizer)
self.saveResult(keywords_dict,outputFile+"/embedrank.json")
finally:
pass
if __name__ == '__main__':
_log.info("starting")
nlp_ws.NLPService.main(EmbedRankWorker)
nlp-ws
langdetect==1.0.7
nltk==3.2.4
numpy==1.14.3
scikit-learn==0.19.0
scipy==0.19.1
six==1.10.0
requests>=2.0
lxml
configparser
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment