Commit 4452a928 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski

Merge branch 'dev' into 'master'

Embedrank worker

See merge request !1
parents 9b5f5a79 463d4799
Pipeline #4047 passed with stage
in 10 minutes and 16 seconds
model/*
\ No newline at end of file
build_image:
image: docker:18.09.7
only:
- master
services:
- 'docker:18.09.7-dind'
script:
- docker build -t clarinpl/embedrank .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/embedrank
FROM clarinpl/builder AS builder
FROM clarinpl/python:3.6
RUN apt-get update && apt-get install -y \
libxml++2.6-dev \
libloki-dev \
libboost-all-dev \
libicu-dev \
libffi-dev \
libssl-dev \
libxml2-utils \
swig \
openjdk-8-jdk
curl \
corpus2-python3.6 \
wccl-python3.6
WORKDIR /tmp/
RUN apt remove -y cmake && \
......@@ -19,12 +12,6 @@ RUN apt remove -y cmake && \
tar -xzf cmake*tar.gz && \
ln -s $(pwd)/cmake*/bin/cmake /usr/bin/cmake
COPY --from=builder /install/corpus2 /
COPY --from=builder /install/wccl /
COPY --from=builder /usr/lib/libmorfeusz* /usr/lib/
#install POLEM
RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/download' && \
dpkg -i ./morf && \
......@@ -40,9 +27,10 @@ RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/dow
RUN python3.6 -m pip install pip --upgrade && \
python3.6 -m pip install --no-cache-dir Cython
WORKDIR /home/worker
COPY requirements.txt .
COPY entrypoint.sh /entrypoint.sh
COPY embedrank_worker.py .
RUN python3.6 -m pip install -r requirements.txt
# Install sent2vec
......@@ -55,6 +43,8 @@ RUN apt-get -y install git g++ make && \
python3.6 setup.py build_ext && \
python3.6 -m pip install .
# Download NLTK data
RUN python3.6 -c "import nltk; nltk.download('punkt')"
RUN ["chmod", "+x", "/entrypoint.sh"]
CMD ["/entrypoint.sh"]
#!/usr/bin/python3.6
import WrapLem
from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import EmbeddingDistributorLocal
from swisscom_ai.research_keyphrase.model.input_representation import InputTextObj
from swisscom_ai.research_keyphrase.model.methods_embeddings import extract_candidates_embedding_for_doc
from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import (
EmbeddingDistributorLocal,
)
from swisscom_ai.research_keyphrase.model.input_representation import (
InputTextObj,
)
from swisscom_ai.research_keyphrase.model.methods_embeddings import (
extract_candidates_embedding_for_doc,
)
from swisscom_ai.research_keyphrase.model.method import MMRPhrase
from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingPL
from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingPL
from swisscom_ai.research_keyphrase.util.fileIO import read_file
from improvement import get_keywords
import nlp_ws
import logging
import os,shutil
import os
import shutil
_log = logging.getLogger(__name__)
class EmbedRankWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(cls, config):
print("Worker started loading models")
cls.ptagger = PosTaggingPL()
print(" starting sent2vec")
sent2vec_model = config['SENT2VEC']['model_path']
cls.sent2vec=EmbeddingDistributorLocal(sent2vec_model)
print(" startring lemmatizer ")
print("starting sent2vec")
sent2vec_model = config["SENT2VEC"]["model_path"]
cls.sent2vec = EmbeddingDistributorLocal(sent2vec_model)
print("starting lemmatizer")
cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
print("Worker finished loading models ")
def saveResult(self,keywords,scores,outputFile):
file = open(outputFile, 'w')
file.write('[')
def saveResult(self, keywords, scores, outputFile):
file = open(outputFile, "w")
file.write("[")
for idx in range(len(keywords)):
element_dict = {'keyword': keywords[idx], 'score': scores[idx]}
file.write(str(element_dict)+', ')
file.write(']')
element_dict = {"keyword": keywords[idx], "score": scores[idx]}
file.write(str(element_dict) + ", ")
file.write("]")
file.close()
def process(self, inputFile, taskOptions,outputFile):
if "N" not in taskOptions:
taskOptions["N"]="10";
def process(self, inputFile, taskOptions, outputFile):
if "N" not in taskOptions:
taskOptions["N"] = "10"
try:
if os.path.isdir(inputFile):
shutil.copytree(inputFile,outputFile)
#_log.info(inputFile+"/text.ccl")
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(inputFile+"/text.ccl")
#_log.info(lemmas)
else:
#_log.info("CCL")
if os.path.isdir(inputFile):
shutil.copytree(inputFile, outputFile)
# _log.info(inputFile+"/text.ccl")
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(
inputFile + "/text.ccl"
)
# _log.info(lemmas)
else:
# _log.info("CCL")
try:
os.makedirs(outputFile)
except:
pass
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(inputFile)
shutil.copy2(inputFile,outputFile+"/text.ccl")
#log.info("tagging finished ")
keywords, scores = get_keywords(tagged, lemmas, raw_text, self.sent2vec, self.lemmatizer, 'avg', int(taskOptions['N']))
self.saveResult(keywords,scores,outputFile+"/embedrank.json")
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(
inputFile
)
shutil.copy2(inputFile, outputFile + "/text.ccl")
# log.info("tagging finished ")
keywords, scores = get_keywords(
tagged,
lemmas,
raw_text,
self.sent2vec,
self.lemmatizer,
"avg",
int(taskOptions["N"]),
)
self.saveResult(keywords, scores, outputFile + "/embedrank.json")
finally:
pass
if __name__ == '__main__':
if __name__ == "__main__":
_log.info("starting")
nlp_ws.NLPService.main(EmbedRankWorker)
#!/bin/bash
if [ ! -f /sent2vec/pretrained_model.bin ]; then
curl https://minio.clarin-pl.eu/public/models/kgr10.bin --create-dirs -o /sent2vec/pretrained_model.bin
fi
cd /home/worker
git clone https://${GITLAB_USERNAME}:${GITLAB_PASSWORD}@gitlab.clarin-pl.eu/embedrankgroup/ai-research-keyphrase-extraction.git -b polish-azon-improvement --single-branch repo
mv repo/* .
rm -r repo
python3.6 embedrank_worker.py
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment