Skip to content
Snippets Groups Projects
Commit 4452a928 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski
Browse files

Merge branch 'dev' into 'master'

Embedrank worker

See merge request !1
parents 9b5f5a79 463d4799
Branches master update
1 merge request!1Embedrank worker
Pipeline #4047 passed with stage
in 10 minutes and 16 seconds
model/*
\ No newline at end of file
build_image:
image: docker:18.09.7
only:
- master
services:
- 'docker:18.09.7-dind'
script:
- docker build -t clarinpl/embedrank .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/embedrank
FROM clarinpl/builder AS builder
FROM clarinpl/python:3.6
RUN apt-get update && apt-get install -y \
libxml++2.6-dev \
libloki-dev \
libboost-all-dev \
libicu-dev \
libffi-dev \
libssl-dev \
libxml2-utils \
swig \
openjdk-8-jdk
curl \
corpus2-python3.6 \
wccl-python3.6
WORKDIR /tmp/
RUN apt remove -y cmake && \
......@@ -19,12 +12,6 @@ RUN apt remove -y cmake && \
tar -xzf cmake*tar.gz && \
ln -s $(pwd)/cmake*/bin/cmake /usr/bin/cmake
COPY --from=builder /install/corpus2 /
COPY --from=builder /install/wccl /
COPY --from=builder /usr/lib/libmorfeusz* /usr/lib/
#install POLEM
RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/download' && \
dpkg -i ./morf && \
......@@ -40,9 +27,10 @@ RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/dow
RUN python3.6 -m pip install pip --upgrade && \
python3.6 -m pip install --no-cache-dir Cython
WORKDIR /home/worker
COPY requirements.txt .
COPY entrypoint.sh /entrypoint.sh
COPY embedrank_worker.py .
RUN python3.6 -m pip install -r requirements.txt
# Install sent2vec
......@@ -55,6 +43,8 @@ RUN apt-get -y install git g++ make && \
python3.6 setup.py build_ext && \
python3.6 -m pip install .
# Download NLTK data
RUN python3.6 -c "import nltk; nltk.download('punkt')"
RUN ["chmod", "+x", "/entrypoint.sh"]
CMD ["/entrypoint.sh"]
File moved
#!/usr/bin/python3.6
import WrapLem
from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import EmbeddingDistributorLocal
from swisscom_ai.research_keyphrase.model.input_representation import InputTextObj
from swisscom_ai.research_keyphrase.model.methods_embeddings import extract_candidates_embedding_for_doc
from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import (
EmbeddingDistributorLocal,
)
from swisscom_ai.research_keyphrase.model.input_representation import (
InputTextObj,
)
from swisscom_ai.research_keyphrase.model.methods_embeddings import (
extract_candidates_embedding_for_doc,
)
from swisscom_ai.research_keyphrase.model.method import MMRPhrase
from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingPL
from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingPL
from swisscom_ai.research_keyphrase.util.fileIO import read_file
from improvement import get_keywords
import nlp_ws
import logging
import os,shutil
import os
import shutil
_log = logging.getLogger(__name__)
class EmbedRankWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(cls, config):
print("Worker started loading models")
cls.ptagger = PosTaggingPL()
print(" starting sent2vec")
sent2vec_model = config['SENT2VEC']['model_path']
cls.sent2vec=EmbeddingDistributorLocal(sent2vec_model)
print(" startring lemmatizer ")
print("starting sent2vec")
sent2vec_model = config["SENT2VEC"]["model_path"]
cls.sent2vec = EmbeddingDistributorLocal(sent2vec_model)
print("starting lemmatizer")
cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
print("Worker finished loading models ")
def saveResult(self,keywords,scores,outputFile):
file = open(outputFile, 'w')
file.write('[')
def saveResult(self, keywords, scores, outputFile):
file = open(outputFile, "w")
file.write("[")
for idx in range(len(keywords)):
element_dict = {'keyword': keywords[idx], 'score': scores[idx]}
file.write(str(element_dict)+', ')
file.write(']')
element_dict = {"keyword": keywords[idx], "score": scores[idx]}
file.write(str(element_dict) + ", ")
file.write("]")
file.close()
def process(self, inputFile, taskOptions,outputFile):
if "N" not in taskOptions:
taskOptions["N"]="10";
def process(self, inputFile, taskOptions, outputFile):
if "N" not in taskOptions:
taskOptions["N"] = "10"
try:
if os.path.isdir(inputFile):
shutil.copytree(inputFile,outputFile)
#_log.info(inputFile+"/text.ccl")
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(inputFile+"/text.ccl")
#_log.info(lemmas)
else:
#_log.info("CCL")
if os.path.isdir(inputFile):
shutil.copytree(inputFile, outputFile)
# _log.info(inputFile+"/text.ccl")
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(
inputFile + "/text.ccl"
)
# _log.info(lemmas)
else:
# _log.info("CCL")
try:
os.makedirs(outputFile)
except:
pass
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(inputFile)
shutil.copy2(inputFile,outputFile+"/text.ccl")
#log.info("tagging finished ")
keywords, scores = get_keywords(tagged, lemmas, raw_text, self.sent2vec, self.lemmatizer, 'avg', int(taskOptions['N']))
self.saveResult(keywords,scores,outputFile+"/embedrank.json")
tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(
inputFile
)
shutil.copy2(inputFile, outputFile + "/text.ccl")
# log.info("tagging finished ")
keywords, scores = get_keywords(
tagged,
lemmas,
raw_text,
self.sent2vec,
self.lemmatizer,
"avg",
int(taskOptions["N"]),
)
self.saveResult(keywords, scores, outputFile + "/embedrank.json")
finally:
pass
if __name__ == '__main__':
if __name__ == "__main__":
_log.info("starting")
nlp_ws.NLPService.main(EmbedRankWorker)
#!/bin/bash
if [ ! -f /sent2vec/pretrained_model.bin ]; then
curl https://minio.clarin-pl.eu/public/models/kgr10.bin --create-dirs -o /sent2vec/pretrained_model.bin
fi
cd /home/worker
git clone https://${GITLAB_USERNAME}:${GITLAB_PASSWORD}@gitlab.clarin-pl.eu/embedrankgroup/ai-research-keyphrase-extraction.git -b polish-azon-improvement --single-branch repo
mv repo/* .
rm -r repo
python3.6 embedrank_worker.py
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment