diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index b42935e91820cff7243a019e4c45ffdd0c08394f..0000000000000000000000000000000000000000 --- a/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -model/* \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..cbc777b532aa674d173c582ac1735c4c1df9dd7b --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,12 @@ +build_image: + image: docker:18.09.7 + only: + - master + services: + - 'docker:18.09.7-dind' + script: + - docker build -t clarinpl/embedrank . + - echo $DOCKER_PASSWORD > pass.txt + - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin + - rm pass.txt + - docker push clarinpl/embedrank diff --git a/Dockerfile b/Dockerfile index 8fa46ca22058b64c42a26c1583b239663d48e17f..c261df21a07f21d8085f0e3e12774d4648d5fd8d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,10 @@ -FROM clarinpl/builder AS builder FROM clarinpl/python:3.6 RUN apt-get update && apt-get install -y \ - libxml++2.6-dev \ - libloki-dev \ - libboost-all-dev \ - libicu-dev \ - libffi-dev \ - libssl-dev \ - libxml2-utils \ - swig \ - openjdk-8-jdk + curl \ + corpus2-python3.6 \ + wccl-python3.6 WORKDIR /tmp/ RUN apt remove -y cmake && \ @@ -19,12 +12,6 @@ RUN apt remove -y cmake && \ tar -xzf cmake*tar.gz && \ ln -s $(pwd)/cmake*/bin/cmake /usr/bin/cmake - - -COPY --from=builder /install/corpus2 / -COPY --from=builder /install/wccl / -COPY --from=builder /usr/lib/libmorfeusz* /usr/lib/ - #install POLEM RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/download' && \ dpkg -i ./morf && \ @@ -40,9 +27,10 @@ RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/dow RUN python3.6 -m pip install pip --upgrade && \ python3.6 -m pip install --no-cache-dir Cython - WORKDIR /home/worker COPY requirements.txt . +COPY entrypoint.sh /entrypoint.sh +COPY embedrank_worker.py . RUN python3.6 -m pip install -r requirements.txt # Install sent2vec @@ -55,6 +43,8 @@ RUN apt-get -y install git g++ make && \ python3.6 setup.py build_ext && \ python3.6 -m pip install . - # Download NLTK data RUN python3.6 -c "import nltk; nltk.download('punkt')" + +RUN ["chmod", "+x", "/entrypoint.sh"] +CMD ["/entrypoint.sh"] diff --git a/README b/README.md similarity index 100% rename from README rename to README.md diff --git a/embedrank_worker.py b/embedrank_worker.py index 9fd007fad2dd5f56b9018ee211c95e27ae714618..1a57110fc315453c6fa1e1fe46ea774e00cc5ed6 100644 --- a/embedrank_worker.py +++ b/embedrank_worker.py @@ -1,86 +1,98 @@ #!/usr/bin/python3.6 - - import WrapLem -from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import EmbeddingDistributorLocal -from swisscom_ai.research_keyphrase.model.input_representation import InputTextObj -from swisscom_ai.research_keyphrase.model.methods_embeddings import extract_candidates_embedding_for_doc +from swisscom_ai.research_keyphrase.embeddings.emb_distrib_local import ( + EmbeddingDistributorLocal, +) +from swisscom_ai.research_keyphrase.model.input_representation import ( + InputTextObj, +) +from swisscom_ai.research_keyphrase.model.methods_embeddings import ( + extract_candidates_embedding_for_doc, +) from swisscom_ai.research_keyphrase.model.method import MMRPhrase -from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingPL +from swisscom_ai.research_keyphrase.preprocessing.postagging import PosTaggingPL from swisscom_ai.research_keyphrase.util.fileIO import read_file from improvement import get_keywords import nlp_ws import logging -import os,shutil +import os +import shutil _log = logging.getLogger(__name__) class EmbedRankWorker(nlp_ws.NLPWorker): - - @classmethod def static_init(cls, config): print("Worker started loading models") cls.ptagger = PosTaggingPL() - - print(" starting sent2vec") - sent2vec_model = config['SENT2VEC']['model_path'] - cls.sent2vec=EmbeddingDistributorLocal(sent2vec_model) - - print(" startring lemmatizer ") + + print("starting sent2vec") + sent2vec_model = config["SENT2VEC"]["model_path"] + cls.sent2vec = EmbeddingDistributorLocal(sent2vec_model) + + print("starting lemmatizer") cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer() - + print("Worker finished loading models ") - - - - def saveResult(self,keywords,scores,outputFile): - file = open(outputFile, 'w') - file.write('[') + + def saveResult(self, keywords, scores, outputFile): + file = open(outputFile, "w") + file.write("[") for idx in range(len(keywords)): - element_dict = {'keyword': keywords[idx], 'score': scores[idx]} - file.write(str(element_dict)+', ') - - file.write(']') + element_dict = {"keyword": keywords[idx], "score": scores[idx]} + file.write(str(element_dict) + ", ") + + file.write("]") file.close() - - def process(self, inputFile, taskOptions,outputFile): - - if "N" not in taskOptions: - taskOptions["N"]="10"; - + + def process(self, inputFile, taskOptions, outputFile): + + if "N" not in taskOptions: + taskOptions["N"] = "10" + try: - - if os.path.isdir(inputFile): - shutil.copytree(inputFile,outputFile) - #_log.info(inputFile+"/text.ccl") - tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(inputFile+"/text.ccl") - #_log.info(lemmas) - - else: - #_log.info("CCL") + + if os.path.isdir(inputFile): + shutil.copytree(inputFile, outputFile) + # _log.info(inputFile+"/text.ccl") + tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text( + inputFile + "/text.ccl" + ) + # _log.info(lemmas) + + else: + # _log.info("CCL") try: os.makedirs(outputFile) except: pass - tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text(inputFile) - - shutil.copy2(inputFile,outputFile+"/text.ccl") - #log.info("tagging finished ") - - - keywords, scores = get_keywords(tagged, lemmas, raw_text, self.sent2vec, self.lemmatizer, 'avg', int(taskOptions['N'])) - self.saveResult(keywords,scores,outputFile+"/embedrank.json") - + tagged, lemmas, raw_text = self.ptagger.pos_tag_raw_text( + inputFile + ) + + shutil.copy2(inputFile, outputFile + "/text.ccl") + # log.info("tagging finished ") + + keywords, scores = get_keywords( + tagged, + lemmas, + raw_text, + self.sent2vec, + self.lemmatizer, + "avg", + int(taskOptions["N"]), + ) + self.saveResult(keywords, scores, outputFile + "/embedrank.json") + finally: pass -if __name__ == '__main__': + +if __name__ == "__main__": _log.info("starting") nlp_ws.NLPService.main(EmbedRankWorker) - diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..183b92bbf481849ebcd0b8319b6d2b87f3175fe5 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ ! -f /sent2vec/pretrained_model.bin ]; then + curl https://minio.clarin-pl.eu/public/models/kgr10.bin --create-dirs -o /sent2vec/pretrained_model.bin +fi +cd /home/worker +git clone https://${GITLAB_USERNAME}:${GITLAB_PASSWORD}@gitlab.clarin-pl.eu/embedrankgroup/ai-research-keyphrase-extraction.git -b polish-azon-improvement --single-branch repo +mv repo/* . +rm -r repo +python3.6 embedrank_worker.py \ No newline at end of file