Commit 3598bf57 authored by Łukasz Kopociński's avatar Łukasz Kopociński

Merge branch 'develop'

parents 4cd9f435 47cf501a
/config.local
/updater
/state-journal
/state-wal
/state
/lock
/tmp
/updater.lock
/cache
['remote "origin"']
url = s3://semrel/dvc
endpointurl = https://minio.clarin-pl.eu
[core]
remote = origin
.idea .idea
.pytest_cache .pytest_cache
\ No newline at end of file
__pycache__/
*.py[cod]
*$py.class
build
dist
*.egg-info
.DS_Store
*.map
*.pt
*.keys
credentials
.tox
semrel/data/data
spert/data
File added
# semrel-extraction # Semrel Extraction
Repository contains a codebase used in research on the extraction of semantic relations (brand-product).
Research description and results are included in the paper:
["Brand-Product Relation Extraction Using Heterogeneous Vector Space Representations"](https://gitlab.clarin-pl.eu/team-semantics/semrel-extraction/-/blob/develop/LREC_BP.pdf)
published in [LREC2020](https://lrec2020.lrec-conf.org/en/) conference.
A project focused on mining semantic relations.
\ No newline at end of file ## Frameworks
Two frameworks were used in the project. [DVC](https://dvc.org/doc) for versioning the datasets and [mlflow](https://mlflow.org/docs/latest/index.html) for tracking experiments.
To manage the project with ease consider familiarize with them.
## Setup project
To setup the project in your machine perform following commands
Download repository: \
`$ git clone https://gitlab.clarin-pl.eu/team-semantics/semrel-extraction.git`
Enter main folder: \
`$ cd semrel-extraction`
Download datasets related to actual commit: \
`$ dvc pull`
Then enter to docker folder: \
`$ cd docker`
Copy __credentials.template__ into __credentials__ files and fill with correct access keys. \
`$ cp deps/credentials.template deps/credentials`
Start docker: \
`$ docker-compose up`
## Repository packages
Repository also contains code for additional functionalities:
__docker__ - docker configuration and execution environment for semrel package. \
__mlflow__ - configuration and execution environment for mlflow server used for tracking experiments. \
__spert__ - scripts used to prepare dataset in format required to train [SpERT](https://github.com/markus-eberts/spert) model. \
__worker__ - scripts and execution environment to use trained model as a worker.
## FAQ
#### Where is data stored?
Data is versioned by [DVC](https://dvc.org/doc) which works like a git but for data.
All data is stored on the remote storage (https://minio.clarin-pl.eu/minio/semrel/) in dvc folder.
To retrieve data execute:
`$ git checkout [branch_name]`
`$ git dvc checkout`
DVC will download all data related to actual commit.
#### How to train and test a model?
There is a script __semrel/model/train.sh__ which starts training.
Adjust training params in __semrel/model/config.yaml__ and then execute:\
`$ ./train.sh`
Training result will be automatically uploaded to mlflow server.
#### Do I need to setup anything on my machine?
Yes, to make mlflow log artifacts properly set environment variable,
otherwise mlflow try to ping original Amazon S3 storage.
`$ export MLFLOW_S3_ENDPOINT_URL=https://minio.clarin-pl.eu`
add also config file filled with correct credentials:
`$ echo "[default]" > ~/.aws/credentials`
`$ echo "aws_access_key_id = access_key" >> ~/.aws/credentials`
`$ echo "aws_secret_access_key = secret_key" >> ~/.aws/credentials`
md5: f5b640a205f82fa6af38883643fb401e
outs:
- md5: 8c7366784929b1b98e955c44d1e104df.dir
path: semrel/data/data/corpora
cache: true
metric: false
persist: false
md5: 1208e7e581bdffa6e8bb1feb8c76004b
outs:
- md5: 554db712988ad5ce544d40ca32cde01a.dir
path: semrel/data/data/elmo
cache: true
metric: false
persist: false
md5: 3fd7ed4807e41c8215e0bade52ab2fda
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_elmo_map.py --input-path
./semrel/data/data/relations.files.list --model ./semrel/data/data/elmo/options.json
./semrel/data/data/elmo/weights.hdf5 --output-paths ./semrel/data/data/maps/elmo.map.keys
./semrel/data/data/maps/elmo.map.pt
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: 1111682d038ff9be1f622b0ec771e7d0
path: semrel/data/scripts/cli/make_elmo_map.py
- md5: 14fa344f624b3105e02e46e2567cb3ae
path: semrel/data/scripts/maps.py
outs:
- md5: cd32cc2bc03aaf30e93665bae80e7d72
path: semrel/data/data/maps/elmo.map.keys
cache: true
metric: false
persist: false
- md5: 9987878c3752d38c5ecf287b95a75700
path: semrel/data/data/maps/elmo.map.pt
cache: true
metric: false
persist: false
md5: 98ed9f4d303600a6be92178081f39a22
outs:
- md5: d1e20ac9018c3de356cb1ed1c38674a4.dir
path: semrel/data/data/fasttext
cache: true
metric: false
persist: false
md5: 218b1553a127004a3693c93f1cde6c13
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_fasttext_map.py --input-path
./semrel/data/data/relations.files.list --model ./semrel/data/data/fasttext/kgr10.plain.skipgram.dim300.neg10.bin
--output-paths ./semrel/data/data/maps/fasttext.map.keys ./semrel/data/data/maps/fasttext.map.pt
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: 9d988a887cba28b9774b3171f9c511f0
path: semrel/data/scripts/cli/make_fasttext_map.py
- md5: 14fa344f624b3105e02e46e2567cb3ae
path: semrel/data/scripts/maps.py
outs:
- md5: cd32cc2bc03aaf30e93665bae80e7d72
path: semrel/data/data/maps/fasttext.map.keys
cache: true
metric: false
persist: false
- md5: 8335f6b5fdc048f306ff4a3b659960cc
path: semrel/data/data/maps/fasttext.map.pt
cache: true
metric: false
persist: false
md5: 4dc591a5158e89ab53a1c29a06561cbc
cmd: find ./semrel/data/data/corpora -type f -name *.rel.xml > ./semrel/data/data/relations.files.list
deps:
- md5: d171bbc05d28098ebd033f5241cf6929.dir
path: semrel/data/data/corpora
outs:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
cache: true
metric: false
persist: false
md5: b4f221037aedcb8dece9c37bc93fab80
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/generate_relations.py --input-path
./semrel/data/data/relations.files.list --output-path ./semrel/data/data/relations/relations.tsv
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: d9666f218a44a6001428420a830f250b
path: semrel/data/scripts/cli/generate_relations.py
- md5: 248d917ea0798532020a401c8b2060f3
path: semrel/data/scripts/relations.py
outs:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
cache: true
metric: false
persist: false
md5: 85c576f8927d68435e5157edb6984ab8
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_retrofit_map.py --input-path
./semrel/data/data/relations.files.list --model-retrofit ./semrel/data/data/fasttext/kgr10.plain.skipgram.dim300.neg10.retrofit.vec
--model-fasttext ./semrel/data/data/fasttext/kgr10.plain.skipgram.dim300.neg10.bin
--output-paths ./semrel/data/data/maps/retrofit.map.keys ./semrel/data/data/maps/retrofit.map.pt
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: c42c5efba4f6aec66a615f974013d3cb
path: semrel/data/scripts/cli/make_retrofit_map.py
- md5: 14fa344f624b3105e02e46e2567cb3ae
path: semrel/data/scripts/maps.py
outs:
- md5: cd32cc2bc03aaf30e93665bae80e7d72
path: semrel/data/data/maps/retrofit.map.keys
cache: true
metric: false
persist: false
- md5: c5c1c7acb41606f7d2abb3fc263697a3
path: semrel/data/data/maps/retrofit.map.pt
cache: true
metric: false
persist: false
md5: 39be35ee4d87c8fd497d255af6cd6968
outs:
- md5: b44cd38cdc97474a008db8f6995df30a.dir
path: semrel/data/data/sent2vec
cache: true
metric: false
persist: false
md5: db20038aeb4bb38d6bef50819b180d29
cmd: ./spert/scripts/generate_indices.py --dataset-keys ./semrel/data/data/vectors/elmo.rel.keys
--output-path ./spert/data/indices.json
deps:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/elmo.rel.keys
- md5: da3a39975b7862292c54ef4f07cc4d19
path: spert/scripts/generate_indices.py
outs:
- md5: c0d40842db8605ed453c34022b69b784
path: spert/data/indices.json
cache: true
metric: false
persist: false
md5: a8d0d30e4f9ed939878b8cbfe2b8360d
cmd: ./spert/scripts/generate_spert_json.py --input-path ./semrel/data/data/relations/relations.tsv
--indices-file ./spert/data/indices.json --output-dir ./spert/data/dataset
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: c0d40842db8605ed453c34022b69b784
path: spert/data/indices.json
- md5: 68e727aef4f3a971d59ffad689c4df88
path: spert/scripts/generate_spert_json.py
outs:
- md5: 91c650042104c8f21f6849e4a247f5ff.dir
path: spert/data/dataset
cache: true
metric: false
persist: false
md5: be08d688e6409f335151e7e21bf7cb0c
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/embedd_relations.py --input-path
./semrel/data/data/relations/relations.tsv --elmo-map ./semrel/data/data/maps/elmo.map.keys
./semrel/data/data/maps/elmo.map.pt --fasttext-map ./semrel/data/data/maps/fasttext.map.keys
./semrel/data/data/maps/fasttext.map.pt --retrofit-map ./semrel/data/data/maps/retrofit.map.keys
./semrel/data/data/maps/retrofit.map.pt --output-dir ./semrel/data/data/vectors
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: 3b6cedb5638139cddf43e24a01b31419
path: semrel/data/scripts/cli/embedd_relations.py
- md5: b6de9c8a19585412edfc1c4a93feed78.dir
path: semrel/data/data/maps
outs:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/elmo.rel.keys
cache: true
metric: false
persist: false
- md5: 4b09441d5e41686fb049ade57b5a4103
path: semrel/data/data/vectors/elmo.rel.pt
cache: true
metric: false
persist: false
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/fasttext.rel.keys
cache: true
metric: false
persist: false
- md5: 43a869894a3a208e89c435a048daf562
path: semrel/data/data/vectors/fasttext.rel.pt
cache: true
metric: false
persist: false
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/retrofit.rel.keys
cache: true
metric: false
persist: false
- md5: ccda8b1c07e0f06e00f61e3e6e869e2e
path: semrel/data/data/vectors/retrofit.rel.pt
cache: true
metric: false
persist: false
md5: 435f9e42b38fb833eaf3b9b6af4f72ae
cmd: ./semrel/data/scripts/cli/make_ner_map.py --relations-file ./semrel/data/data/relations/relations.tsv
--output-paths ./semrel/data/data/vectors/ner.rel.keys ./semrel/data/data/vectors/ner.rel.pt
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: 1f0f59ff5cb80777a2912d3a0b26b494
path: semrel/data/scripts/cli/make_ner_map.py
outs:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/ner.rel.keys
cache: true
metric: false
persist: false
- md5: 7ba176bb876bcb51673145a85c85f394
path: semrel/data/data/vectors/ner.rel.pt
cache: true
metric: false
persist: false
md5: bf87f0672762ca73c4cf5a64886eab0f
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_sent2vec_map.py --relations-file
./semrel/data/data/relations/relations.tsv --documents-files ./semrel/data/data/relations.files.list
--model ./semrel/data/data/sent2vec/kgr10.bin --output-paths ./semrel/data/data/vectors/sent2vec.rel.keys
./semrel/data/data/vectors/sent2vec.rel.pt
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: 6143026d334ec949aaaf4192160e4aed
path: semrel/data/scripts/cli/make_sent2vec_map.py
outs:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/sent2vec.rel.keys
cache: true
metric: false
persist: false
- md5: ee24cbccd8e51c1afbb8955619cc76be
path: semrel/data/data/vectors/sent2vec.rel.pt
cache: true
metric: false
persist: false
FROM nvidia/cuda:10.0-runtime-ubuntu16.04
LABEL maintainer="Łukasz Kopocinski <lkopocinski@gmail.com>"
RUN apt update && apt install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa
# default ubuntu packages
RUN apt update && apt install -y \
bison \
build-essential \
cmake \
curl \
flex \
git \
g++ \
htop \
libantlr-dev \
libboost-all-dev \
libedit-dev \
libicu-dev \
libloki-dev \
libreadline-dev \
libsfst1-1.4-dev \
libxml++2.6-dev \
locales \
locales-all \
pkg-config \
python3.6 \
python3.6-dev \
python3.6-venv \
ranger \
swig \
wget \
vim
# locale
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
# python3.6
RUN update-alternatives --install \
/usr/bin/python python /usr/bin/python3.6 10 && \
wget https://bootstrap.pypa.io/get-pip.py && \
python3.6 get-pip.py && \
rm get-pip.py && \
pip install --upgrade pip
# requirements
WORKDIR /home/install
ENV PIP_EXTRA_INDEX_URL "https://pypi.clarin-pl.eu/"
COPY deps/requirements.txt requirements.txt
RUN pip install -r requirements.txt
# morfeusz
WORKDIR /home/install
RUN wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|sudo apt-key add - && \
apt-add-repository http://download.sgjp.pl/apt/ubuntu && apt update && \
apt install -y *morfeusz2*
WORKDIR /home/install
RUN mkdir morfeusz/ && cd morfeusz/ && \
wget http://tools.clarin-pl.eu/share/morfeusz-SGJP-linux64-20130413.tar.bz2 && \
tar -jxvf morfeusz-SGJP-linux64-20130413.tar.bz2 && \
mv libmorfeusz* /usr/local/lib/ && \
mv morfeusz /usr/local/bin/ && \
mv morfeusz.h /usr/local/include/ && \
ldconfig && \
cd /home/install && \
rm -rf morfeusz-sgjp
# corpus2
WORKDIR /home/install
RUN git clone http://nlp.pwr.edu.pl/corpus2.git && \
cd corpus2/ && \
git checkout --track origin/python3.6 && \
mkdir bin && \
cd bin/ && \
cmake -D CORPUS2_BUILD_POLIQARP:BOOL=True .. && \
make -j 4 && \
make install && \
ldconfig && \
cd /home/install && \
rm -rf corpus2
# toki
WORKDIR /home/install
RUN git clone http://nlp.pwr.edu.pl/toki.git && \
cd toki/ && \
git checkout --track origin/lkopocinski-cmake-cpp11-support && \
mkdir bin && \
cd bin/ && \
cmake .. && \
make -j 4 && \
make install && \
ldconfig && \
cd /home/install && \
rm -rf toki
# maca
WORKDIR /home/install
RUN git clone https://gitlab.clarin-pl.eu/analysers/maca.git && \
cd maca && \
mkdir bin && \
cd bin && \
cmake .. && \
make -j 4 && \
make install && \
ldconfig && \
cd /home/install && \
rm -rf maca
# wccl
WORKDIR /home/install
RUN git clone http://nlp.pwr.edu.pl/wccl.git && \
cd wccl/ && \
git checkout --track origin/lkopocinski-add-cmake-cpp11-support && \
mkdir bin && \
cd bin && \
cmake .. && \
make -j 4 && \
make install && \
ldconfig && \
cd /home/install && \
rm -rf wccl
# sent2vec
WORKDIR /home/install
RUN git clone https://github.com/epfml/sent2vec.git && \
cd sent2vec/ && \
pip install . && \
cd /home/install && \
rm -rf sent2vec
WORKDIR /semrel-extraction
[default]
aws_access_key_id = access_key
aws_secret_access_key = secret_access_key
\ No newline at end of file
MLFLOW_TRACKING_URI="http://10.17.50.132:8080"
MLFLOW_S3_ENDPOINT_URL=https://minio.clarin-pl.eu
\ No newline at end of file
allennlp==0.9.0
argcomplete==1.11.0
basicutils==0.9
boto==2.49.0
boto3==1.12.0
botocore==1.15.0
click==7.0
corpus_ccl==0.9
cython==0.29.15
dvc==0.85.0
gensim==3.8.1
matplotlib==3.1.3
mlflow==1.6.0
numpy==1.18.0
pandas==1.0.0
pytest==5.3.5
PyYAML==5.3
scikit-learn==0.22.0
scipy==1.4.0
torch==1.4.0
tqdm==4.42.0
virtualenv==20.0.0
virtualenv-clone==0.5.3
wordfreq==2.2.1
\ No newline at end of file
version: '3'
services:
semrel:
build: .
shm_size: '300gb'
volumes:
- ../../semrel-extraction:/semrel-extraction
- ./deps/credentials:/root/.aws/credentials
env_file:
- deps/mlflow.env
entrypoint:
- /bin/bash
*/credentials
\ No newline at end of file
FROM python:3.7
LABEL maintainer="Łukasz Kopociński <lkopocinski@gmail.com>"
RUN pip install --upgrade pip && \
pip install mlflow==1.4.0 && \
pip install boto3==1.10.26
ENV PORT 5000
ENV FILE_DIR mlruns
ENV AWS_BUCKET semrel
ENV MLFLOW_S3_ENDPOINT_URL https://minio.clarin-pl.eu
COPY deps/credentials /root/.aws/credentials
COPY deps/init.sh /
ENTRYPOINT ["/init.sh"]
\ No newline at end of file
[default]
aws_access_key_id = access_key
aws_secret_access_key = secret_access_key
\ No newline at end of file
#!/bin/bash
set -e
if [[ -z $FILE_DIR ]]; then
echo >&2 "FILE_DIR must be set."
exit 1
fi
if [[ -z $AWS_BUCKET ]]; then
echo >&2 "AWS_BUCKET must be set."
exit 1
fi
CREDENTIALS_FILE=/root/.aws/credentials
if [[ ! -f ${CREDENTIALS_FILE} ]]; then
echo >&2 "Credentials file must be provided."
exit 1
fi
mlflow server \
--backend-store-uri file:/${FILE_DIR} \
--default-artifact-root s3://${AWS_BUCKET} \
--host 0.0.0.0 \
--port $PORT
version: '3'
services:
tracking-server:
build: .