Commit 0e0b7d7c authored by Łukasz Kopociński's avatar Łukasz Kopociński

Merge branch 'lkopocinski/code-clean-up' into develop

parents 5ce3ddeb 6e6a6ae2
......@@ -12,5 +12,11 @@ dist
*.map
*.pt
*.keys
credentials
\ No newline at end of file
credentials
.tox
semrel/data/data
spert/data
md5: 95fe168079b050fcce6805511c009149
md5: f5b640a205f82fa6af38883643fb401e
outs:
- md5: fec63c0bd1ca725b1ed49e3df15abb13.dir
path: corpora
- md5: 8c7366784929b1b98e955c44d1e104df.dir
path: semrel/data/data/corpora
cache: true
metric: false
persist: false
md5: 99145725bd26cd9e7a2ffbf71f1213dc
md5: 1208e7e581bdffa6e8bb1feb8c76004b
outs:
- md5: 554db712988ad5ce544d40ca32cde01a.dir
path: elmo
path: semrel/data/data/elmo
cache: true
metric: false
persist: false
md5: 3fd7ed4807e41c8215e0bade52ab2fda
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_elmo_map.py --input-path
./semrel/data/data/relations.files.list --model ./semrel/data/data/elmo/options.json
./semrel/data/data/elmo/weights.hdf5 --output-paths ./semrel/data/data/maps/elmo.map.keys
./semrel/data/data/maps/elmo.map.pt
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: 1111682d038ff9be1f622b0ec771e7d0
path: semrel/data/scripts/cli/make_elmo_map.py
- md5: 14fa344f624b3105e02e46e2567cb3ae
path: semrel/data/scripts/maps.py
outs:
- md5: cd32cc2bc03aaf30e93665bae80e7d72
path: semrel/data/data/maps/elmo.map.keys
cache: true
metric: false
persist: false
- md5: 9987878c3752d38c5ecf287b95a75700
path: semrel/data/data/maps/elmo.map.pt
cache: true
metric: false
persist: false
md5: b4269061f70f06caf22b273745c52b3c
md5: 98ed9f4d303600a6be92178081f39a22
outs:
- md5: d1e20ac9018c3de356cb1ed1c38674a4.dir
path: fasttext
path: semrel/data/data/fasttext
cache: true
metric: false
persist: false
md5: 218b1553a127004a3693c93f1cde6c13
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_fasttext_map.py --input-path
./semrel/data/data/relations.files.list --model ./semrel/data/data/fasttext/kgr10.plain.skipgram.dim300.neg10.bin
--output-paths ./semrel/data/data/maps/fasttext.map.keys ./semrel/data/data/maps/fasttext.map.pt
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: 9d988a887cba28b9774b3171f9c511f0
path: semrel/data/scripts/cli/make_fasttext_map.py
- md5: 14fa344f624b3105e02e46e2567cb3ae
path: semrel/data/scripts/maps.py
outs:
- md5: cd32cc2bc03aaf30e93665bae80e7d72
path: semrel/data/data/maps/fasttext.map.keys
cache: true
metric: false
persist: false
- md5: 8335f6b5fdc048f306ff4a3b659960cc
path: semrel/data/data/maps/fasttext.map.pt
cache: true
metric: false
persist: false
md5: 4dc591a5158e89ab53a1c29a06561cbc
cmd: find ./semrel/data/data/corpora -type f -name *.rel.xml > ./semrel/data/data/relations.files.list
deps:
- md5: d171bbc05d28098ebd033f5241cf6929.dir
path: semrel/data/data/corpora
outs:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
cache: true
metric: false
persist: false
md5: b4f221037aedcb8dece9c37bc93fab80
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/generate_relations.py --input-path
./semrel/data/data/relations.files.list --output-path ./semrel/data/data/relations/relations.tsv
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: d9666f218a44a6001428420a830f250b
path: semrel/data/scripts/cli/generate_relations.py
- md5: 248d917ea0798532020a401c8b2060f3
path: semrel/data/scripts/relations.py
outs:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
cache: true
metric: false
persist: false
md5: 85c576f8927d68435e5157edb6984ab8
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_retrofit_map.py --input-path
./semrel/data/data/relations.files.list --model-retrofit ./semrel/data/data/fasttext/kgr10.plain.skipgram.dim300.neg10.retrofit.vec
--model-fasttext ./semrel/data/data/fasttext/kgr10.plain.skipgram.dim300.neg10.bin
--output-paths ./semrel/data/data/maps/retrofit.map.keys ./semrel/data/data/maps/retrofit.map.pt
deps:
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: c42c5efba4f6aec66a615f974013d3cb
path: semrel/data/scripts/cli/make_retrofit_map.py
- md5: 14fa344f624b3105e02e46e2567cb3ae
path: semrel/data/scripts/maps.py
outs:
- md5: cd32cc2bc03aaf30e93665bae80e7d72
path: semrel/data/data/maps/retrofit.map.keys
cache: true
metric: false
persist: false
- md5: c5c1c7acb41606f7d2abb3fc263697a3
path: semrel/data/data/maps/retrofit.map.pt
cache: true
metric: false
persist: false
md5: caee931111d6e2ccf80322a014e69c37
md5: 39be35ee4d87c8fd497d255af6cd6968
outs:
- md5: b44cd38cdc97474a008db8f6995df30a.dir
path: sent2vec
path: semrel/data/data/sent2vec
cache: true
metric: false
persist: false
md5: db20038aeb4bb38d6bef50819b180d29
cmd: ./spert/scripts/generate_indices.py --dataset-keys ./semrel/data/data/vectors/elmo.rel.keys
--output-path ./spert/data/indices.json
deps:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/elmo.rel.keys
- md5: da3a39975b7862292c54ef4f07cc4d19
path: spert/scripts/generate_indices.py
outs:
- md5: c0d40842db8605ed453c34022b69b784
path: spert/data/indices.json
cache: true
metric: false
persist: false
md5: a8d0d30e4f9ed939878b8cbfe2b8360d
cmd: ./spert/scripts/generate_spert_json.py --input-path ./semrel/data/data/relations/relations.tsv
--indices-file ./spert/data/indices.json --output-dir ./spert/data/dataset
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: c0d40842db8605ed453c34022b69b784
path: spert/data/indices.json
- md5: 68e727aef4f3a971d59ffad689c4df88
path: spert/scripts/generate_spert_json.py
outs:
- md5: 91c650042104c8f21f6849e4a247f5ff.dir
path: spert/data/dataset
cache: true
metric: false
persist: false
md5: be08d688e6409f335151e7e21bf7cb0c
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/embedd_relations.py --input-path
./semrel/data/data/relations/relations.tsv --elmo-map ./semrel/data/data/maps/elmo.map.keys
./semrel/data/data/maps/elmo.map.pt --fasttext-map ./semrel/data/data/maps/fasttext.map.keys
./semrel/data/data/maps/fasttext.map.pt --retrofit-map ./semrel/data/data/maps/retrofit.map.keys
./semrel/data/data/maps/retrofit.map.pt --output-dir ./semrel/data/data/vectors
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: 3b6cedb5638139cddf43e24a01b31419
path: semrel/data/scripts/cli/embedd_relations.py
- md5: b6de9c8a19585412edfc1c4a93feed78.dir
path: semrel/data/data/maps
outs:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/elmo.rel.keys
cache: true
metric: false
persist: false
- md5: 4b09441d5e41686fb049ade57b5a4103
path: semrel/data/data/vectors/elmo.rel.pt
cache: true
metric: false
persist: false
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/fasttext.rel.keys
cache: true
metric: false
persist: false
- md5: 43a869894a3a208e89c435a048daf562
path: semrel/data/data/vectors/fasttext.rel.pt
cache: true
metric: false
persist: false
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/retrofit.rel.keys
cache: true
metric: false
persist: false
- md5: ccda8b1c07e0f06e00f61e3e6e869e2e
path: semrel/data/data/vectors/retrofit.rel.pt
cache: true
metric: false
persist: false
md5: 435f9e42b38fb833eaf3b9b6af4f72ae
cmd: ./semrel/data/scripts/cli/make_ner_map.py --relations-file ./semrel/data/data/relations/relations.tsv
--output-paths ./semrel/data/data/vectors/ner.rel.keys ./semrel/data/data/vectors/ner.rel.pt
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: 1f0f59ff5cb80777a2912d3a0b26b494
path: semrel/data/scripts/cli/make_ner_map.py
outs:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/ner.rel.keys
cache: true
metric: false
persist: false
- md5: 7ba176bb876bcb51673145a85c85f394
path: semrel/data/data/vectors/ner.rel.pt
cache: true
metric: false
persist: false
md5: bf87f0672762ca73c4cf5a64886eab0f
cmd: CUDA_VISIBLE_DEVICES=0 ./semrel/data/scripts/cli/make_sent2vec_map.py --relations-file
./semrel/data/data/relations/relations.tsv --documents-files ./semrel/data/data/relations.files.list
--model ./semrel/data/data/sent2vec/kgr10.bin --output-paths ./semrel/data/data/vectors/sent2vec.rel.keys
./semrel/data/data/vectors/sent2vec.rel.pt
deps:
- md5: 1f97be140dcfa90a912c0aea2fe1d502
path: semrel/data/data/relations/relations.tsv
- md5: e34115c7c699baaf15c97ca5364ad554
path: semrel/data/data/relations.files.list
- md5: 6143026d334ec949aaaf4192160e4aed
path: semrel/data/scripts/cli/make_sent2vec_map.py
outs:
- md5: 3b0e670c1466e7b44e40b6a30160d0e5
path: semrel/data/data/vectors/sent2vec.rel.keys
cache: true
metric: false
persist: false
- md5: ee24cbccd8e51c1afbb8955619cc76be
path: semrel/data/data/vectors/sent2vec.rel.pt
cache: true
metric: false
persist: false
/vectors
/sent2vec
/corpora
/elmo
/fasttext
/maps
/relations
/relations_files.list
#!/bin/bash -eux
pushd "$(git rev-parse --show-toplevel)"
MODEL_DIR="./data/fasttext"
SCRIPTS_DIR="./data/scripts"
INPUT_PATH="./data/relations_files.list"
KEYS_FILE="./data/maps/fasttext.map.keys"
VECTORS_FILE="./data/maps/fasttext.map.pt"
mkdir -p "./data/maps/"
dvc run \
-d ${INPUT_PATH} \
-d ${SCRIPTS_DIR}/make_fasttext_map.py \
-d ${SCRIPTS_DIR}/maps.py \
-o ${KEYS_FILE} \
-o ${VECTORS_FILE} \
-f fasttext.map.dvc \
CUDA_VISIBLE_DEVICES=0 ${SCRIPTS_DIR}/make_fasttext_map.py --input-path ${INPUT_PATH} \
--model "${MODEL_DIR}/kgr10.plain.skipgram.dim300.neg10.bin" \
--output-paths ${KEYS_FILE} ${VECTORS_FILE}
popd
#!/bin/bash -eux
pushd "$(git rev-parse --show-toplevel)"
MODEL_DIR="./data/fasttext"
SCRIPTS_DIR="./data/scripts"
INPUT_PATH="./data/relations_files.list"
KEYS_FILE="./data/maps/retrofit.map.keys"
VECTORS_FILE="./data/maps/retrofit.map.pt"
mkdir -p "./data/maps/"
dvc run \
-d ${INPUT_PATH} \
-d ${SCRIPTS_DIR}/make_retrofit_map.py \
-d ${SCRIPTS_DIR}/maps.py \
-o ${KEYS_FILE} \
-o ${VECTORS_FILE} \
-f retrofit.map.dvc \
CUDA_VISIBLE_DEVICES=0 ${SCRIPTS_DIR}/make_retrofit_map.py --input-path ${INPUT_PATH} \
--model-retrofit "${MODEL_DIR}/kgr10.plain.skipgram.dim300.neg10.retrofit.vec" \
--model-fasttext "${MODEL_DIR}/kgr10.plain.skipgram.dim300.neg10.bin" \
--output-paths ${KEYS_FILE} ${VECTORS_FILE}
popd
#!/bin/bash -eux
pushd "$(git rev-parse --show-toplevel)"
SCRIPTS_DIR="./data/scripts"
MAPS_DIR="./data/maps"
INPUT_PATH="./data/relations/relations.tsv"
OUTPUT_DIR="./data/vectors"
mkdir -p ${OUTPUT_DIR}
dvc run \
-d ${INPUT_PATH} \
-d ${SCRIPTS_DIR}/embedd_relations.py \
-d ${MAPS_DIR} \
-o ${OUTPUT_DIR}/elmo.rel.keys \
-o ${OUTPUT_DIR}/elmo.rel.pt \
-o ${OUTPUT_DIR}/fasttext.rel.keys \
-o ${OUTPUT_DIR}/fasttext.rel.pt \
-o ${OUTPUT_DIR}/retrofit.rel.keys \
-o ${OUTPUT_DIR}/retrofit.rel.pt \
-f vectors.dvc
CUDA_VISIBLE_DEVICES=0 ${SCRIPTS_DIR}/embedd_relations.py --input-path ${INPUT_PATH} \
--elmo-map "${MAPS_DIR}/elmo.map.keys" "${MAPS_DIR}/elmo.map.pt" \
--fasttext-map "${MAPS_DIR}/fasttext.map.keys" "${MAPS_DIR}/fasttext.map.pt" \
--retrofit-map "${MAPS_DIR}/retrofit.map.keys" "${MAPS_DIR}/retrofit.map.pt" \
--output-dir ${OUTPUT_DIR}
popd
#!/bin/bash -eux
pushd "$(git rev-parse --show-toplevel)"
RELATIONS_FILE="./data/relations/relations.tsv"
DOCUMENTS_FILE="./data/relations_files.list"
MODEL_DIR="./data/sent2vec"
SCRIPTS_DIR="./data/scripts"
OUTPUT_DIR='./data/vectors'
KEYS_FILE="${OUTPUT_DIR}/sent2vec.rel.keys"
VECTORS_FILE="${OUTPUT_DIR}/sent2vec.rel.pt"
mkdir -p ${OUTPUT_DIR}
dvc run \
-d ${RELATIONS_FILE} \
-d ${DOCUMENTS_FILE} \
-d ${SCRIPTS_DIR}/make_sent2vec_map.py \
-o ${KEYS_FILE} \
-o ${VECTORS_FILE} \
-f vectors.sent2vec.dvc \
CUDA_VISIBLE_DEVICES=0 ${SCRIPTS_DIR}/make_sent2vec_map.py --relations-file ${RELATIONS_FILE} \
--documents-files ${DOCUMENTS_FILE} \
--model "${MODEL_DIR}/kgr10.bin" \
--output-paths ${KEYS_FILE} ${VECTORS_FILE}
popd
This diff is collapsed.
......@@ -4,7 +4,7 @@ LABEL maintainer="Łukasz Kopocinski <lkopocinski@gmail.com>"
RUN apt update && apt install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa
# default ubuntu packages
RUN apt update && apt install -y \
bison \
build-essential \
......@@ -29,11 +29,19 @@ RUN apt update && apt install -y \
python3.6-dev \
python3.6-venv \
ranger \
subversion \
swig \
wget \
vim
# locale
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
# python3.6
RUN update-alternatives --install \
/usr/bin/python python /usr/bin/python3.6 10 && \
wget https://bootstrap.pypa.io/get-pip.py && \
......@@ -41,7 +49,16 @@ RUN update-alternatives --install \
rm get-pip.py && \
pip install --upgrade pip
# Morfeusz
# requirements
WORKDIR /home/install
ENV PIP_EXTRA_INDEX_URL "https://pypi.clarin-pl.eu/"
COPY deps/requirements.txt requirements.txt
RUN pip install -r requirements.txt
# morfeusz
WORKDIR /home/install
RUN wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|sudo apt-key add - && \
apt-add-repository http://download.sgjp.pl/apt/ubuntu && apt update && \
apt install -y *morfeusz2*
......@@ -57,7 +74,7 @@ RUN mkdir morfeusz/ && cd morfeusz/ && \
cd /home/install && \
rm -rf morfeusz-sgjp
# Corpus2
# corpus2
WORKDIR /home/install
RUN git clone http://nlp.pwr.edu.pl/corpus2.git && \
cd corpus2/ && \
......@@ -71,7 +88,7 @@ RUN git clone http://nlp.pwr.edu.pl/corpus2.git && \
cd /home/install && \
rm -rf corpus2
# Toki
# toki
WORKDIR /home/install
RUN git clone http://nlp.pwr.edu.pl/toki.git && \
cd toki/ && \
......@@ -85,7 +102,7 @@ RUN git clone http://nlp.pwr.edu.pl/toki.git && \
cd /home/install && \
rm -rf toki
# Maca
# maca
WORKDIR /home/install
RUN git clone https://gitlab.clarin-pl.eu/analysers/maca.git && \
cd maca && \
......@@ -98,7 +115,7 @@ RUN git clone https://gitlab.clarin-pl.eu/analysers/maca.git && \
cd /home/install && \
rm -rf maca
# Wccl
# wccl
WORKDIR /home/install
RUN git clone http://nlp.pwr.edu.pl/wccl.git && \
cd wccl/ && \
......@@ -112,18 +129,6 @@ RUN git clone http://nlp.pwr.edu.pl/wccl.git && \
cd /home/install && \
rm -rf wccl
RUN locale-gen en_US.UTF-8
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENV MLFLOW_TRACKING_URI "http://10.17.50.132:8080"
ENV MLFLOW_S3_ENDPOINT_URL=https://minio.clarin-pl.eu
ENV PIP_EXTRA_INDEX_URL "https://pypi.clarin-pl.eu/"
COPY requirements.txt /
RUN pip install -r /requirements.txt
# sent2vec
WORKDIR /home/install
RUN git clone https://github.com/epfml/sent2vec.git && \
......@@ -132,7 +137,4 @@ RUN git clone https://github.com/epfml/sent2vec.git && \
cd /home/install && \
rm -rf sent2vec
COPY credentials /root/.aws/credentials
WORKDIR /
ENTRYPOINT ["/bin/bash"]
WORKDIR /semrel-extraction
MLFLOW_TRACKING_URI="http://10.17.50.132:8080"
MLFLOW_S3_ENDPOINT_URL=https://minio.clarin-pl.eu
\ No newline at end of file
......@@ -21,6 +21,4 @@ torch==1.4.0
tqdm==4.42.0
virtualenv==20.0.0
virtualenv-clone==0.5.3
wordfreq==2.2.1
pudb
\ No newline at end of file
wordfreq==2.2.1
\ No newline at end of file
version: '3'
services:
semrel:
build: .
shm_size: '300gb'
volumes:
- ../../semrel-extraction:/semrel-extraction
- ./deps/credentials:/root/.aws/credentials
env_file:
- deps/mlflow.env
entrypoint:
- /bin/bash
md5: 3cd7f2230fbd8269f8d4d9e2ed46a98f
cmd: CUDA_VISIBLE_DEVICES=0 ./data/scripts/make_elmo_map.py --input-path ./data/relations_files.list
--model ./data/elmo/options.json ./data/elmo/weights.hdf5 --output-paths ./data/maps/elmo.map.keys
./data/maps/elmo.map.pt
deps:
- md5: 687d0ef2d76502c115ff26a39a3a13df
path: data/relations_files.list
- md5: 6e2ca6def6f8b73663848de552ef7002
path: data/scripts/make_elmo_map.py
- md5: cb4c058711de0b75989e339ada28f4dd
path: data/scripts/maps.py
outs:
- md5: 4879fee7db0b389f8afdd90d0b792369
path: data/maps/elmo.map.keys
cache: true
metric: false
persist: false
- md5: cd44d29bdc512f8fb3fe9b9e32b1f31a
path: data/maps/elmo.map.pt
cache: true
metric: false
persist: false
md5: 408a8a2465838000a8ccc2aa597b3e82
cmd: CUDA_VISIBLE_DEVICES=0 ./data/scripts/make_fasttext_map.py --input-path ./data/relations_files.list
--model ./data/fasttext/kgr10.plain.skipgram.dim300.neg10.bin --output-paths ./data/maps/fasttext.map.keys
./data/maps/fasttext.map.pt
deps:
- md5: 687d0ef2d76502c115ff26a39a3a13df
path: data/relations_files.list
- md5: a71a74a81a03cb74063e008821cf0605
path: data/scripts/make_fasttext_map.py
- md5: cb4c058711de0b75989e339ada28f4dd
path: data/scripts/maps.py
outs:
- md5: 4879fee7db0b389f8afdd90d0b792369
path: data/maps/fasttext.map.keys
cache: true
metric: false
persist: false
- md5: cf3a0329d7fb340df5061cb2ff6a6d03
path: data/maps/fasttext.map.pt
cache: true
metric: false
persist: false
This diff is collapsed.
md5: f2354e9a373b93c8c05b58fd76055ce4
cmd: find ./data/corpora/ -type f -name *.rel.xml > ./data/relations_files.list
deps:
- md5: fec63c0bd1ca725b1ed49e3df15abb13.dir
path: data/corpora
outs:
- md5: 687d0ef2d76502c115ff26a39a3a13df
path: data/relations_files.list
cache: true
metric: false
persist: false