Commit bed699bd authored by Michal Pogoda's avatar Michal Pogoda

Added readme, cleaned some stuff

parent c200a1dd
[submodule "modules/bert_document_classifier"]
path = modules/bert_document_classifier
url = https://gitlab.clarin-pl.eu/mipo57/bert_document_classifier.git
# syntax=docker/dockerfile:experimental
FROM clarinpl/cuda-python:3.7
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y gcc python3-dev
RUN mkdir /workdir
WORKDIR /workdir
# Allow pip to install from gitlab
RUN apt-get update && apt-get install -y openssh-client
RUN mkdir -p -m 0600 /root/.ssh && ssh-keyscan gitlab.clarin-pl.eu >> /root/.ssh/known_hosts
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt && rm requirements.txt
RUN --mount=type=secret,id=sshkey,dst=/root/.ssh/id_rsa pip3 install -r requirements.txt && rm requirements.txt
ARG USERNAME=clarin
ARG USER_UID=1000
......
# Bert Insights
## Usage
1. Compute representation on dataset (Can take some time...).
```bash
process.sh --data_dir <path_to_dataset> --output_dir <output_path> --model <path to model.pt> --gpu_device <gpu to use> [additional options]
compute.sh --data_dir <path_to_dataset> --output_dir <output_path> --model <path to model.pt> --gpu_device <gpu to use> [additional options]
```
2. Use precomputed data to generate informations (relatively fast process)
```bash
compute.sh --data_dir <path_to_dataset> --output_dir <output_path> --model <path to model.pt> --gpu_device <gpu to use> [additional options]
```
## Important Notes
Package depends on non-public `bert_document_classifier` repository. Docker needs to access it to embedd the package into image. To do that it needs to have public ssh key entered on `https://gitlab.clarin-pl.eu/`. By default private key will be taken from `~/.ssh/id_rsa`, however you can pass `--ssh_key path/to/file` if you want to use other key. Also, you need to have a docker version that supports DOCKER_BUILDKIT enviroment, as that's how ssh key is securly passed to docker build enviroment.
## Dataset format
Format is identical to the one used to train `bert_document_classifier` (please refer to https://gitlab.clarin-pl.eu/mipo57/bert_document_classifier). However, only single-label format is accepted. BERT Insights works with multilabel models with no problem, however it needs a single-label format to compute representation. The easiest way to achive that, is to duplicate examples that have more than one label and remove examples without a label.
Eg.:
```json
{
"train": {
"1.txt": ["A", "B"],
"2.txt": [],
},
"test": {
"3.txt": ["A"],
}
}
```
should be transformed to something like:
```json
{
"train": {
"1_1.txt": "A",
"1_2.txt": "B"
},
"test": {
"3.txt": "A",
}
}
```
\ No newline at end of file
......@@ -13,13 +13,30 @@ while test $# -gt 0; do
output_dir=$1
shift
;;
--ssh_key)
shift
ssh_key=$1
shift
;;
*)
script_params+=($1)
shift
;;
esac
done
if [ -z "$ssh_key" ]; then
ssh_key=~/.ssh/id_rsa
fi
docker_build_params=(
-q
--build-arg USER_UID=$(id -u)
--build-arg USER_GID=$(id -g)
--secret id=sshkey,src=$ssh_key
)
docker_params=(
-v $(pwd)/bert_document_classifier:/home/clarin/workspace/bert_document_classifier:ro
-v $(pwd)/bert_insight:/home/clarin/workspace/bert_insight:ro
......@@ -37,8 +54,9 @@ script_params+=(
command=(docker run)
mkdir -p $output_dir
export DOCKER_BUILDKIT=1
${command[@]} -it \
${docker_params[@]} \
$(docker build -q --build-arg USER_UID=$(id -u) --build-arg USER_GID=$(id -g) .) \
$(docker build ${docker_build_params[@]} .) \
python -m bert_insight.analize \
${script_params[@]}
\ No newline at end of file
./modules/bert_document_classifier/bert_document_classifier
\ No newline at end of file
......@@ -25,6 +25,8 @@ def main(
output_dir: str,
top_keywords: int
):
print("Analizing...")
con = sqlite3.connect(f"{data_dir}/links_raw.db")
df = pd.read_sql("SELECT * FROM links", con)
......
......@@ -18,6 +18,11 @@ while test $# -gt 0; do
model=$1
shift
;;
--ssh_key)
shift
ssh_key=$1
shift
;;
--gpu_device)
shift
gpu=$1
......@@ -30,10 +35,19 @@ while test $# -gt 0; do
esac
done
if [ -z "$ssh_key" ]; then
ssh_key=~/.ssh/id_rsa
fi
docker_build_params=(
-q
--build-arg USER_UID=$(id -u)
--build-arg USER_GID=$(id -g)
--secret id=sshkey,src=$ssh_key
)
docker_params=(
-v $(pwd)/bert_document_classifier:/home/clarin/workspace/bert_document_classifier:ro
-v $(pwd)/bert_insight:/home/clarin/workspace/bert_insight:ro
-v $(pwd)/modules:/home/clarin/workspace/modules:ro
-v $(realpath $data_dir):/home/clarin/workspace/dataset:ro
-v $(realpath $output_dir):/home/clarin/workspace/output
-v $(realpath $model):/home/clarin/workspace/model.pt:ro
......@@ -46,7 +60,7 @@ script_params+=(
--model /home/clarin/workspace/model.pt
)
command=(docker run)
command=(DOCKER_BUILDKIT=1 docker run)
if [ ! -z "$gpu" ]; then
docker_params+=(--gpus device=$gpu)
......@@ -57,6 +71,6 @@ mkdir -p $output_dir
${command[@]} -it \
${docker_params[@]} \
$(docker build -q --build-arg USER_UID=$(id -u) --build-arg USER_GID=$(id -g) .) \
$(docker build $docker_build_params .) \
python -m bert_insight.compute \
${script_params[@]}
\ No newline at end of file
Subproject commit c2d7c840bd468b6ab6315c0b4e58ae052ae2a6a8
......@@ -7,4 +7,5 @@ captum==0.2.0
pandas==1.1.3
scikit-learn==0.23.2
networkx==2.4
seaborn==0.11.0
\ No newline at end of file
seaborn==0.11.0
git+ssh://git@gitlab.clarin-pl.eu/mipo57/bert_document_classifier.git
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment