Change model and update pipeline

e800454d · Marcin Wątroba · e7a1f7ac · e800454d · e800454d · e800454d
Unverified Commit e800454d authored 3 years ago by Marcin Wątroba
--- a/.idea/asr-benchmarks.iml
+++ b/.idea/asr-benchmarks.iml
@@ -4,7 +4,7 @@
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Poetry (asr-benchmarks) (2)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">

--- a/.idea/misc.xml
+++ b/.idea/misc.xml
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (asr-benchmarks)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Poetry (asr-benchmarks) (2)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
--- a/docker/ajn_asr/Dockerfile
+++ b/docker/ajn_asr/Dockerfile
+FROM danijel3/clarin-pl-speechtools:pkf
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+# RUN python3 --version
+RUN mkdir /data
+RUN mkdir /data/processing_flask
+ADD requirements.txt .
+RUN apt-get update && apt-get install -y curl wget
+RUN rm -rf /var/lib/apt/lists/*
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh
+RUN conda --version
+# RUN bash Miniconda3-py39_4.9.2-Linux-x86_64.sh
+RUN pip install -i https://pypi.clarin-pl.eu/simple -r requirements.txt
+RUN rm requirements.txt
+RUN ls -l
+ADD main.py .
+CMD ["python3", "-u", "main.py"]
+ENTRYPOINT ["python3", "-u", "main.py"]
--- a/docker/ajn_asr/main.py
+++ b/docker/ajn_asr/main.py
+import os
+import uuid
+from sziszapangma.integration.service_core.asr.asr_base_processor import AsrBaseProcessor
+from sziszapangma.integration.service_core.asr.asr_result import AsrResult
+class SpeechbrainAsrProcessor(AsrBaseProcessor):
+    def process_asr(self, audio_file_path: str) -> AsrResult:
+        # prepare paths
+        file_tag = str(uuid.uuid4())
+        file_extension = audio_file_path.split('.')[-1]
+        file_name = f'{file_tag}.{file_extension}'
+        result_file_path = f'processing_flask/{file_tag}.txt'
+        file_path = f'processing_flask/{file_name}'
+        # create file in /data/uuid.ext
+        os.system(f"cp {audio_file_path} /data/{file_path}")
+        command = f'/tools/Recognize/run.sh {file_path} {result_file_path}'
+        print(f'run {command}')
+        os.system(command)
+        with open(f'/data/{result_file_path}', 'r') as f:
+            transcription = f.read()
+        transcription = transcription.replace('\n', ' ')
+        # remove temp file
+        os.remove(f'/data/{file_path}')
+        os.remove(f'/data/{result_file_path}')
+        return AsrResult(
+            words=[it for it in transcription.split(' ')],
+            full_text=transcription,
+            words_time_alignment=None
+        )
+if __name__ == '__main__':
+    SpeechbrainAsrProcessor().start_processor()
--- a/docker/ajn_asr/prepare_docker.sh
+++ b/docker/ajn_asr/prepare_docker.sh
+#!/bin/bash
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+docker build -t asr-clarin-pl-service "$SCRIPT_DIR"
+docker tag asr-clarin-pl-service docker-registry.theliver.pl/asr-clarin-pl-service:1.4
+docker push docker-registry.theliver.pl/asr-clarin-pl-service:1.4
--- a/docker/ajn_asr/requirements.txt
+++ b/docker/ajn_asr/requirements.txt
+asr-benchmarks==0.0.1-alpha.48
--- a/docker/build-all-dockers.sh
+++ b/docker/build-all-dockers.sh
+#!/bin/bash
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+"$SCRIPT_DIR"/ajn_asr/prepare_docker.sh
+"$SCRIPT_DIR"/fasttext_embedding/prepare_docker.sh
+"$SCRIPT_DIR"/polish_asr_hf/prepare_docker.sh
+"$SCRIPT_DIR"/speechbrain_asr/prepare_docker.sh
+"$SCRIPT_DIR"/techmo_asr/prepare_docker.sh
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
+version: "3.8"
+services:
+  techmo_asr:
+    image: docker-registry.theliver.pl/techmo-asr:1.1
+    container_name: techmo_asr
+    restart: always
+    ports:
+      - 5001:5000
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - /home/marcinwatroba/.ssh/keys/techmo_asr_server:/keys/techmo_rsa_key:ro
+    environment:
+      - TECHMO_SSH_SERVER_USERNAME=mwatroba
+      - TECHMO_SSH_SERVER_URL=jankocon.clarin-pl.eu
+      - TECHMO_SERVER_SSH_PORT=9222
+      - TECHMO_REMOTE_SERVICE_PORT=12321
+      - TECHMO_SERVER_URL=156.17.135.34
+      - AUTH_TOKEN=t8sv-9bwd-6rps-rs9u
+  transformers-wav2vec2for_ctc:
+    image: docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
+    container_name: transformers-wav2vec2for_ctc
+    restart: always
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - ./wav2vec2for_ctc_models:/models
+    ports:
+      - 5002:5000
+    environment:
+      - AUTH_TOKEN=x42s-qz8u-baa4-d354
+      - MODEL_NAME=jonatasgrosman/wav2vec2-large-xlsr-53-polish
+      - SAMPLING_RATE=16000
+  embedding_service:
+    image: docker-registry.theliver.pl/embedding_docker:1.0
+    container_name: embeddings_service
+    restart: always
+    ports:
+      - 5003:5000
+    environment:
+      - AUTH_TOKEN=fjsd-mkwe-oius-m9h2
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - ./embedding_models:/models
+  ajn_asr:
+    image: docker-registry.theliver.pl/asr-clarin-pl-service:1.4
+    container_name: ajn_asr
+    restart: always
+    ports:
+      - 5004:5000
+    environment:
+      - AUTH_TOKEN=am43-649g-gwa3-b9wj
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+  speechbrain_asr:
+    image: docker-registry.theliver.pl/speechbrain-asr:1.5
+    container_name: speechbrain_asr
+    restart: always
+    ports:
+      - 5005:5000
+    volumes:
+      - /etc/localtime:/etc/localtime:ro
+      - ./speechbrain_asr_models:/models
+    environment:
+      - AUTH_TOKEN=gwa3-b9wj-am43-649g
--- a/docker/fasttext_embedding/Dockerfile
+++ b/docker/fasttext_embedding/Dockerfile
+FROM python:3.9
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip install -i https://pypi.clarin-pl.eu/simple -r requirements.txt && rm requirements.txt
+COPY main.py main.py
+CMD ["python3", "-u", "main.py"]
--- a/docker/fasttext_embedding/main.py
+++ b/docker/fasttext_embedding/main.py
+import os
+import shutil
+from typing import Dict
+import fasttext
+import fasttext.util
+import numpy as np
+from fasttext.FastText import _FastText
+from numpy import typing as npt
+from sziszapangma.integration.service_core.embedding.embedding_base_processor \
+    import EmbeddingBaseProcessor
+class FasttextWebEmbeddingTransformer(EmbeddingBaseProcessor):
+    _models = Dict[str, _FastText]
+    def __init__(self):
+        super().__init__()
+        self._models = dict()
+    def get_embedding(self, phrase: str, language: str) -> npt.NDArray[np.float64]:
+        return self.get_model(language).get_word_vector(phrase)
+    def get_model(self, language: str) -> _FastText:
+        if language not in self._models:
+            print(f'load model {language}')
+            model_filename = f'cc.{language}.300.bin'
+            model_path = f'/models/{model_filename}'
+            print(f'{model_filename}  {model_path}')
+            if not os.path.exists(model_path):
+                full_model_name = fasttext.util.download_model(language,
+                                                               if_exists='ignore')
+                shutil.move(full_model_name, f'/models/{full_model_name}')
+                print(f'downloaded {full_model_name}')
+            self._models[language] = fasttext.load_model(model_path)
+        return self._models[language]
+if __name__ == '__main__':
+    transformer = FasttextWebEmbeddingTransformer()
+    transformer.get_model('pl')
+    transformer.get_model('en')
+    transformer.start_processor()
--- a/docker/fasttext_embedding/prepare_docker.sh
+++ b/docker/fasttext_embedding/prepare_docker.sh
+#!/bin/bash
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+docker build -t embedding_docker "$SCRIPT_DIR"
+docker tag embedding_docker docker-registry.theliver.pl/embedding_docker:1.0
+docker push docker-registry.theliver.pl/embedding_docker:1.0
--- a/docker/fasttext_embedding/requirements.txt
+++ b/docker/fasttext_embedding/requirements.txt
+fasttext>=0.9.2
+asr-benchmarks==0.0.1-alpha.48
--- a/docker/polish_asr_hf/Dockerfile
+++ b/docker/polish_asr_hf/Dockerfile
+FROM python:3.9
+WORKDIR /app
+RUN apt-get update && apt-get install -y libsndfile1 && apt-get clean
+COPY requirements.txt requirements.txt
+RUN pip install -i https://pypi.clarin-pl.eu/simple -r requirements.txt && rm requirements.txt
+COPY main.py main.py
+CMD ["python3", "-u", "main.py"]
--- a/docker/polish_asr_hf/docker-compose.yml
+++ b/docker/polish_asr_hf/docker-compose.yml
+version: "3.8"
+services:
+  transformers-wav2vec2for_ctc:
+    image: docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
+    container_name: transformers-wav2vec2for_ctc
+    volumes:
+      - ./models:/models
+    ports:
+      - 5003:5000
+    environment:
+      - AUTH_TOKEN=test1234
+      - MODEL_ID=jonatasgrosman/wav2vec2-large-xlsr-53-polish
+      - SAMPLING_RATE=16000
\ No newline at end of file
--- a/docker/polish_asr_hf/main.py
+++ b/docker/polish_asr_hf/main.py
+import os
+import warnings
+import librosa
+import torch
+from sziszapangma.integration.service_core.asr.asr_base_processor import AsrBaseProcessor
+from sziszapangma.integration.service_core.asr.asr_result import AsrResult
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+warnings.filterwarnings("ignore")
+# MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-polish"
+# SAMPLING_RATE = 16_000
+class SpeechbrainAsrProcessor(AsrBaseProcessor):
+    _model_id: str
+    _processor: Wav2Vec2Processor
+    _model: Wav2Vec2ForCTC
+    _sampling_rate: int
+    def __init__(self, model_id: str, sampling_rate: int):
+        super().__init__()
+        self._model_id = model_id
+        self._sampling_rate = sampling_rate
+        self._processor = Wav2Vec2Processor.from_pretrained(model_id, cache_dir='/models')
+        self._model = Wav2Vec2ForCTC.from_pretrained(model_id, cache_dir='/models')
+    def process_asr(self, audio_file_path: str) -> AsrResult:
+        speech_array, sampling_rate = librosa.load(audio_file_path, sr=self._sampling_rate)
+        inputs = self._processor([speech_array], sampling_rate=sampling_rate, return_tensors="pt",
+                                 padding=True)
+        with torch.no_grad():
+            logits = self._model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_sentences = self._processor.batch_decode(predicted_ids)
+        transcription = predicted_sentences[0]
+        return AsrResult(words=transcription.split(' '), full_text=transcription,
+                         words_time_alignment=None)
+if __name__ == '__main__':
+    SpeechbrainAsrProcessor(
+        os.environ['MODEL_NAME'],
+        int(os.environ['SAMPLING_RATE'])
+    ).start_processor()
--- a/docker/polish_asr_hf/prepare_docker.sh
+++ b/docker/polish_asr_hf/prepare_docker.sh
+#!/bin/bash
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+docker build -t transformers-wav2vec2for_ctc "$SCRIPT_DIR"
+docker tag transformers-wav2vec2for_ctc docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
+docker push docker-registry.theliver.pl/transformers-wav2vec2for_ctc:1.0
--- a/docker/polish_asr_hf/requirements.txt
+++ b/docker/polish_asr_hf/requirements.txt
+torchaudio
+datasets
+transformers
+asrp
+asr-benchmarks==0.0.1-alpha.48
+librosa
\ No newline at end of file
--- a/docker/speechbrain_asr/Dockerfile
+++ b/docker/speechbrain_asr/Dockerfile
+FROM python:3.9
+WORKDIR /app
+ADD requirements.txt .
+RUN pip install -i https://pypi.clarin-pl.eu/simple -r requirements.txt
+RUN rm requirements.txt
+ADD main.py .
+RUN mkdir asr_processing
+CMD ["python3", "-u", "main.py"]
--- a/docker/speechbrain_asr/main.py
+++ b/docker/speechbrain_asr/main.py
+import os
+from speechbrain.pretrained import EncoderDecoderASR
+from sziszapangma.integration.service_core.asr.asr_base_processor import AsrBaseProcessor
+from sziszapangma.integration.service_core.asr.asr_result import AsrResult
+class SpeechbrainAsrProcessor(AsrBaseProcessor):
+    asr_model: EncoderDecoderASR
+    def __init__(self):
+        super().__init__()
+        self.asr_model = EncoderDecoderASR.from_hparams(
+            source="speechbrain/asr-transformer-transformerlm-librispeech"
+        )
+    def process_asr(self, audio_file_path: str) -> AsrResult:
+        transcription = self.asr_model.transcribe_file(audio_file_path)
+        os.remove(audio_file_path)
+        words = [it.lower() for it in transcription.split(' ')]
+        final_transcription = transcription.lower()
+        return AsrResult(words=words, full_text=final_transcription, words_time_alignment=None)
+if __name__ == '__main__':
+    SpeechbrainAsrProcessor().start_processor()
--- a/docker/speechbrain_asr/prepare_docker.sh
+++ b/docker/speechbrain_asr/prepare_docker.sh
+#!/bin/bash
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+docker build --no-cache -t speechbrain-asr "$SCRIPT_DIR"
+docker tag speechbrain-asr docker-registry.theliver.pl/speechbrain-asr:1.5
+docker push docker-registry.theliver.pl/speechbrain-asr:1.5