From f726d29ab4be698af76eb57f94ca8cdd6112eab0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20W=C4=85troba?= <markowanga@gmail.com> Date: Wed, 30 Jun 2021 07:14:38 +0200 Subject: [PATCH] Add poetry --- .github/workflows/python-master.yml | 23 ++++ .github/workflows/python-package.yml | 44 +++---- .github/workflows/python-release.yml | 24 ++++ docs/conf.py | 54 ++++---- requirements.txt | 5 - requirements_dev.txt | 9 -- setup.cfg | 26 ---- setup.py | 57 --------- sziszapangma/__init__.py | 4 +- .../__pycache__/__init__.cpython-39.pyc | Bin 307 -> 307 bytes .../core/alignment/alignment_calculator.py | 116 +++++++----------- .../alignment/alignment_classic_calculator.py | 7 +- .../alignment_embedding_calculator.py | 49 ++++---- .../alignment/alignment_processing_step.py | 32 ++--- .../alignment/alignment_soft_calculator.py | 10 +- sziszapangma/core/alignment/alignment_step.py | 4 +- sziszapangma/core/alignment/alignment_util.py | 36 ++---- .../alignment/distance_matrix_calculator.py | 65 +++++----- .../cached_embedding_transformer.py | 15 +-- .../core/transformer/embedding_transformer.py | 3 +- .../fasttext_embedding_transformer.py | 13 +- sziszapangma/core/wer/span.py | 5 +- sziszapangma/core/wer/wer_calculator.py | 29 ++--- sziszapangma/integration/asr_processor.py | 5 +- .../integration/experiment_manager.py | 11 +- .../integration/gold_transcript_processor.py | 2 +- .../mapper/alignment_step_mapper.py | 17 ++- .../integration/mapper/step_words_mapper.py | 17 ++- .../integration/mapper/word_mapper.py | 10 +- sziszapangma/integration/path_filter.py | 20 +-- .../integration/record_id_iterator.py | 3 +- .../repository/experiment_repository.py | 21 +--- .../repository/file_experiment_repository.py | 31 ++--- .../repository/mongo_experiment_repository.py | 27 ++-- sziszapangma/integration/task/asr_task.py | 23 ++-- .../task/classic_wer_metric_task.py | 78 +++++------- .../task/embedding_wer_metrics_task.py | 100 +++++++-------- .../integration/task/gold_transcript_task.py | 22 ++-- .../integration/task/processing_task.py | 31 ++--- tests/file_stored_embedding_transformer.py | 17 +-- tests/test_classic_wer.py | 28 +++-- tests/test_embedding_wer.py | 19 ++- tests/test_soft_wer.py | 19 ++- tox.ini | 22 ---- 44 files changed, 436 insertions(+), 717 deletions(-) create mode 100644 .github/workflows/python-master.yml create mode 100644 .github/workflows/python-release.yml delete mode 100644 requirements.txt delete mode 100644 requirements_dev.txt delete mode 100644 setup.cfg delete mode 100644 setup.py delete mode 100644 tox.ini diff --git a/.github/workflows/python-master.yml b/.github/workflows/python-master.yml new file mode 100644 index 0000000..cf39de6 --- /dev/null +++ b/.github/workflows/python-master.yml @@ -0,0 +1,23 @@ +name: Release pre-release version +on: release + +jobs: + publish: + + runs-on: ubuntu-18.04 + strategy: + matrix: + python-version: [ '3.8' ] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install run dependencies + run: | + python -m pip install --upgrade pip + pip install --upgrade -r requirements.txt + - name: Publish + run: | diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 71463e7..9af6adf 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,29 +1,29 @@ name: Python package on: - pull_request: - branches: [ main, develop ] + pull_request: + branches: [ main, develop ] jobs: - build: + build: - runs-on: ubuntu-18.04 - strategy: - matrix: - python-version: [ '3.8', '3.9' ] + runs-on: ubuntu-18.04 + strategy: + matrix: + python-version: [ '3.8', '3.9' ] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - ls -la - python -m pip install --upgrade pip - pip install tox - pip install --upgrade -r requirements.txt - pip install --upgrade -r requirements_dev.txt - - name: Run tox - run: tox -v + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + ls -la + python -m pip install --upgrade pip + pip install tox + pip install --upgrade -r requirements.txt + pip install --upgrade -r requirements_dev.txt + - name: Run tox + run: tox -v diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml new file mode 100644 index 0000000..42a7097 --- /dev/null +++ b/.github/workflows/python-release.yml @@ -0,0 +1,24 @@ +name: Release version +on: release + +jobs: + publish: + + runs-on: ubuntu-18.04 + strategy: + matrix: + python-version: [ '3.8' ] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install run dependencies + run: | + python -m pip install --upgrade pip + pip install --upgrade -r requirements.txt + - name: Publish + run: | + diff --git a/docs/conf.py b/docs/conf.py index 3abaefc..b16dce0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,7 +19,8 @@ # import os import sys -sys.path.insert(0, os.path.abspath('..')) + +sys.path.insert(0, os.path.abspath("..")) import sziszapangma @@ -31,22 +32,22 @@ import sziszapangma # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] +extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'sziszapangma' +project = "sziszapangma" copyright = "2021, Piotr Szyma艅ski" author = "Piotr Szyma艅ski" @@ -69,10 +70,10 @@ language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -83,7 +84,7 @@ todo_include_todos = False # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -94,13 +95,13 @@ html_theme = 'alabaster' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'sziszapangmadoc' +htmlhelp_basename = "sziszapangmadoc" # -- Options for LaTeX output ------------------------------------------ @@ -109,15 +110,12 @@ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -127,9 +125,7 @@ latex_elements = { # (source start file, target name, title, author, documentclass # [howto, manual, or own class]). latex_documents = [ - (master_doc, 'sziszapangma.tex', - 'sziszapangma Documentation', - 'Piotr Szyma艅ski', 'manual'), + (master_doc, "sziszapangma.tex", "sziszapangma Documentation", "Piotr Szyma艅ski", "manual"), ] @@ -137,11 +133,7 @@ latex_documents = [ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'sziszapangma', - 'sziszapangma Documentation', - [author], 1) -] +man_pages = [(master_doc, "sziszapangma", "sziszapangma Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------- @@ -150,13 +142,13 @@ man_pages = [ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'sziszapangma', - 'sziszapangma Documentation', - author, - 'sziszapangma', - 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "sziszapangma", + "sziszapangma Documentation", + author, + "sziszapangma", + "One line description of project.", + "Miscellaneous", + ), ] - - - diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 6568fe0..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy>=1.20.1 -requests>=2.25.1 -pandas>=1.2.4 -fasttext>=0.9.2 -pymongo>=3.11.4 diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index 91f97d1..0000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,9 +0,0 @@ -wheel==0.36.2 -watchdog==2.1.3 -flake8==3.9.2 -tox==3.23.1 -coverage==5.5 -Sphinx==4.0.2 -twine==3.4.1 -pytest==6.2.4 -pytest-runner==5.3.1 diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 2642f6a..0000000 --- a/setup.cfg +++ /dev/null @@ -1,26 +0,0 @@ -[bumpversion] -current_version = 0.1.0 -commit = True -tag = True - -[bumpversion:file:setup.py] -search = version='{current_version}' -replace = version='{new_version}' - -[bumpversion:file:sziszapangma/__init__.py] -search = __version__ = '{current_version}' -replace = __version__ = '{new_version}' - -[bdist_wheel] -universal = 1 - -[flake8] -exclude = docs - -[aliases] -# Define setup.py command aliases here -test = pytest - -;[tool:pytest] -;collect_ignore = ['setup.py'] - diff --git a/setup.py b/setup.py deleted file mode 100644 index 4830c9e..0000000 --- a/setup.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python - -"""The setup script.""" - -from setuptools import setup, find_packages - -with open('README.rst') as readme_file: - readme = readme_file.read() - -with open('HISTORY.rst') as history_file: - history = history_file.read() - -with open("requirements.txt", "r") as fh: - requirements = fh.readlines() - -with open("requirements_dev.txt", "r") as fh: - requirements_dev = fh.readlines() + requirements - -setup_requirements = ['pytest-runner', ] - -test_requirements = ['pytest>=3', ] - -setup( - author="Piotr Szyma艅ski", - author_email='niedakh@gmail.com', - python_requires='>=3.5', - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - ], - description="A n", - entry_points={ - 'console_scripts': [ - 'sziszapangma=sziszapangma.cli:main', - ], - }, - install_requires=requirements, - license="MIT license", - long_description=readme + '\n\n' + history, - include_package_data=True, - keywords='sziszapangma', - name='sziszapangma', - packages=find_packages(include=['sziszapangma', 'sziszapangma.*']), - setup_requires=requirements_dev, - test_suite='tests', - tests_require=requirements_dev, - url='https://github.com/niedakh/sziszapangma', - version='0.1.0', - zip_safe=False, -) diff --git a/sziszapangma/__init__.py b/sziszapangma/__init__.py index 00e971c..1ae4cd1 100644 --- a/sziszapangma/__init__.py +++ b/sziszapangma/__init__.py @@ -1,5 +1,5 @@ """Top-level package for sziszapangma.""" __author__ = """Piotr Szyma艅ski""" -__email__ = 'niedakh@gmail.com' -__version__ = '0.1.0' +__email__ = "niedakh@gmail.com" +__version__ = "0.1.0" diff --git a/sziszapangma/__pycache__/__init__.cpython-39.pyc b/sziszapangma/__pycache__/__init__.cpython-39.pyc index 125189747dbbc47416dd2dc5f0eaaeb1d97f848b..0afbe57fd7e7215829ae3fee5d0f567fffa32ca7 100644 GIT binary patch delta 20 acmdnYw3&%Jk(ZZ?0SKl~y1kKGgAo8SE(GKN delta 20 acmdnYw3&%Jk(ZZ?0SM}v8#i)mFaiKCV+3LV diff --git a/sziszapangma/core/alignment/alignment_calculator.py b/sziszapangma/core/alignment/alignment_calculator.py index f69ec95..d22c0a8 100644 --- a/sziszapangma/core/alignment/alignment_calculator.py +++ b/sziszapangma/core/alignment/alignment_calculator.py @@ -1,15 +1,13 @@ from abc import ABC -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple import numpy as np -from sziszapangma.core.alignment.step_type import StepType +from sziszapangma.core.alignment.alignment_processing_step import AlignmentProcessingStep from sziszapangma.core.alignment.alignment_step import AlignmentStep -from sziszapangma.core.alignment.distance_matrix_calculator import \ - DistanceCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import DistanceCalculator +from sziszapangma.core.alignment.step_type import StepType from sziszapangma.core.alignment.step_words import StepWords -from sziszapangma.core.alignment.alignment_processing_step import \ - AlignmentProcessingStep from sziszapangma.core.alignment.word import Word @@ -28,10 +26,7 @@ class AlignmentCalculator(ABC): for step in processing_steps ] - def _get_reference_indexes_per_steps( - self, - steps: List[AlignmentProcessingStep] - ) -> List[int]: + def _get_reference_indexes_per_steps(self, steps: List[AlignmentProcessingStep]) -> List[int]: counter = 0 indexes = [] for step in steps: @@ -41,99 +36,87 @@ class AlignmentCalculator(ABC): return indexes def get_distance_matrix_between_words( - self, - reference: List[Word], - hypothesis: List[Word] + self, reference: List[Word], hypothesis: List[Word] ) -> np.ndarray: - return self._distance_matrix_calculator.calculate_distance_matrix( - reference, hypothesis) + return self._distance_matrix_calculator.calculate_distance_matrix(reference, hypothesis) @staticmethod def _get_initialized_levenshtein_matrix( - reference: List[Word], - hypothesis: List[Word] + reference: List[Word], hypothesis: List[Word] ) -> Tuple[np.ndarray, List[List[Optional[AlignmentProcessingStep]]]]: # TODO: consider about remove distance_arr replaced by steps_arr reference_len = len(reference) hypothesis_len = len(hypothesis) - distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)) \ - .reshape((reference_len + 1, hypothesis_len + 1)) - steps_arr = [ - [None for _ in range(hypothesis_len + 1)] - for _ in range(reference_len + 1) - ] + distance_arr = np.zeros((reference_len + 1) * (hypothesis_len + 1)).reshape( + (reference_len + 1, hypothesis_len + 1) + ) + steps_arr = [[None for _ in range(hypothesis_len + 1)] for _ in range(reference_len + 1)] # levenshtein initial for ref_index in range(reference_len + 1): distance_arr[ref_index][0] = ref_index - step_words = StepWords( - reference[ref_index - 1] if ref_index > 0 else None, - None + step_words = StepWords(reference[ref_index - 1] if ref_index > 0 else None, None) + steps_arr[ref_index][0] = AlignmentProcessingStep.levenshtein_deletion( + ref_index - 1, step_words ) - steps_arr[ref_index][0] = AlignmentProcessingStep\ - .levenshtein_deletion(ref_index - 1, step_words) for hyp_index in range(hypothesis_len + 1): distance_arr[0][hyp_index] = hyp_index - step_words = StepWords( - None, - hypothesis[hyp_index - 1] if hyp_index > 0 else None + step_words = StepWords(None, hypothesis[hyp_index - 1] if hyp_index > 0 else None) + steps_arr[0][hyp_index] = AlignmentProcessingStep.levenshtein_insertion( + hyp_index - 1, step_words ) - steps_arr[0][hyp_index] = AlignmentProcessingStep\ - .levenshtein_insertion(hyp_index - 1, step_words) return distance_arr, steps_arr @staticmethod def _get_levenshtein_processing_step_cross( - prev_cross_distance: float, - step_words: StepWords, - current_distance: float + prev_cross_distance: float, step_words: StepWords, current_distance: float ) -> AlignmentProcessingStep: - return AlignmentProcessingStep.levenshtein_correct( - prev_cross_distance, step_words, 0) \ - if current_distance == 0 \ + return ( + AlignmentProcessingStep.levenshtein_correct(prev_cross_distance, step_words, 0) + if current_distance == 0 else AlignmentProcessingStep.levenshtein_substitution( - prev_cross_distance, step_words, current_distance) + prev_cross_distance, step_words, current_distance + ) + ) def get_levenshtein_embedding_based( - self, - reference: List[Word], - hypothesis: List[Word], - distance_matrix: np.ndarray + self, reference: List[Word], hypothesis: List[Word], distance_matrix: np.ndarray ) -> Tuple[np.ndarray, List[List[AlignmentProcessingStep]]]: reference_len = len(reference) hypothesis_len = len(hypothesis) - distance_arr, steps_arr = self._get_initialized_levenshtein_matrix( - reference, hypothesis) + distance_arr, steps_arr = self._get_initialized_levenshtein_matrix(reference, hypothesis) for ref_index in range(reference_len): for hyp_index in range(hypothesis_len): - step_words = StepWords(reference[ref_index], - hypothesis[hyp_index]) + step_words = StepWords(reference[ref_index], hypothesis[hyp_index]) current_distance = distance_matrix[ref_index][hyp_index] prev_cross_distance = distance_arr[ref_index][hyp_index] cross_go_step = self._get_levenshtein_processing_step_cross( - prev_cross_distance, step_words, current_distance) + prev_cross_distance, step_words, current_distance + ) insertion_step = AlignmentProcessingStep.levenshtein_insertion( - distance_arr[ref_index + 1][hyp_index], step_words) + distance_arr[ref_index + 1][hyp_index], step_words + ) deletion_step = AlignmentProcessingStep.levenshtein_deletion( - distance_arr[ref_index][hyp_index + 1], step_words) + distance_arr[ref_index][hyp_index + 1], step_words + ) - best_step = min([cross_go_step, insertion_step, deletion_step], - key=lambda it: it.total_distance()) + best_step = min( + [cross_go_step, insertion_step, deletion_step], + key=lambda it: it.total_distance(), + ) - distance_arr[ref_index + 1][hyp_index + 1] = \ - best_step.total_distance() + distance_arr[ref_index + 1][hyp_index + 1] = best_step.total_distance() steps_arr[ref_index + 1][hyp_index + 1] = best_step return distance_arr, steps_arr def extract_steps_path( - self, - steps_matrix: List[List[AlignmentProcessingStep]] + self, steps_matrix: List[List[AlignmentProcessingStep]] ) -> List[AlignmentProcessingStep]: x = len(steps_matrix) - 1 y = len(steps_matrix[0]) - 1 @@ -151,29 +134,22 @@ class AlignmentCalculator(ABC): return to_return[::-1] def _calculate_steps_path( - self, - reference: List[Word], - hypothesis: List[Word] + self, reference: List[Word], hypothesis: List[Word] ) -> List[AlignmentProcessingStep]: - distance_between_words = self.get_distance_matrix_between_words( - reference, hypothesis) + distance_between_words = self.get_distance_matrix_between_words(reference, hypothesis) _, steps_matrix = self.get_levenshtein_embedding_based( - reference, hypothesis, distance_between_words) + reference, hypothesis, distance_between_words + ) return self.extract_steps_path(steps_matrix) def calculate_alignment( - self, - reference: List[Word], - hypothesis: List[Word] + self, reference: List[Word], hypothesis: List[Word] ) -> List[AlignmentStep]: steps_path = self._calculate_steps_path(reference, hypothesis) return self.convert_processing_steps_to_result(steps_path) def calculate_alignment_weighted( - self, - reference: List[Word], - hypothesis: List[Word], - weights: List[float] + self, reference: List[Word], hypothesis: List[Word], weights: List[float] ) -> List[AlignmentStep]: steps_path = self._calculate_steps_path(reference, hypothesis) return self.convert_processing_steps_to_result(steps_path) diff --git a/sziszapangma/core/alignment/alignment_classic_calculator.py b/sziszapangma/core/alignment/alignment_classic_calculator.py index fbf60eb..cab2093 100644 --- a/sziszapangma/core/alignment/alignment_classic_calculator.py +++ b/sziszapangma/core/alignment/alignment_classic_calculator.py @@ -1,10 +1,7 @@ -from sziszapangma.core.alignment.alignment_calculator import \ - AlignmentCalculator -from sziszapangma.core.alignment.distance_matrix_calculator import \ - BinaryDistanceCalculator +from sziszapangma.core.alignment.alignment_calculator import AlignmentCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import BinaryDistanceCalculator class AlignmentClassicCalculator(AlignmentCalculator): - def __init__(self): super().__init__(BinaryDistanceCalculator()) diff --git a/sziszapangma/core/alignment/alignment_embedding_calculator.py b/sziszapangma/core/alignment/alignment_embedding_calculator.py index a20802d..8b01730 100644 --- a/sziszapangma/core/alignment/alignment_embedding_calculator.py +++ b/sziszapangma/core/alignment/alignment_embedding_calculator.py @@ -1,15 +1,15 @@ -from typing import List +from typing import List, Optional -from sziszapangma.core.alignment.alignment_calculator import \ - AlignmentCalculator -from sziszapangma.core.alignment.alignment_processing_step import \ - AlignmentProcessingStep +from sziszapangma.core.alignment.alignment_calculator import AlignmentCalculator +from sziszapangma.core.alignment.alignment_processing_step import AlignmentProcessingStep from sziszapangma.core.alignment.alignment_step import AlignmentStep -from sziszapangma.core.alignment.distance_matrix_calculator import \ - BinaryDistanceCalculator, DistanceCalculator, CosineDistanceCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import ( + BinaryDistanceCalculator, + CosineDistanceCalculator, + DistanceCalculator, +) from sziszapangma.core.alignment.step_words import StepWords -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer +from sziszapangma.core.transformer.embedding_transformer import EmbeddingTransformer class AlignmentEmbeddingCalculator(AlignmentCalculator): @@ -18,33 +18,28 @@ class AlignmentEmbeddingCalculator(AlignmentCalculator): def __init__(self, embedding_transformer: EmbeddingTransformer): super().__init__(BinaryDistanceCalculator()) self._embedding_transformer = embedding_transformer - self._distance_calculator = CosineDistanceCalculator( - embedding_transformer) + self._distance_calculator = CosineDistanceCalculator(embedding_transformer) - def _calculate_distance_for_word_step( - self, - step_words: StepWords - ) -> float: + def _calculate_distance_for_word_step(self, step_words: StepWords) -> float: + Optional return self._distance_calculator.calculate_distance_for_words( - step_words.reference_word, - step_words.hypothesis_word + step_words.reference_word, step_words.hypothesis_word ) - def _calculate_result_cost_for_step( - self, - processing_step: AlignmentProcessingStep - ) -> float: + def _calculate_result_cost_for_step(self, processing_step: AlignmentProcessingStep) -> float: step_words = processing_step.step_words - return self._calculate_distance_for_word_step(step_words) \ - if processing_step.step_type.is_cross_step() \ + return ( + self._calculate_distance_for_word_step(step_words) + if processing_step.step_type.is_cross_step() else processing_step.step_cost + ) def convert_processing_steps_to_result( - self, - processing_steps: List[AlignmentProcessingStep] + self, processing_steps: List[AlignmentProcessingStep] ) -> List[AlignmentStep]: return [ - AlignmentStep(step.step_type, step.step_words, - self._calculate_result_cost_for_step(step)) + AlignmentStep( + step.step_type, step.step_words, self._calculate_result_cost_for_step(step) + ) for step in processing_steps ] diff --git a/sziszapangma/core/alignment/alignment_processing_step.py b/sziszapangma/core/alignment/alignment_processing_step.py index e4ab96d..ed70f94 100644 --- a/sziszapangma/core/alignment/alignment_processing_step.py +++ b/sziszapangma/core/alignment/alignment_processing_step.py @@ -12,30 +12,30 @@ class AlignmentProcessingStep: step_cost: float @classmethod - def levenshtein_insertion(cls, previous_distance: float, - step_words: StepWords, step_cost: float = 1): + def levenshtein_insertion( + cls, previous_distance: float, step_words: StepWords, step_cost: float = 1 + ): words = StepWords(None, step_words.hypothesis_word) - return AlignmentProcessingStep(StepType.INSERTION, words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.INSERTION, words, previous_distance, step_cost) @classmethod - def levenshtein_deletion(cls, previous_distance: float, - step_words: StepWords, step_cost: float = 1): + def levenshtein_deletion( + cls, previous_distance: float, step_words: StepWords, step_cost: float = 1 + ): words = StepWords(step_words.reference_word, None) - return AlignmentProcessingStep(StepType.DELETION, words, - previous_distance, step_cost) + return AlignmentProcessingStep(StepType.DELETION, words, previous_distance, step_cost) @classmethod - def levenshtein_substitution(cls, previous_distance: float, - step_words: StepWords, step_cost: float): - return AlignmentProcessingStep(StepType.SUBSTITUTION, step_words, - previous_distance, step_cost) + def levenshtein_substitution( + cls, previous_distance: float, step_words: StepWords, step_cost: float + ): + return AlignmentProcessingStep( + StepType.SUBSTITUTION, step_words, previous_distance, step_cost + ) @classmethod - def levenshtein_correct(cls, previous_distance: float, - step_words: StepWords, step_cost: float): - return AlignmentProcessingStep(StepType.CORRECT, step_words, - previous_distance, step_cost) + def levenshtein_correct(cls, previous_distance: float, step_words: StepWords, step_cost: float): + return AlignmentProcessingStep(StepType.CORRECT, step_words, previous_distance, step_cost) def total_distance(self) -> float: return self.step_cost + self.previous_distance diff --git a/sziszapangma/core/alignment/alignment_soft_calculator.py b/sziszapangma/core/alignment/alignment_soft_calculator.py index c7de34c..6266390 100644 --- a/sziszapangma/core/alignment/alignment_soft_calculator.py +++ b/sziszapangma/core/alignment/alignment_soft_calculator.py @@ -1,12 +1,8 @@ -from sziszapangma.core.alignment.alignment_calculator import \ - AlignmentCalculator -from sziszapangma.core.alignment.distance_matrix_calculator import \ - CosineDistanceCalculator -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer +from sziszapangma.core.alignment.alignment_calculator import AlignmentCalculator +from sziszapangma.core.alignment.distance_matrix_calculator import CosineDistanceCalculator +from sziszapangma.core.transformer.embedding_transformer import EmbeddingTransformer class AlignmentSoftCalculator(AlignmentCalculator): - def __init__(self, embedding_transformer: EmbeddingTransformer): super().__init__(CosineDistanceCalculator(embedding_transformer)) diff --git a/sziszapangma/core/alignment/alignment_step.py b/sziszapangma/core/alignment/alignment_step.py index cefd0d1..2663b06 100644 --- a/sziszapangma/core/alignment/alignment_step.py +++ b/sziszapangma/core/alignment/alignment_step.py @@ -12,7 +12,5 @@ class AlignmentStep: def with_weight_multiplication(self, weight: float): return AlignmentStep( - step_type=self.step_type, - step_words=self.step_words, - step_cost=self.step_cost * weight + step_type=self.step_type, step_words=self.step_words, step_cost=self.step_cost * weight ) diff --git a/sziszapangma/core/alignment/alignment_util.py b/sziszapangma/core/alignment/alignment_util.py index c188731..31d3ab3 100644 --- a/sziszapangma/core/alignment/alignment_util.py +++ b/sziszapangma/core/alignment/alignment_util.py @@ -7,36 +7,26 @@ from sziszapangma.core.alignment.alignment_step import AlignmentStep class AlignmentUtil: - @staticmethod def _optional_str_to_str(value: Optional[str]) -> str: - return value if value is not None else '' + return value if value is not None else "" @staticmethod def _wer_step_to_pandas_row_lit(step: AlignmentStep) -> List[any]: return [ step.step_type.get_short_name(), AlignmentUtil._optional_str_to_str(step.step_words.reference_word), - AlignmentUtil._optional_str_to_str( - step.step_words.hypothesis_word), - round(step.step_cost, 3) + AlignmentUtil._optional_str_to_str(step.step_words.hypothesis_word), + round(step.step_cost, 3), ] @staticmethod def steps_to_dataframe(steps: List[AlignmentStep]) -> pd.DataFrame: - arr = np.array([ - AlignmentUtil._wer_step_to_pandas_row_lit(step) - for step in steps - ]) - return pd.DataFrame( - arr, - columns=['step_type', 'reference', 'hypothesis', 'cost'] - ) + arr = np.array([AlignmentUtil._wer_step_to_pandas_row_lit(step) for step in steps]) + return pd.DataFrame(arr, columns=["step_type", "reference", "hypothesis", "cost"]) @staticmethod - def get_reference_indexes_per_steps( - steps: List[AlignmentStep] - ) -> List[int]: + def get_reference_indexes_per_steps(steps: List[AlignmentStep]) -> List[int]: counter = 0 indexes = [] for step in steps: @@ -47,25 +37,19 @@ class AlignmentUtil: @staticmethod def get_reference_length(steps: List[AlignmentStep]) -> int: - return sum([ - 1 if step.step_type.contain_reference_word() else 0 - for step in steps - ]) + return sum([1 if step.step_type.contain_reference_word() else 0 for step in steps]) @staticmethod def apply_weights_to_alignment( - steps: List[AlignmentStep], - weights: List[float] + steps: List[AlignmentStep], weights: List[float] ) -> List[AlignmentStep]: if AlignmentUtil.get_reference_length(steps) != len(weights): raise Exception( f"Incorrect length of weights, current={len(weights)}, " f"required={AlignmentUtil.get_reference_length(steps)}" ) - reference_indexes_per_steps = \ - AlignmentUtil.get_reference_indexes_per_steps(steps) + reference_indexes_per_steps = AlignmentUtil.get_reference_indexes_per_steps(steps) return [ - steps[index].with_weight_multiplication( - weights[reference_indexes_per_steps[index]]) + steps[index].with_weight_multiplication(weights[reference_indexes_per_steps[index]]) for index in range(len(steps)) ] diff --git a/sziszapangma/core/alignment/distance_matrix_calculator.py b/sziszapangma/core/alignment/distance_matrix_calculator.py index 5f17ea7..ee09960 100644 --- a/sziszapangma/core/alignment/distance_matrix_calculator.py +++ b/sziszapangma/core/alignment/distance_matrix_calculator.py @@ -3,18 +3,13 @@ from typing import List import numpy as np -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer from sziszapangma.core.alignment.word import Word +from sziszapangma.core.transformer.embedding_transformer import EmbeddingTransformer class DistanceCalculator(ABC): @abstractmethod - def calculate_distance_matrix( - self, - reference: List[Word], - hypothesis: List[Word] - ) -> np.array: + def calculate_distance_matrix(self, reference: List[Word], hypothesis: List[Word]) -> np.array: pass @abstractmethod @@ -26,16 +21,16 @@ class BinaryDistanceCalculator(DistanceCalculator): def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: return 0 if word1.value == word2.value else 1 - def calculate_distance_matrix( - self, - reference: List[Word], - hypothesis: List[Word] - ) -> np.array: - return np.array([ - [self.calculate_distance_for_words(reference_word, hypothesis_word) - for hypothesis_word in hypothesis] - for reference_word in reference - ]) + def calculate_distance_matrix(self, reference: List[Word], hypothesis: List[Word]) -> np.array: + return np.array( + [ + [ + self.calculate_distance_for_words(reference_word, hypothesis_word) + for hypothesis_word in hypothesis + ] + for reference_word in reference + ] + ) class CosineDistanceCalculator(DistanceCalculator): @@ -47,19 +42,17 @@ class CosineDistanceCalculator(DistanceCalculator): def calculate_distance_for_words(self, word1: Word, word2: Word) -> float: return self.cosine_distance_between_words_embeddings( self._embedding_transformer.get_embedding(word1.value), - self._embedding_transformer.get_embedding(word2.value) + self._embedding_transformer.get_embedding(word2.value), ) @staticmethod def cosine_distance_between_words_embeddings( - word1_embedding: np.array, - word2_embedding: np.array + word1_embedding: np.array, word2_embedding: np.array ) -> float: a = word1_embedding b = word2_embedding if a.shape != b.shape: - raise RuntimeError( - "array {} shape not match {}".format(a.shape, b.shape)) + raise RuntimeError("array {} shape not match {}".format(a.shape, b.shape)) if a.ndim == 1: a_norm = np.linalg.norm(a) b_norm = np.linalg.norm(b) @@ -69,22 +62,22 @@ class CosineDistanceCalculator(DistanceCalculator): else: raise RuntimeError("array dimensions {} not right".format(a.ndim)) similarity = np.dot(a, b.T) / (a_norm * b_norm) - dist = 1. - similarity + dist = 1.0 - similarity return dist - def calculate_distance_matrix( - self, - reference: List[Word], - hypothesis: List[Word] - ) -> np.array: + def calculate_distance_matrix(self, reference: List[Word], hypothesis: List[Word]) -> np.array: embeddings_dict = self._embedding_transformer.get_embeddings( list(set(it.value for it in (reference + hypothesis))) ) - return np.array([[ - self.cosine_distance_between_words_embeddings( - embeddings_dict[reference_word.value], - embeddings_dict[hypothesis_word.value], - ) - for hypothesis_word in hypothesis] - for reference_word in reference - ]) + return np.array( + [ + [ + self.cosine_distance_between_words_embeddings( + embeddings_dict[reference_word.value], + embeddings_dict[hypothesis_word.value], + ) + for hypothesis_word in hypothesis + ] + for reference_word in reference + ] + ) diff --git a/sziszapangma/core/transformer/cached_embedding_transformer.py b/sziszapangma/core/transformer/cached_embedding_transformer.py index f58fe33..1cb6c86 100644 --- a/sziszapangma/core/transformer/cached_embedding_transformer.py +++ b/sziszapangma/core/transformer/cached_embedding_transformer.py @@ -1,9 +1,8 @@ -from typing import List, Dict +from typing import Dict, List import numpy as np -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer +from sziszapangma.core.transformer.embedding_transformer import EmbeddingTransformer class CachedEmbeddingTransformer(EmbeddingTransformer): @@ -19,14 +18,12 @@ class CachedEmbeddingTransformer(EmbeddingTransformer): def get_embeddings(self, words: List[str]) -> Dict[str, np.ndarray]: new_words = [word for word in words if word not in self._cache] - new_embeddings = self._embeddings_transformer\ - .get_embeddings(new_words) if len(new_words) > 0 else dict() + new_embeddings = ( + self._embeddings_transformer.get_embeddings(new_words) if len(new_words) > 0 else dict() + ) for new_word in new_words: self._cache[new_word] = new_embeddings[new_word] - return { - word: self._cache[word] - for word in words - } + return {word: self._cache[word] for word in words} def clear(self): self._cache.clear() diff --git a/sziszapangma/core/transformer/embedding_transformer.py b/sziszapangma/core/transformer/embedding_transformer.py index a953f44..52a698c 100644 --- a/sziszapangma/core/transformer/embedding_transformer.py +++ b/sziszapangma/core/transformer/embedding_transformer.py @@ -1,11 +1,10 @@ from abc import ABC, abstractmethod -from typing import List, Dict +from typing import Dict, List import numpy as np class EmbeddingTransformer(ABC): - @abstractmethod def get_embeddings(self, words: List[str]) -> Dict[str, np.ndarray]: pass diff --git a/sziszapangma/core/transformer/fasttext_embedding_transformer.py b/sziszapangma/core/transformer/fasttext_embedding_transformer.py index a74ac7e..bb38a71 100644 --- a/sziszapangma/core/transformer/fasttext_embedding_transformer.py +++ b/sziszapangma/core/transformer/fasttext_embedding_transformer.py @@ -1,11 +1,10 @@ -from typing import List, Dict +from typing import Dict, List import fasttext import fasttext.util import numpy as np -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer +from sziszapangma.core.transformer.embedding_transformer import EmbeddingTransformer class FasttextEmbeddingTransformer(EmbeddingTransformer): @@ -13,15 +12,11 @@ class FasttextEmbeddingTransformer(EmbeddingTransformer): _model_name: str def __init__(self, lang_id: str): - full_model_name = fasttext.util.download_model( - lang_id, if_exists='ignore') + full_model_name = fasttext.util.download_model(lang_id, if_exists="ignore") self._fasttext_model = fasttext.load_model(full_model_name) def get_embedding(self, word: str) -> np.ndarray: return self._fasttext_model.get_word_vector(word) def get_embeddings(self, words: List[str]) -> Dict[str, np.ndarray]: - return { - word: self.get_embedding(word) - for word in set(words) - } + return {word: self.get_embedding(word) for word in set(words)} diff --git a/sziszapangma/core/wer/span.py b/sziszapangma/core/wer/span.py index 44cfe84..a07be75 100644 --- a/sziszapangma/core/wer/span.py +++ b/sziszapangma/core/wer/span.py @@ -10,7 +10,4 @@ class Span: return self.index_start <= index < self.index_end def get_reference_mask_table(self, total_size: int): - return [ - self.is_index_belong(it) - for it in range(total_size) - ] + return [self.is_index_belong(it) for it in range(total_size)] diff --git a/sziszapangma/core/wer/wer_calculator.py b/sziszapangma/core/wer/wer_calculator.py index 3fa65db..b5cdab3 100644 --- a/sziszapangma/core/wer/wer_calculator.py +++ b/sziszapangma/core/wer/wer_calculator.py @@ -7,20 +7,17 @@ from sziszapangma.core.wer.span import Span class WerCalculator(ABC): - @staticmethod def _convert_processing_steps_to_result( - input_steps: List[AlignmentStep], - span: Span + input_steps: List[AlignmentStep], span: Span ) -> List[AlignmentStep]: - indexes_per_steps = AlignmentUtil.get_reference_indexes_per_steps( - input_steps) + indexes_per_steps = AlignmentUtil.get_reference_indexes_per_steps(input_steps) return [ AlignmentStep( input_steps[step_index].step_type, input_steps[step_index].step_words, - input_steps[step_index].step_cost * span.is_index_belong( - indexes_per_steps[step_index]) + input_steps[step_index].step_cost + * span.is_index_belong(indexes_per_steps[step_index]), ) for step_index in range(len(input_steps)) ] @@ -32,21 +29,13 @@ class WerCalculator(ABC): reference_len = AlignmentUtil.get_reference_length(steps) return sum([step.step_cost for step in steps]) / reference_len - def calculate_wer( - self, - steps: List[AlignmentStep] - ) -> float: + def calculate_wer(self, steps: List[AlignmentStep]) -> float: return self._calculate_wer(steps) - def calculate_wer_for_spans( - self, - steps: List[AlignmentStep], - spans: List[Span] - ) -> List[float]: + def calculate_wer_for_spans(self, steps: List[AlignmentStep], spans: List[Span]) -> List[float]: return [ - self._calculate_wer(self._convert_processing_steps_to_result( - input_steps=steps, - span=span - )) + self._calculate_wer( + self._convert_processing_steps_to_result(input_steps=steps, span=span) + ) for span in spans ] diff --git a/sziszapangma/integration/asr_processor.py b/sziszapangma/integration/asr_processor.py index 580695a..5859f06 100644 --- a/sziszapangma/integration/asr_processor.py +++ b/sziszapangma/integration/asr_processor.py @@ -1,11 +1,10 @@ from abc import ABC, abstractmethod -from typing import List, Dict +from typing import Dict, List import requests class AsrProcessor(ABC): - @abstractmethod def call_recognise(self, file_path: str) -> List[Dict[str, any]]: """ @@ -23,7 +22,7 @@ class AsrWebClient(AsrProcessor): self._url = url def call_recognise(self, file_path: str) -> List[Dict[str, any]]: - files = {'file': open(file_path, 'rb')} + files = {"file": open(file_path, "rb")} res = requests.post(self._url, files=files) json_response = res.json() print(json_response) diff --git a/sziszapangma/integration/experiment_manager.py b/sziszapangma/integration/experiment_manager.py index b28ad39..1dd2528 100644 --- a/sziszapangma/integration/experiment_manager.py +++ b/sziszapangma/integration/experiment_manager.py @@ -1,7 +1,7 @@ from typing import List -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository + from .record_id_iterator import RecordIdIterator from .task.processing_task import ProcessingTask @@ -15,7 +15,7 @@ class ExperimentManager: self, experiment_repository: ExperimentRepository, record_id_iterator: RecordIdIterator, - processing_tasks: List[ProcessingTask] + processing_tasks: List[ProcessingTask], ): self._experiment_repository = experiment_repository self._record_id_iterator = record_id_iterator @@ -24,7 +24,4 @@ class ExperimentManager: def process(self): self._experiment_repository.initialise() for processing_task in self._processing_tasks: - processing_task.process( - self._record_id_iterator, - self._experiment_repository - ) + processing_task.process(self._record_id_iterator, self._experiment_repository) diff --git a/sziszapangma/integration/gold_transcript_processor.py b/sziszapangma/integration/gold_transcript_processor.py index a632254..754efd3 100644 --- a/sziszapangma/integration/gold_transcript_processor.py +++ b/sziszapangma/integration/gold_transcript_processor.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import List, Dict +from typing import Dict, List class GoldTranscriptProcessor(ABC): diff --git a/sziszapangma/integration/mapper/alignment_step_mapper.py b/sziszapangma/integration/mapper/alignment_step_mapper.py index 8b3bf9b..de43514 100644 --- a/sziszapangma/integration/mapper/alignment_step_mapper.py +++ b/sziszapangma/integration/mapper/alignment_step_mapper.py @@ -1,16 +1,23 @@ from typing import Dict from sziszapangma.core.alignment.alignment_step import AlignmentStep +from sziszapangma.core.alignment.step_type import StepType from sziszapangma.integration.mapper.step_words_mapper import StepWordsMapper class AlignmentStepMapper: - @staticmethod def to_json_dict(alignment_step: AlignmentStep) -> Dict[str, any]: return { - 'step_type': alignment_step.step_type.name, - 'step_words': StepWordsMapper.to_json_dict( - alignment_step.step_words), - 'step_cost': alignment_step.step_cost + "step_type": alignment_step.step_type.name, + "step_words": StepWordsMapper.to_json_dict(alignment_step.step_words), + "step_cost": alignment_step.step_cost, } + + @staticmethod + def from_json_dict(input_json_dict: Dict[str, any]) -> AlignmentStep: + return AlignmentStep( + StepType[input_json_dict["step_type"]], + StepWordsMapper.from_json_dict(input_json_dict["step_words"]), + input_json_dict["step_cost"], + ) diff --git a/sziszapangma/integration/mapper/step_words_mapper.py b/sziszapangma/integration/mapper/step_words_mapper.py index a28b532..2cd64b2 100644 --- a/sziszapangma/integration/mapper/step_words_mapper.py +++ b/sziszapangma/integration/mapper/step_words_mapper.py @@ -5,23 +5,22 @@ from sziszapangma.integration.mapper.word_mapper import WordMapper class StepWordsMapper: - @staticmethod def to_json_dict(step_words: StepWords) -> Dict[str, any]: to_return = dict() if step_words.hypothesis_word is not None: - to_return['hypothesis_word'] = WordMapper.to_json_dict( - step_words.hypothesis_word) + to_return["hypothesis_word"] = WordMapper.to_json_dict(step_words.hypothesis_word) if step_words.reference_word is not None: - to_return['reference_word'] = WordMapper.to_json_dict( - step_words.reference_word) + to_return["reference_word"] = WordMapper.to_json_dict(step_words.reference_word) return to_return @staticmethod def from_json_dict(input_json_dict: Dict[str, any]) -> StepWords: return StepWords( - None if 'reference_word' not in input_json_dict - else WordMapper.from_json_dict(input_json_dict['reference_word']), - None if 'hypothesis_word' not in input_json_dict - else WordMapper.from_json_dict(input_json_dict['hypothesis_word']), + None + if "reference_word" not in input_json_dict + else WordMapper.from_json_dict(input_json_dict["reference_word"]), + None + if "hypothesis_word" not in input_json_dict + else WordMapper.from_json_dict(input_json_dict["hypothesis_word"]), ) diff --git a/sziszapangma/integration/mapper/word_mapper.py b/sziszapangma/integration/mapper/word_mapper.py index f7b0cd4..30d3d6a 100644 --- a/sziszapangma/integration/mapper/word_mapper.py +++ b/sziszapangma/integration/mapper/word_mapper.py @@ -2,18 +2,14 @@ from typing import Dict from sziszapangma.core.alignment.word import Word -_ID = 'id' -_VALUE = 'value' +_ID = "id" +_VALUE = "value" class WordMapper: - @staticmethod def to_json_dict(word: Word) -> Dict[str, str]: - return { - _ID: word.id, - _VALUE: word.value - } + return {_ID: word.id, _VALUE: word.value} @staticmethod def from_json_dict(input_json_dict: Dict[str, str]) -> Word: diff --git a/sziszapangma/integration/path_filter.py b/sziszapangma/integration/path_filter.py index 1ac6eb4..a5e55f9 100644 --- a/sziszapangma/integration/path_filter.py +++ b/sziszapangma/integration/path_filter.py @@ -19,16 +19,12 @@ class ExtensionPathFilter(PathFilter): """ Implementation of PathFilter which find all files with specified extension. """ + _extension: str _root_directory: str _files_limit: Optional[int] - def __init__( - self, - root_directory: str, - extension: str, - files_limit: Optional[int] = None - ): + def __init__(self, root_directory: str, extension: str, files_limit: Optional[int] = None): """Constructor of class.""" self._extension = extension self._files_limit = files_limit @@ -38,12 +34,6 @@ class ExtensionPathFilter(PathFilter): """ Implementation of searching files with extension. """ - path_generator = Path(self._root_directory).glob( - f'**/*.{self._extension}') - all_files = [ - str(it) - for it in path_generator - ] - return all_files \ - if self._files_limit is None \ - else all_files[:self._files_limit] + path_generator = Path(self._root_directory).glob(f"**/*.{self._extension}") + all_files = [str(it) for it in path_generator] + return all_files if self._files_limit is None else all_files[: self._files_limit] diff --git a/sziszapangma/integration/record_id_iterator.py b/sziszapangma/integration/record_id_iterator.py index f62ad1e..dd3ab0d 100644 --- a/sziszapangma/integration/record_id_iterator.py +++ b/sziszapangma/integration/record_id_iterator.py @@ -1,8 +1,7 @@ from abc import ABC, abstractmethod from typing import Set -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository class RecordIdIterator(ABC): diff --git a/sziszapangma/integration/repository/experiment_repository.py b/sziszapangma/integration/repository/experiment_repository.py index 6f6d148..04b99f2 100644 --- a/sziszapangma/integration/repository/experiment_repository.py +++ b/sziszapangma/integration/repository/experiment_repository.py @@ -7,7 +7,7 @@ class ExperimentRepository(ABC): """Repository to manage results of asr experiment processing.""" @abstractmethod - def initialise(self): + def initialise(self) -> None: """Method to initialize repository.""" @abstractmethod @@ -15,28 +15,15 @@ class ExperimentRepository(ABC): """Method checks that property in record exists.""" @abstractmethod - def update_property_for_key( - self, - record_id: str, - property_name: str, - property_value: any - ): + def update_property_for_key(self, record_id: str, property_name: str, property_value: any): """Method updates property in record.""" @abstractmethod - def delete_property_for_key( - self, - record_id: str, - property_name: str - ): + def delete_property_for_key(self, record_id: str, property_name: str): """Method removes property in record.""" @abstractmethod - def get_property_for_key( - self, - record_id: str, - property_name: str - ) -> Optional[any]: + def get_property_for_key(self, record_id: str, property_name: str) -> Optional[any]: """Method returns property for record.""" @abstractmethod diff --git a/sziszapangma/integration/repository/file_experiment_repository.py b/sziszapangma/integration/repository/file_experiment_repository.py index 520390e..d724a9c 100644 --- a/sziszapangma/integration/repository/file_experiment_repository.py +++ b/sziszapangma/integration/repository/file_experiment_repository.py @@ -1,11 +1,10 @@ import json import os -from typing import Optional, Dict, Set +from typing import Dict, Optional, Set import pandas as pd -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository class FileExperimentRepository(ExperimentRepository): @@ -21,7 +20,7 @@ class FileExperimentRepository(ExperimentRepository): def initialise(self): if not os.path.isfile(self._metadata_file_path): - with open(self._metadata_file_path, 'w') as f: + with open(self._metadata_file_path, "w") as f: empty_dict = dict() f.write(json.dumps(empty_dict)) @@ -34,23 +33,14 @@ class FileExperimentRepository(ExperimentRepository): else: return True - def update_property_for_key( - self, - record_id: str, - property_name: str, - property_value: any - ): + def update_property_for_key(self, record_id: str, property_name: str, property_value: any): json_content = self._get_file_parsed_json() if record_id not in json_content: json_content[record_id] = dict({}) json_content[record_id][property_name] = property_value self._update_metadata_file(json_content) - def get_property_for_key( - self, - record_id: str, - property_name: str - ) -> Optional[any]: + def get_property_for_key(self, record_id: str, property_name: str) -> Optional[any]: json_content = self._get_file_parsed_json() if self.property_exists(record_id, property_name): return json_content[record_id][property_name] @@ -59,14 +49,14 @@ class FileExperimentRepository(ExperimentRepository): def _get_file_parsed_json(self) -> Dict[str, any]: if self._cache_value is None: - with open(self._metadata_file_path, 'r') as f: + with open(self._metadata_file_path, "r") as f: self._cache_value = json.loads(f.read()) return self._cache_value def _update_metadata_file(self, json_content: Dict[str, any]): self._cache_value = json_content indent = 4 if self._pretty_format else None - with open(self._metadata_file_path, 'w') as f: + with open(self._metadata_file_path, "w") as f: f.write(json.dumps(json_content, indent=indent)) def get_metrics_result_to_df(self, metrics_property: str) -> pd.DataFrame: @@ -74,10 +64,11 @@ class FileExperimentRepository(ExperimentRepository): all_metadata = self._get_file_parsed_json() for item_id in all_metadata.keys(): item_dict = dict() - item_dict['id'] = item_id + item_dict["id"] = item_id for metric_keys in all_metadata[item_id][metrics_property].keys(): - item_dict[f'{metrics_property}.{metric_keys}'] = \ - all_metadata[item_id][metrics_property][metric_keys] + item_dict[f"{metrics_property}.{metric_keys}"] = all_metadata[item_id][ + metrics_property + ][metric_keys] list_of_dicts.append(item_dict) return pd.DataFrame(list_of_dicts) diff --git a/sziszapangma/integration/repository/mongo_experiment_repository.py b/sziszapangma/integration/repository/mongo_experiment_repository.py index 19c11aa..766ba69 100644 --- a/sziszapangma/integration/repository/mongo_experiment_repository.py +++ b/sziszapangma/integration/repository/mongo_experiment_repository.py @@ -3,11 +3,10 @@ from typing import Optional, Set from pymongo import MongoClient from pymongo.database import Database -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository -ID = '_id' -VALUE = 'value' +ID = "_id" +VALUE = "value" class MongoExperimentRepository(ExperimentRepository): @@ -28,29 +27,19 @@ class MongoExperimentRepository(ExperimentRepository): if property_name not in all_collections: return False else: - return database[property_name].find_one( - {ID: record_id}) is not None + return database[property_name].find_one({ID: record_id}) is not None - def update_property_for_key(self, record_id: str, property_name: str, - property_value: any): + def update_property_for_key(self, record_id: str, property_name: str, property_value: any): self.delete_property_for_key(record_id, property_name) - self._get_database()[property_name].insert_one({ - ID: record_id, - VALUE: property_value - }) + self._get_database()[property_name].insert_one({ID: record_id, VALUE: property_value}) def delete_property_for_key(self, record_id: str, property_name: str): if self.property_exists(record_id, property_name): self._get_database()[property_name].delete_one({ID: record_id}) - def get_property_for_key( - self, - record_id: str, - property_name: str - ) -> Optional[any]: + def get_property_for_key(self, record_id: str, property_name: str) -> Optional[any]: if self.property_exists(record_id, property_name): - return self._get_database()[property_name].find_one( - {ID: record_id})[VALUE] + return self._get_database()[property_name].find_one({ID: record_id})[VALUE] else: return None diff --git a/sziszapangma/integration/task/asr_task.py b/sziszapangma/integration/task/asr_task.py index 3393121..9422f70 100644 --- a/sziszapangma/integration/task/asr_task.py +++ b/sziszapangma/integration/task/asr_task.py @@ -1,7 +1,6 @@ from sziszapangma.integration.asr_processor import AsrProcessor from sziszapangma.integration.record_path_provider import RecordPathProvider -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask @@ -16,25 +15,21 @@ class AsrTask(ProcessingTask): record_path_provider: RecordPathProvider, asr_processor: AsrProcessor, asr_property_name: str, - require_update: bool + require_update: bool, ): super().__init__(task_name, require_update) self._record_path_provider = record_path_provider self._asr_processor = asr_processor self._asr_property_name = asr_property_name - def skip_for_record(self, record_id: str, - experiment_repository: ExperimentRepository) -> bool: - asr_value = experiment_repository \ - .get_property_for_key(record_id, self._asr_property_name) - return asr_value is not None and 'transcription' in asr_value + def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: + asr_value = experiment_repository.get_property_for_key(record_id, self._asr_property_name) + return asr_value is not None and "transcription" in asr_value - def run_single_process(self, record_id: str, - experiment_repository: ExperimentRepository): + def run_single_process( + self, record_id: str, experiment_repository: ExperimentRepository + ) -> None: file_record_path = self._record_path_provider.get_path(record_id) experiment_repository.update_property_for_key( - record_id, - self._asr_property_name, - self._asr_processor - .call_recognise(file_record_path) + record_id, self._asr_property_name, self._asr_processor.call_recognise(file_record_path) ) diff --git a/sziszapangma/integration/task/classic_wer_metric_task.py b/sziszapangma/integration/task/classic_wer_metric_task.py index 4657a4a..2ea41e3 100644 --- a/sziszapangma/integration/task/classic_wer_metric_task.py +++ b/sziszapangma/integration/task/classic_wer_metric_task.py @@ -1,17 +1,14 @@ -from typing import List, Dict +from typing import Dict, List -from sziszapangma.core.alignment.alignment_classic_calculator import \ - AlignmentClassicCalculator +from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator from sziszapangma.core.alignment.alignment_step import AlignmentStep from sziszapangma.core.wer.wer_calculator import WerCalculator -from sziszapangma.integration.mapper.alignment_step_mapper import \ - AlignmentStepMapper +from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper from sziszapangma.integration.mapper.word_mapper import WordMapper -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask -_CLASSIC_WER = 'classic_wer' +_CLASSIC_WER = "classic_wer" class ClassicWerMetricTask(ProcessingTask): @@ -28,7 +25,7 @@ class ClassicWerMetricTask(ProcessingTask): asr_property_name: str, metrics_property_name: str, alignment_property_name: str, - require_update: bool + require_update: bool, ): super().__init__(task_name, require_update) self._gold_transcript_property_name = gold_transcript_property_name @@ -38,60 +35,41 @@ class ClassicWerMetricTask(ProcessingTask): self._alignment_classic_calculator = AlignmentClassicCalculator() self._wer_calculator = WerCalculator() - def skip_for_record( - self, - record_id: str, - experiment_repository: ExperimentRepository - ) -> bool: - return experiment_repository \ - .get_property_for_key(record_id, self._metrics_property_name) + def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: + return ( + experiment_repository.get_property_for_key(record_id, self._metrics_property_name) + is not None + ) - def run_single_process(self, record_id: str, - experiment_repository: ExperimentRepository): - gold_transcript = experiment_repository \ - .get_property_for_key(record_id, - self._gold_transcript_property_name) - asr_result = experiment_repository \ - .get_property_for_key(record_id, self._asr_property_name) - if 'transcription' in asr_result: - alignment_steps = self._get_alignment( - gold_transcript, asr_result['transcription'] - ) + def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository): + gold_transcript = experiment_repository.get_property_for_key( + record_id, self._gold_transcript_property_name + ) + asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name) + if "transcription" in asr_result: + alignment_steps = self._get_alignment(gold_transcript, asr_result["transcription"]) experiment_repository.update_property_for_key( record_id, self._alignment_property_name, - [AlignmentStepMapper.to_json_dict(it) - for it in alignment_steps] + [AlignmentStepMapper.to_json_dict(it) for it in alignment_steps], ) experiment_repository.update_property_for_key( - record_id, - self._metrics_property_name, - self.calculate_metrics(alignment_steps) + record_id, self._metrics_property_name, self.calculate_metrics(alignment_steps) ) def _get_alignment( - self, - gold_transcript: List[Dict[str, any]], - asr_result: List[Dict[str, any]] + self, gold_transcript: List[Dict[str, any]], asr_result: List[Dict[str, any]] ) -> List[AlignmentStep]: gold_transcript_words = [ - WordMapper.from_json_dict(word_dict) - for word_dict in gold_transcript - ] - asr_words = [ - WordMapper.from_json_dict(word_dict) - for word_dict in asr_result + WordMapper.from_json_dict(word_dict) for word_dict in gold_transcript ] - return self._alignment_classic_calculator \ - .calculate_alignment(reference=gold_transcript_words, - hypothesis=asr_words) + asr_words = [WordMapper.from_json_dict(word_dict) for word_dict in asr_result] + return self._alignment_classic_calculator.calculate_alignment( + reference=gold_transcript_words, hypothesis=asr_words + ) - def calculate_metrics( - self, - alignment_steps: List[AlignmentStep] - ) -> Dict[str, any]: + def calculate_metrics(self, alignment_steps: List[AlignmentStep]) -> Dict[str, any]: """Calculate all metrics for data sample.""" metrics = dict() - metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer( - alignment_steps) + metrics[_CLASSIC_WER] = self._wer_calculator.calculate_wer(alignment_steps) return metrics diff --git a/sziszapangma/integration/task/embedding_wer_metrics_task.py b/sziszapangma/integration/task/embedding_wer_metrics_task.py index 3eb3476..1f7b25a 100644 --- a/sziszapangma/integration/task/embedding_wer_metrics_task.py +++ b/sziszapangma/integration/task/embedding_wer_metrics_task.py @@ -1,25 +1,19 @@ -from typing import List, Dict +from typing import Dict, List -from sziszapangma.core.alignment.alignment_embedding_calculator import \ - AlignmentEmbeddingCalculator -from sziszapangma.core.alignment.alignment_soft_calculator import \ - AlignmentSoftCalculator +from sziszapangma.core.alignment.alignment_embedding_calculator import AlignmentEmbeddingCalculator +from sziszapangma.core.alignment.alignment_soft_calculator import AlignmentSoftCalculator from sziszapangma.core.alignment.word import Word -from sziszapangma.core.transformer.cached_embedding_transformer import \ - CachedEmbeddingTransformer -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer +from sziszapangma.core.transformer.cached_embedding_transformer import CachedEmbeddingTransformer +from sziszapangma.core.transformer.embedding_transformer import EmbeddingTransformer from sziszapangma.core.wer.wer_calculator import WerCalculator -from sziszapangma.integration.mapper.alignment_step_mapper import \ - AlignmentStepMapper +from sziszapangma.integration.mapper.alignment_step_mapper import AlignmentStepMapper from sziszapangma.integration.mapper.word_mapper import WordMapper -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask -_SOFT_WER = 'soft_wer' -_EMBEDDING_WER = 'embedding_wer' -_WORD = 'word' +_SOFT_WER = "soft_wer" +_EMBEDDING_WER = "embedding_wer" +_WORD = "word" class EmbeddingWerMetricsTask(ProcessingTask): @@ -39,67 +33,59 @@ class EmbeddingWerMetricsTask(ProcessingTask): metrics_property_name: str, alignment_property_name: str, require_update: bool, - embedding_transformer: EmbeddingTransformer + embedding_transformer: EmbeddingTransformer, ): super().__init__(task_name, require_update) self._gold_transcript_property_name = gold_transcript_property_name self._asr_property_name = asr_property_name self._metrics_property_name = metrics_property_name - self._embedding_transformer = \ - CachedEmbeddingTransformer(embedding_transformer) - self._alignment_embedding_calculator = \ - AlignmentEmbeddingCalculator(self._embedding_transformer) - self._alignment_soft_calculator = \ - AlignmentSoftCalculator(self._embedding_transformer) + self._embedding_transformer = CachedEmbeddingTransformer(embedding_transformer) + self._alignment_embedding_calculator = AlignmentEmbeddingCalculator( + self._embedding_transformer + ) + self._alignment_soft_calculator = AlignmentSoftCalculator(self._embedding_transformer) self._wer_calculator = WerCalculator() self._alignment_property_name = alignment_property_name - def skip_for_record(self, record_id: str, - experiment_repository: ExperimentRepository) -> bool: - return experiment_repository \ - .get_property_for_key(record_id, self._metrics_property_name) + def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: + return experiment_repository.get_property_for_key(record_id, self._metrics_property_name) - def run_single_process(self, record_id: str, - experiment_repository: ExperimentRepository): - gold_transcript = experiment_repository \ - .get_property_for_key(record_id, - self._gold_transcript_property_name) - asr_result = experiment_repository \ - .get_property_for_key(record_id, self._asr_property_name) - if 'transcription' in asr_result: + def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository): + gold_transcript = experiment_repository.get_property_for_key( + record_id, self._gold_transcript_property_name + ) + asr_result = experiment_repository.get_property_for_key(record_id, self._asr_property_name) + if "transcription" in asr_result: gold_transcript_words = self._map_words_to_domain(gold_transcript) - asr_words = self._map_words_to_domain(asr_result['transcription']) + asr_words = self._map_words_to_domain(asr_result["transcription"]) - soft_alignment = self._alignment_soft_calculator \ - .calculate_alignment(gold_transcript_words, asr_words) - embedding_alignment = self._alignment_embedding_calculator \ - .calculate_alignment(gold_transcript_words, asr_words) + soft_alignment = self._alignment_soft_calculator.calculate_alignment( + gold_transcript_words, asr_words + ) + embedding_alignment = self._alignment_embedding_calculator.calculate_alignment( + gold_transcript_words, asr_words + ) soft_wer = self._wer_calculator.calculate_wer(soft_alignment) - embedding_wer = self._wer_calculator \ - .calculate_wer(embedding_alignment) + embedding_wer = self._wer_calculator.calculate_wer(embedding_alignment) alignment_results = { - 'soft_alignment': [AlignmentStepMapper.to_json_dict(it) - for it in soft_alignment], - 'embedding_alignment': [AlignmentStepMapper.to_json_dict(it) - for it in embedding_alignment], + "soft_alignment": [AlignmentStepMapper.to_json_dict(it) for it in soft_alignment], + "embedding_alignment": [ + AlignmentStepMapper.to_json_dict(it) for it in embedding_alignment + ], } - wer_results = {'soft_wer': soft_wer, - 'embedding_wer': embedding_wer} + wer_results = {"soft_wer": soft_wer, "embedding_wer": embedding_wer} experiment_repository.update_property_for_key( - record_id, self._alignment_property_name, alignment_results) + record_id, self._alignment_property_name, alignment_results + ) experiment_repository.update_property_for_key( - record_id, self._metrics_property_name, wer_results) + record_id, self._metrics_property_name, wer_results + ) self._embedding_transformer.clear() @staticmethod - def _map_words_to_domain( - input_json_dicts: List[Dict[str, str]] - ) -> List[Word]: - return [ - WordMapper.from_json_dict(word_dict) - for word_dict in input_json_dicts - ] + def _map_words_to_domain(input_json_dicts: List[Dict[str, str]]) -> List[Word]: + return [WordMapper.from_json_dict(word_dict) for word_dict in input_json_dicts] diff --git a/sziszapangma/integration/task/gold_transcript_task.py b/sziszapangma/integration/task/gold_transcript_task.py index 0b407b4..e4327d1 100644 --- a/sziszapangma/integration/task/gold_transcript_task.py +++ b/sziszapangma/integration/task/gold_transcript_task.py @@ -1,7 +1,5 @@ -from sziszapangma.integration.gold_transcript_processor import \ - GoldTranscriptProcessor -from sziszapangma.integration.repository.experiment_repository \ - import ExperimentRepository +from sziszapangma.integration.gold_transcript_processor import GoldTranscriptProcessor +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository from sziszapangma.integration.task.processing_task import ProcessingTask @@ -14,22 +12,20 @@ class GoldTranscriptTask(ProcessingTask): task_name: str, gold_transcript_processor: GoldTranscriptProcessor, gold_transcript_property_name: str, - require_update: bool + require_update: bool, ): super().__init__(task_name, require_update) self._gold_transcript_processor = gold_transcript_processor self._gold_transcript_property_name = gold_transcript_property_name - def skip_for_record(self, record_id: str, - experiment_repository: ExperimentRepository) -> bool: - return experiment_repository \ - .get_property_for_key(record_id, - self._gold_transcript_property_name) + def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: + return experiment_repository.get_property_for_key( + record_id, self._gold_transcript_property_name + ) - def run_single_process(self, record_id: str, - experiment_repository: ExperimentRepository): + def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository): experiment_repository.update_property_for_key( record_id, self._gold_transcript_property_name, - self._gold_transcript_processor.get_gold_transcript(record_id) + self._gold_transcript_processor.get_gold_transcript(record_id), ) diff --git a/sziszapangma/integration/task/processing_task.py b/sziszapangma/integration/task/processing_task.py index c494851..8752f90 100644 --- a/sziszapangma/integration/task/processing_task.py +++ b/sziszapangma/integration/task/processing_task.py @@ -1,8 +1,7 @@ from abc import ABC, abstractmethod from sziszapangma.integration.record_id_iterator import RecordIdIterator -from sziszapangma.integration.repository.experiment_repository import \ - ExperimentRepository +from sziszapangma.integration.repository.experiment_repository import ExperimentRepository class ProcessingTask(ABC): @@ -14,38 +13,26 @@ class ProcessingTask(ABC): self._task_name = task_name @abstractmethod - def run_single_process( - self, - record_id: str, - experiment_repository: ExperimentRepository - ): + def run_single_process(self, record_id: str, experiment_repository: ExperimentRepository): pass @abstractmethod - def skip_for_record( - self, - record_id: str, - experiment_repository: ExperimentRepository - ) -> bool: + def skip_for_record(self, record_id: str, experiment_repository: ExperimentRepository) -> bool: pass def process( - self, - record_id_iterator: RecordIdIterator, - experiment_repository: ExperimentRepository + self, record_id_iterator: RecordIdIterator, experiment_repository: ExperimentRepository ): records_ids = list(record_id_iterator.get_all_records()) for record_index in range(len(records_ids)): - part = f'{record_index + 1} / {len(records_ids)}' + part = f"{record_index + 1} / {len(records_ids)}" record_id = records_ids[record_index] skip = self.skip_for_record( - record_id=record_id, - experiment_repository=experiment_repository + record_id=record_id, experiment_repository=experiment_repository ) - base_log = f'{self._task_name} processing {part} {record_id}' + base_log = f"{self._task_name} processing {part} {record_id}" if not skip or self._require_update: print(base_log) - self.run_single_process(record_id, - experiment_repository) + self.run_single_process(record_id, experiment_repository) else: - print(f'{base_log} -- skipped') + print(f"{base_log} -- skipped") diff --git a/tests/file_stored_embedding_transformer.py b/tests/file_stored_embedding_transformer.py index 2e329fa..dfad1b0 100644 --- a/tests/file_stored_embedding_transformer.py +++ b/tests/file_stored_embedding_transformer.py @@ -1,28 +1,21 @@ import json -from typing import List, Dict +from typing import Dict, List import numpy as np -from sziszapangma.core.transformer.embedding_transformer import \ - EmbeddingTransformer +from sziszapangma.core.transformer.embedding_transformer import EmbeddingTransformer class FileStoredEmbeddingTransformer(EmbeddingTransformer): _cache: Dict[str, np.array] def __init__(self, file_path: str): - with open(file_path, 'r') as f: + with open(file_path, "r") as f: json_content = json.loads(f.read()) - self._cache = dict({ - key: np.array(json_content[key]) - for key in json_content.keys() - }) + self._cache = dict({key: np.array(json_content[key]) for key in json_content.keys()}) def get_embeddings(self, words: List[str]) -> Dict[str, np.ndarray]: - return dict({ - word: self._cache[word] - for word in words - }) + return dict({word: self._cache[word] for word in words}) def get_embedding(self, word: str) -> np.ndarray: return self._cache[word] diff --git a/tests/test_classic_wer.py b/tests/test_classic_wer.py index ff67ebd..0766e94 100644 --- a/tests/test_classic_wer.py +++ b/tests/test_classic_wer.py @@ -2,8 +2,7 @@ from typing import List, Tuple import pytest -from sziszapangma.core.alignment.alignment_classic_calculator import \ - AlignmentClassicCalculator +from sziszapangma.core.alignment.alignment_classic_calculator import AlignmentClassicCalculator from sziszapangma.core.alignment.step_type import StepType from sziszapangma.core.alignment.step_words import StepWords from sziszapangma.core.alignment.word import Word @@ -15,16 +14,15 @@ def string_list_to_words(strings: List[str]) -> List[Word]: def get_sample_data() -> Tuple[List[Word], List[Word]]: - reference = ['This', 'great', 'machine', 'can', 'recognize', 'speech'] - hypothesis = ['This', 'machine', 'can', 'wreck', 'a', 'nice', 'beach'] + reference = ["This", "great", "machine", "can", "recognize", "speech"] + hypothesis = ["This", "machine", "can", "wreck", "a", "nice", "beach"] return string_list_to_words(reference), string_list_to_words(hypothesis) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - alignment = AlignmentClassicCalculator()\ - .calculate_alignment(reference, hypothesis) + alignment = AlignmentClassicCalculator().calculate_alignment(reference, hypothesis) wer_result = WerCalculator().calculate_wer(alignment) assert pytest.approx(wer_result) == 0.8333333 @@ -32,8 +30,7 @@ def test_classic_calculate_wer_value(): def test_classic_calculate_wer_steps(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - alignment = AlignmentClassicCalculator().calculate_alignment( - reference, hypothesis) + alignment = AlignmentClassicCalculator().calculate_alignment(reference, hypothesis) reference_words = [ StepWords(reference[0], hypothesis[0]), @@ -43,11 +40,18 @@ def test_classic_calculate_wer_steps(): StepWords(None, hypothesis[3]), StepWords(None, hypothesis[4]), StepWords(reference[4], hypothesis[5]), - StepWords(reference[5], hypothesis[6])] + StepWords(reference[5], hypothesis[6]), + ] step_types = [ - StepType.CORRECT, StepType.DELETION, StepType.CORRECT, - StepType.CORRECT, StepType.INSERTION, StepType.INSERTION, - StepType.SUBSTITUTION, StepType.SUBSTITUTION] + StepType.CORRECT, + StepType.DELETION, + StepType.CORRECT, + StepType.CORRECT, + StepType.INSERTION, + StepType.INSERTION, + StepType.SUBSTITUTION, + StepType.SUBSTITUTION, + ] assert len(alignment) == 8 assert [it.step_type for it in alignment] == step_types diff --git a/tests/test_embedding_wer.py b/tests/test_embedding_wer.py index 4f7cd55..69fe11e 100644 --- a/tests/test_embedding_wer.py +++ b/tests/test_embedding_wer.py @@ -2,14 +2,11 @@ from typing import List, Tuple import pytest -from sziszapangma.core.alignment.alignment_calculator import \ - AlignmentCalculator -from sziszapangma.core.alignment.alignment_embedding_calculator import \ - AlignmentEmbeddingCalculator +from sziszapangma.core.alignment.alignment_calculator import AlignmentCalculator +from sziszapangma.core.alignment.alignment_embedding_calculator import AlignmentEmbeddingCalculator from sziszapangma.core.alignment.word import Word from sziszapangma.core.wer.wer_calculator import WerCalculator -from tests.file_stored_embedding_transformer import \ - FileStoredEmbeddingTransformer +from tests.file_stored_embedding_transformer import FileStoredEmbeddingTransformer def string_list_to_words(strings: List[str]) -> List[Word]: @@ -17,20 +14,18 @@ def string_list_to_words(strings: List[str]) -> List[Word]: def get_sample_data() -> Tuple[List[Word], List[Word]]: - reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego'] - hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego'] + reference = ["ala", "ma", "dobrego", "wielkiego", "psa", "rasowego"] + hypothesis = ["alana", "rego", "kruchego", "psa", "rasowego"] return string_list_to_words(reference), string_list_to_words(hypothesis) def get_alignment_calculator() -> AlignmentCalculator: - return AlignmentEmbeddingCalculator( - FileStoredEmbeddingTransformer('tests/embeddings_pl.json')) + return AlignmentEmbeddingCalculator(FileStoredEmbeddingTransformer("tests/embeddings_pl.json")) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - alignment = get_alignment_calculator().calculate_alignment(reference, - hypothesis) + alignment = get_alignment_calculator().calculate_alignment(reference, hypothesis) wer_result = WerCalculator().calculate_wer(alignment) assert pytest.approx(wer_result) == 0.55879563 diff --git a/tests/test_soft_wer.py b/tests/test_soft_wer.py index 85a3433..64703e3 100644 --- a/tests/test_soft_wer.py +++ b/tests/test_soft_wer.py @@ -2,14 +2,11 @@ from typing import List, Tuple import pytest -from sziszapangma.core.alignment.alignment_calculator import \ - AlignmentCalculator -from sziszapangma.core.alignment.alignment_soft_calculator import \ - AlignmentSoftCalculator +from sziszapangma.core.alignment.alignment_calculator import AlignmentCalculator +from sziszapangma.core.alignment.alignment_soft_calculator import AlignmentSoftCalculator from sziszapangma.core.alignment.word import Word from sziszapangma.core.wer.wer_calculator import WerCalculator -from tests.file_stored_embedding_transformer import \ - FileStoredEmbeddingTransformer +from tests.file_stored_embedding_transformer import FileStoredEmbeddingTransformer def string_list_to_words(strings: List[str]) -> List[Word]: @@ -17,21 +14,19 @@ def string_list_to_words(strings: List[str]) -> List[Word]: def get_sample_data() -> Tuple[List[Word], List[Word]]: - reference = ['ala', 'ma', 'dobrego', 'wielkiego', 'psa', 'rasowego'] - hypothesis = ['alana', 'rego', 'kruchego', 'psa', 'rasowego'] + reference = ["ala", "ma", "dobrego", "wielkiego", "psa", "rasowego"] + hypothesis = ["alana", "rego", "kruchego", "psa", "rasowego"] return string_list_to_words(reference), string_list_to_words(hypothesis) def get_alignment_calculator() -> AlignmentCalculator: - return AlignmentSoftCalculator( - FileStoredEmbeddingTransformer('tests/embeddings_pl.json')) + return AlignmentSoftCalculator(FileStoredEmbeddingTransformer("tests/embeddings_pl.json")) def test_classic_calculate_wer_value(): """Sample test for core calculate.""" reference, hypothesis = get_sample_data() - alignment = get_alignment_calculator().calculate_alignment( - reference, hypothesis) + alignment = get_alignment_calculator().calculate_alignment(reference, hypothesis) wer_result = WerCalculator().calculate_wer(alignment) print(wer_result) assert pytest.approx(wer_result) == 0.50186761 diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 7062dd2..0000000 --- a/tox.ini +++ /dev/null @@ -1,22 +0,0 @@ -[tox] -envlist = flake8,testenv -skipsdist = True - -[testenv:flake8] -basepython = python -deps = flake8 -commands = flake8 sziszapangma tests - -[testenv] -setenv = - PYTHONPATH = {toxinidir} -deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/requirements_dev.txt -; If you want to make tox run the tests with the same versions, create a -; requirements.txt with the pinned versions and uncomment the following line: -; -r{toxinidir}/requirements.txt -commands = - pytest -; pytest --basetemp={envtmpdir} - -- GitLab