Merge branch 'code_refactor' into 'develop'

Wordifier - first project outline See merge request !1

Merge branch 'code_refactor' into 'develop'
Wordifier - first project outline See merge request !1
684bee0d · Mateusz Gniewkowski · 2eaf26fc · 36baae94 · 684bee0d · 684bee0d
Commit 684bee0d authored 4 years ago by Mateusz Gniewkowski
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+.vscode
\ No newline at end of file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-image: 'clarinpl/python:3.6'
+image: "clarinpl/python:3.6"
 cache:
  paths:
    - .tox
 stages:
  - check_style
+  - test
  - build
-before_script:
-  - pip install tox==2.9.1
+
 pep8:
  stage: check_style
+  before_script:
+    - pip install tox==2.9.1
  script:
    - tox -v -e pep8
+
 docstyle:
  stage: check_style
+  before_script:
+    - pip install tox==2.9.1
  script:
    - tox -v -e docstyle
-build_image:
+
+test:
+  stage: test
+  image: "docker:18.09.7"
+  services:
+    - "docker:18.09.7-dind"
+  script:
+    - docker build -t clarinpl/wordifier .
+    - docker run --rm
+      -v "$(pwd)/requirements-dev.txt:/home/worker/requirements-dev.txt"
+      -v "$(pwd)/tests:/home/worker/tests"
+      clarinpl/wordifier
+      sh -c 'pip3 install -r requirements-dev.txt ; nose2 -v tests'
+
+build:
  stage: build
-  image: 'docker:18.09.7'
+  image: "docker:18.09.7"
  only:
    - master
  services:
-    - 'docker:18.09.7-dind'
-  before_script:
-    - ''
+    - "docker:18.09.7-dind"
  script:
    - docker build -t clarinpl/wordifier .
    - echo $DOCKER_PASSWORD > pass.txt

--- a/Dockerfile
+++ b/Dockerfile
 FROM clarinpl/python:3.6

 WORKDIR /home/worker
-COPY ./src ./src
-COPY ./main.py .
-COPY ./requirements.txt .

-RUN wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \
-    apt-add-repository http://download.sgjp.pl/apt/ubuntu && \
-    apt update && \
-    apt install morfeusz2 -y
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
+RUN update-alternatives  --set python /usr/bin/python3.6
+
+RUN apt-get update && apt-get install -y morfeusz2 

 RUN wget -O morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl http://download.sgjp.pl/morfeusz/20200913/Linux/18.04/64/morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl

 RUN python3.6 -m pip install morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl

+COPY ./src ./src
+COPY ./main.py .
+COPY ./requirements.txt .
+COPY ./data ./data
+
 RUN python3.6 -m pip install -r requirements.txt

 CMD ["python3.6", "main.py", "service"]
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# Wordifier 
+
+A service that expands abbreviations into full texts. The following modules are implemented at this time:
+- verbal notation of digits, numbers, decimal and ordinary fractions (with separators '.' and '/')
+- verbal notation of simple equations with addition, subtraction, multiplication and division
+- verbal notation of dates 
+    - recognizing different ways to write dates.
+        - 25.12.2010 or 25,12,12 (day/month, day/month, year)
+        - 2009-08-30 or 20 08 30 (year, day/month, day/month)
+        - 12 Jan 2010 or 31 Jan 1998 (day, month, year)
+        - Mar 12 (month, year)
+        - Dec 15 (day, month)
+        - April 30 2000 (month, day, year)
+- replace currency symbols with words
+- write special characters (%, &, #, ^, =, +, -, /) in words
\ No newline at end of file
--- a/data/currencies.json
+++ b/data/currencies.json
--- a/data/numbers.json
+++ b/data/numbers.json
+{
+    "number_words": {
+        "0": "zero",
+        "1": "jeden",
+        "2": "dwa",
+        "3": "trzy",
+        "4": "cztery",
+        "5": "pięć",
+        "6": "sześć",
+        "7": "siedem",
+        "8": "osiem",
+        "9": "dziewięć",
+        "10": "dziesięć",
+        "11": "jedenaście",
+        "12": "dwanaście",
+        "13": "trzynaście",
+        "14": "czternaście",
+        "15": "piętnaście",
+        "16": "szesnaście",
+        "17": "siedemnaście",
+        "18": "osiemnaście",
+        "19": "dziewiętnaście",
+        "20": "dwadzieścia",
+        "30": "trzydzieści",
+        "40": "czterdzieści",
+        "50": "pięćdziesiąt",
+        "60": "sześćdziesiąt",
+        "70": "siedemdziesiąt",
+        "80": "osiemdziesiąt",
+        "90": "dziewięćdziesiąt",
+        "100": "sto",
+        "200": "dwieście",
+        "300": "trzysta",
+        "400": "czterysta",
+        "500": "pięćset",
+        "600": "sześćset",
+        "700": "siedemset",
+        "800": "osiemset",
+        "900": "dziewięćset"
+    },
+    "ordinal_number_words": {
+        "0": "zerowy",
+        "1": "pierwszy",
+        "2": "drugi",
+        "3": "trzeci",
+        "4": "czwarty",
+        "5": "piąty",
+        "6": "szósty",
+        "7": "siódmy",
+        "8": "ósmy",
+        "9": "dziewiąty",
+        "10": "dziesiąty",
+        "11": "jedenasty",
+        "12": "dwunasty",
+        "13": "trzynasty",
+        "14": "czternasty",
+        "15": "piętnasty",
+        "16": "szesnasty",
+        "17": "siedemnasty",
+        "18": "osiemnasty",
+        "19": "dziewiętnasty",
+        "20": "dwudziesty",
+        "30": "trzydziesty",
+        "40": "czterdziesty",
+        "50": "pięćdziesiąty",
+        "60": "sześćdziesiąty",
+        "70": "siedemdziesiąty",
+        "80": "osiemdziesiąty",
+        "90": "dziewięćdziesiąty",
+        "100": "setny",
+        "200": "dwusetny",
+        "300": "trzechsetny",
+        "400": "czterechsetny",
+        "500": "pięćsetny",
+        "600": "sześćsetny",
+        "700": "siedemsetny",
+        "800": "osiemsetny",
+        "900": "dziewięćsetny"
+    },
+    "large_numbers": {
+        "3": "tysiąc",
+        "6": "milion",
+        "9": "miliard",
+        "12": "bilion",
+        "15": "biliard",
+        "18": "trylion",
+        "21": "tryliard",
+        "24": "kwadrylion",
+        "27": "kwadryliard",
+        "30": "kwintylion",
+        "33": "kwintyliard",
+        "36": "sekstylion",
+        "39": "sekstyliard",
+        "42": "septylion",
+        "45": "septyliard",
+        "48": "oktylion",
+        "51": "oktyliard",
+        "54": "nonilion",
+        "57": "noniliard",
+        "60": "decylion",
+        "63": "decyliard",
+        "66": "undecylion",
+        "69": "undecyliard",
+        "72": "duodecylion",
+        "75": "duodecyliard",
+        "100": "googol",
+        "600": "centylion",
+        "603": "centyliard"
+    },
+    "ordinal_large_numbers": {
+        "3": "tysięczny",
+        "6": "milionowy",
+        "9": "miliardowy",
+        "12": "bilionowy"
+    }
+}
\ No newline at end of file
--- a/docker-compose.yml
+++ b/docker-compose.yml
 version: '3'
 services:
-  tokenizer:
+  wordifier:
    container_name: clarin_wordifier
    build: ./
    working_dir: /home/worker
-    entrypoint:
-      - python3.6
-      - main.py
-      - service
+    command:
+      - python3.6 main.py service
    environment:
      - PYTHONUNBUFFERED=0
    volumes:
      - '/samba:/samba'
      - './config.ini:/home/worker/config.ini'
      - './src:/home/worker/src'
+      - './tests:/home/worker/tests'
      - './main.py:/home/worker/main.py'
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
+parameterized==0.8.1
+nose2==0.10.0
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
 nlp-ws
-python-morfeusz
\ No newline at end of file
+Babel==2.8.0
\ No newline at end of file
--- a/src/__init__.py
+++ b/src/__init__.py
--- a/src/ccl_handler.py
+++ b/src/ccl_handler.py
@@ -2,19 +2,19 @@
 from xml.etree.ElementTree import iterparse


-class Ccl_handler:
+class CCLHandler:
    """Implements reading ccl for anonymizer service."""

    def __init__(self, ccl_file_name):
-        """Initialize ccl_handler with a filename."""
+        """Initialize CCLHandler with a filename."""
        self._file_name = ccl_file_name

    def process(self, output_file, unmarshallers):
        """Process xml tags using unmarshallers and save in output_file."""
-        with open(output_file, 'w', encoding='utf-8') as out:
-            with open(self._file_name, 'r', encoding='utf-8') as f:
-                for event, elem in iterparse(f):
-                    unmarshal = unmarshallers.get(elem.tag, None)
-                    if unmarshal:
-                        out.write(unmarshal(elem))
-                        elem.clear()
+        with open(self._file_name, 'r', encoding='utf-8') as input_file, \
+                open(output_file, 'w', encoding='utf-8') as output_file:
+            for event, elem in iterparse(input_file):
+                unmarshal = unmarshallers.get(elem.tag, None)
+                if unmarshal:
+                    output_file.write(unmarshal(elem))
+                    elem.clear()
--- a/src/date2words.py
+++ b/src/date2words.py
+"""Module for converting dates to words."""
+from babel import Locale
+
+from src.num2words import num2words
+
+date_tags = ['sg:gen:m3']
+
+
+def check_none(token):
+    """If token is none then convert to empty list otherwise return token."""
+    if not token:
+        return []
+    return token
+
+
+def month_name_expansion(month):
+    """Expand month abbreviation or change form.
+
+    Args:
+        month (str): Month abbrevation or full name.
+
+    Returns:
+        str: Full month name in genitive case.
+
+    """
+    abbr = len(month) == 3
+    locale = Locale('pl')
+    month = month.lower()
+
+    if abbr:
+        months = locale.months['format']['abbreviated']
+        index = list(months.values()).index(month) + 1
+        month = locale.months['format']['wide'][index]
+    else:
+        for format in ['format', 'stand-alone']:
+            if month in list(locale.months[format]['wide'].values()):
+                months = locale.months[format]['wide']
+                index = list(months.values()).index(month) + 1
+                month = locale.months['format']['wide'][index]
+    return month
+
+
+def date2words(date_match, tags=None):
+    """Convert a date to list of words.
+
+    Args:
+        date_match (re.Match): Date match.
+        tag (str, optional): Morphological tag. Defaults to None.
+
+    Returns:
+        list of str: List of words representing date.
+
+    """
+    if tags and ":".join(tags[0].split(":")[1:4]) in date_tags:
+        corrected_tag = tags[0]
+    else:
+        corrected_tag = None
+    if date_match['day_or_month_year']:
+        day_month1 = num2words(date_match['day_month1'], corrected_tag,
+                               ordinal=True)
+        day_month2 = num2words(date_match['day_month2'], corrected_tag,
+                               ordinal=True)
+        year = num2words(date_match['year1'], corrected_tag, ordinal=True)
+
+        # split punctuation into single characters and remove if None
+        date_order = [day_month1, *check_none(date_match['punct1']),
+                      day_month2, *check_none(date_match['punct2']), year]
+    elif date_match['year_month_or_day']:
+        day_month3 = num2words(date_match['day_month3'], ordinal=True)
+        day_month4 = num2words(date_match['day_month4'], ordinal=True)
+        year = num2words(date_match['year2'], ordinal=True)
+
+        # split punctuation into single characters and remove if None
+        date_order = [year, *check_none(date_match['punct3']), day_month3,
+                      *check_none(date_match['punct4']), day_month4]
+    elif date_match['month_in_words']:
+        day = date_match['day1']
+        if date_match['day2']:
+            day = date_match['day2']
+        if day:
+            day = num2words(day, corrected_tag, ordinal=True)
+
+        year = ''
+        if date_match['year3']:
+            year = num2words(date_match['year3'], corrected_tag, ordinal=True)
+        if date_match['year4']:
+            year = num2words(date_match['year4'], corrected_tag, ordinal=True)
+
+        if not day and not year:
+            return [date_match['month']]
+        else:
+            month = month_name_expansion(date_match['month'])
+
+        # split punctuation into single characters and remove if None
+        if date_match['day2']:
+            date_order = [month, *check_none(date_match['punct7']),
+                          day, *check_none(date_match['punct8'])]
+        elif date_match['day1']:
+            date_order = [day, *check_none(date_match['punct5']),
+                          month, *check_none(date_match['punct6'])]
+        else:
+            date_order = [month]
+        if year:
+            date_order = date_order + [year]
+        date_order = list(map(lambda x: x if x else '', date_order))
+    else:
+        date_order = ['']
+    return date_order
--- a/src/num2words.py
+++ b/src/num2words.py
+"""Module for converting numbers to words."""
+import math
+import json
+
+from src.utils import get_word_form, trailing_zeros
+
+with open('data/numbers.json', 'r') as numbers_file:
+    numbers_dict = json.load(numbers_file)
+    number_words = {int(k): v for k, v in numbers_dict['number_words'].items()}
+    ordinal_number_words = {int(k): v for k, v
+                            in numbers_dict['ordinal_number_words'].items()}
+    large_numbers = {int(k): v for k, v
+                     in numbers_dict['large_numbers'].items()}
+    ordinal_large_numbers = {int(k): v for k, v
+                             in numbers_dict['ordinal_large_numbers'].items()}
+
+
+def three_digit_to_words(text, tag='', ordinal=False):
+    """Convert three digits numbers to words with given tag. Util function."""
+    map_to_words = ordinal_number_words if ordinal else number_words
+
+    number = int(text)
+    if number == 0:
+        return get_word_form(map_to_words[number], tag)
+    words = []
+    units = number % 10
+    tens = number % 100 - units
+    hundredths = number // 100
+    if 0 < tens + units <= 20:
+        word = get_word_form(map_to_words[tens + units], tag)
+        words.append(word)
+    else:
+        if units != 0:
+            words.append(get_word_form(map_to_words[units], tag))
+        if tens != 0:
+            words.append(get_word_form(map_to_words[tens], tag))
+
+    if hundredths != 0:
+        if tens == 0 and units == 0:
+            words.append(get_word_form(map_to_words[hundredths * 100], tag))
+        else:
+            words.append(get_word_form(number_words[hundredths * 100], ''))
+
+    return ' '.join(reversed(words))
+
+
+def num2words(text, tag='', ordinal=False):
+    """Converts a number to words.
+
+    Args:
+        text (str): Three digits number.
+        tag (str, optional): Morphological tag. Defaults to ''.
+        ordinal (bool, optional): If word should be derived from ordinal number.
+        Defaults to False.
+
+    Returns:
+        str: Returns number as words with given tag.
+
+    """
+    i = 0
+    words = []
+    number = int(text)
+
+    if ordinal:
+        zeros = trailing_zeros(number)
+        zeros = 3 * math.floor(zeros / 3)
+        if zeros > 2 and 0 < len(text) - zeros <= 3:
+            number = number // 10 ** zeros
+            if number == 1:
+                words = ''
+            else:
+                words = three_digit_to_words(str(number), 'numcomp')
+            words += get_word_form(ordinal_large_numbers[zeros], tag)
+            return words
+
+    if len(text) <= 3 or number == 0:
+        return three_digit_to_words(text, tag, ordinal)
+
+    while number > 0:
+        remainder = number % 1000
+        if i == 0:
+            triple = three_digit_to_words(remainder, tag, ordinal)
+        else:
+            triple = three_digit_to_words(remainder)
+        number = number // 1000
+        if remainder == 0 and number != 0:
+            i += 3
+            continue
+
+        if i == 0:
+            words.append(triple)
+        else:
+            if remainder == 1:
+                tag = 'subst:sg:nom:m3'
+            elif remainder % 10 in [2, 3, 4]:
+                tag = 'subst:pl:nom:m3'
+            else:
+                tag = 'subst:pl:gen:m3'
+            form = get_word_form(large_numbers[i], tag)
+            if remainder == 1:
+                words.append(form)
+            else:
+                words.append(triple + ' ' + form)
+        i += 3
+    return ' '.join(list(reversed(words)))
--- a/src/utils.py
+++ b/src/utils.py
+"""Module for useful functions."""
+from enum import Enum
+
+import morfeusz2
+
+
+class TokenType(Enum):
+    """Type of token."""
+
+    NUMBER = 1
+    SPECIAL_CHARACTER = 2
+    PUNCTUATION = 3
+    CURRENCY = 4
+
+
+class NumberPlural(Enum):
+    """Type of number indicating what the word suffix will be.
+
+    E.g:
+    SINGULAR 1$ - jeden dolar
+    SEVERAL (2-4) 2$ - dwa dolary
+    MANY (5+) 7$ - siedem dolarów
+    """
+
+    SINGULAR = 0
+    SEVERAL = 1
+    MANY = 2
+
+
+def to_number_plural(number):
+    """Convert a number to enumerate type, that indicates word suffix.
+
+    Args:
+        number (int or string): Number to be converted.
+
+    Returns:
+        NumberPlural: Enumerate, which indicates what the end of the word
+            will be.
+
+    """
+    number = int(number)
+    if number == 1:
+        return NumberPlural.SINGULAR
+    elif 2 <= number <= 4:
+        return NumberPlural.SEVERAL
+    else:
+        return NumberPlural.MANY
+
+
+def is_simple_number(tokens, special_types):
+    """Checks if list of tokens creates a simple number.
+
+    Simple number contains only digits and spaces between groups of three.
+
+    Args:
+        tokens (list): List of tokens.
+        special_types (list): Types of tokens.
+
+    Returns:
+        bool: Return True if joined tokens are simple number otherwise False.
+
+    """
+    numbers = [n for i, n in enumerate(tokens)
+               if special_types[i] == TokenType.NUMBER]
+    return (all([len(t) == 3 for t in numbers[1:]]) and
+            all([(s.isdigit() or s == ' ') for s in tokens]))
+
+
+def is_fraction(tokens, decimal=False):
+    """Check is list of tokens are 2 numbers splitted by slash or dot.
+
+    Args:
+        tokens (list): List of tokens.
+        decimal (bool, optional): If True delimiter is dot otherwise slash '/'.
+            Defaults to False.
+
+    Returns:
+        bool: Return True if tokens are fraction otherwise False.
+
+    """
+    if len(tokens) < 3:
+        return False
+    delimiter = '.' if decimal else '/'
+    splitted = ''.join(tokens).split(delimiter)
+    return ((len(splitted) == 2) and
+            tokens.count(delimiter) == 1 and
+            all([(s.isdigit() or s in ' /.') for s in tokens]))
+
+
+def trailing_zeros(number):
+    """Count trailing zeros in number.
+
+    Returns:
+        int: Return number of trailing zeros.
+
+    """
+    manipulandum = str(number)
+    return len(manipulandum) - len(manipulandum.rstrip('0'))
+
+
+def search_form(forms, tag):
+    """Search for the correct form of word from all those returned by Morfeusz.
+
+    Args:
+        forms (list of tuples): Tags and variations of words returned
+            by Morfeusz.
+        tag (str): The tag of the word whose form is being searched for.
+
+    Returns:
+        str: Word properly conjugated with the given tag or None if not found.
+
+    """
+    for form in forms:
+        form_categories = [x.split('.') for x in form[2].split(':')]
+        gramm_categ_enum = enumerate(tag)
+        if all((c in form_categories[i] for i, c in gramm_categ_enum)):
+            return form[0]
+    return None
+
+
+def get_word_form(text, tag):
+    """Change the word in the appropriate form with given morphological tag.
+
+    Args:
+        text (str): Word to be changed.
+        tag (str): Morphological tag.
+
+    Returns:
+        str: Word changed with given morphological tag.
+
+    """
+    if not tag:
+        return text
+
+    morf = morfeusz2.Morfeusz()
+    all_forms = morf.generate(text)
+
+    tag = tag.split(':')
+    forms = [x for x in all_forms if x[2].split(':')[0] == tag[0]]
+    form = search_form(forms, tag)
+
+    if form:
+        return form
+    if len(tag) > 4:
+        tag = tag[:4]
+        form = search_form(forms, tag)
+
+    if form:
+        return form
+    else:
+        return text
+
+
+def subtract_from_first(list_of_tuples, offset):
+    """Subtract from every first element in tuples that make up list."""
+    list_of_tuples = (list_of_tuples[0] - offset, *list_of_tuples[1:])
+    return list_of_tuples
+
+
+def check_and_replace(string_builder, find, replace, filtered_tokens):
+    """Check for matches in list and replace them with given tokens.
+
+    Remove replaced tokens from `filtered_tokens` to to avoid double processing.
+
+    Args:
+        string_builder (list of str): List of all words.
+        find (list of str): Tokens to be replaced.
+        replace (list of str): Words that will replace `find` tokens in
+            `string_builder`.
+        filtered_tokens (list of tuples): List of tokens and their features.
+
+    Returns:
+        (list of str, list of tuples): Pair: list of words with replaced matched
+            tokens and filtered list of tokens and their feature with deleted
+            items that have been replaced.
+
+    """
+    if not find or not replace:
+        return string_builder, filtered_tokens
+
+    new_builder = string_builder.copy()
+    max_lenght = max(map(len, find))
+    for i, token in enumerate(string_builder):
+        if not find:
+            break
+        to_remove = [i]
+        check = token
+        j = i + 1
+        if check in find:
+            new_builder[i] = ''.join(replace[find.index(check)])
+            filtered_tokens = list(filter(lambda x: x[0] != i, filtered_tokens))
+            del find[0], replace[0]
+            continue
+        if check[0] != find[0][:len(check[0])]:
+            continue
+        while len(check) < max_lenght and j < len(string_builder):
+            check += string_builder[j]
+            to_remove.append(j)
+            if check in find:
+                index = find.index(check)
+                new_builder = new_builder[:i] + replace[index]
+                if j + 1 < len(string_builder):
+                    new_builder += string_builder[j + 1:]
+                filtered_tokens = list(filter(lambda x: x[0] not in to_remove,
+                                              filtered_tokens))
+                find.pop(index)
+                replace.pop(index)
+                if not find:
+                    return new_builder, filtered_tokens
+            j += 1
+    return new_builder, filtered_tokens
--- a/src/wordifier.py
+++ b/src/wordifier.py
--- a/src/worker.py
+++ b/src/worker.py
@@ -4,7 +4,7 @@ import logging
 import nlp_ws

 from src.wordifier import Wordifier
-from src.ccl_handler import Ccl_handler
+from src.ccl_handler import CCLHandler


 _log = logging.getLogger(__name__)
@@ -18,7 +18,7 @@ class Worker(nlp_ws.NLPWorker):
        """One time static initialisation."""

    def process(self, input_file, task_options, output_file):
-        """A."""
+        """Processing an input file and generating tokens converted to words."""
        wordifier = Wordifier()
-        ccl_handler = Ccl_handler(input_file)
+        ccl_handler = CCLHandler(input_file)
        ccl_handler.process(output_file, wordifier.unmarshallers)
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/test_num2words.py
+++ b/tests/test_num2words.py
+import unittest
+from parameterized import parameterized, param
+
+from src.num2words import num2words
+
+
+class TestNum2Words(unittest.TestCase):
+    single_tag = 'adj:sg:nom:f'
+    several_tag = 'adj:pl:acc:f'
+    many_tag = 'adj:pl:acc:m1'
+
+    @parameterized.expand([
+        param('0', 'zero'),
+        param('08', 'osiem'),
+        param('12', 'dwanaście'),
+        param('23', 'dwadzieścia trzy'),
+        param('48', 'czterdzieści osiem'),
+        param('187', 'sto osiemdziesiąt siedem'),
+        param('249', 'dwieście czterdzieści dziewięć'),
+        param('600', 'sześćset'),
+        param('720', 'siedemset dwadzieścia'),
+        param('304', 'trzysta cztery'),
+
+        param('1000', 'tysiąc'),
+        param('425000', 'czterysta dwadzieścia pięć tysięcy'),
+        param('102000', 'sto dwa tysiące'),
+        param('390000', 'trzysta dziewięćdziesiąt tysięcy'),
+        param('701000', 'siedemset jeden tysięcy'),
+        param('993999', 'dziewięćset dziewięćdziesiąt trzy tysiące '
+              'dziewięćset dziewięćdziesiąt dziewięć'),
+        param('1000642', 'milion sześćset czterdzieści dwa'),
+        param('2001003', 'dwa miliony tysiąc trzy'),
+        param('18456000', 'osiemnaście milionów '
+              'czterysta pięćdziesiąt sześć tysięcy'),
+        param('1000000000', 'miliard')
+    ])
+    def test_numbers(self, number, words):
+        self.assertEqual(num2words(number), words)
+
+    @parameterized.expand([
+        param('0', 'zerowy', ordinal=True),
+        param('1', 'pierwszy', ordinal=True),
+        param('10', 'dziesiąty', ordinal=True),
+        param('15', 'piętnasty', ordinal=True),
+        param('31', 'trzydziesty pierwszy', ordinal=True),
+        param('70', 'siedemdziesiąty', ordinal=True),
+        param('099', 'dziewięćdziesiąty dziewiąty', ordinal=True),
+        param('100', 'setny', ordinal=True),
+        param('102', 'sto drugi', ordinal=True),
+        param('183', 'sto osiemdziesiąty trzeci', ordinal=True),
+        param('201', 'dwieście pierwszy', ordinal=True),
+
+        param('1000', 'tysięczny', ordinal=True),
+        param('1005', 'tysiąc piąty', ordinal=True),
+        param('2000', 'dwutysięczny', ordinal=True),
+        param('2020', 'dwa tysiące dwudziesty', ordinal=True),
+        param('10000', 'dziesięciotysięczny', ordinal=True),
+        param('100856', 'sto tysięcy osiemset pięćdziesiąty szósty',
+              ordinal=True),
+        param('1000000', 'milionowy', ordinal=True),
+        param('1002003', 'milion dwa tysiące trzeci', ordinal=True),
+        param('1948052296', 'miliard dziewięćset czterdzieści osiem milionów '
+              'pięćdziesiąt dwa tysiące '
+              'dwieście dziewięćdziesiąty szósty', ordinal=True),
+    ])
+    def test_ordinal_numbers(self, number, words, ordinal):
+        self.assertEqual(num2words(number, ordinal=ordinal), words)
+
+    @parameterized.expand([
+        ('1', 'adj:sg:nom:f', 'jedna'),
+        ('2', 'num:pl:nom:f', 'dwie')
+    ])
+    def test_numbers_numerator(self, number, tag, words):
+        self.assertEqual(num2words(number, tag), words)
+
+    @parameterized.expand([
+        param('1', 'pierwsza'),
+        param('2', 'druga'),
+        param('5', 'piąta'),
+        param('10', 'dziesiąta'),
+        param('31', 'trzydziesta pierwsza'),
+        param('100', 'setna'),
+        param('102', 'sto druga'),
+        param('512', 'pięćset dwunasta'),
+        param('600', 'sześćsetna'),
+
+        param('1000', 'tysięczna'),
+        param('2002', 'dwa tysiące druga'),
+        param('3000', 'trzytysięczna'),
+        param('1000000000', 'miliardowa'),
+        param('1473022977', 'miliard czterysta siedemdziesiąt trzy miliony '
+              'dwadzieścia dwa tysiące dziewięćset siedemdziesiąta siódma'),
+    ])
+    def test_single_numbers_denominator(self, number, words, ordinal=True):
+        self.assertEqual(num2words(number, self.single_tag, ordinal), words)
+
+    @parameterized.expand([
+        param('3', 'trzecie'),
+        param('6', 'szóste'),
+        param('10', 'dziesiąte'),
+        param('47', 'czterdzieste siódme'),
+        param('100', 'setne'),
+        param('101', 'sto pierwsze'),
+        param('300', 'trzechsetne'),
+        param('981', 'dziewięćset osiemdziesiąte pierwsze'),
+
+        param('1000', 'tysięczne'),
+        param('8000', 'ośmiotysięczne'),
+        param('10000', 'dziesięciotysięczne'),
+        param('100000', 'stutysięczne'),
+        param('1000115376708', 'bilion sto piętnaście milionów '
+              'trzysta siedemdziesiąt sześć tysięcy siedemset ósme'),
+    ])
+    def test_several_numbers_denominator(self, number, words, ordinal=True):
+        self.assertEqual(num2words(number, self.several_tag, ordinal), words)
+
+    @parameterized.expand([
+        param('4', 'czwartych'),
+        param('8', 'ósmych'),
+        param('10', 'dziesiątych'),
+        param('69', 'sześćdziesiątych dziewiątych'),
+        param('100', 'setnych'),
+        param('212', 'dwieście dwunastych'),
+        param('700', 'siedemsetnych'),
+        param('901', 'dziewięćset pierwszych'),
+
+        param('1000', 'tysięcznych'),
+        param('6000', 'sześciotysięcznych'),
+        param('10000', 'dziesięciotysięcznych'),
+        param('1000000', 'milionowych'),
+        param('238055017238', 'dwieście trzydzieści osiem miliardów '
+              'pięćdziesiąt pięć milionów siedemnaście tysięcy '
+              'dwieście trzydziestych ósmych'),
+    ])
+    def test_many_numbers_denominator(self, number, words, ordinal=True):
+        self.assertEqual(num2words(number, self.many_tag, ordinal), words)
--- a/tox.ini
+++ b/tox.ini
@@ -40,5 +40,5 @@ max-line-length = 80
 # D410 Missing blank line after section
 # D411 Missing blank line before section
 ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
-match-dir = ^(?!\.tox|venv).*
+match-dir = ^(?!\.tox|venv|tests).*
 match = ^(?!setup).*\.py
\ No newline at end of file