From 14eea9618e4b0fe7f0a12d809110a01b734c8cce Mon Sep 17 00:00:00 2001 From: Bartlomiej Koptyra <bartlomiej.koptyra@gmail.com> Date: Wed, 16 Sep 2020 16:37:04 +0200 Subject: [PATCH 1/4] First somewhat working version of wordifier for numbers. --- .gitlab-ci.yml | 32 ++++ Dockerfile | 15 ++ config.ini | 20 +++ docker-compose.yml | 17 +++ main.py | 34 +++++ requirements.txt | 2 + src/ccl_handler.py | 20 +++ src/morfeusz.py | 32 ++++ src/wordifier.py | 366 +++++++++++++++++++++++++++++++++++++++++++++ src/worker.py | 24 +++ tox.ini | 44 ++++++ 11 files changed, 606 insertions(+) create mode 100755 .gitlab-ci.yml create mode 100755 Dockerfile create mode 100755 config.ini create mode 100755 docker-compose.yml create mode 100755 main.py create mode 100755 requirements.txt create mode 100755 src/ccl_handler.py create mode 100644 src/morfeusz.py create mode 100644 src/wordifier.py create mode 100755 src/worker.py create mode 100755 tox.ini diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100755 index 0000000..a78b15f --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,32 @@ +image: 'clarinpl/python:3.6' +cache: + paths: + - .tox +stages: + - check_style + - build +before_script: + - pip install tox==2.9.1 +pep8: + stage: check_style + script: + - tox -v -e pep8 +docstyle: + stage: check_style + script: + - tox -v -e docstyle +build_image: + stage: build + image: 'docker:18.09.7' + only: + - master + services: + - 'docker:18.09.7-dind' + before_script: + - '' + script: + - docker build -t clarinpl/wordifier . + - echo $DOCKER_PASSWORD > pass.txt + - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin + - rm pass.txt + - docker push clarinpl/wordifier diff --git a/Dockerfile b/Dockerfile new file mode 100755 index 0000000..86914cb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM clarinpl/python:3.6 + +WORKDIR /home/worker +COPY ./src ./src +COPY ./main.py . +COPY ./requirements.txt . + +RUN wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \ + apt-add-repository http://download.sgjp.pl/apt/ubuntu && \ + apt update && \ + apt install morfeusz2 -y + +RUN python3.6 -m pip install -r requirements.txt + +CMD ["python3.6", "main.py", "service"] \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100755 index 0000000..efa4055 --- /dev/null +++ b/config.ini @@ -0,0 +1,20 @@ +[service] +tool = wordifier + +root = /samba/requests/ +rabbit_host = rabbitmq +rabbit_user = test +rabbit_password = test +queue_prefix = nlp_ + +[tool] +workers_number = 1 +processed_lines = 1000 + +[logging] +port = 9998 +local_log_level = INFO + +[logging_levels] +__main__ = INFO + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100755 index 0000000..ad8f8a6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: '3' +services: + tokenizer: + container_name: clarin_wordifier + build: ./ + working_dir: /home/worker + entrypoint: + - python3.6 + - main.py + - service + environment: + - PYTHONUNBUFFERED=0 + volumes: + - '/samba:/samba' + - './config.ini:/home/worker/config.ini' + - './src:/home/worker/src' + - './main.py:/home/worker/main.py' diff --git a/main.py b/main.py new file mode 100755 index 0000000..ccb9f30 --- /dev/null +++ b/main.py @@ -0,0 +1,34 @@ +"""Implementation of wordifier service.""" +import argparse +import nlp_ws +from src.worker import Worker + + +def get_args(): + """Gets command line arguments.""" + parser = argparse.ArgumentParser(description="wordifier") + + subparsers = parser.add_subparsers(dest="mode") + subparsers.required = True + + subparsers.add_parser( + "service", + help="Run as a service") + + return parser.parse_args() + + +def main(): + """Runs the program.""" + args = get_args() + + generators = { + "service": lambda: nlp_ws.NLPService.main(Worker), + } + + gen_fn = generators.get(args.mode, lambda: None) + gen_fn() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000..aae2e50 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +nlp-ws +python-morfeusz \ No newline at end of file diff --git a/src/ccl_handler.py b/src/ccl_handler.py new file mode 100755 index 0000000..a61dd89 --- /dev/null +++ b/src/ccl_handler.py @@ -0,0 +1,20 @@ +"""Implementation of ccl reading functionality.""" +from xml.etree.ElementTree import iterparse + + +class Ccl_handler: + """Implements reading ccl for anonymizer service.""" + + def __init__(self, ccl_file_name): + """Initialize ccl_handler with a filename.""" + self._file_name = ccl_file_name + + def process(self, output_file, unmarshallers): + """Process xml tags using unmarshallers and save in output_file.""" + with open(output_file, 'w', encoding='utf-8') as out: + with open(self._file_name, 'r', encoding='utf-8') as f: + for event, elem in iterparse(f): + unmarshal = unmarshallers.get(elem.tag, None) + if unmarshal: + out.write(unmarshal(elem)) + elem.clear() diff --git a/src/morfeusz.py b/src/morfeusz.py new file mode 100644 index 0000000..350710a --- /dev/null +++ b/src/morfeusz.py @@ -0,0 +1,32 @@ +"""Implementation of command-line morfeusz.""" +import subprocess + + +class Morfeusz(): + """Class used to handle using morfeusz.""" + + def _handle_processing(self, name, word): + with subprocess.Popen( + name, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.PIPE + ) as process: + process.stdin.write(str.encode(word)) + out, err = process.communicate(timeout=15) + if out: + return [elem.split(',') for elem in + out.decode().rstrip()[1:-2].split()] + elif err: + return [elem.split(',') for elem in + err.decode().rstrip()[1:-2].split()] + else: + return list() + + def generate(self, word): + """Morphological generation.""" + return self._handle_processing('morfeusz_generator', word) + + def analyze(self, word): + """Morphological analysis.""" + return self._handle_processing('morfeusz_analyzer', word) diff --git a/src/wordifier.py b/src/wordifier.py new file mode 100644 index 0000000..9a9ba70 --- /dev/null +++ b/src/wordifier.py @@ -0,0 +1,366 @@ +"""Implementation of wordifier functionality.""" +from src.morfeusz import Morfeusz + + +class Wordifier: + """Class used to edit sentences based on options.""" + + _num_list = [ + { + '0': 'zero', + '1': 'jeden', + '2': 'dwa', + '3': 'trzy', + '4': 'cztery', + '5': 'pięć', + '6': 'sześć', + '7': 'siedem', + '8': 'osiem', + '9': 'dziewięć' + }, + { + '10': 'dziesięć', + '11': 'jedenaście', + '12': 'dwanaście', + '13': 'trzynaście', + '14': 'czternaście', + '15': 'piętnaście', + '16': 'szesnaście', + '17': 'siedemnaście', + '18': 'osiemnaście', + '19': 'dziewiętnaście', + '20': 'dwadzieścia', + '30': 'trzydzieści', + '40': 'czterdzieści', + '50': 'pięćdziesiąt', + '60': 'sześćdziesiąt', + '70': 'siedemdziesiąt', + '80': 'osiemdziesiąt', + '90': 'dziewięćdziesiąt' + }, + { + '1': 'sto', + '2': 'dwieście', + '3': 'trzysta', + '4': 'czterysta', + '5': 'pięćset', + '6': 'sześćset', + '7': 'siedemset', + '8': 'osiemset', + '9': 'dziewięćset' + }, + { + 3: 'tysiąc', + 6: 'milion', + 9: 'miliard', + 12: 'bilion', + 15: 'biliard', + 18: 'trylion', + 21: 'tryliard', + 24: 'kwadrylion', + 27: 'kwadryliard', + 30: 'kwintylion', + 33: 'kwintyliard', + 36: 'sekstylion', + 39: 'sekstyliard', + 42: 'septylion', + 45: 'septyliard', + 48: 'oktylion', + 51: 'oktyliard', + 54: 'nonilion', + 57: 'noniliard', + 60: 'decylion', + 63: 'decyliard', + 66: 'undecylion', + 69: 'undecyliard', + 72: 'duodecylion', + 75: 'duodecyliard', + 100: 'googol', + 600: 'centylion', + 603: 'centyliard' + } + ] + + _adj_list = [ + { + '0': 'zerowy', + '1': 'pierwszy', + '2': 'drugi', + '3': 'trzeci', + '4': 'czwarty', + '5': 'piąty', + '6': 'szósty', + '7': 'siódmy', + '8': 'ósmy', + '9': 'dziewiąty' + }, + { + '10': 'dziesiąty', + '11': 'jedenasty', + '12': 'dwunasty', + '13': 'trzynasty', + '14': 'czternasty', + '15': 'piętnasty', + '16': 'szesnasty', + '17': 'siedemnasty', + '18': 'osiemnasty', + '19': 'dziewiętnasty', + '20': 'dwudziesty', + '30': 'trzydziesty', + '40': 'czterdziesty', + '50': 'pięćdziesiąty', + '60': 'sześćdziesiąty', + '70': 'siedemdziesiąty', + '80': 'osiemdziesiąty', + '90': 'dziewięćdziesiąty' + }, + { + '1': 'setny', + '2': 'dwusetny', + '3': 'trzechsetny', + '4': 'czterechsetny', + '5': 'pięćsetny', + '6': 'sześćsetny', + '7': 'siedemsetny', + '8': 'osiemsetny', + '9': 'dziewięćsetny' + }, + { + 3: 'tysięczny', + 6: 'milionowy', + 9: 'miliardowy', + 12: 'bilionowy' + } + ] + + def __init__(self): + """Class initialization.""" + self._morf = Morfeusz() + self.unmarshallers = { + 'chunk': lambda *args: '\n', + 'sentence': lambda *args: self._process_sent_tree(*args), + } + self._one_dict = dict() + self._create_one_dict() + self._special_list = [] + + def _create_one_dict(self): + for word in self._morf.generate('jeden'): + self._one_dict[word[0]] = True + + def _process_sent_tree(self, sentence_subtree): + string_builder = [] + tok_id = 0 + for elem in sentence_subtree: + if elem.tag == 'tok': + tok = self._process_single_tok(tok_id, elem) + string_builder.append(tok) + string_builder.append(' ') + tok_id += 2 + elif elem.tag == 'ns': + tok_id -= 1 + string_builder.pop() + else: + raise Exception('Unrecognized tag inside sentence: ' + elem.tag) + return self._process_sentence(string_builder) + + def _is_special(self, text): + return True + + def _process_single_tok(self, tok_id, tok_subtree): + text = '' + tag = '' + for elem in tok_subtree: + if elem.tag == 'orth': + text = elem.text + elif elem.tag == 'lex': + tag = self._process_lex(elem) + if self._is_special(text): + self._handle_special(tok_id, text, tag) + word = self._process_word(tok_id, text, tag) + return word + + def _return_large_part(self, num, digit, tag, word_text=None): + if word_text: + last_word = word_text.split(' ')[-1] + if last_word == 'dwa' \ + or last_word == 'trzy' \ + or last_word == 'cztery': + tag = 'subst:pl:nom:m3' + else: + tag = 'subst:pl:gen:m3' + return self._return_number(num, 3, digit, tag, digit) + return self._return_number(num, 3, digit, tag) + + def _return_number(self, num, pos, digit, tag=None, key=None): + if tag: + return self._get_correct_form(self._num_list[pos][digit], tag, key)\ + if num else\ + self._get_correct_form(self._adj_list[pos][digit], tag, key) + return self._num_list[pos][digit] if num else self._adj_list[pos][digit] + + def _handle_two_digits(self, from_, to_, num, text, tag=None): + text = text[from_:to_] + if len(text) >= 2: + if text[-2] == '0': + return self._return_number(num, 0, text[-1], tag) + elif text[-2] == '1': + return self._return_number(num, 1, text[-2:], tag) + else: + if text[-1] == '0': + return self._return_number(num, 1, text[-2:], tag) + return self._return_number(num, 1, text[-2] + '0', tag) \ + + ' ' + self._return_number(num, 0, text[-1], tag) + elif len(text) >= 1: + return self._return_number(num, 0, text[-1], tag) + else: + return '' + + def _handle_three_digits(self, from_, to_, num, text, tag=None): + text = text[from_:to_] + if len(text) >= 3: + string = '' + if len(text) > 3: + string = ' ' + if text[-3] != '0': + if text[-2:] == '00': + return string + self._return_number(num, 2, text[-3], tag) + return string + self._return_number(num, 2, text[-3], tag) \ + + ' ' + self._handle_two_digits(from_, to_, num, text, tag) + if len(text) >= 2 and text[-2:] == '00': + return self._return_number(num, 0, text[-1], tag) + return self._handle_two_digits(from_, to_, num, text, tag) + + def _replace_correct_from(self, key, tag, base): + flex = self._get_correct_form(base, tag).lstrip(base) + return self._num_list[3][key] + flex + + def _handle_numbers(self, text, tag, word_text, _from): + tag_list = tag.split(':') + num = not tag_list[0] == 'adj' + length = len(text) + if length <= 3: + word_text = word_text + self._handle_three_digits( + 0, + None, + num, + text, + tag + ) + elif length == 0: + raise Exception('Fragment recognized as number is empty!') + else: + new_text = text + digits = len(new_text) - int(len(new_text) / 3) * 3 + if digits != 0: + new_word = self._handle_two_digits( + 0, + digits, + num, + new_text, + tag + ) + if new_word in self._one_dict: + word_text = self._return_large_part( + num, + int(len(new_text) / 3) * 3, + tag, + None + ) + else: + word_text = new_word + ' ' + self._return_large_part( + num, + int(len(new_text) / 3) * 3, + tag, + new_word + ) + new_text = new_text[digits:] + if len(new_text.rstrip('0')) != 0: + word_text += ' ' + else: + return word_text + for k in reversed(range(0, int(len(new_text) / 3))): + key = k * 3 + new_word = self._handle_three_digits(0, 3, num, new_text, tag) + if new_word in self._one_dict and key != 0: + word_text += self._return_large_part(num, key, tag, None) + else: + word_text += new_word + if key != 0: + word_text += ' ' + self._return_large_part( + num, + key, + tag, + word_text + ) + new_text = new_text[3:] + if len(new_text.rstrip('0')) == 0: + return word_text + word_text += ' ' + return word_text + + def _replace_using(self, key, word_text, tag, base): + text_split = word_text.split(' ') + text_split[-1] = self._replace_correct_from(key, tag, base) + return ' '.join(text_split) + + def _correct_large_number(self, num, word_text, tag, key): + trailing_zeros = key + if not num and trailing_zeros >= 12: + if int(trailing_zeros / 3) * 2 != int(trailing_zeros / 6): + word_text = self._replace_using( + key, + word_text, + tag, + 'miliardowy') + else: + word_text = self._replace_using( + key, + word_text, + tag, + 'bilionowy' + ) + elif num and trailing_zeros >= 27: + if int(trailing_zeros / 3) * 2 != int(trailing_zeros / 6): + word_text = self._replace_using(key, word_text, tag, 'biliard') + elif trailing_zeros >= 54: + word_text = self._replace_using(key, word_text, tag, 'bilion') + else: + word_text = self._get_correct_form(word_text, tag) + else: + word_text = self._get_correct_form(word_text, tag) + return word_text + + def _get_correct_form(self, text, tag, key=None): + if key: + return self._correct_large_number(True, text, tag, key) + text_split = text.split(' ') + generated = self._morf.generate(text_split[-1]) + for form in generated: + if tag in form[2]: + text_split[-1] = form[0] + break + return ' '.join(text_split) + + def _process_word(self, tok_id, text, tag): + if text.isdigit(): + text = self._handle_numbers(text, tag, '', 0) + return text + + def _handle_special(self, tok_id, text, tag): + self._special_list.append((tok_id, text, tag)) + return text + + def _process_lex(self, lex_subtree): + tag = '' + for elem in lex_subtree: + if elem.tag == 'ctag': + tag = elem.text + elif elem.tag != 'base': + raise Exception('Unrecognized tag inside lex: ' + elem.tag) + if tag == '': + raise Exception('Lex tag had no ctag inside!') + return tag + + def _process_sentence(self, string_builder): + return ''.join(string_builder) diff --git a/src/worker.py b/src/worker.py new file mode 100755 index 0000000..45fccfe --- /dev/null +++ b/src/worker.py @@ -0,0 +1,24 @@ +"""Implementation of nlp_worker.""" +import logging + +import nlp_ws + +from src.wordifier import Wordifier +from src.ccl_handler import Ccl_handler + + +_log = logging.getLogger(__name__) + + +class Worker(nlp_ws.NLPWorker): + """Implements nlp_worker for tokenizer service.""" + + @classmethod + def static_init(cls, config): + """One time static initialisation.""" + + def process(self, input_file, task_options, output_file): + """A.""" + wordifier = Wordifier() + ccl_handler = Ccl_handler(input_file) + ccl_handler.process(output_file, wordifier.unmarshallers) diff --git a/tox.ini b/tox.ini new file mode 100755 index 0000000..1516042 --- /dev/null +++ b/tox.ini @@ -0,0 +1,44 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python3 +commands = + pydocstyle --verbose {posargs} + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 +match-dir = ^(?!\.tox|venv).* +match = ^(?!setup).*\.py \ No newline at end of file -- GitLab From 58633a643747afae446b692b7347a3d90062046d Mon Sep 17 00:00:00 2001 From: Bartlomiej Koptyra <bartlomiej.koptyra@gmail.com> Date: Wed, 23 Sep 2020 14:30:14 +0200 Subject: [PATCH 2/4] Handling more ways of typing numbers. --- Dockerfile | 4 + config.ini | 2 +- src/morfeusz.py | 32 ----- src/wordifier.py | 366 ++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 355 insertions(+), 49 deletions(-) delete mode 100644 src/morfeusz.py diff --git a/Dockerfile b/Dockerfile index 86914cb..a5fdf32 100755 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,10 @@ RUN wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \ apt update && \ apt install morfeusz2 -y +RUN wget -O morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl http://download.sgjp.pl/morfeusz/20200913/Linux/18.04/64/morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl + +RUN python3.6 -m pip install morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl + RUN python3.6 -m pip install -r requirements.txt CMD ["python3.6", "main.py", "service"] \ No newline at end of file diff --git a/config.ini b/config.ini index efa4055..2845245 100755 --- a/config.ini +++ b/config.ini @@ -8,7 +8,7 @@ rabbit_password = test queue_prefix = nlp_ [tool] -workers_number = 1 +workers_number = 5 processed_lines = 1000 [logging] diff --git a/src/morfeusz.py b/src/morfeusz.py deleted file mode 100644 index 350710a..0000000 --- a/src/morfeusz.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Implementation of command-line morfeusz.""" -import subprocess - - -class Morfeusz(): - """Class used to handle using morfeusz.""" - - def _handle_processing(self, name, word): - with subprocess.Popen( - name, - stdout=subprocess.PIPE, - stdin=subprocess.PIPE, - stderr=subprocess.PIPE - ) as process: - process.stdin.write(str.encode(word)) - out, err = process.communicate(timeout=15) - if out: - return [elem.split(',') for elem in - out.decode().rstrip()[1:-2].split()] - elif err: - return [elem.split(',') for elem in - err.decode().rstrip()[1:-2].split()] - else: - return list() - - def generate(self, word): - """Morphological generation.""" - return self._handle_processing('morfeusz_generator', word) - - def analyze(self, word): - """Morphological analysis.""" - return self._handle_processing('morfeusz_analyzer', word) diff --git a/src/wordifier.py b/src/wordifier.py index 9a9ba70..333a3b2 100644 --- a/src/wordifier.py +++ b/src/wordifier.py @@ -1,5 +1,6 @@ """Implementation of wordifier functionality.""" -from src.morfeusz import Morfeusz +import morfeusz2 +import re class Wordifier: @@ -133,9 +134,41 @@ class Wordifier: } ] + _script_translator = [ + { + '\u2070': '0', # SUPERSCRIPT ZERO + '\u00B9': '1', # SUPERSCRIPT ONE + '\u00B2': '2', # SUPERSCRIPT TWO + '\u00B3': '3', # SUPERSCRIPT THREE + '\u2074': '4', # SUPERSCRIPT FOUR + '\u2075': '5', # SUPERSCRIPT FIVE + '\u2076': '6', # SUPERSCRIPT SIX + '\u2077': '7', # SUPERSCRIPT SEVEN + '\u2078': '8', # SUPERSCRIPT EIGHT + '\u2079': '9', # SUPERSCRIPT NINE + }, + { + '\u2080': '0', # SUBSCRIPT ZERO + '\u2081': '1', # SUBSCRIPT ONE + '\u2082': '2', # SUBSCRIPT TWO + '\u2083': '3', # SUBSCRIPT THREE + '\u2084': '4', # SUBSCRIPT FOUR + '\u2085': '5', # SUBSCRIPT FIVE + '\u2086': '6', # SUBSCRIPT SIX + '\u2087': '7', # SUBSCRIPT SEVEN + '\u2088': '8', # SUBSCRIPT EIGHT + '\u2089': '9' # SUBSCRIPT NINE + }, + { + '\u00BC': '1/4', # VULGAR FRACTION ONE QUARTER + '\u00BD': '1/2', # VULGAR FRACTION ONE HALF + '\u00BE': '3/4', # VULGAR FRACTION THREE QUARTERS + } + ] + def __init__(self): """Class initialization.""" - self._morf = Morfeusz() + self._morf = morfeusz2.Morfeusz() self.unmarshallers = { 'chunk': lambda *args: '\n', 'sentence': lambda *args: self._process_sent_tree(*args), @@ -143,6 +176,25 @@ class Wordifier: self._one_dict = dict() self._create_one_dict() self._special_list = [] + self._special_dict = { + 'number': lambda *args: self._get_number(*args), + 'superscript': lambda *args: self._get_superscript(*args), + 'subscript': lambda *args: self._get_subscript(*args), + '/': lambda *args: '/', + 'number/': lambda *args: self._get_number_slash(*args), + 'number/subscript': lambda *args: self._get_number_sub_fraction(*args), + 'number/number': lambda *args: self._get_number_number_fraction(*args), + 'superscript/': lambda *args: self._get_superscript_slash(*args), + 'superscript/subscript': lambda *args: self._get_script_fraction(*args), + 'superscript/number': lambda *args: self._get_super_number_fraction(*args), + 'fraction': lambda *args: self._get_fraction(*args), + 'scientific': lambda *args: self._get_scientific(*args), + 'dot': lambda *args: self._get_dot(*args), + '^': lambda *args: '^', + 'number^': lambda *args: self._get_number_hat(*args), + 'number^number': lambda *args: self._get_number_to_number(*args), + 'number^superscript': lambda *args: self._get_number_to_super(*args), + } def _create_one_dict(self): for word in self._morf.generate('jeden'): @@ -164,8 +216,24 @@ class Wordifier: raise Exception('Unrecognized tag inside sentence: ' + elem.tag) return self._process_sentence(string_builder) - def _is_special(self, text): - return True + def _special_type(self, text): + if text == '/' or text == '\u002F': + return '/' + elif re.match(r'\d+\.\d+', text): + return 'dot' + elif re.match(r'\d+\^\d+', text): + return 'scientific' + elif text == r'^': + return '^' + elif all(char in self._script_translator[0] for char in text): + return 'superscript' + elif all(char in self._script_translator[1] for char in text): + return 'subscript' + elif all(char in self._script_translator[2] for char in text): + return 'fraction' + elif text.isdigit(): + return 'number' + return None def _process_single_tok(self, tok_id, tok_subtree): text = '' @@ -175,12 +243,10 @@ class Wordifier: text = elem.text elif elem.tag == 'lex': tag = self._process_lex(elem) - if self._is_special(text): - self._handle_special(tok_id, text, tag) word = self._process_word(tok_id, text, tag) return word - def _return_large_part(self, num, digit, tag, word_text=None): + def _return_large_part(self, num, digit, tag=None, word_text=None): if word_text: last_word = word_text.split(' ')[-1] if last_word == 'dwa' \ @@ -235,9 +301,12 @@ class Wordifier: flex = self._get_correct_form(base, tag).lstrip(base) return self._num_list[3][key] + flex - def _handle_numbers(self, text, tag, word_text, _from): - tag_list = tag.split(':') - num = not tag_list[0] == 'adj' + def _handle_numbers(self, text, tag=None, word_text='', num=None): + if num is None and tag: + tag_list = tag.split(':') + num = not (tag_list[0] == 'adj' or tag_list[0] == 'subst') + elif num is None: + num = True length = len(text) if length <= 3: word_text = word_text + self._handle_three_digits( @@ -332,23 +401,31 @@ class Wordifier: return word_text def _get_correct_form(self, text, tag, key=None): + if tag is None: + return text if key: return self._correct_large_number(True, text, tag, key) text_split = text.split(' ') generated = self._morf.generate(text_split[-1]) for form in generated: - if tag in form[2]: + is_correct = True + form_tag = form[2].split(':') + for i, t in enumerate(tag.split(':')): + if t not in form_tag[i].split('.'): + is_correct = False + break + if is_correct: text_split[-1] = form[0] - break return ' '.join(text_split) def _process_word(self, tok_id, text, tag): - if text.isdigit(): - text = self._handle_numbers(text, tag, '', 0) + self._add_special(tok_id, text, tag) return text - def _handle_special(self, tok_id, text, tag): - self._special_list.append((tok_id, text, tag)) + def _add_special(self, tok_id, text, tag): + s_type = self._special_type(text) + if s_type: + self._special_list.append((tok_id, text, tag, s_type)) return text def _process_lex(self, lex_subtree): @@ -362,5 +439,262 @@ class Wordifier: raise Exception('Lex tag had no ctag inside!') return tag + def _get_number(self, string_builder, id_, tag, length): + if length > 1: + words = '' + j = length + i = 0 + while j > 0: + if string_builder[id_ + i] != ' ': + j -= 1 + i -= 1 + i += 1 + for j in range(0, length): + if string_builder[id_ + i] == ' ': + i += 1 + if len(string_builder[id_ + i]) <= 3: + if all(len(elem) == 3 or elem == ' ' for elem in string_builder[id_ + i:id_+1]): + return words + self._handle_numbers(text=''.join(string_builder[id_ + i:id_+1]).replace(' ', ''), tag=tag) + words += self._handle_numbers(text=string_builder[id_ + i], tag=tag) + ' ' + i += 1 + return words.rstrip() + else: + return self._handle_numbers(text=string_builder[id_], tag=tag) + + def _get_superscript(self, string_builder, id_, tag, length): + words = '' + i = 1 - length + new_text = [] + for j in range(0, length): + if string_builder[id_ + i] == ' ': + i += 1 + words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' + for char in string_builder[id_ + i]: + new_text.append(self._script_translator[0][char]) + i += 1 + words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' + return words.rstrip() + + def _get_subscript(self, string_builder, id_, tag, length): + words = '' + i = 1 - length + for j in range(0, length): + if string_builder[id_ + i] == ' ': + i += 1 + new_text = [] + for char in string_builder[id_ + i]: + new_text.append(self._script_translator[1][char]) + words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' + i += 1 + return words.rstrip() + + def _get_number_slash(self, string_builder, id_, tag, length): + return self._get_number(string_builder, id_, tag, length-1) + ' /' + + def _get_superscript_slash(self, string_builder, id_, tag, length): + return self._get_superscript(string_builder, id_, tag, length-1) + ' /' + + def _handle_fraction(self, numerator, denominator, tag): + num = None + tag_list = tag.split(':') + if numerator == '1': + de_tag = 'adj:sg:' + tag_list[2] + ':f' + num_tag = 'adj:sg:' + tag_list[2] + ':f' + num = True + elif numerator == '2' or numerator == '3' or numerator == '4': + de_tag = 'adj:sg:' + tag_list[2] + ':n' + num_tag = 'num:pl:' + tag_list[2] + ':f' + elif numerator[-1] == '0': + de_tag = 'adj:sg:' + tag_list[2] + ':n' + num_tag = None + else: + de_tag = 'subst:pl:gen:' + tag_list[3] + num_tag = None + return self._get_correct_form(self._handle_numbers(numerator, num_tag, num=num), num_tag)\ + + ' ' \ + + self._get_correct_form(self._handle_numbers(denominator, de_tag), de_tag) + + def _get_fraction(self, string_builder, id_, tag, length): + string = '' + i = 1 - length + for j in range(length): + words = self._script_translator[2][string_builder[id_ + i + j]].split('/') + string += self._handle_fraction(words[0], words[1], tag) + ' ' + return string.rstrip(' ') + + def _translate_script(self, string_builder, id_, length, super, until=None, from_=None): + result = '' + idx = 0 if super else 1 + i = 1 - length + p = 0 + if from_ is not None: + for p in range(length): + word = string_builder[id_ + p + i] + if word == from_: + i += p + 1 + break + for j in range(length - p): + word = string_builder[id_ + j + i] + if until and word == until: + break + if word in self._script_translator[idx]: + result += self._script_translator[idx][word] + else: + break + return result + + def _get_script_fraction(self, string_builder, id_, tag, length): + return self._handle_fraction( + self._translate_script(string_builder, id_, length, True, until='/'), + self._translate_script(string_builder, id_, length, False, from_='/'), + tag + ) + + def _get_super_number_fraction(self, string_builder, id_, tag, length): + return self._handle_fraction( + self._translate_script(string_builder, id_, length, True, until='/'), + string_builder[id_], + tag + ) + + def _get_number_sub_fraction(self, string_builder, id_, tag, length): + return self._handle_fraction( + string_builder[id_ - 2], + self._translate_script(string_builder, id_, length, False, from_='/'), + tag + ) + + def _get_number_number_fraction(self, string_builder, id_, tag, length): + return self._handle_fraction( + string_builder[id_ - 2], + string_builder[id_], + tag + ) + + def _get_dot(self, string_builder, id_, tag, length): + i = 1 - length + word = ''.join(string_builder[id_ + 1 - length:id_ + 1]) + numbers = word.split('.') + return self._handle_numbers(numbers[0]) + ' i ' + self._handle_fraction(numbers[1], str(10 ** int(len(numbers[1]))), tag) + + def _handle_additional_numbers(self, string_builder, id_, until, tag, length): + number = [] + i = 1 - length + for j in range(length): + word = string_builder[id_ + i + j] + if word == until: + break + number.append(word) + length = len(number) + if length > 1: + return self._handle_numbers(' '.join(number[0:length]).rstrip(), tag) + return '' + + def _handle_powers(self, first_number, second_number, tag=None): + if first_number == '10': + return self._handle_numbers(text='1' + '0' * int(second_number), tag=tag, num=True) + return self._handle_numbers(first_number) + ' do potęgi ' + self._handle_numbers(second_number, 'adj:sg:gen:f') + + def _get_number_to_number(self, string_builder, id_, tag, length): + text = self._handle_additional_numbers(string_builder, id_, '^', tag, length) + j = 0 + i = 1 - length + for k in range(length): + if string_builder[id_ + k + i] == '^': + j = k + 1 + i + break + if j < 0 or j >= length: + return text + if text: + text += ' ' + return text + self._handle_powers(string_builder[id_ + j - 2], string_builder[id_ + j], tag) + + def _get_number_to_super(self, string_builder, id_, tag, length): + text = self._handle_additional_numbers(string_builder, id_, '^', tag, length) + j = 0 + i = 1 - length + for k in range(length): + if string_builder[id_ + k + i] == '^': + j = k + 1 + i + break + if j == 0 or j >= length: + return text + if text: + text += ' ' + second_number = self._translate_script(string_builder, id_ + j, length - j, True) + return text + self._handle_powers(string_builder[id_ + j - 2], second_number, tag) + + def _get_scientific(self, string_builder, id_, tag, length): + words = string_builder[id_].split('^') + return self._handle_powers(words[0], words[1], tag) + + def _get_number_hat(self, string_builder, id_, tag, length): + return self._get_number(string_builder, id_, tag, length-1) + ' ^' + + def _get_as_words(self, id_, string_builder, tag, length, s_type): + if s_type in self._special_dict: + return self._special_dict[s_type](string_builder, id_, tag, length) + return '' + + def _handle_special(self, string_builder): + if self._special_list: + it = iter(self._special_list) + id_, text, tag, s_type = next(it) + current_tag = tag + current_stype = s_type + current_id = id_ + length = 1 + for id_, text, tag, s_type in it: + if ((current_stype == 'number' or current_stype == 'superscript') and s_type == '/') \ + or ((current_stype == 'superscript/' or current_stype == 'number/') and (s_type == 'number' or s_type == 'subscript'))\ + or (current_stype == s_type and (s_type == 'number' or s_type == 'subscript' or s_type == 'superscript'))\ + or ((current_stype == 'superscript/subscript' or current_stype == 'number/subscript') and s_type == 'subscript')\ + or (current_stype == 'number' and s_type == '^') \ + or (current_stype == 'number^' and (s_type == 'number' or s_type == 'superscript')): + if id_ == current_id + 1 or (id_ == current_id + 2 and s_type == 'number' and string_builder[current_id + 1] == ' '): + length += 1 + if not ((current_stype == s_type and (s_type == 'number' or s_type == 'subscript' or s_type == 'superscript')) + or ((current_stype == 'superscript/subscript' or current_stype == 'number/subscript') and s_type == 'subscript')): + current_stype += s_type + current_tag = tag + current_id = id_ + continue + new_text = self._get_as_words( + id_=current_id, + string_builder=string_builder, + tag=current_tag, + length=length, + s_type=current_stype + ) + string_builder = self._replace_string_in_builder(string_builder, current_id, length, new_text) + length = 1 + current_tag = tag + current_stype = s_type + current_id = id_ + new_text = self._get_as_words( + id_=current_id, + string_builder=string_builder, + tag=current_tag, + length=length, + s_type=current_stype + ) + string_builder = self._replace_string_in_builder(string_builder, current_id, length, new_text) + self._special_list.clear() + return string_builder + + @staticmethod + def _replace_string_in_builder(string_builder, current_id, length, new_text): + j = current_id + i = length + while i > 0: + if not (string_builder[j] == ' ' or string_builder[j] == ''): + i -= 1 + string_builder[j] = '' + j -= 1 + string_builder[current_id] = new_text + return string_builder + def _process_sentence(self, string_builder): + string_builder = self._handle_special(string_builder) + string_builder[0] = string_builder[0].capitalize() return ''.join(string_builder) -- GitLab From 2eaf26fc117f0d3b15ef120cbaaedb1483fff50b Mon Sep 17 00:00:00 2001 From: Bartlomiej Koptyra <bartlomiej.koptyra@gmail.com> Date: Wed, 23 Sep 2020 15:17:38 +0200 Subject: [PATCH 3/4] Fixed tox --- src/wordifier.py | 286 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 224 insertions(+), 62 deletions(-) diff --git a/src/wordifier.py b/src/wordifier.py index 333a3b2..8f1f2ad 100644 --- a/src/wordifier.py +++ b/src/wordifier.py @@ -177,23 +177,40 @@ class Wordifier: self._create_one_dict() self._special_list = [] self._special_dict = { - 'number': lambda *args: self._get_number(*args), - 'superscript': lambda *args: self._get_superscript(*args), - 'subscript': lambda *args: self._get_subscript(*args), - '/': lambda *args: '/', - 'number/': lambda *args: self._get_number_slash(*args), - 'number/subscript': lambda *args: self._get_number_sub_fraction(*args), - 'number/number': lambda *args: self._get_number_number_fraction(*args), - 'superscript/': lambda *args: self._get_superscript_slash(*args), - 'superscript/subscript': lambda *args: self._get_script_fraction(*args), - 'superscript/number': lambda *args: self._get_super_number_fraction(*args), - 'fraction': lambda *args: self._get_fraction(*args), - 'scientific': lambda *args: self._get_scientific(*args), - 'dot': lambda *args: self._get_dot(*args), - '^': lambda *args: '^', - 'number^': lambda *args: self._get_number_hat(*args), - 'number^number': lambda *args: self._get_number_to_number(*args), - 'number^superscript': lambda *args: self._get_number_to_super(*args), + 'number': + lambda *args: self._get_number(*args), + 'superscript': + lambda *args: self._get_superscript(*args), + 'subscript': + lambda *args: self._get_subscript(*args), + '/': + lambda *args: '/', + 'number/': + lambda *args: self._get_number_slash(*args), + 'number/subscript': + lambda *args: self._get_number_sub_fraction(*args), + 'number/number': + lambda *args: self._get_number_number_fraction(*args), + 'superscript/': + lambda *args: self._get_superscript_slash(*args), + 'superscript/subscript': + lambda *args: self._get_script_fraction(*args), + 'superscript/number': + lambda *args: self._get_super_number_fraction(*args), + 'fraction': + lambda *args: self._get_fraction(*args), + 'scientific': + lambda *args: self._get_scientific(*args), + 'dot': + lambda *args: self._get_dot(*args), + '^': + lambda *args: '^', + 'number^': + lambda *args: self._get_number_hat(*args), + 'number^number': + lambda *args: self._get_number_to_number(*args), + 'number^superscript': + lambda *args: self._get_number_to_super(*args) } def _create_one_dict(self): @@ -260,9 +277,15 @@ class Wordifier: def _return_number(self, num, pos, digit, tag=None, key=None): if tag: - return self._get_correct_form(self._num_list[pos][digit], tag, key)\ - if num else\ - self._get_correct_form(self._adj_list[pos][digit], tag, key) + return self._get_correct_form( + text=self._num_list[pos][digit], + tag=tag, + key=key + ) if num else self._get_correct_form( + text=self._adj_list[pos][digit], + tag=tag, + key=key + ) return self._num_list[pos][digit] if num else self._adj_list[pos][digit] def _handle_two_digits(self, from_, to_, num, text, tag=None): @@ -275,8 +298,8 @@ class Wordifier: else: if text[-1] == '0': return self._return_number(num, 1, text[-2:], tag) - return self._return_number(num, 1, text[-2] + '0', tag) \ - + ' ' + self._return_number(num, 0, text[-1], tag) + return self._return_number(num, 1, text[-2] + '0', tag) +\ + ' ' + self._return_number(num, 0, text[-1], tag) elif len(text) >= 1: return self._return_number(num, 0, text[-1], tag) else: @@ -453,9 +476,18 @@ class Wordifier: if string_builder[id_ + i] == ' ': i += 1 if len(string_builder[id_ + i]) <= 3: - if all(len(elem) == 3 or elem == ' ' for elem in string_builder[id_ + i:id_+1]): - return words + self._handle_numbers(text=''.join(string_builder[id_ + i:id_+1]).replace(' ', ''), tag=tag) - words += self._handle_numbers(text=string_builder[id_ + i], tag=tag) + ' ' + if all(len(elem) == 3 or elem == ' ' for elem + in string_builder[id_ + i:id_ + 1]): + return words + self._handle_numbers( + text=''.join( + string_builder[id_ + i:id_ + 1]) + .replace(' ', ''), + tag=tag + ) + words += self._handle_numbers( + text=string_builder[id_ + i], + tag=tag + ) + ' ' i += 1 return words.rstrip() else: @@ -468,7 +500,10 @@ class Wordifier: for j in range(0, length): if string_builder[id_ + i] == ' ': i += 1 - words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' + words += self._handle_numbers( + text=''.join(new_text), + tag=tag + ) + ' ' for char in string_builder[id_ + i]: new_text.append(self._script_translator[0][char]) i += 1 @@ -489,10 +524,15 @@ class Wordifier: return words.rstrip() def _get_number_slash(self, string_builder, id_, tag, length): - return self._get_number(string_builder, id_, tag, length-1) + ' /' + return self._get_number(string_builder, id_, tag, length - 1) + ' /' def _get_superscript_slash(self, string_builder, id_, tag, length): - return self._get_superscript(string_builder, id_, tag, length-1) + ' /' + return self._get_superscript( + string_builder=string_builder, + id_=id_, + tag=tag, + length=length - 1 + ) + ' /' def _handle_fraction(self, numerator, denominator, tag): num = None @@ -510,21 +550,34 @@ class Wordifier: else: de_tag = 'subst:pl:gen:' + tag_list[3] num_tag = None - return self._get_correct_form(self._handle_numbers(numerator, num_tag, num=num), num_tag)\ - + ' ' \ - + self._get_correct_form(self._handle_numbers(denominator, de_tag), de_tag) + return self._get_correct_form( + text=self._handle_numbers(text=numerator, tag=num_tag, num=num), + tag=num_tag + ) + ' ' + self._get_correct_form( + text=self._handle_numbers(text=denominator, tag=de_tag), + tag=de_tag + ) def _get_fraction(self, string_builder, id_, tag, length): string = '' i = 1 - length for j in range(length): - words = self._script_translator[2][string_builder[id_ + i + j]].split('/') + words = self._script_translator[2][string_builder[id_ + i + j]]\ + .split('/') string += self._handle_fraction(words[0], words[1], tag) + ' ' return string.rstrip(' ') - def _translate_script(self, string_builder, id_, length, super, until=None, from_=None): + def _translate_script( + self, + string_builder, + id_, + length, + superscript, + until=None, + from_=None + ): result = '' - idx = 0 if super else 1 + idx = 0 if superscript else 1 i = 1 - length p = 0 if from_ is not None: @@ -545,14 +598,32 @@ class Wordifier: def _get_script_fraction(self, string_builder, id_, tag, length): return self._handle_fraction( - self._translate_script(string_builder, id_, length, True, until='/'), - self._translate_script(string_builder, id_, length, False, from_='/'), + self._translate_script( + string_builder=string_builder, + id_=id_, + length=length, + superscript=True, + until='/' + ), + self._translate_script( + string_builder=string_builder, + id_=id_, + length=length, + superscript=False, + from_='/' + ), tag ) def _get_super_number_fraction(self, string_builder, id_, tag, length): return self._handle_fraction( - self._translate_script(string_builder, id_, length, True, until='/'), + self._translate_script( + string_builder=string_builder, + id_=id_, + length=length, + superscript=True, + until='/' + ), string_builder[id_], tag ) @@ -560,7 +631,13 @@ class Wordifier: def _get_number_sub_fraction(self, string_builder, id_, tag, length): return self._handle_fraction( string_builder[id_ - 2], - self._translate_script(string_builder, id_, length, False, from_='/'), + self._translate_script( + string_builder=string_builder, + id_=id_, + length=length, + superscript=False, + from_='/' + ), tag ) @@ -572,12 +649,23 @@ class Wordifier: ) def _get_dot(self, string_builder, id_, tag, length): - i = 1 - length word = ''.join(string_builder[id_ + 1 - length:id_ + 1]) numbers = word.split('.') - return self._handle_numbers(numbers[0]) + ' i ' + self._handle_fraction(numbers[1], str(10 ** int(len(numbers[1]))), tag) + return self._handle_numbers(numbers[0]) + ' i ' \ + + self._handle_fraction( + numerator=numbers[1], + denominator=str(10 ** int(len(numbers[1]))), + tag=tag + ) - def _handle_additional_numbers(self, string_builder, id_, until, tag, length): + def _handle_additional_numbers( + self, + string_builder, + id_, + until, + tag, + length + ): number = [] i = 1 - length for j in range(length): @@ -587,16 +675,30 @@ class Wordifier: number.append(word) length = len(number) if length > 1: - return self._handle_numbers(' '.join(number[0:length]).rstrip(), tag) + return self._handle_numbers( + text=' '.join(number[0:length]).rstrip(), + tag=tag + ) return '' def _handle_powers(self, first_number, second_number, tag=None): if first_number == '10': - return self._handle_numbers(text='1' + '0' * int(second_number), tag=tag, num=True) - return self._handle_numbers(first_number) + ' do potęgi ' + self._handle_numbers(second_number, 'adj:sg:gen:f') + return self._handle_numbers( + text='1' + '0' * int(second_number), + tag=tag, + num=True + ) + return self._handle_numbers(first_number) + ' do potęgi ' \ + + self._handle_numbers(second_number, 'adj:sg:gen:f') def _get_number_to_number(self, string_builder, id_, tag, length): - text = self._handle_additional_numbers(string_builder, id_, '^', tag, length) + text = self._handle_additional_numbers( + string_builder, + id_, + '^', + tag, + length + ) j = 0 i = 1 - length for k in range(length): @@ -607,10 +709,20 @@ class Wordifier: return text if text: text += ' ' - return text + self._handle_powers(string_builder[id_ + j - 2], string_builder[id_ + j], tag) + return text + self._handle_powers( + string_builder[id_ + j - 2], + string_builder[id_ + j], + tag + ) def _get_number_to_super(self, string_builder, id_, tag, length): - text = self._handle_additional_numbers(string_builder, id_, '^', tag, length) + text = self._handle_additional_numbers( + string_builder, + id_, + '^', + tag, + length + ) j = 0 i = 1 - length for k in range(length): @@ -621,21 +733,57 @@ class Wordifier: return text if text: text += ' ' - second_number = self._translate_script(string_builder, id_ + j, length - j, True) - return text + self._handle_powers(string_builder[id_ + j - 2], second_number, tag) + second_number = self._translate_script( + string_builder=string_builder, + id_=id_ + j, + length=length - j, + superscript=True + ) + return text + self._handle_powers( + string_builder[id_ + j - 2], + second_number, + tag + ) def _get_scientific(self, string_builder, id_, tag, length): words = string_builder[id_].split('^') return self._handle_powers(words[0], words[1], tag) def _get_number_hat(self, string_builder, id_, tag, length): - return self._get_number(string_builder, id_, tag, length-1) + ' ^' + return self._get_number(string_builder, id_, tag, length - 1) + ' ^' def _get_as_words(self, id_, string_builder, tag, length, s_type): if s_type in self._special_dict: return self._special_dict[s_type](string_builder, id_, tag, length) return '' + @staticmethod + def _check_if_multipart(current_stype, s_type): + return ((current_stype == 'number' or + current_stype == 'superscript') and + s_type == '/') or\ + ((current_stype == 'superscript/' or + current_stype == 'number/') and + (s_type == 'number' or s_type == 'subscript')) or\ + (current_stype == s_type and + (s_type == 'number' or s_type == 'subscript' or + s_type == 'superscript')) or\ + ((current_stype == 'superscript/subscript' or + current_stype == 'number/subscript') and + s_type == 'subscript') or\ + (current_stype == 'number' and s_type == '^') or\ + (current_stype == 'number^' and + (s_type == 'number' or s_type == 'superscript')) + + @staticmethod + def _check_if_number_continuation(current_stype, s_type): + return not ((current_stype == s_type and + (s_type == 'number' or s_type == 'subscript' or + s_type == 'superscript')) or + ((current_stype == 'superscript/subscript' or + current_stype == 'number/subscript') and + s_type == 'subscript')) + def _handle_special(self, string_builder): if self._special_list: it = iter(self._special_list) @@ -645,16 +793,15 @@ class Wordifier: current_id = id_ length = 1 for id_, text, tag, s_type in it: - if ((current_stype == 'number' or current_stype == 'superscript') and s_type == '/') \ - or ((current_stype == 'superscript/' or current_stype == 'number/') and (s_type == 'number' or s_type == 'subscript'))\ - or (current_stype == s_type and (s_type == 'number' or s_type == 'subscript' or s_type == 'superscript'))\ - or ((current_stype == 'superscript/subscript' or current_stype == 'number/subscript') and s_type == 'subscript')\ - or (current_stype == 'number' and s_type == '^') \ - or (current_stype == 'number^' and (s_type == 'number' or s_type == 'superscript')): - if id_ == current_id + 1 or (id_ == current_id + 2 and s_type == 'number' and string_builder[current_id + 1] == ' '): + if self._check_if_multipart(current_stype, s_type): + if id_ == current_id + 1 or ( + id_ == current_id + 2 and s_type == 'number' and + string_builder[current_id + 1] == ' '): length += 1 - if not ((current_stype == s_type and (s_type == 'number' or s_type == 'subscript' or s_type == 'superscript')) - or ((current_stype == 'superscript/subscript' or current_stype == 'number/subscript') and s_type == 'subscript')): + if self._check_if_number_continuation( + current_stype, + s_type + ): current_stype += s_type current_tag = tag current_id = id_ @@ -666,7 +813,12 @@ class Wordifier: length=length, s_type=current_stype ) - string_builder = self._replace_string_in_builder(string_builder, current_id, length, new_text) + string_builder = self._replace_string_in_builder( + string_builder=string_builder, + current_id=current_id, + length=length, + new_text=new_text + ) length = 1 current_tag = tag current_stype = s_type @@ -678,12 +830,22 @@ class Wordifier: length=length, s_type=current_stype ) - string_builder = self._replace_string_in_builder(string_builder, current_id, length, new_text) + string_builder = self._replace_string_in_builder( + string_builder=string_builder, + current_id=current_id, + length=length, + new_text=new_text + ) self._special_list.clear() return string_builder @staticmethod - def _replace_string_in_builder(string_builder, current_id, length, new_text): + def _replace_string_in_builder( + string_builder, + current_id, + length, + new_text + ): j = current_id i = length while i > 0: -- GitLab From 36baae94db2c407b83cefb84de47db93de5ba631 Mon Sep 17 00:00:00 2001 From: Norbert Ropiak <norbert.ropiak@pwr.edu.pl> Date: Thu, 18 Feb 2021 15:08:06 +0000 Subject: [PATCH 4/4] Wordifier - first project outline --- .gitignore | 139 ++++ .gitlab-ci.yml | 33 +- Dockerfile | 16 +- README.md | 15 + data/currencies.json | 1514 +++++++++++++++++++++++++++++++++++++++ data/numbers.json | 116 +++ docker-compose.yml | 9 +- requirements-dev.txt | 2 + requirements.txt | 2 +- src/__init__.py | 0 src/ccl_handler.py | 18 +- src/date2words.py | 108 +++ src/num2words.py | 105 +++ src/utils.py | 211 ++++++ src/wordifier.py | 1185 ++++++++++-------------------- src/worker.py | 6 +- tests/__init__.py | 0 tests/test_num2words.py | 136 ++++ tox.ini | 2 +- 19 files changed, 2793 insertions(+), 824 deletions(-) create mode 100644 .gitignore create mode 100644 data/currencies.json create mode 100644 data/numbers.json create mode 100644 requirements-dev.txt create mode 100644 src/__init__.py create mode 100644 src/date2words.py create mode 100644 src/num2words.py create mode 100644 src/utils.py create mode 100644 tests/__init__.py create mode 100644 tests/test_num2words.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f622468 --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +.vscode \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a78b15f..811491d 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,29 +1,46 @@ -image: 'clarinpl/python:3.6' +image: "clarinpl/python:3.6" cache: paths: - .tox stages: - check_style + - test - build -before_script: - - pip install tox==2.9.1 + pep8: stage: check_style + before_script: + - pip install tox==2.9.1 script: - tox -v -e pep8 + docstyle: stage: check_style + before_script: + - pip install tox==2.9.1 script: - tox -v -e docstyle -build_image: + +test: + stage: test + image: "docker:18.09.7" + services: + - "docker:18.09.7-dind" + script: + - docker build -t clarinpl/wordifier . + - docker run --rm + -v "$(pwd)/requirements-dev.txt:/home/worker/requirements-dev.txt" + -v "$(pwd)/tests:/home/worker/tests" + clarinpl/wordifier + sh -c 'pip3 install -r requirements-dev.txt ; nose2 -v tests' + +build: stage: build - image: 'docker:18.09.7' + image: "docker:18.09.7" only: - master services: - - 'docker:18.09.7-dind' - before_script: - - '' + - "docker:18.09.7-dind" script: - docker build -t clarinpl/wordifier . - echo $DOCKER_PASSWORD > pass.txt diff --git a/Dockerfile b/Dockerfile index a5fdf32..2dfcce5 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,21 @@ FROM clarinpl/python:3.6 WORKDIR /home/worker -COPY ./src ./src -COPY ./main.py . -COPY ./requirements.txt . -RUN wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \ - apt-add-repository http://download.sgjp.pl/apt/ubuntu && \ - apt update && \ - apt install morfeusz2 -y +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 +RUN update-alternatives --set python /usr/bin/python3.6 + +RUN apt-get update && apt-get install -y morfeusz2 RUN wget -O morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl http://download.sgjp.pl/morfeusz/20200913/Linux/18.04/64/morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl RUN python3.6 -m pip install morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl +COPY ./src ./src +COPY ./main.py . +COPY ./requirements.txt . +COPY ./data ./data + RUN python3.6 -m pip install -r requirements.txt CMD ["python3.6", "main.py", "service"] \ No newline at end of file diff --git a/README.md b/README.md index e69de29..3741fc1 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,15 @@ +# Wordifier + +A service that expands abbreviations into full texts. The following modules are implemented at this time: +- verbal notation of digits, numbers, decimal and ordinary fractions (with separators '.' and '/') +- verbal notation of simple equations with addition, subtraction, multiplication and division +- verbal notation of dates + - recognizing different ways to write dates. + - 25.12.2010 or 25,12,12 (day/month, day/month, year) + - 2009-08-30 or 20 08 30 (year, day/month, day/month) + - 12 Jan 2010 or 31 Jan 1998 (day, month, year) + - Mar 12 (month, year) + - Dec 15 (day, month) + - April 30 2000 (month, day, year) +- replace currency symbols with words +- write special characters (%, &, #, ^, =, +, -, /) in words \ No newline at end of file diff --git a/data/currencies.json b/data/currencies.json new file mode 100644 index 0000000..2a29d81 --- /dev/null +++ b/data/currencies.json @@ -0,0 +1,1514 @@ +{ + "$": [ + "dolar", + "dolary", + "dolarów", + "dolara" + ], + "USD": [ + "dolar amerykański", + "dolary amerykańskie", + "dolarów amerykańskich", + "dolara amerykańskiego" + ], + "CA$": [ + "dolar kanadyjski", + "dolary kanadyjskie", + "dolarów kanadyjskich", + "dolara kanadyjskiego" + ], + "CAD": [ + "dolar kanadyjski", + "dolary kanadyjskie", + "dolarów kanadyjskich", + "dolara kanadyjskiego" + ], + "\u20ac": [ + "Euro", + "Euro", + "Euro", + "Euro" + ], + "EUR": [ + "Euro", + "Euro", + "Euro", + "Euro" + ], + "\u062f.\u0625.\u200f": [ + "Dirham Zjednoczonych Emiratów Arabskich", + "Dirhamy Zjednoczonych Emiratów Arabskich", + "Dirhamów Zjednoczonych Emiratów Arabskich", + "Dirhama Zjednoczonych Emiratów Arabskich" + ], + "AED": [ + "Dirham Zjednoczonych Emiratów Arabskich", + "Dirhamy Zjednoczonych Emiratów Arabskich", + "Dirhamów Zjednoczonych Emiratów Arabskich", + "Dirhama Zjednoczonych Emiratów Arabskich" + ], + "\u060b": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "Af": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "AFN": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "Lek": [ + "lek", + "leki", + "leków", + "leka" + ], + "ALL": [ + "lek", + "leki", + "leków", + "leka" + ], + "\u0564\u0580.": [ + "armański dram", + "armeńskie dramy", + "armeńskich dramów", + "armeńskiego drama" + ], + "AMD": [ + "armański dram", + "armeńskie dramy", + "armeńskich dramów", + "armeńskiego drama" + ], + "AR$": [ + "argetyńskie peso", + "argetyńskie pesos", + "argetyńsich pesos", + "argetyńskiego peso" + ], + "ARS": [ + "argetyńskie peso", + "argetyńskie pesos", + "argetyńsich pesos", + "argetyńskiego peso" + ], + "AU$": [ + "dolar australijski", + "dolary australijskie", + "dolarów australijskich", + "dolara australijskiego" + ], + "AUD": [ + "dolar australijski", + "dolary australijskie", + "dolarów australijskich", + "dolara australijskiego" + ], + "\u043c\u0430\u043d.": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "man.": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "AZN": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "KM": [ + "marka zamienna", + "marki zamienne", + "marek zamiennych", + "marki zamiennej" + ], + "BAM": [ + "marka zamienna", + "marki zamienne", + "marek zamiennych", + "marki zamiennej" + ], + "\u09f3": [ + "taka", + "taka", + "taka", + "taka" + ], + "Tk": [ + "taka", + "taka", + "taka", + "taka" + ], + "BDT": [ + "taka", + "taka", + "taka", + "taka" + ], + "\u043b\u0432.": [ + "lew", + "lewy", + "lewów", + "lewa" + ], + "BGN": [ + "lew", + "lewy", + "lewów", + "lewa" + ], + "\u062f.\u0628.\u200f": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "BD": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "BHD": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "FBu": [ + "frank burundyjski", + "franki burundyjskie", + "franków burundyjskich", + "franka burundyjskiego" + ], + "BIF": [ + "frank burundyjski", + "franki burundyjskie", + "franków burundyjskich", + "franka burundyjskiego" + ], + "BN$": [ + "dolar brunejski", + "dolary brunejskie", + "dolarów brunejskich", + "dolara brunejskiego" + ], + "BND": [ + "dolar brunejski", + "dolary brunejskie", + "dolarów brunejskich", + "dolara brunejskiego" + ], + "Bs": [ + "boliviano", + "bolivianos", + "bolivianos", + "boliviano" + ], + "BOB": [ + "boliviano", + "bolivianos", + "bolivianos", + "boliviano" + ], + "R$": [ + "real brazylijski", + "reale brazylijskie", + "realów brazylijskich", + "reala brazylijskiego" + ], + "BRL": [ + "real brazylijski", + "reale brazylijskie", + "realów brazylijskich", + "reala brazylijskiego" + ], + "P": [ + "pula", + "pula", + "pula", + "pula" + ], + "BWP": [ + "pula", + "pula", + "pula", + "pula" + ], + "\u0440\u0443\u0431.": [ + "rubel białoruski", + "ruble białoruskie", + "rubli białoruskich", + "rubla białoruskiego" + ], + "Br": [ + "birr", + "birry", + "birrów", + "birra" + ], + "BYN": [ + "rubel białoruski", + "ruble białoruskie", + "rubli białoruskich", + "rubla białoruskiego" + ], + "BZ$": [ + "dolar belizeński", + "dolary belizeńskie", + "dolarów belizeńskich", + "dolara belizeńskiego" + ], + "BZD": [ + "dolar belizeński", + "dolary belizeńskie", + "dolarów belizeńskich", + "dolara belizeńskiego" + ], + "FrCD": [ + "frank kongijski", + "franki kongijskie", + "franków kongijskich", + "franka kongijskiego" + ], + "CDF": [ + "frank kongijski", + "franki kongijskie", + "franków kongijskich", + "franka kongijskiego" + ], + "CHF": [ + "frank szwajcarski", + "franki szwajcarskie", + "franków szwajcarskich", + "franka szwajcarskiego" + ], + "CL$": [ + "peso chilijskie", + "peso chilijskie", + "pesos chilijskich", + "peso chilijskiego" + ], + "CLP": [ + "peso chilijskie", + "peso chilijskie", + "pesos chilijskich", + "peso chilijskiego" + ], + "CN\u00a5": [ + "yuan", + "yuan", + "yuan", + "yuan" + ], + "CNY": [ + "yuan", + "yuan", + "yuan", + "yuan" + ], + "CO$": [ + "peso kolumbijskie", + "peso kolumbijskie", + "pesos kolumbijskich", + "peso kolumbijskiego" + ], + "COP": [ + "peso kolumbijskie", + "peso kolumbijskie", + "pesos kolumbijskich", + "peso kolumbijskiego" + ], + "\u20a1": [ + "colón kostarykański", + "colóny kostarytańskie", + "colónów kostarytańskich", + "colóna kostaryńskiego" + ], + "CRC": [ + "colón kostarykański", + "colóny kostarytańskie", + "colónów kostarytańskich", + "colóna kostaryńskiego" + ], + "CV$": [ + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka" + ], + "CVE": [ + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka" + ], + "K\u010d": [ + "czeska korona", + "czeskie korony", + "czeskich koron", + "czeskiej korony" + ], + "CZK": [ + "czeska korona", + "czeskie korony", + "czeskich koron", + "czeskiej korony" + ], + "Fdj": [ + "frank dżibutyjski", + "franki dżibutyjskie", + "franków dżibutyjskich", + "franka dżibutyjskiego" + ], + "DJF": [ + "frank dżibutyjski", + "franki dżibutyjskie", + "franków dżibutyjskich", + "franka dżibutyjskiego" + ], + "kr": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "Dkr": [ + "korona duńska", + "korony duńskie", + "koron duńskich", + "korony duńskiej" + ], + "DKK": [ + "korona duńska", + "korony duńskie", + "koron duńskich", + "korony duńskiej" + ], + "RD$": [ + "peso dominikańskie", + "peso dominikańskie", + "pesos dominikańskich", + "peso dominikańskiego" + ], + "DOP": [ + "peso dominikańskie", + "peso dominikańskie", + "pesos dominikańskich", + "peso dominikańskiego" + ], + "\u062f.\u062c.\u200f": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "DA": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "DZD": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "Ekr": [ + "korona estońska", + "korony estońskie", + "koron estońskich", + "korony estońskiej" + ], + "EEK": [ + "korona estońska", + "korony estońskie", + "koron estońskich", + "korony estońskiej" + ], + "\u062c.\u0645.\u200f": [ + "funt egipski", + "funty egipskie", + "funtów egipskich", + "funta egipskiego" + ], + "EGP": [ + "funt egipski", + "funty egipskie", + "funtów egipskich", + "funta egipskiego" + ], + "Nfk": [ + "nakfa", + "nakfy", + "nakf", + "nakfy" + ], + "ERN": [ + "nakfa", + "nakfy", + "nakf", + "nakfy" + ], + "ETB": [ + "birr", + "birry", + "birrów", + "birra" + ], + "\u00a3": [ + "funt szterling", + "funty szterling", + "funtów szterling", + "funta szterlinga" + ], + "GBP": [ + "funt szterling", + "funty szterling", + "funtów szterling", + "funta szterlinga" + ], + "GEL": [ + "lari", + "lari", + "lari", + "lari" + ], + "GH\u20b5": [ + "cedi", + "cedi", + "cedi", + "cedi" + ], + "GHS": [ + "cedi", + "cedi", + "cedi", + "cedi" + ], + "FG": [ + "frank gwinejski", + "franki gwinejskie", + "franków gwinejskich", + "franka gwinejskiego" + ], + "GNF": [ + "frank gwinejski", + "franki gwinejskie", + "franków gwinejskich", + "franka gwinejskiego" + ], + "Q": [ + "quetzal", + "quetzale", + "quetzali", + "quetzala" + ], + "GTQ": [ + "quetzal", + "quetzale", + "quetzali", + "quetzala" + ], + "HK$": [ + "dolar hongkoński", + "dolary hongkońskie", + "dolarów hongkońskich", + "dolara hongkońskiego" + ], + "HKD": [ + "dolar hongkoński", + "dolary hongkońskie", + "dolarów hongkońskich", + "dolara hongkońskiego" + ], + "L": [ + "lempira", + "lempiry", + "lempir", + "lempira" + ], + "HNL": [ + "lempira", + "lempiry", + "lempir", + "lempira" + ], + "kn": [ + "kuna", + "kuny", + "kun", + "kuny" + ], + "HRK": [ + "kuna", + "kuny", + "kun", + "kuny" + ], + "Ft": [ + "forint", + "forinty", + "forintów", + "forinta" + ], + "HUF": [ + "forint", + "forinty", + "forintów", + "forinta" + ], + "Rp": [ + "rupia indonezyjska", + "rupie indonezyjske", + "rupii indonezyjskych", + "rupii indonezyjskiej" + ], + "IDR": [ + "rupia indonezyjska", + "rupie indonezyjske", + "rupii indonezyjskych", + "rupii indonezyjskiej" + ], + "\u20aa": [ + "nowy izraelski szekel", + "nowe izraelskie szekle", + "nowych izraelskich szekli", + "nowego izraelskiego szekla" + ], + "ILS": [ + "nowy izraelski szekel", + "nowe izraelskie szekle", + "nowych izraelskich szekli", + "nowego izraelskiego szekla" + ], + "\u099f\u0995\u09be": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "Rs": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "INR": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "\u062f.\u0639.\u200f": [ + "dinar iracki", + "dinary irackie", + "dinarów irackich", + "dinara irackiego" + ], + "IQD": [ + "dinar iracki", + "dinary irackie", + "dinarów irackich", + "dinara irackiego" + ], + "\ufdfc": [ + "rial irański", + "riale irańskie", + "riali irańskich", + "riala irańskiego" + ], + "IRR": [ + "rial irański", + "riale irańskie", + "riali irańskich", + "riala irańskiego" + ], + "Ikr": [ + "korona islandzka", + "korony islandzkie", + "koron islandzkich", + "korony islandzkiej" + ], + "ISK": [ + "korona islandzka", + "korony islandzkie", + "koron islandzkich", + "korony islandzkiej" + ], + "J$": [ + "dolar jamajski", + "dolary jamajskie", + "dolarów jamajskich", + "dolara jamajskiego" + ], + "JMD": [ + "dolar jamajski", + "dolary jamajskie", + "dolarów jamajskich", + "dolara jamajskiego" + ], + "\u062f.\u0623.\u200f": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "JD": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "JOD": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "\uffe5": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "\u00a5": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "JPY": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "Ksh": [ + "szyling kenijski", + "szylingi kenijskie", + "szylingów kenijskich", + "szylinga kenijskiego" + ], + "KES": [ + "szyling kenijski", + "szylingi kenijskie", + "szylingów kenijskich", + "szylinga kenijskiego" + ], + "\u17db": [ + "riel kambodżański", + "riele kambodżańskie", + "rieli kambodżańskich", + "riela kambodzańskiego" + ], + "KHR": [ + "riel kambodżański", + "riele kambodżańskie", + "rieli kambodżańskich", + "riela kambodzańskiego" + ], + "FC": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "CF": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "KMF": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "\u20a9": [ + "won południowokoreański", + "wony południowokoreańskie", + "wonów południowokoreańskich", + "wona południowokoreańskiego" + ], + "KRW": [ + "won południowokoreański", + "wony południowokoreańskie", + "wonów południowokoreańskich", + "wona południowokoreańskiego" + ], + "\u062f.\u0643.\u200f": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "KD": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "KWD": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "\u0442\u04a3\u0433.": [ + "tenge", + "tenge", + "tenge", + "tenge" + ], + "KZT": [ + "tenge", + "tenge", + "tenge", + "tenge" + ], + "\u0644.\u0644.\u200f": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "LB\u00a3": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "LBP": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "SL Re": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "SLRs": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "LKR": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "Lt": [ + "lit", + "lity", + "litów", + "lita" + ], + "LTL": [ + "lit", + "lity", + "litów", + "lita" + ], + "Ls": [ + "łat", + "łaty", + "łatów", + "łata" + ], + "LVL": [ + "łat", + "łaty", + "łatów", + "łata" + ], + "\u062f.\u0644.\u200f": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "LD": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "LYD": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "\u062f.\u0645.\u200f": [ + "dirham marokański", + "dirhamy marokańskie", + "dirhamów marokańskich", + "dirhama marokańskiego" + ], + "MAD": [ + "dirham marokański", + "dirhamy marokańskie", + "dirhamów marokańskich", + "dirhama marokańskiego" + ], + "MDL": [ + "Lej Mołdawii", + "Leje Mołdawii", + "Lei Mołdawii", + "Leja Mołdawii" + ], + "MGA": [ + "ariary", + "ariary", + "ariary", + "ariary" + ], + "MKD": [ + "denar macedoński", + "denary macedońskie", + "denarów macedońskich", + "denara macedońskiego" + ], + "K": [ + "kiat", + "kiaty", + "kiatów", + "kiata" + ], + "MMK": [ + "kiat", + "kiaty", + "kiatów", + "kiata" + ], + "MOP$": [ + "pataca", + "pataca", + "pataca", + "pataca" + ], + "MOP": [ + "pataca", + "pataca", + "pataca", + "pataca" + ], + "MURs": [ + "rupia Mauritiusu", + "rupie Mauritiusu", + "rupii Mauritiusu", + "rupii Mauritiusu" + ], + "MUR": [ + "rupia Mauritiusu", + "rupie Mauritiusu", + "rupii Mauritiusu", + "rupii Mauritiusu" + ], + "MX$": [ + "peso meksykańskie", + "peso meksykańskie", + "pesos meksykańskich", + "peso meksykańskiego" + ], + "MXN": [ + "peso meksykańskie", + "peso meksykańskie", + "pesos meksykańskich", + "peso meksykańskiego" + ], + "RM": [ + "ringgit", + "ringgit", + "ringgitów", + "ringgita" + ], + "MYR": [ + "ringgit", + "ringgit", + "ringgitów", + "ringgita" + ], + "MTn": [ + "metical", + "meticale", + "meticali", + "meticala" + ], + "MZN": [ + "metical", + "meticale", + "meticali", + "meticala" + ], + "N$": [ + "dolar namibijski", + "dolare namibijskie", + "dolarów namibijskich", + "dolara namibijskiego" + ], + "NAD": [ + "dolar namibijski", + "dolare namibijskie", + "dolarów namibijskich", + "dolara namibijskiego" + ], + "\u20a6": [ + "naira", + "naire", + "nair", + "naira" + ], + "NGN": [ + "naira", + "naire", + "nair", + "naira" + ], + "C$": [ + "cordoba oro", + "cordoby", + "córdob", + "cordoby" + ], + "NIO": [ + "cordoba oro", + "cordoby", + "córdob", + "cordoby" + ], + "Nkr": [ + "korona norweska", + "korony norweskie", + "koron norweskich", + "korony norweskiej" + ], + "NOK": [ + "korona norweska", + "korony norweskie", + "koron norweskich", + "korony norweskiej" + ], + "\u0928\u0947\u0930\u0942": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NPRs": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NPR": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NZ$": [ + "dolar nowozelandzki", + "dolary nowozelandzkie", + "dolarów nowozelandzkich", + "dolara nowozelandzkiego" + ], + "NZD": [ + "dolar nowozelandzki", + "dolary nowozelandzkie", + "dolarów nowozelandzkich", + "dolara nowozelandzkiego" + ], + "\u0631.\u0639.\u200f": [ + "rial omański", + "riale omańskie", + "riali omańskich", + "riala omańskiego" + ], + "OMR": [ + "rial omański", + "riale omańskie", + "riali omańskich", + "riala omańskiego" + ], + "B/.": [ + "balboa", + "balboa", + "balboa", + "balboa" + ], + "PAB": [ + "balboa", + "balboa", + "balboa", + "balboa" + ], + "S/.": [ + "sol", + "sole", + "soli", + "sola" + ], + "PEN": [ + "sol", + "sole", + "soli", + "sola" + ], + "\u20b1": [ + "peso filipińskie", + "peso filipińskie", + "pesos filipińskich", + "peso filipińskiego" + ], + "PHP": [ + "peso filipińskie", + "peso filipińskie", + "pesos filipińskich", + "peso filipińskiego" + ], + "\u20a8": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "PKRs": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "PKR": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "z\u0142": [ + "złoty", + "złote", + "złotych", + "złotego" + ], + "PLN": [ + "złoty", + "złote", + "złotych", + "złotego" + ], + "\u20b2": [ + "guarani", + "guarani", + "guarani", + "guarani" + ], + "PYG": [ + "guarani", + "guarani", + "guarani", + "guarani" + ], + "\u0631.\u0642.\u200f": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "QR": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "QAR": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "RON": [ + "lej rumuński", + "leje rumuńskie", + "lei rumuńsich", + "leja rumuńskiego" + ], + "\u0434\u0438\u043d.": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "din.": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "RSD": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "\u20bd.": [ + "rubel rosyjski", + "ruble rosyjskie", + "ruble rosyjskie", + "rubla rosyjskiego" + ], + "RUB": [ + "rubel rosyjski", + "ruble rosyjskie", + "ruble rosyjskie", + "rubla rosyjskiego" + ], + "FR": [ + "frank rwandyjski", + "franki rwandyjskie", + "franków rwandyjskich", + "franka rwandyjskiego" + ], + "RWF": [ + "frank rwandyjski", + "franki rwandyjskie", + "franków rwandyjskich", + "franka rwandyjskiego" + ], + "\u0631.\u0633.\u200f": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SR": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SAR": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SDG": [ + "funt sudański", + "funty sudańskie", + "funtów sudańskich", + "funta sudańskiego" + ], + "Skr": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "SEK": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "S$": [ + "dolar singapurski", + "dolary singapurskie", + "dolarów singapurskich", + "dolara singapurskiego" + ], + "SGD": [ + "dolar singapurski", + "dolary singapurskie", + "dolarów singapurskich", + "dolara singapurskiego" + ], + "Ssh": [ + "szyling somalijski", + "szylingi somalijskie", + "szylingów somalijskich", + "szylinga somalijskiego" + ], + "SOS": [ + "szyling somalijski", + "szylingi somalijskie", + "szylingów somalijskich", + "szylinga somalijskiego" + ], + "\u0644.\u0633.\u200f": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "SY\u00a3": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "SYP": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "\u0e3f": [ + "bat tajlandzki", + "baty tajlandzkie", + "batów tajlandzkich", + "bata tajlandzkiego" + ], + "THB": [ + "bat tajlandzki", + "baty tajlandzkie", + "batów tajlandzkich", + "bata tajlandzkiego" + ], + "\u062f.\u062a.\u200f": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "DT": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "TND": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "T$": [ + "pa'anga", + "pa'anga", + "pa'anga", + "pa'anga" + ], + "TOP": [ + "pa'anga", + "pa'anga", + "pa'anga", + "pa'anga" + ], + "TL": [ + "lira turecka", + "liry tureckie", + "lir tureckich", + "liry tureckiej" + ], + "TRY": [ + "lira turecka", + "liry tureckie", + "lir tureckich", + "liry tureckiej" + ], + "TT$": [ + "dolar Trynidadu i Tobago", + "dolary Trynidadu i Tobago", + "dolarów Trynidadu i Tobago", + "dolara Trynidadu i Tobago" + ], + "TTD": [ + "dolar Trynidadu i Tobago", + "dolary Trynidadu i Tobago", + "dolarów Trynidadu i Tobago", + "dolara Trynidadu i Tobago" + ], + "NT$": [ + "dolar tajwański", + "dolary tajwańskie", + "dolarów tajwańskich", + "dolara tajwańskiego" + ], + "TWD": [ + "dolar tajwański", + "dolary tajwańskie", + "dolarów tajwańskich", + "dolara tajwańskiego" + ], + "TSh": [ + "szyling tanzański", + "szylingi tanzańskie", + "szylingów tanzańskich", + "szylinga tanzańskiego" + ], + "TZS": [ + "szyling tanzański", + "szylingi tanzańskie", + "szylingów tanzańskich", + "szylinga tanzańskiego" + ], + "\u20b4": [ + "hrywna", + "hrywny", + "hrywien", + "hrywny" + ], + "UAH": [ + "hrywna", + "hrywny", + "hrywien", + "hrywny" + ], + "USh": [ + "szyling ugandyjski", + "szylingi ugandyjskie", + "szylingów ugandyjskich", + "szylinga ugandyjskiego" + ], + "UGX": [ + "szyling ugandyjski", + "szylingi ugandyjskie", + "szylingów ugandyjskich", + "szylinga ugandyjskiego" + ], + "$U": [ + "peso urugwajskie", + "peso urugwajskie", + "pesos urugwajskie", + "peso urugwajskiego" + ], + "UYU": [ + "peso urugwajskie", + "peso urugwajskie", + "pesos urugwajskie", + "peso urugwajskiego" + ], + "UZS": [ + "sum", + "sumy", + "sumów", + "suma" + ], + "Bs.F.": [ + "boliwar", + "boliwary", + "boliwarów", + "boliwara" + ], + "VEF": [ + "boliwar", + "boliwary", + "boliwarów", + "boliwara" + ], + "\u20ab": [ + "dong", + "dongi", + "dongów", + "donga" + ], + "VND": [ + "dong", + "dongi", + "dongów", + "donga" + ], + "FCFA": [ + "środkowoafrykański frank CFA", + "środkowoafrykańskie franki CFA", + "środkowoafrykańskich franków CFA", + "środkowoafrykańskiego franka CFA" + ], + "XAF": [ + "środkowoafrykański frank CFA", + "środkowoafrykańskie franki CFA", + "środkowoafrykańskich franków CFA", + "środkowoafrykańskiego franka CFA" + ], + "CFA": [ + "frank CFA Afryki Zachodniej", + "franki CFA Afryki Zachodniej", + "franków CFA Afryki Zachodniej", + "franka CFA Afryki Zachodniej" + ], + "XOF": [ + "frank CFA Afryki Zachodniej", + "franki CFA Afryki Zachodniej", + "franków CFA Afryki Zachodniej", + "franka CFA Afryki Zachodniej" + ], + "\u0631.\u064a.\u200f": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "YR": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "YER": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "R": [ + "rand", + "randy", + "randów", + "randa" + ], + "ZAR": [ + "rand", + "randy", + "randów", + "randa" + ], + "ZK": [ + "kwacha zambijska", + "kwacha zambijskie", + "kwacha zambijskich", + "kwacha zambijskiego" + ], + "ZMK": [ + "kwacha zambijska", + "kwacha zambijskie", + "kwacha zambijskich", + "kwacha zambijskiego" + ], + "ZWL$": [ + "dolar Zimbabwe", + "dolary Zimbabwe", + "dolarów Zimbabwe", + "dolara Zimbabwe" + ], + "ZWL": [ + "dolar Zimbabwe", + "dolary Zimbabwe", + "dolarów Zimbabwe", + "dolara Zimbabwe" + ] +} \ No newline at end of file diff --git a/data/numbers.json b/data/numbers.json new file mode 100644 index 0000000..65cae97 --- /dev/null +++ b/data/numbers.json @@ -0,0 +1,116 @@ +{ + "number_words": { + "0": "zero", + "1": "jeden", + "2": "dwa", + "3": "trzy", + "4": "cztery", + "5": "pięć", + "6": "sześć", + "7": "siedem", + "8": "osiem", + "9": "dziewięć", + "10": "dziesięć", + "11": "jedenaście", + "12": "dwanaście", + "13": "trzynaście", + "14": "czternaście", + "15": "piętnaście", + "16": "szesnaście", + "17": "siedemnaście", + "18": "osiemnaście", + "19": "dziewiętnaście", + "20": "dwadzieścia", + "30": "trzydzieści", + "40": "czterdzieści", + "50": "pięćdziesiąt", + "60": "sześćdziesiąt", + "70": "siedemdziesiąt", + "80": "osiemdziesiąt", + "90": "dziewięćdziesiąt", + "100": "sto", + "200": "dwieście", + "300": "trzysta", + "400": "czterysta", + "500": "pięćset", + "600": "sześćset", + "700": "siedemset", + "800": "osiemset", + "900": "dziewięćset" + }, + "ordinal_number_words": { + "0": "zerowy", + "1": "pierwszy", + "2": "drugi", + "3": "trzeci", + "4": "czwarty", + "5": "piąty", + "6": "szósty", + "7": "siódmy", + "8": "ósmy", + "9": "dziewiąty", + "10": "dziesiąty", + "11": "jedenasty", + "12": "dwunasty", + "13": "trzynasty", + "14": "czternasty", + "15": "piętnasty", + "16": "szesnasty", + "17": "siedemnasty", + "18": "osiemnasty", + "19": "dziewiętnasty", + "20": "dwudziesty", + "30": "trzydziesty", + "40": "czterdziesty", + "50": "pięćdziesiąty", + "60": "sześćdziesiąty", + "70": "siedemdziesiąty", + "80": "osiemdziesiąty", + "90": "dziewięćdziesiąty", + "100": "setny", + "200": "dwusetny", + "300": "trzechsetny", + "400": "czterechsetny", + "500": "pięćsetny", + "600": "sześćsetny", + "700": "siedemsetny", + "800": "osiemsetny", + "900": "dziewięćsetny" + }, + "large_numbers": { + "3": "tysiąc", + "6": "milion", + "9": "miliard", + "12": "bilion", + "15": "biliard", + "18": "trylion", + "21": "tryliard", + "24": "kwadrylion", + "27": "kwadryliard", + "30": "kwintylion", + "33": "kwintyliard", + "36": "sekstylion", + "39": "sekstyliard", + "42": "septylion", + "45": "septyliard", + "48": "oktylion", + "51": "oktyliard", + "54": "nonilion", + "57": "noniliard", + "60": "decylion", + "63": "decyliard", + "66": "undecylion", + "69": "undecyliard", + "72": "duodecylion", + "75": "duodecyliard", + "100": "googol", + "600": "centylion", + "603": "centyliard" + }, + "ordinal_large_numbers": { + "3": "tysięczny", + "6": "milionowy", + "9": "miliardowy", + "12": "bilionowy" + } +} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ad8f8a6..98462cc 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,17 +1,16 @@ version: '3' services: - tokenizer: + wordifier: container_name: clarin_wordifier build: ./ working_dir: /home/worker - entrypoint: - - python3.6 - - main.py - - service + command: + - python3.6 main.py service environment: - PYTHONUNBUFFERED=0 volumes: - '/samba:/samba' - './config.ini:/home/worker/config.ini' - './src:/home/worker/src' + - './tests:/home/worker/tests' - './main.py:/home/worker/main.py' diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..2d55715 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +parameterized==0.8.1 +nose2==0.10.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index aae2e50..e834004 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ nlp-ws -python-morfeusz \ No newline at end of file +Babel==2.8.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ccl_handler.py b/src/ccl_handler.py index a61dd89..02e08ac 100755 --- a/src/ccl_handler.py +++ b/src/ccl_handler.py @@ -2,19 +2,19 @@ from xml.etree.ElementTree import iterparse -class Ccl_handler: +class CCLHandler: """Implements reading ccl for anonymizer service.""" def __init__(self, ccl_file_name): - """Initialize ccl_handler with a filename.""" + """Initialize CCLHandler with a filename.""" self._file_name = ccl_file_name def process(self, output_file, unmarshallers): """Process xml tags using unmarshallers and save in output_file.""" - with open(output_file, 'w', encoding='utf-8') as out: - with open(self._file_name, 'r', encoding='utf-8') as f: - for event, elem in iterparse(f): - unmarshal = unmarshallers.get(elem.tag, None) - if unmarshal: - out.write(unmarshal(elem)) - elem.clear() + with open(self._file_name, 'r', encoding='utf-8') as input_file, \ + open(output_file, 'w', encoding='utf-8') as output_file: + for event, elem in iterparse(input_file): + unmarshal = unmarshallers.get(elem.tag, None) + if unmarshal: + output_file.write(unmarshal(elem)) + elem.clear() diff --git a/src/date2words.py b/src/date2words.py new file mode 100644 index 0000000..055e5f6 --- /dev/null +++ b/src/date2words.py @@ -0,0 +1,108 @@ +"""Module for converting dates to words.""" +from babel import Locale + +from src.num2words import num2words + +date_tags = ['sg:gen:m3'] + + +def check_none(token): + """If token is none then convert to empty list otherwise return token.""" + if not token: + return [] + return token + + +def month_name_expansion(month): + """Expand month abbreviation or change form. + + Args: + month (str): Month abbrevation or full name. + + Returns: + str: Full month name in genitive case. + + """ + abbr = len(month) == 3 + locale = Locale('pl') + month = month.lower() + + if abbr: + months = locale.months['format']['abbreviated'] + index = list(months.values()).index(month) + 1 + month = locale.months['format']['wide'][index] + else: + for format in ['format', 'stand-alone']: + if month in list(locale.months[format]['wide'].values()): + months = locale.months[format]['wide'] + index = list(months.values()).index(month) + 1 + month = locale.months['format']['wide'][index] + return month + + +def date2words(date_match, tags=None): + """Convert a date to list of words. + + Args: + date_match (re.Match): Date match. + tag (str, optional): Morphological tag. Defaults to None. + + Returns: + list of str: List of words representing date. + + """ + if tags and ":".join(tags[0].split(":")[1:4]) in date_tags: + corrected_tag = tags[0] + else: + corrected_tag = None + if date_match['day_or_month_year']: + day_month1 = num2words(date_match['day_month1'], corrected_tag, + ordinal=True) + day_month2 = num2words(date_match['day_month2'], corrected_tag, + ordinal=True) + year = num2words(date_match['year1'], corrected_tag, ordinal=True) + + # split punctuation into single characters and remove if None + date_order = [day_month1, *check_none(date_match['punct1']), + day_month2, *check_none(date_match['punct2']), year] + elif date_match['year_month_or_day']: + day_month3 = num2words(date_match['day_month3'], ordinal=True) + day_month4 = num2words(date_match['day_month4'], ordinal=True) + year = num2words(date_match['year2'], ordinal=True) + + # split punctuation into single characters and remove if None + date_order = [year, *check_none(date_match['punct3']), day_month3, + *check_none(date_match['punct4']), day_month4] + elif date_match['month_in_words']: + day = date_match['day1'] + if date_match['day2']: + day = date_match['day2'] + if day: + day = num2words(day, corrected_tag, ordinal=True) + + year = '' + if date_match['year3']: + year = num2words(date_match['year3'], corrected_tag, ordinal=True) + if date_match['year4']: + year = num2words(date_match['year4'], corrected_tag, ordinal=True) + + if not day and not year: + return [date_match['month']] + else: + month = month_name_expansion(date_match['month']) + + # split punctuation into single characters and remove if None + if date_match['day2']: + date_order = [month, *check_none(date_match['punct7']), + day, *check_none(date_match['punct8'])] + elif date_match['day1']: + date_order = [day, *check_none(date_match['punct5']), + month, *check_none(date_match['punct6'])] + else: + date_order = [month] + if year: + date_order = date_order + [year] + date_order = list(map(lambda x: x if x else '', date_order)) + else: + date_order = [''] + return date_order diff --git a/src/num2words.py b/src/num2words.py new file mode 100644 index 0000000..fdae119 --- /dev/null +++ b/src/num2words.py @@ -0,0 +1,105 @@ +"""Module for converting numbers to words.""" +import math +import json + +from src.utils import get_word_form, trailing_zeros + +with open('data/numbers.json', 'r') as numbers_file: + numbers_dict = json.load(numbers_file) + number_words = {int(k): v for k, v in numbers_dict['number_words'].items()} + ordinal_number_words = {int(k): v for k, v + in numbers_dict['ordinal_number_words'].items()} + large_numbers = {int(k): v for k, v + in numbers_dict['large_numbers'].items()} + ordinal_large_numbers = {int(k): v for k, v + in numbers_dict['ordinal_large_numbers'].items()} + + +def three_digit_to_words(text, tag='', ordinal=False): + """Convert three digits numbers to words with given tag. Util function.""" + map_to_words = ordinal_number_words if ordinal else number_words + + number = int(text) + if number == 0: + return get_word_form(map_to_words[number], tag) + words = [] + units = number % 10 + tens = number % 100 - units + hundredths = number // 100 + if 0 < tens + units <= 20: + word = get_word_form(map_to_words[tens + units], tag) + words.append(word) + else: + if units != 0: + words.append(get_word_form(map_to_words[units], tag)) + if tens != 0: + words.append(get_word_form(map_to_words[tens], tag)) + + if hundredths != 0: + if tens == 0 and units == 0: + words.append(get_word_form(map_to_words[hundredths * 100], tag)) + else: + words.append(get_word_form(number_words[hundredths * 100], '')) + + return ' '.join(reversed(words)) + + +def num2words(text, tag='', ordinal=False): + """Converts a number to words. + + Args: + text (str): Three digits number. + tag (str, optional): Morphological tag. Defaults to ''. + ordinal (bool, optional): If word should be derived from ordinal number. + Defaults to False. + + Returns: + str: Returns number as words with given tag. + + """ + i = 0 + words = [] + number = int(text) + + if ordinal: + zeros = trailing_zeros(number) + zeros = 3 * math.floor(zeros / 3) + if zeros > 2 and 0 < len(text) - zeros <= 3: + number = number // 10 ** zeros + if number == 1: + words = '' + else: + words = three_digit_to_words(str(number), 'numcomp') + words += get_word_form(ordinal_large_numbers[zeros], tag) + return words + + if len(text) <= 3 or number == 0: + return three_digit_to_words(text, tag, ordinal) + + while number > 0: + remainder = number % 1000 + if i == 0: + triple = three_digit_to_words(remainder, tag, ordinal) + else: + triple = three_digit_to_words(remainder) + number = number // 1000 + if remainder == 0 and number != 0: + i += 3 + continue + + if i == 0: + words.append(triple) + else: + if remainder == 1: + tag = 'subst:sg:nom:m3' + elif remainder % 10 in [2, 3, 4]: + tag = 'subst:pl:nom:m3' + else: + tag = 'subst:pl:gen:m3' + form = get_word_form(large_numbers[i], tag) + if remainder == 1: + words.append(form) + else: + words.append(triple + ' ' + form) + i += 3 + return ' '.join(list(reversed(words))) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..a214675 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,211 @@ +"""Module for useful functions.""" +from enum import Enum + +import morfeusz2 + + +class TokenType(Enum): + """Type of token.""" + + NUMBER = 1 + SPECIAL_CHARACTER = 2 + PUNCTUATION = 3 + CURRENCY = 4 + + +class NumberPlural(Enum): + """Type of number indicating what the word suffix will be. + + E.g: + SINGULAR 1$ - jeden dolar + SEVERAL (2-4) 2$ - dwa dolary + MANY (5+) 7$ - siedem dolarów + """ + + SINGULAR = 0 + SEVERAL = 1 + MANY = 2 + + +def to_number_plural(number): + """Convert a number to enumerate type, that indicates word suffix. + + Args: + number (int or string): Number to be converted. + + Returns: + NumberPlural: Enumerate, which indicates what the end of the word + will be. + + """ + number = int(number) + if number == 1: + return NumberPlural.SINGULAR + elif 2 <= number <= 4: + return NumberPlural.SEVERAL + else: + return NumberPlural.MANY + + +def is_simple_number(tokens, special_types): + """Checks if list of tokens creates a simple number. + + Simple number contains only digits and spaces between groups of three. + + Args: + tokens (list): List of tokens. + special_types (list): Types of tokens. + + Returns: + bool: Return True if joined tokens are simple number otherwise False. + + """ + numbers = [n for i, n in enumerate(tokens) + if special_types[i] == TokenType.NUMBER] + return (all([len(t) == 3 for t in numbers[1:]]) and + all([(s.isdigit() or s == ' ') for s in tokens])) + + +def is_fraction(tokens, decimal=False): + """Check is list of tokens are 2 numbers splitted by slash or dot. + + Args: + tokens (list): List of tokens. + decimal (bool, optional): If True delimiter is dot otherwise slash '/'. + Defaults to False. + + Returns: + bool: Return True if tokens are fraction otherwise False. + + """ + if len(tokens) < 3: + return False + delimiter = '.' if decimal else '/' + splitted = ''.join(tokens).split(delimiter) + return ((len(splitted) == 2) and + tokens.count(delimiter) == 1 and + all([(s.isdigit() or s in ' /.') for s in tokens])) + + +def trailing_zeros(number): + """Count trailing zeros in number. + + Returns: + int: Return number of trailing zeros. + + """ + manipulandum = str(number) + return len(manipulandum) - len(manipulandum.rstrip('0')) + + +def search_form(forms, tag): + """Search for the correct form of word from all those returned by Morfeusz. + + Args: + forms (list of tuples): Tags and variations of words returned + by Morfeusz. + tag (str): The tag of the word whose form is being searched for. + + Returns: + str: Word properly conjugated with the given tag or None if not found. + + """ + for form in forms: + form_categories = [x.split('.') for x in form[2].split(':')] + gramm_categ_enum = enumerate(tag) + if all((c in form_categories[i] for i, c in gramm_categ_enum)): + return form[0] + return None + + +def get_word_form(text, tag): + """Change the word in the appropriate form with given morphological tag. + + Args: + text (str): Word to be changed. + tag (str): Morphological tag. + + Returns: + str: Word changed with given morphological tag. + + """ + if not tag: + return text + + morf = morfeusz2.Morfeusz() + all_forms = morf.generate(text) + + tag = tag.split(':') + forms = [x for x in all_forms if x[2].split(':')[0] == tag[0]] + form = search_form(forms, tag) + + if form: + return form + if len(tag) > 4: + tag = tag[:4] + form = search_form(forms, tag) + + if form: + return form + else: + return text + + +def subtract_from_first(list_of_tuples, offset): + """Subtract from every first element in tuples that make up list.""" + list_of_tuples = (list_of_tuples[0] - offset, *list_of_tuples[1:]) + return list_of_tuples + + +def check_and_replace(string_builder, find, replace, filtered_tokens): + """Check for matches in list and replace them with given tokens. + + Remove replaced tokens from `filtered_tokens` to to avoid double processing. + + Args: + string_builder (list of str): List of all words. + find (list of str): Tokens to be replaced. + replace (list of str): Words that will replace `find` tokens in + `string_builder`. + filtered_tokens (list of tuples): List of tokens and their features. + + Returns: + (list of str, list of tuples): Pair: list of words with replaced matched + tokens and filtered list of tokens and their feature with deleted + items that have been replaced. + + """ + if not find or not replace: + return string_builder, filtered_tokens + + new_builder = string_builder.copy() + max_lenght = max(map(len, find)) + for i, token in enumerate(string_builder): + if not find: + break + to_remove = [i] + check = token + j = i + 1 + if check in find: + new_builder[i] = ''.join(replace[find.index(check)]) + filtered_tokens = list(filter(lambda x: x[0] != i, filtered_tokens)) + del find[0], replace[0] + continue + if check[0] != find[0][:len(check[0])]: + continue + while len(check) < max_lenght and j < len(string_builder): + check += string_builder[j] + to_remove.append(j) + if check in find: + index = find.index(check) + new_builder = new_builder[:i] + replace[index] + if j + 1 < len(string_builder): + new_builder += string_builder[j + 1:] + filtered_tokens = list(filter(lambda x: x[0] not in to_remove, + filtered_tokens)) + find.pop(index) + replace.pop(index) + if not find: + return new_builder, filtered_tokens + j += 1 + return new_builder, filtered_tokens diff --git a/src/wordifier.py b/src/wordifier.py index 8f1f2ad..0f4ed21 100644 --- a/src/wordifier.py +++ b/src/wordifier.py @@ -1,255 +1,149 @@ """Implementation of wordifier functionality.""" -import morfeusz2 import re +import json +from itertools import islice + +from src.utils import is_simple_number, subtract_from_first, trailing_zeros, \ + check_and_replace, TokenType, NumberPlural, to_number_plural, is_fraction +from src.num2words import num2words +from src.date2words import date2words class Wordifier: - """Class used to edit sentences based on options.""" - - _num_list = [ - { - '0': 'zero', - '1': 'jeden', - '2': 'dwa', - '3': 'trzy', - '4': 'cztery', - '5': 'pięć', - '6': 'sześć', - '7': 'siedem', - '8': 'osiem', - '9': 'dziewięć' - }, - { - '10': 'dziesięć', - '11': 'jedenaście', - '12': 'dwanaście', - '13': 'trzynaście', - '14': 'czternaście', - '15': 'piętnaście', - '16': 'szesnaście', - '17': 'siedemnaście', - '18': 'osiemnaście', - '19': 'dziewiętnaście', - '20': 'dwadzieścia', - '30': 'trzydzieści', - '40': 'czterdzieści', - '50': 'pięćdziesiąt', - '60': 'sześćdziesiąt', - '70': 'siedemdziesiąt', - '80': 'osiemdziesiąt', - '90': 'dziewięćdziesiąt' + """Class for generating words from special characters or numbers.""" + + date_regex = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' + r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' + r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)' + r'|Gru(?:|dzie[nń]|dnia))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I + ) + decimal_fraction_regex = re.compile(r'\d+[ ]?(\.)[ ]?\d+') + + number_punctuation = ' .,' + following_type = { + TokenType.NUMBER: [TokenType.NUMBER, TokenType.SPECIAL_CHARACTER, + TokenType.CURRENCY], + TokenType.SPECIAL_CHARACTER: [TokenType.SPECIAL_CHARACTER, + TokenType.NUMBER], + TokenType.CURRENCY: [] + } + + _denominator_tag = { + NumberPlural.SINGULAR: { + 'default': 'adj:sg:nom:f', + ('acc', 'dat', 'gen', 'loc'): { + ('f'): 'adj:sg:acc:f' + } }, - { - '1': 'sto', - '2': 'dwieście', - '3': 'trzysta', - '4': 'czterysta', - '5': 'pięćset', - '6': 'sześćset', - '7': 'siedemset', - '8': 'osiemset', - '9': 'dziewięćset' + NumberPlural.SEVERAL: { + 'default': 'adj:pl:acc:f', + ('dat'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f' + }, + ('gen', 'loc'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:pl:acc:m1' + }, + ('nom', 'voc'): { + ('m1'): 'adj:pl:acc:m1' + } }, - { - 3: 'tysiąc', - 6: 'milion', - 9: 'miliard', - 12: 'bilion', - 15: 'biliard', - 18: 'trylion', - 21: 'tryliard', - 24: 'kwadrylion', - 27: 'kwadryliard', - 30: 'kwintylion', - 33: 'kwintyliard', - 36: 'sekstylion', - 39: 'sekstyliard', - 42: 'septylion', - 45: 'septyliard', - 48: 'oktylion', - 51: 'oktyliard', - 54: 'nonilion', - 57: 'noniliard', - 60: 'decylion', - 63: 'decyliard', - 66: 'undecylion', - 69: 'undecyliard', - 72: 'duodecylion', - 75: 'duodecyliard', - 100: 'googol', - 600: 'centylion', - 603: 'centyliard' + NumberPlural.MANY: { + 'default': 'adj:pl:acc:m1', + ('acc', 'nom', 'voc'): { + ('m1'): 'adj:sg:dat:f' + }, + ('gen', 'dat', 'inst', 'loc'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f' + } } - ] - - _adj_list = [ - { - '0': 'zerowy', - '1': 'pierwszy', - '2': 'drugi', - '3': 'trzeci', - '4': 'czwarty', - '5': 'piąty', - '6': 'szósty', - '7': 'siódmy', - '8': 'ósmy', - '9': 'dziewiąty' - }, - { - '10': 'dziesiąty', - '11': 'jedenasty', - '12': 'dwunasty', - '13': 'trzynasty', - '14': 'czternasty', - '15': 'piętnasty', - '16': 'szesnasty', - '17': 'siedemnasty', - '18': 'osiemnasty', - '19': 'dziewiętnasty', - '20': 'dwudziesty', - '30': 'trzydziesty', - '40': 'czterdziesty', - '50': 'pięćdziesiąty', - '60': 'sześćdziesiąty', - '70': 'siedemdziesiąty', - '80': 'osiemdziesiąty', - '90': 'dziewięćdziesiąty' - }, - { - '1': 'setny', - '2': 'dwusetny', - '3': 'trzechsetny', - '4': 'czterechsetny', - '5': 'pięćsetny', - '6': 'sześćsetny', - '7': 'siedemsetny', - '8': 'osiemsetny', - '9': 'dziewięćsetny' - }, - { - 3: 'tysięczny', - 6: 'milionowy', - 9: 'miliardowy', - 12: 'bilionowy' - } - ] - - _script_translator = [ - { - '\u2070': '0', # SUPERSCRIPT ZERO - '\u00B9': '1', # SUPERSCRIPT ONE - '\u00B2': '2', # SUPERSCRIPT TWO - '\u00B3': '3', # SUPERSCRIPT THREE - '\u2074': '4', # SUPERSCRIPT FOUR - '\u2075': '5', # SUPERSCRIPT FIVE - '\u2076': '6', # SUPERSCRIPT SIX - '\u2077': '7', # SUPERSCRIPT SEVEN - '\u2078': '8', # SUPERSCRIPT EIGHT - '\u2079': '9', # SUPERSCRIPT NINE - }, - { - '\u2080': '0', # SUBSCRIPT ZERO - '\u2081': '1', # SUBSCRIPT ONE - '\u2082': '2', # SUBSCRIPT TWO - '\u2083': '3', # SUBSCRIPT THREE - '\u2084': '4', # SUBSCRIPT FOUR - '\u2085': '5', # SUBSCRIPT FIVE - '\u2086': '6', # SUBSCRIPT SIX - '\u2087': '7', # SUBSCRIPT SEVEN - '\u2088': '8', # SUBSCRIPT EIGHT - '\u2089': '9' # SUBSCRIPT NINE - }, - { - '\u00BC': '1/4', # VULGAR FRACTION ONE QUARTER - '\u00BD': '1/2', # VULGAR FRACTION ONE HALF - '\u00BE': '3/4', # VULGAR FRACTION THREE QUARTERS - } - ] + } + + special_character_numbers_map = { + '+': 'plus', + '-': 'minus', + '/': 'przez', + '*': 'razy', + '%': 'procent', + '&': 'ampersand', + '=': 'równa się', + '^': 'do potęgi', + '#': 'numer' + } + special_character_map = { + '+': 'plus', + '-': '-', + '/': 'ukośnik', + '%': 'procent', + '&': 'i', + '=': 'równa się', + '^': 'kareta', + '#': 'kratka' + } def __init__(self): """Class initialization.""" - self._morf = morfeusz2.Morfeusz() self.unmarshallers = { 'chunk': lambda *args: '\n', 'sentence': lambda *args: self._process_sent_tree(*args), } - self._one_dict = dict() - self._create_one_dict() - self._special_list = [] - self._special_dict = { - 'number': - lambda *args: self._get_number(*args), - 'superscript': - lambda *args: self._get_superscript(*args), - 'subscript': - lambda *args: self._get_subscript(*args), - '/': - lambda *args: '/', - 'number/': - lambda *args: self._get_number_slash(*args), - 'number/subscript': - lambda *args: self._get_number_sub_fraction(*args), - 'number/number': - lambda *args: self._get_number_number_fraction(*args), - 'superscript/': - lambda *args: self._get_superscript_slash(*args), - 'superscript/subscript': - lambda *args: self._get_script_fraction(*args), - 'superscript/number': - lambda *args: self._get_super_number_fraction(*args), - 'fraction': - lambda *args: self._get_fraction(*args), - 'scientific': - lambda *args: self._get_scientific(*args), - 'dot': - lambda *args: self._get_dot(*args), - '^': - lambda *args: '^', - 'number^': - lambda *args: self._get_number_hat(*args), - 'number^number': - lambda *args: self._get_number_to_number(*args), - 'number^superscript': - lambda *args: self._get_number_to_super(*args) - } - - def _create_one_dict(self): - for word in self._morf.generate('jeden'): - self._one_dict[word[0]] = True + with open('data/currencies.json', 'r') as currency_file: + self._currencies = json.load(currency_file) + self._wordify_tokens = [] def _process_sent_tree(self, sentence_subtree): string_builder = [] + tags = [] tok_id = 0 for elem in sentence_subtree: if elem.tag == 'tok': - tok = self._process_single_tok(tok_id, elem) - string_builder.append(tok) + token, tag = self._process_single_tok(tok_id, elem) + string_builder.append(token) string_builder.append(' ') + tags.append(tag) tok_id += 2 elif elem.tag == 'ns': tok_id -= 1 string_builder.pop() else: raise Exception('Unrecognized tag inside sentence: ' + elem.tag) - return self._process_sentence(string_builder) + return self._process_sentence(string_builder, tags) + + def _get_denominator_tag(self, nominator_plural, nom_case, nom_gender=None): + if nom_case == 'default' or nom_gender is None: + return self._denominator_tag[nominator_plural]['default'] + + for cases, value in self._denominator_tag[nominator_plural].items(): + if cases == 'default': + continue + if nom_case in cases: + for genders, tag in value.items(): + if nom_gender in genders: + return tag + return self._denominator_tag[nominator_plural]['default'] def _special_type(self, text): - if text == '/' or text == '\u002F': - return '/' - elif re.match(r'\d+\.\d+', text): - return 'dot' - elif re.match(r'\d+\^\d+', text): - return 'scientific' - elif text == r'^': - return '^' - elif all(char in self._script_translator[0] for char in text): - return 'superscript' - elif all(char in self._script_translator[1] for char in text): - return 'subscript' - elif all(char in self._script_translator[2] for char in text): - return 'fraction' + if text in self.special_character_map: + return TokenType.SPECIAL_CHARACTER + elif text in self._currencies: + return TokenType.CURRENCY elif text.isdigit(): - return 'number' + return TokenType.NUMBER return None def _process_single_tok(self, tok_id, tok_subtree): @@ -261,185 +155,7 @@ class Wordifier: elif elem.tag == 'lex': tag = self._process_lex(elem) word = self._process_word(tok_id, text, tag) - return word - - def _return_large_part(self, num, digit, tag=None, word_text=None): - if word_text: - last_word = word_text.split(' ')[-1] - if last_word == 'dwa' \ - or last_word == 'trzy' \ - or last_word == 'cztery': - tag = 'subst:pl:nom:m3' - else: - tag = 'subst:pl:gen:m3' - return self._return_number(num, 3, digit, tag, digit) - return self._return_number(num, 3, digit, tag) - - def _return_number(self, num, pos, digit, tag=None, key=None): - if tag: - return self._get_correct_form( - text=self._num_list[pos][digit], - tag=tag, - key=key - ) if num else self._get_correct_form( - text=self._adj_list[pos][digit], - tag=tag, - key=key - ) - return self._num_list[pos][digit] if num else self._adj_list[pos][digit] - - def _handle_two_digits(self, from_, to_, num, text, tag=None): - text = text[from_:to_] - if len(text) >= 2: - if text[-2] == '0': - return self._return_number(num, 0, text[-1], tag) - elif text[-2] == '1': - return self._return_number(num, 1, text[-2:], tag) - else: - if text[-1] == '0': - return self._return_number(num, 1, text[-2:], tag) - return self._return_number(num, 1, text[-2] + '0', tag) +\ - ' ' + self._return_number(num, 0, text[-1], tag) - elif len(text) >= 1: - return self._return_number(num, 0, text[-1], tag) - else: - return '' - - def _handle_three_digits(self, from_, to_, num, text, tag=None): - text = text[from_:to_] - if len(text) >= 3: - string = '' - if len(text) > 3: - string = ' ' - if text[-3] != '0': - if text[-2:] == '00': - return string + self._return_number(num, 2, text[-3], tag) - return string + self._return_number(num, 2, text[-3], tag) \ - + ' ' + self._handle_two_digits(from_, to_, num, text, tag) - if len(text) >= 2 and text[-2:] == '00': - return self._return_number(num, 0, text[-1], tag) - return self._handle_two_digits(from_, to_, num, text, tag) - - def _replace_correct_from(self, key, tag, base): - flex = self._get_correct_form(base, tag).lstrip(base) - return self._num_list[3][key] + flex - - def _handle_numbers(self, text, tag=None, word_text='', num=None): - if num is None and tag: - tag_list = tag.split(':') - num = not (tag_list[0] == 'adj' or tag_list[0] == 'subst') - elif num is None: - num = True - length = len(text) - if length <= 3: - word_text = word_text + self._handle_three_digits( - 0, - None, - num, - text, - tag - ) - elif length == 0: - raise Exception('Fragment recognized as number is empty!') - else: - new_text = text - digits = len(new_text) - int(len(new_text) / 3) * 3 - if digits != 0: - new_word = self._handle_two_digits( - 0, - digits, - num, - new_text, - tag - ) - if new_word in self._one_dict: - word_text = self._return_large_part( - num, - int(len(new_text) / 3) * 3, - tag, - None - ) - else: - word_text = new_word + ' ' + self._return_large_part( - num, - int(len(new_text) / 3) * 3, - tag, - new_word - ) - new_text = new_text[digits:] - if len(new_text.rstrip('0')) != 0: - word_text += ' ' - else: - return word_text - for k in reversed(range(0, int(len(new_text) / 3))): - key = k * 3 - new_word = self._handle_three_digits(0, 3, num, new_text, tag) - if new_word in self._one_dict and key != 0: - word_text += self._return_large_part(num, key, tag, None) - else: - word_text += new_word - if key != 0: - word_text += ' ' + self._return_large_part( - num, - key, - tag, - word_text - ) - new_text = new_text[3:] - if len(new_text.rstrip('0')) == 0: - return word_text - word_text += ' ' - return word_text - - def _replace_using(self, key, word_text, tag, base): - text_split = word_text.split(' ') - text_split[-1] = self._replace_correct_from(key, tag, base) - return ' '.join(text_split) - - def _correct_large_number(self, num, word_text, tag, key): - trailing_zeros = key - if not num and trailing_zeros >= 12: - if int(trailing_zeros / 3) * 2 != int(trailing_zeros / 6): - word_text = self._replace_using( - key, - word_text, - tag, - 'miliardowy') - else: - word_text = self._replace_using( - key, - word_text, - tag, - 'bilionowy' - ) - elif num and trailing_zeros >= 27: - if int(trailing_zeros / 3) * 2 != int(trailing_zeros / 6): - word_text = self._replace_using(key, word_text, tag, 'biliard') - elif trailing_zeros >= 54: - word_text = self._replace_using(key, word_text, tag, 'bilion') - else: - word_text = self._get_correct_form(word_text, tag) - else: - word_text = self._get_correct_form(word_text, tag) - return word_text - - def _get_correct_form(self, text, tag, key=None): - if tag is None: - return text - if key: - return self._correct_large_number(True, text, tag, key) - text_split = text.split(' ') - generated = self._morf.generate(text_split[-1]) - for form in generated: - is_correct = True - form_tag = form[2].split(':') - for i, t in enumerate(tag.split(':')): - if t not in form_tag[i].split('.'): - is_correct = False - break - if is_correct: - text_split[-1] = form[0] - return ' '.join(text_split) + return word, tag def _process_word(self, tok_id, text, tag): self._add_special(tok_id, text, tag) @@ -448,7 +164,7 @@ class Wordifier: def _add_special(self, tok_id, text, tag): s_type = self._special_type(text) if s_type: - self._special_list.append((tok_id, text, tag, s_type)) + self._wordify_tokens.append((tok_id, text, tag, s_type)) return text def _process_lex(self, lex_subtree): @@ -462,401 +178,290 @@ class Wordifier: raise Exception('Lex tag had no ctag inside!') return tag - def _get_number(self, string_builder, id_, tag, length): - if length > 1: - words = '' - j = length - i = 0 - while j > 0: - if string_builder[id_ + i] != ' ': - j -= 1 - i -= 1 - i += 1 - for j in range(0, length): - if string_builder[id_ + i] == ' ': - i += 1 - if len(string_builder[id_ + i]) <= 3: - if all(len(elem) == 3 or elem == ' ' for elem - in string_builder[id_ + i:id_ + 1]): - return words + self._handle_numbers( - text=''.join( - string_builder[id_ + i:id_ + 1]) - .replace(' ', ''), - tag=tag - ) - words += self._handle_numbers( - text=string_builder[id_ + i], - tag=tag - ) + ' ' - i += 1 - return words.rstrip() + def _handle_fraction(self, tokens, tags): + """Generate words from fraction splitted by slash '/'. + + Args: + tokens (list of str): List that contains numbers separated by + slash '/'. + + Returns: + str: Fraction as words. + + """ + text = ''.join(tokens) + numerator, denominator = text.split('/') + tag_num = tags[0] + remainder = to_number_plural(int(numerator) % 10) + + tag_case, tag_gender = tag_num.split(':')[2:4] + tag_den = self._get_denominator_tag(remainder, tag_case, tag_gender) + + zeros = trailing_zeros(denominator) + if len(denominator) < 4 or \ + (zeros > 2 and 0 < len(denominator) - zeros <= 3): + return num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) else: - return self._handle_numbers(text=string_builder[id_], tag=tag) - - def _get_superscript(self, string_builder, id_, tag, length): - words = '' - i = 1 - length - new_text = [] - for j in range(0, length): - if string_builder[id_ + i] == ' ': - i += 1 - words += self._handle_numbers( - text=''.join(new_text), - tag=tag - ) + ' ' - for char in string_builder[id_ + i]: - new_text.append(self._script_translator[0][char]) - i += 1 - words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' - return words.rstrip() - - def _get_subscript(self, string_builder, id_, tag, length): - words = '' - i = 1 - length - for j in range(0, length): - if string_builder[id_ + i] == ' ': + return num2words(numerator) + ' przez ' + \ + num2words(denominator) + + def _handle_decimal_fraction(self, tokens): + """Generate words from decimal fraction splitted by dot. + + Args: + tokens (list of str): List that contains numbers separated by dot. + + Returns: + str: Decimal fraction as words. + + """ + text = ''.join(tokens) + number, numerator = text.split('.') + number = number.replace(' ', '') + tag_num = 'adj:sg:nom:f' if int(numerator) == 1 else 'num:pl:nom:f' + denominator = str(10 ** len(numerator)) + remainder = to_number_plural(int(numerator) % 10) + tag_den = self._get_denominator_tag(remainder, 'default') + if int(number) == 0: + return num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) + else: + return num2words(number) + ' i ' + \ + num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) + + def _check_decimal_fraction(self, tokens): + """Checks whether given list of tokens starts with decimal fraction. + + If contains fraction generate words from whole fraction otherwise + generate words from first number. + + Args: + tokens (list of str): List of tokens with number at the beginning. + + Returns: + str: Tokens that form a fraction or number. + int: The number of tokens that make up the fraction. + + """ + match = self.decimal_fraction_regex.search(''.join(tokens[:5])) + if match and match.start() == 0: + tokens_match = tokens[0] + i = 1 + while tokens_match != match.group(0): + tokens_match += tokens[i] i += 1 - new_text = [] - for char in string_builder[id_ + i]: - new_text.append(self._script_translator[1][char]) - words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' + return match.group(0), i - 1 + else: + return tokens[0], 0 + + def _handle_mixed_types(self, tokens, special_types, tags): + last_number_plural = NumberPlural.SINGULAR + if TokenType.NUMBER in special_types: + special_character_map = self.special_character_numbers_map + else: + special_character_map = self.special_character_map + i = 0 + iter_special_types = iter(special_types) + for token_type in iter_special_types: + if token_type == TokenType.SPECIAL_CHARACTER: + if tokens[i] in special_character_map: + tokens[i] = special_character_map[tokens[i]] + else: + tokens[i] = '' + elif token_type == TokenType.PUNCTUATION: + if tokens[i] == ' ': + tokens[i] = '' + elif token_type == TokenType.NUMBER: + number, skip = self._check_decimal_fraction(tokens[i:]) + if skip > 0: + words = self._handle_decimal_fraction(number) + if int(''.join(number).split('.')[0]) == 0: + last_number_plural = NumberPlural.FRACTION + else: + last_number_plural = NumberPlural.MANY + else: + words = num2words(number) + last_number_plural = to_number_plural(number) + tokens = tokens[:i] + [words] + tokens[i + skip + 1:] + if skip != 0: + next(islice(iter_special_types, skip - 1, skip), '') + elif token_type == TokenType.CURRENCY: + suffix = last_number_plural.value + tokens[i] = self._currencies[tokens[i]][suffix] i += 1 - return words.rstrip() - - def _get_number_slash(self, string_builder, id_, tag, length): - return self._get_number(string_builder, id_, tag, length - 1) + ' /' - - def _get_superscript_slash(self, string_builder, id_, tag, length): - return self._get_superscript( - string_builder=string_builder, - id_=id_, - tag=tag, - length=length - 1 - ) + ' /' - - def _handle_fraction(self, numerator, denominator, tag): - num = None - tag_list = tag.split(':') - if numerator == '1': - de_tag = 'adj:sg:' + tag_list[2] + ':f' - num_tag = 'adj:sg:' + tag_list[2] + ':f' - num = True - elif numerator == '2' or numerator == '3' or numerator == '4': - de_tag = 'adj:sg:' + tag_list[2] + ':n' - num_tag = 'num:pl:' + tag_list[2] + ':f' - elif numerator[-1] == '0': - de_tag = 'adj:sg:' + tag_list[2] + ':n' - num_tag = None + text = ' '.join([w for w in tokens if w != '']) + return text + + def _get_as_words(self, tokens, tags, special_types): + """Convert special tokens and numbers to words. + + Args: + tokens (list of str): List of tokens. + special_types (list of TokenType): Types of tokens. + + Returns: + str : Joined tokens converted to words. + + """ + if is_simple_number(tokens, special_types): + numbers = ''.join([n for i, n in enumerate(tokens) + if special_types[i] == TokenType.NUMBER]) + return num2words(''.join(numbers), tags[-1]) + elif is_fraction(tokens): + return self._handle_fraction(tokens, tags) + elif is_fraction(tokens, decimal=True): + return self._handle_decimal_fraction(tokens) else: - de_tag = 'subst:pl:gen:' + tag_list[3] - num_tag = None - return self._get_correct_form( - text=self._handle_numbers(text=numerator, tag=num_tag, num=num), - tag=num_tag - ) + ' ' + self._get_correct_form( - text=self._handle_numbers(text=denominator, tag=de_tag), - tag=de_tag - ) - - def _get_fraction(self, string_builder, id_, tag, length): - string = '' - i = 1 - length - for j in range(length): - words = self._script_translator[2][string_builder[id_ + i + j]]\ - .split('/') - string += self._handle_fraction(words[0], words[1], tag) + ' ' - return string.rstrip(' ') - - def _translate_script( - self, - string_builder, - id_, - length, - superscript, - until=None, - from_=None - ): - result = '' - idx = 0 if superscript else 1 - i = 1 - length - p = 0 - if from_ is not None: - for p in range(length): - word = string_builder[id_ + p + i] - if word == from_: - i += p + 1 + return self._handle_mixed_types(tokens, special_types, tags) + + def _check_number_multipart(self, index, next_id, string_builder): + """Check if the next token is continuation of number with actual token. + + Args: + index (int): Actual token id. + next_id (int): Next token id. + string_builder (list of str): List of all words. + + Returns: + bool: Is next token continuation of a number. + + """ + return next_id == index + 1 or \ + (index + 2 == next_id and + string_builder[index + 1] in self.number_punctuation) + + def _join_tokens(self, token, string_builder): + """Combine tokens that form multi-part formulas. + + Args: + tokens (list of tuple): List of tokens and their features. + Every element contains index, word, morphological tag and + token type. + string_builder (list of str): List of all words. + + Returns: + list of tuple: List of joined tokens and their features. + + """ + joined_tokens = [] + iter_wordify_tokens = enumerate(iter(self._wordify_tokens)) + for i, (index, token, tag, token_type) in iter_wordify_tokens: + j = i + 1 + tokens = [token] + tags = [tag] + special_types = [token_type] + start_id = index + + while j < len(self._wordify_tokens): + next_id, next_token, next_tag, \ + next_special_type = self._wordify_tokens[j] + if not self._check_number_multipart(index, next_id, + string_builder): + break + if next_special_type in self.following_type[token_type]: + if index + 2 == next_id: + tokens.append(string_builder[index + 1]) + special_types.append(TokenType.PUNCTUATION) + tags.append('') + tokens.append(next_token) + tags.append(next_tag) + special_types.append(next_special_type) + else: break - for j in range(length - p): - word = string_builder[id_ + j + i] - if until and word == until: - break - if word in self._script_translator[idx]: - result += self._script_translator[idx][word] - else: - break - return result - - def _get_script_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=True, - until='/' - ), - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=False, - from_='/' - ), - tag - ) - - def _get_super_number_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=True, - until='/' - ), - string_builder[id_], - tag - ) - - def _get_number_sub_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - string_builder[id_ - 2], - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=False, - from_='/' - ), - tag - ) - - def _get_number_number_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - string_builder[id_ - 2], - string_builder[id_], - tag - ) - - def _get_dot(self, string_builder, id_, tag, length): - word = ''.join(string_builder[id_ + 1 - length:id_ + 1]) - numbers = word.split('.') - return self._handle_numbers(numbers[0]) + ' i ' \ - + self._handle_fraction( - numerator=numbers[1], - denominator=str(10 ** int(len(numbers[1]))), - tag=tag - ) - - def _handle_additional_numbers( - self, - string_builder, - id_, - until, - tag, - length - ): - number = [] - i = 1 - length - for j in range(length): - word = string_builder[id_ + i + j] - if word == until: - break - number.append(word) - length = len(number) - if length > 1: - return self._handle_numbers( - text=' '.join(number[0:length]).rstrip(), - tag=tag - ) - return '' - - def _handle_powers(self, first_number, second_number, tag=None): - if first_number == '10': - return self._handle_numbers( - text='1' + '0' * int(second_number), - tag=tag, - num=True - ) - return self._handle_numbers(first_number) + ' do potęgi ' \ - + self._handle_numbers(second_number, 'adj:sg:gen:f') - - def _get_number_to_number(self, string_builder, id_, tag, length): - text = self._handle_additional_numbers( - string_builder, - id_, - '^', - tag, - length - ) - j = 0 - i = 1 - length - for k in range(length): - if string_builder[id_ + k + i] == '^': - j = k + 1 + i - break - if j < 0 or j >= length: - return text - if text: - text += ' ' - return text + self._handle_powers( - string_builder[id_ + j - 2], - string_builder[id_ + j], - tag - ) - - def _get_number_to_super(self, string_builder, id_, tag, length): - text = self._handle_additional_numbers( - string_builder, - id_, - '^', - tag, - length - ) - j = 0 - i = 1 - length - for k in range(length): - if string_builder[id_ + k + i] == '^': - j = k + 1 + i - break - if j == 0 or j >= length: - return text - if text: - text += ' ' - second_number = self._translate_script( - string_builder=string_builder, - id_=id_ + j, - length=length - j, - superscript=True - ) - return text + self._handle_powers( - string_builder[id_ + j - 2], - second_number, - tag - ) - - def _get_scientific(self, string_builder, id_, tag, length): - words = string_builder[id_].split('^') - return self._handle_powers(words[0], words[1], tag) - - def _get_number_hat(self, string_builder, id_, tag, length): - return self._get_number(string_builder, id_, tag, length - 1) + ' ^' - - def _get_as_words(self, id_, string_builder, tag, length, s_type): - if s_type in self._special_dict: - return self._special_dict[s_type](string_builder, id_, tag, length) - return '' - - @staticmethod - def _check_if_multipart(current_stype, s_type): - return ((current_stype == 'number' or - current_stype == 'superscript') and - s_type == '/') or\ - ((current_stype == 'superscript/' or - current_stype == 'number/') and - (s_type == 'number' or s_type == 'subscript')) or\ - (current_stype == s_type and - (s_type == 'number' or s_type == 'subscript' or - s_type == 'superscript')) or\ - ((current_stype == 'superscript/subscript' or - current_stype == 'number/subscript') and - s_type == 'subscript') or\ - (current_stype == 'number' and s_type == '^') or\ - (current_stype == 'number^' and - (s_type == 'number' or s_type == 'superscript')) - - @staticmethod - def _check_if_number_continuation(current_stype, s_type): - return not ((current_stype == s_type and - (s_type == 'number' or s_type == 'subscript' or - s_type == 'superscript')) or - ((current_stype == 'superscript/subscript' or - current_stype == 'number/subscript') and - s_type == 'subscript')) - - def _handle_special(self, string_builder): - if self._special_list: - it = iter(self._special_list) - id_, text, tag, s_type = next(it) - current_tag = tag - current_stype = s_type - current_id = id_ - length = 1 - for id_, text, tag, s_type in it: - if self._check_if_multipart(current_stype, s_type): - if id_ == current_id + 1 or ( - id_ == current_id + 2 and s_type == 'number' and - string_builder[current_id + 1] == ' '): - length += 1 - if self._check_if_number_continuation( - current_stype, - s_type - ): - current_stype += s_type - current_tag = tag - current_id = id_ - continue - new_text = self._get_as_words( - id_=current_id, - string_builder=string_builder, - tag=current_tag, - length=length, - s_type=current_stype - ) - string_builder = self._replace_string_in_builder( - string_builder=string_builder, - current_id=current_id, - length=length, - new_text=new_text - ) - length = 1 - current_tag = tag - current_stype = s_type - current_id = id_ - new_text = self._get_as_words( - id_=current_id, - string_builder=string_builder, - tag=current_tag, - length=length, - s_type=current_stype - ) - string_builder = self._replace_string_in_builder( - string_builder=string_builder, - current_id=current_id, - length=length, - new_text=new_text - ) - self._special_list.clear() - return string_builder - @staticmethod - def _replace_string_in_builder( - string_builder, - current_id, - length, - new_text - ): - j = current_id - i = length - while i > 0: - if not (string_builder[j] == ' ' or string_builder[j] == ''): - i -= 1 - string_builder[j] = '' - j -= 1 - string_builder[current_id] = new_text + next(iter_wordify_tokens) + index = next_id + token_type = next_special_type + j += 1 + joined_tokens.append((start_id, tokens, tags, special_types)) + return joined_tokens + + def _handle_special_types(self, string_builder): + """Convert special tokens to words and replace them in string builder. + + Args: + string_builder (list of str]): List of all words. + + Returns: + list of str: Return updated string builder with special tokens + replaced by words. + + """ + wordify_tokens = self._join_tokens(self._wordify_tokens, string_builder) + enum_special = enumerate(wordify_tokens) + for i, special_token in enum_special: + index, tokens, tags, token_type = special_token + words = self._get_as_words(tokens, tags, token_type) + no_tokens = len(tokens) + string_builder = string_builder[:index] + [words] + \ + string_builder[index + no_tokens:] + offset = no_tokens - 1 + wordify_tokens[i + 1:] = [subtract_from_first(x, offset) + for x in wordify_tokens[i + 1:]] + self._wordify_tokens.clear() return string_builder - def _process_sentence(self, string_builder): - string_builder = self._handle_special(string_builder) - string_builder[0] = string_builder[0].capitalize() + def _get_match_tag(self, match, string_builder, tags): + match = match.group(0) + j = 0 + for i, word in enumerate(string_builder): + if match.startswith(word): + acc = word + match_tags = [tags[j]] + tmp = j + while i < len(string_builder) - 1 and len(acc) < len(match): + i += 1 + acc += string_builder[i] + if acc != match[:len(acc)]: + break + if string_builder[i] != ' ': + j += 1 + match_tags.append(tags[j]) + j = tmp + if acc == match: + return match_tags + if word != ' ': + j += 1 + return [] + + def _handle_regexes(self, string_builder, tags): + """Check for regexes in the given builder and replace them with words. + + Args: + string_builder (list of str): List of all words. + + Returns: + list of str: Updated string builder with matches replaced by words. + + """ + sentence = ''.join(string_builder) + matches = list(self.date_regex.finditer(sentence)) + if not matches: + return string_builder + replace = [] + for match in matches: + date_tags = self._get_match_tag(match, string_builder, tags) + replace.append(date2words(match, date_tags)) + matches = list(map(lambda m: m.group(0), matches)) + builder, self._wordify_tokens = check_and_replace(string_builder, + matches, replace, + self._wordify_tokens) + return builder + + def _process_sentence(self, string_builder, tags): + """Process a sentence and replace special tokens (eg. numbers) words. + + Args: + string_builder (list of str): List of all words. + + Returns: + str: Sentece with replaced special tokens. + + """ + string_builder = self._handle_regexes(string_builder, tags) + string_builder = self._handle_special_types(string_builder) + if string_builder[0] and not string_builder[0][0].isupper(): + string_builder[0] = string_builder[0].capitalize() return ''.join(string_builder) diff --git a/src/worker.py b/src/worker.py index 45fccfe..8dfe2f8 100755 --- a/src/worker.py +++ b/src/worker.py @@ -4,7 +4,7 @@ import logging import nlp_ws from src.wordifier import Wordifier -from src.ccl_handler import Ccl_handler +from src.ccl_handler import CCLHandler _log = logging.getLogger(__name__) @@ -18,7 +18,7 @@ class Worker(nlp_ws.NLPWorker): """One time static initialisation.""" def process(self, input_file, task_options, output_file): - """A.""" + """Processing an input file and generating tokens converted to words.""" wordifier = Wordifier() - ccl_handler = Ccl_handler(input_file) + ccl_handler = CCLHandler(input_file) ccl_handler.process(output_file, wordifier.unmarshallers) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_num2words.py b/tests/test_num2words.py new file mode 100644 index 0000000..23b06e6 --- /dev/null +++ b/tests/test_num2words.py @@ -0,0 +1,136 @@ +import unittest +from parameterized import parameterized, param + +from src.num2words import num2words + + +class TestNum2Words(unittest.TestCase): + single_tag = 'adj:sg:nom:f' + several_tag = 'adj:pl:acc:f' + many_tag = 'adj:pl:acc:m1' + + @parameterized.expand([ + param('0', 'zero'), + param('08', 'osiem'), + param('12', 'dwanaście'), + param('23', 'dwadzieścia trzy'), + param('48', 'czterdzieści osiem'), + param('187', 'sto osiemdziesiąt siedem'), + param('249', 'dwieście czterdzieści dziewięć'), + param('600', 'sześćset'), + param('720', 'siedemset dwadzieścia'), + param('304', 'trzysta cztery'), + + param('1000', 'tysiąc'), + param('425000', 'czterysta dwadzieścia pięć tysięcy'), + param('102000', 'sto dwa tysiące'), + param('390000', 'trzysta dziewięćdziesiąt tysięcy'), + param('701000', 'siedemset jeden tysięcy'), + param('993999', 'dziewięćset dziewięćdziesiąt trzy tysiące ' + 'dziewięćset dziewięćdziesiąt dziewięć'), + param('1000642', 'milion sześćset czterdzieści dwa'), + param('2001003', 'dwa miliony tysiąc trzy'), + param('18456000', 'osiemnaście milionów ' + 'czterysta pięćdziesiąt sześć tysięcy'), + param('1000000000', 'miliard') + ]) + def test_numbers(self, number, words): + self.assertEqual(num2words(number), words) + + @parameterized.expand([ + param('0', 'zerowy', ordinal=True), + param('1', 'pierwszy', ordinal=True), + param('10', 'dziesiąty', ordinal=True), + param('15', 'piętnasty', ordinal=True), + param('31', 'trzydziesty pierwszy', ordinal=True), + param('70', 'siedemdziesiąty', ordinal=True), + param('099', 'dziewięćdziesiąty dziewiąty', ordinal=True), + param('100', 'setny', ordinal=True), + param('102', 'sto drugi', ordinal=True), + param('183', 'sto osiemdziesiąty trzeci', ordinal=True), + param('201', 'dwieście pierwszy', ordinal=True), + + param('1000', 'tysięczny', ordinal=True), + param('1005', 'tysiąc piąty', ordinal=True), + param('2000', 'dwutysięczny', ordinal=True), + param('2020', 'dwa tysiące dwudziesty', ordinal=True), + param('10000', 'dziesięciotysięczny', ordinal=True), + param('100856', 'sto tysięcy osiemset pięćdziesiąty szósty', + ordinal=True), + param('1000000', 'milionowy', ordinal=True), + param('1002003', 'milion dwa tysiące trzeci', ordinal=True), + param('1948052296', 'miliard dziewięćset czterdzieści osiem milionów ' + 'pięćdziesiąt dwa tysiące ' + 'dwieście dziewięćdziesiąty szósty', ordinal=True), + ]) + def test_ordinal_numbers(self, number, words, ordinal): + self.assertEqual(num2words(number, ordinal=ordinal), words) + + @parameterized.expand([ + ('1', 'adj:sg:nom:f', 'jedna'), + ('2', 'num:pl:nom:f', 'dwie') + ]) + def test_numbers_numerator(self, number, tag, words): + self.assertEqual(num2words(number, tag), words) + + @parameterized.expand([ + param('1', 'pierwsza'), + param('2', 'druga'), + param('5', 'piąta'), + param('10', 'dziesiąta'), + param('31', 'trzydziesta pierwsza'), + param('100', 'setna'), + param('102', 'sto druga'), + param('512', 'pięćset dwunasta'), + param('600', 'sześćsetna'), + + param('1000', 'tysięczna'), + param('2002', 'dwa tysiące druga'), + param('3000', 'trzytysięczna'), + param('1000000000', 'miliardowa'), + param('1473022977', 'miliard czterysta siedemdziesiąt trzy miliony ' + 'dwadzieścia dwa tysiące dziewięćset siedemdziesiąta siódma'), + ]) + def test_single_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.single_tag, ordinal), words) + + @parameterized.expand([ + param('3', 'trzecie'), + param('6', 'szóste'), + param('10', 'dziesiąte'), + param('47', 'czterdzieste siódme'), + param('100', 'setne'), + param('101', 'sto pierwsze'), + param('300', 'trzechsetne'), + param('981', 'dziewięćset osiemdziesiąte pierwsze'), + + param('1000', 'tysięczne'), + param('8000', 'ośmiotysięczne'), + param('10000', 'dziesięciotysięczne'), + param('100000', 'stutysięczne'), + param('1000115376708', 'bilion sto piętnaście milionów ' + 'trzysta siedemdziesiąt sześć tysięcy siedemset ósme'), + ]) + def test_several_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.several_tag, ordinal), words) + + @parameterized.expand([ + param('4', 'czwartych'), + param('8', 'ósmych'), + param('10', 'dziesiątych'), + param('69', 'sześćdziesiątych dziewiątych'), + param('100', 'setnych'), + param('212', 'dwieście dwunastych'), + param('700', 'siedemsetnych'), + param('901', 'dziewięćset pierwszych'), + + param('1000', 'tysięcznych'), + param('6000', 'sześciotysięcznych'), + param('10000', 'dziesięciotysięcznych'), + param('1000000', 'milionowych'), + param('238055017238', 'dwieście trzydzieści osiem miliardów ' + 'pięćdziesiąt pięć milionów siedemnaście tysięcy ' + 'dwieście trzydziestych ósmych'), + ]) + def test_many_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.many_tag, ordinal), words) diff --git a/tox.ini b/tox.ini index 1516042..67d5403 100755 --- a/tox.ini +++ b/tox.ini @@ -40,5 +40,5 @@ max-line-length = 80 # D410 Missing blank line after section # D411 Missing blank line before section ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 -match-dir = ^(?!\.tox|venv).* +match-dir = ^(?!\.tox|venv|tests).* match = ^(?!setup).*\.py \ No newline at end of file -- GitLab