diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f622468222a6d36fc394b19a7a0de058fa0f40ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +.vscode \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a78b15f1ae3b3bb1af05fcfb0008224d2979db2a..811491d2847c21836f8d5d156d70fceb848f1526 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,29 +1,46 @@ -image: 'clarinpl/python:3.6' +image: "clarinpl/python:3.6" cache: paths: - .tox stages: - check_style + - test - build -before_script: - - pip install tox==2.9.1 + pep8: stage: check_style + before_script: + - pip install tox==2.9.1 script: - tox -v -e pep8 + docstyle: stage: check_style + before_script: + - pip install tox==2.9.1 script: - tox -v -e docstyle -build_image: + +test: + stage: test + image: "docker:18.09.7" + services: + - "docker:18.09.7-dind" + script: + - docker build -t clarinpl/wordifier . + - docker run --rm + -v "$(pwd)/requirements-dev.txt:/home/worker/requirements-dev.txt" + -v "$(pwd)/tests:/home/worker/tests" + clarinpl/wordifier + sh -c 'pip3 install -r requirements-dev.txt ; nose2 -v tests' + +build: stage: build - image: 'docker:18.09.7' + image: "docker:18.09.7" only: - master services: - - 'docker:18.09.7-dind' - before_script: - - '' + - "docker:18.09.7-dind" script: - docker build -t clarinpl/wordifier . - echo $DOCKER_PASSWORD > pass.txt diff --git a/Dockerfile b/Dockerfile index a5fdf326277387d11d12088fb7478ccd1e9b4d9d..2dfcce57a1c900a55566eb90d6797c6bc40d4001 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,21 @@ FROM clarinpl/python:3.6 WORKDIR /home/worker -COPY ./src ./src -COPY ./main.py . -COPY ./requirements.txt . -RUN wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key|apt-key add - && \ - apt-add-repository http://download.sgjp.pl/apt/ubuntu && \ - apt update && \ - apt install morfeusz2 -y +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 +RUN update-alternatives --set python /usr/bin/python3.6 + +RUN apt-get update && apt-get install -y morfeusz2 RUN wget -O morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl http://download.sgjp.pl/morfeusz/20200913/Linux/18.04/64/morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl RUN python3.6 -m pip install morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl +COPY ./src ./src +COPY ./main.py . +COPY ./requirements.txt . +COPY ./data ./data + RUN python3.6 -m pip install -r requirements.txt CMD ["python3.6", "main.py", "service"] \ No newline at end of file diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3741fc1f3740628f08f560b5c2d6186bb09a2019 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,15 @@ +# Wordifier + +A service that expands abbreviations into full texts. The following modules are implemented at this time: +- verbal notation of digits, numbers, decimal and ordinary fractions (with separators '.' and '/') +- verbal notation of simple equations with addition, subtraction, multiplication and division +- verbal notation of dates + - recognizing different ways to write dates. + - 25.12.2010 or 25,12,12 (day/month, day/month, year) + - 2009-08-30 or 20 08 30 (year, day/month, day/month) + - 12 Jan 2010 or 31 Jan 1998 (day, month, year) + - Mar 12 (month, year) + - Dec 15 (day, month) + - April 30 2000 (month, day, year) +- replace currency symbols with words +- write special characters (%, &, #, ^, =, +, -, /) in words \ No newline at end of file diff --git a/data/currencies.json b/data/currencies.json new file mode 100644 index 0000000000000000000000000000000000000000..2a29d8155da515c8e4f1fcdaca4342e64ac148c2 --- /dev/null +++ b/data/currencies.json @@ -0,0 +1,1514 @@ +{ + "$": [ + "dolar", + "dolary", + "dolarów", + "dolara" + ], + "USD": [ + "dolar amerykański", + "dolary amerykańskie", + "dolarów amerykańskich", + "dolara amerykańskiego" + ], + "CA$": [ + "dolar kanadyjski", + "dolary kanadyjskie", + "dolarów kanadyjskich", + "dolara kanadyjskiego" + ], + "CAD": [ + "dolar kanadyjski", + "dolary kanadyjskie", + "dolarów kanadyjskich", + "dolara kanadyjskiego" + ], + "\u20ac": [ + "Euro", + "Euro", + "Euro", + "Euro" + ], + "EUR": [ + "Euro", + "Euro", + "Euro", + "Euro" + ], + "\u062f.\u0625.\u200f": [ + "Dirham Zjednoczonych Emiratów Arabskich", + "Dirhamy Zjednoczonych Emiratów Arabskich", + "Dirhamów Zjednoczonych Emiratów Arabskich", + "Dirhama Zjednoczonych Emiratów Arabskich" + ], + "AED": [ + "Dirham Zjednoczonych Emiratów Arabskich", + "Dirhamy Zjednoczonych Emiratów Arabskich", + "Dirhamów Zjednoczonych Emiratów Arabskich", + "Dirhama Zjednoczonych Emiratów Arabskich" + ], + "\u060b": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "Af": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "AFN": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "Lek": [ + "lek", + "leki", + "leków", + "leka" + ], + "ALL": [ + "lek", + "leki", + "leków", + "leka" + ], + "\u0564\u0580.": [ + "armański dram", + "armeńskie dramy", + "armeńskich dramów", + "armeńskiego drama" + ], + "AMD": [ + "armański dram", + "armeńskie dramy", + "armeńskich dramów", + "armeńskiego drama" + ], + "AR$": [ + "argetyńskie peso", + "argetyńskie pesos", + "argetyńsich pesos", + "argetyńskiego peso" + ], + "ARS": [ + "argetyńskie peso", + "argetyńskie pesos", + "argetyńsich pesos", + "argetyńskiego peso" + ], + "AU$": [ + "dolar australijski", + "dolary australijskie", + "dolarów australijskich", + "dolara australijskiego" + ], + "AUD": [ + "dolar australijski", + "dolary australijskie", + "dolarów australijskich", + "dolara australijskiego" + ], + "\u043c\u0430\u043d.": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "man.": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "AZN": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "KM": [ + "marka zamienna", + "marki zamienne", + "marek zamiennych", + "marki zamiennej" + ], + "BAM": [ + "marka zamienna", + "marki zamienne", + "marek zamiennych", + "marki zamiennej" + ], + "\u09f3": [ + "taka", + "taka", + "taka", + "taka" + ], + "Tk": [ + "taka", + "taka", + "taka", + "taka" + ], + "BDT": [ + "taka", + "taka", + "taka", + "taka" + ], + "\u043b\u0432.": [ + "lew", + "lewy", + "lewów", + "lewa" + ], + "BGN": [ + "lew", + "lewy", + "lewów", + "lewa" + ], + "\u062f.\u0628.\u200f": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "BD": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "BHD": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "FBu": [ + "frank burundyjski", + "franki burundyjskie", + "franków burundyjskich", + "franka burundyjskiego" + ], + "BIF": [ + "frank burundyjski", + "franki burundyjskie", + "franków burundyjskich", + "franka burundyjskiego" + ], + "BN$": [ + "dolar brunejski", + "dolary brunejskie", + "dolarów brunejskich", + "dolara brunejskiego" + ], + "BND": [ + "dolar brunejski", + "dolary brunejskie", + "dolarów brunejskich", + "dolara brunejskiego" + ], + "Bs": [ + "boliviano", + "bolivianos", + "bolivianos", + "boliviano" + ], + "BOB": [ + "boliviano", + "bolivianos", + "bolivianos", + "boliviano" + ], + "R$": [ + "real brazylijski", + "reale brazylijskie", + "realów brazylijskich", + "reala brazylijskiego" + ], + "BRL": [ + "real brazylijski", + "reale brazylijskie", + "realów brazylijskich", + "reala brazylijskiego" + ], + "P": [ + "pula", + "pula", + "pula", + "pula" + ], + "BWP": [ + "pula", + "pula", + "pula", + "pula" + ], + "\u0440\u0443\u0431.": [ + "rubel białoruski", + "ruble białoruskie", + "rubli białoruskich", + "rubla białoruskiego" + ], + "Br": [ + "birr", + "birry", + "birrów", + "birra" + ], + "BYN": [ + "rubel białoruski", + "ruble białoruskie", + "rubli białoruskich", + "rubla białoruskiego" + ], + "BZ$": [ + "dolar belizeński", + "dolary belizeńskie", + "dolarów belizeńskich", + "dolara belizeńskiego" + ], + "BZD": [ + "dolar belizeński", + "dolary belizeńskie", + "dolarów belizeńskich", + "dolara belizeńskiego" + ], + "FrCD": [ + "frank kongijski", + "franki kongijskie", + "franków kongijskich", + "franka kongijskiego" + ], + "CDF": [ + "frank kongijski", + "franki kongijskie", + "franków kongijskich", + "franka kongijskiego" + ], + "CHF": [ + "frank szwajcarski", + "franki szwajcarskie", + "franków szwajcarskich", + "franka szwajcarskiego" + ], + "CL$": [ + "peso chilijskie", + "peso chilijskie", + "pesos chilijskich", + "peso chilijskiego" + ], + "CLP": [ + "peso chilijskie", + "peso chilijskie", + "pesos chilijskich", + "peso chilijskiego" + ], + "CN\u00a5": [ + "yuan", + "yuan", + "yuan", + "yuan" + ], + "CNY": [ + "yuan", + "yuan", + "yuan", + "yuan" + ], + "CO$": [ + "peso kolumbijskie", + "peso kolumbijskie", + "pesos kolumbijskich", + "peso kolumbijskiego" + ], + "COP": [ + "peso kolumbijskie", + "peso kolumbijskie", + "pesos kolumbijskich", + "peso kolumbijskiego" + ], + "\u20a1": [ + "colón kostarykański", + "colóny kostarytańskie", + "colónów kostarytańskich", + "colóna kostaryńskiego" + ], + "CRC": [ + "colón kostarykański", + "colóny kostarytańskie", + "colónów kostarytańskich", + "colóna kostaryńskiego" + ], + "CV$": [ + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka" + ], + "CVE": [ + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka" + ], + "K\u010d": [ + "czeska korona", + "czeskie korony", + "czeskich koron", + "czeskiej korony" + ], + "CZK": [ + "czeska korona", + "czeskie korony", + "czeskich koron", + "czeskiej korony" + ], + "Fdj": [ + "frank dżibutyjski", + "franki dżibutyjskie", + "franków dżibutyjskich", + "franka dżibutyjskiego" + ], + "DJF": [ + "frank dżibutyjski", + "franki dżibutyjskie", + "franków dżibutyjskich", + "franka dżibutyjskiego" + ], + "kr": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "Dkr": [ + "korona duńska", + "korony duńskie", + "koron duńskich", + "korony duńskiej" + ], + "DKK": [ + "korona duńska", + "korony duńskie", + "koron duńskich", + "korony duńskiej" + ], + "RD$": [ + "peso dominikańskie", + "peso dominikańskie", + "pesos dominikańskich", + "peso dominikańskiego" + ], + "DOP": [ + "peso dominikańskie", + "peso dominikańskie", + "pesos dominikańskich", + "peso dominikańskiego" + ], + "\u062f.\u062c.\u200f": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "DA": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "DZD": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "Ekr": [ + "korona estońska", + "korony estońskie", + "koron estońskich", + "korony estońskiej" + ], + "EEK": [ + "korona estońska", + "korony estońskie", + "koron estońskich", + "korony estońskiej" + ], + "\u062c.\u0645.\u200f": [ + "funt egipski", + "funty egipskie", + "funtów egipskich", + "funta egipskiego" + ], + "EGP": [ + "funt egipski", + "funty egipskie", + "funtów egipskich", + "funta egipskiego" + ], + "Nfk": [ + "nakfa", + "nakfy", + "nakf", + "nakfy" + ], + "ERN": [ + "nakfa", + "nakfy", + "nakf", + "nakfy" + ], + "ETB": [ + "birr", + "birry", + "birrów", + "birra" + ], + "\u00a3": [ + "funt szterling", + "funty szterling", + "funtów szterling", + "funta szterlinga" + ], + "GBP": [ + "funt szterling", + "funty szterling", + "funtów szterling", + "funta szterlinga" + ], + "GEL": [ + "lari", + "lari", + "lari", + "lari" + ], + "GH\u20b5": [ + "cedi", + "cedi", + "cedi", + "cedi" + ], + "GHS": [ + "cedi", + "cedi", + "cedi", + "cedi" + ], + "FG": [ + "frank gwinejski", + "franki gwinejskie", + "franków gwinejskich", + "franka gwinejskiego" + ], + "GNF": [ + "frank gwinejski", + "franki gwinejskie", + "franków gwinejskich", + "franka gwinejskiego" + ], + "Q": [ + "quetzal", + "quetzale", + "quetzali", + "quetzala" + ], + "GTQ": [ + "quetzal", + "quetzale", + "quetzali", + "quetzala" + ], + "HK$": [ + "dolar hongkoński", + "dolary hongkońskie", + "dolarów hongkońskich", + "dolara hongkońskiego" + ], + "HKD": [ + "dolar hongkoński", + "dolary hongkońskie", + "dolarów hongkońskich", + "dolara hongkońskiego" + ], + "L": [ + "lempira", + "lempiry", + "lempir", + "lempira" + ], + "HNL": [ + "lempira", + "lempiry", + "lempir", + "lempira" + ], + "kn": [ + "kuna", + "kuny", + "kun", + "kuny" + ], + "HRK": [ + "kuna", + "kuny", + "kun", + "kuny" + ], + "Ft": [ + "forint", + "forinty", + "forintów", + "forinta" + ], + "HUF": [ + "forint", + "forinty", + "forintów", + "forinta" + ], + "Rp": [ + "rupia indonezyjska", + "rupie indonezyjske", + "rupii indonezyjskych", + "rupii indonezyjskiej" + ], + "IDR": [ + "rupia indonezyjska", + "rupie indonezyjske", + "rupii indonezyjskych", + "rupii indonezyjskiej" + ], + "\u20aa": [ + "nowy izraelski szekel", + "nowe izraelskie szekle", + "nowych izraelskich szekli", + "nowego izraelskiego szekla" + ], + "ILS": [ + "nowy izraelski szekel", + "nowe izraelskie szekle", + "nowych izraelskich szekli", + "nowego izraelskiego szekla" + ], + "\u099f\u0995\u09be": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "Rs": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "INR": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "\u062f.\u0639.\u200f": [ + "dinar iracki", + "dinary irackie", + "dinarów irackich", + "dinara irackiego" + ], + "IQD": [ + "dinar iracki", + "dinary irackie", + "dinarów irackich", + "dinara irackiego" + ], + "\ufdfc": [ + "rial irański", + "riale irańskie", + "riali irańskich", + "riala irańskiego" + ], + "IRR": [ + "rial irański", + "riale irańskie", + "riali irańskich", + "riala irańskiego" + ], + "Ikr": [ + "korona islandzka", + "korony islandzkie", + "koron islandzkich", + "korony islandzkiej" + ], + "ISK": [ + "korona islandzka", + "korony islandzkie", + "koron islandzkich", + "korony islandzkiej" + ], + "J$": [ + "dolar jamajski", + "dolary jamajskie", + "dolarów jamajskich", + "dolara jamajskiego" + ], + "JMD": [ + "dolar jamajski", + "dolary jamajskie", + "dolarów jamajskich", + "dolara jamajskiego" + ], + "\u062f.\u0623.\u200f": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "JD": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "JOD": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "\uffe5": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "\u00a5": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "JPY": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "Ksh": [ + "szyling kenijski", + "szylingi kenijskie", + "szylingów kenijskich", + "szylinga kenijskiego" + ], + "KES": [ + "szyling kenijski", + "szylingi kenijskie", + "szylingów kenijskich", + "szylinga kenijskiego" + ], + "\u17db": [ + "riel kambodżański", + "riele kambodżańskie", + "rieli kambodżańskich", + "riela kambodzańskiego" + ], + "KHR": [ + "riel kambodżański", + "riele kambodżańskie", + "rieli kambodżańskich", + "riela kambodzańskiego" + ], + "FC": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "CF": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "KMF": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "\u20a9": [ + "won południowokoreański", + "wony południowokoreańskie", + "wonów południowokoreańskich", + "wona południowokoreańskiego" + ], + "KRW": [ + "won południowokoreański", + "wony południowokoreańskie", + "wonów południowokoreańskich", + "wona południowokoreańskiego" + ], + "\u062f.\u0643.\u200f": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "KD": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "KWD": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "\u0442\u04a3\u0433.": [ + "tenge", + "tenge", + "tenge", + "tenge" + ], + "KZT": [ + "tenge", + "tenge", + "tenge", + "tenge" + ], + "\u0644.\u0644.\u200f": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "LB\u00a3": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "LBP": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "SL Re": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "SLRs": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "LKR": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "Lt": [ + "lit", + "lity", + "litów", + "lita" + ], + "LTL": [ + "lit", + "lity", + "litów", + "lita" + ], + "Ls": [ + "łat", + "łaty", + "łatów", + "łata" + ], + "LVL": [ + "łat", + "łaty", + "łatów", + "łata" + ], + "\u062f.\u0644.\u200f": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "LD": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "LYD": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "\u062f.\u0645.\u200f": [ + "dirham marokański", + "dirhamy marokańskie", + "dirhamów marokańskich", + "dirhama marokańskiego" + ], + "MAD": [ + "dirham marokański", + "dirhamy marokańskie", + "dirhamów marokańskich", + "dirhama marokańskiego" + ], + "MDL": [ + "Lej Mołdawii", + "Leje Mołdawii", + "Lei Mołdawii", + "Leja Mołdawii" + ], + "MGA": [ + "ariary", + "ariary", + "ariary", + "ariary" + ], + "MKD": [ + "denar macedoński", + "denary macedońskie", + "denarów macedońskich", + "denara macedońskiego" + ], + "K": [ + "kiat", + "kiaty", + "kiatów", + "kiata" + ], + "MMK": [ + "kiat", + "kiaty", + "kiatów", + "kiata" + ], + "MOP$": [ + "pataca", + "pataca", + "pataca", + "pataca" + ], + "MOP": [ + "pataca", + "pataca", + "pataca", + "pataca" + ], + "MURs": [ + "rupia Mauritiusu", + "rupie Mauritiusu", + "rupii Mauritiusu", + "rupii Mauritiusu" + ], + "MUR": [ + "rupia Mauritiusu", + "rupie Mauritiusu", + "rupii Mauritiusu", + "rupii Mauritiusu" + ], + "MX$": [ + "peso meksykańskie", + "peso meksykańskie", + "pesos meksykańskich", + "peso meksykańskiego" + ], + "MXN": [ + "peso meksykańskie", + "peso meksykańskie", + "pesos meksykańskich", + "peso meksykańskiego" + ], + "RM": [ + "ringgit", + "ringgit", + "ringgitów", + "ringgita" + ], + "MYR": [ + "ringgit", + "ringgit", + "ringgitów", + "ringgita" + ], + "MTn": [ + "metical", + "meticale", + "meticali", + "meticala" + ], + "MZN": [ + "metical", + "meticale", + "meticali", + "meticala" + ], + "N$": [ + "dolar namibijski", + "dolare namibijskie", + "dolarów namibijskich", + "dolara namibijskiego" + ], + "NAD": [ + "dolar namibijski", + "dolare namibijskie", + "dolarów namibijskich", + "dolara namibijskiego" + ], + "\u20a6": [ + "naira", + "naire", + "nair", + "naira" + ], + "NGN": [ + "naira", + "naire", + "nair", + "naira" + ], + "C$": [ + "cordoba oro", + "cordoby", + "córdob", + "cordoby" + ], + "NIO": [ + "cordoba oro", + "cordoby", + "córdob", + "cordoby" + ], + "Nkr": [ + "korona norweska", + "korony norweskie", + "koron norweskich", + "korony norweskiej" + ], + "NOK": [ + "korona norweska", + "korony norweskie", + "koron norweskich", + "korony norweskiej" + ], + "\u0928\u0947\u0930\u0942": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NPRs": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NPR": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NZ$": [ + "dolar nowozelandzki", + "dolary nowozelandzkie", + "dolarów nowozelandzkich", + "dolara nowozelandzkiego" + ], + "NZD": [ + "dolar nowozelandzki", + "dolary nowozelandzkie", + "dolarów nowozelandzkich", + "dolara nowozelandzkiego" + ], + "\u0631.\u0639.\u200f": [ + "rial omański", + "riale omańskie", + "riali omańskich", + "riala omańskiego" + ], + "OMR": [ + "rial omański", + "riale omańskie", + "riali omańskich", + "riala omańskiego" + ], + "B/.": [ + "balboa", + "balboa", + "balboa", + "balboa" + ], + "PAB": [ + "balboa", + "balboa", + "balboa", + "balboa" + ], + "S/.": [ + "sol", + "sole", + "soli", + "sola" + ], + "PEN": [ + "sol", + "sole", + "soli", + "sola" + ], + "\u20b1": [ + "peso filipińskie", + "peso filipińskie", + "pesos filipińskich", + "peso filipińskiego" + ], + "PHP": [ + "peso filipińskie", + "peso filipińskie", + "pesos filipińskich", + "peso filipińskiego" + ], + "\u20a8": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "PKRs": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "PKR": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "z\u0142": [ + "złoty", + "złote", + "złotych", + "złotego" + ], + "PLN": [ + "złoty", + "złote", + "złotych", + "złotego" + ], + "\u20b2": [ + "guarani", + "guarani", + "guarani", + "guarani" + ], + "PYG": [ + "guarani", + "guarani", + "guarani", + "guarani" + ], + "\u0631.\u0642.\u200f": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "QR": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "QAR": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "RON": [ + "lej rumuński", + "leje rumuńskie", + "lei rumuńsich", + "leja rumuńskiego" + ], + "\u0434\u0438\u043d.": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "din.": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "RSD": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "\u20bd.": [ + "rubel rosyjski", + "ruble rosyjskie", + "ruble rosyjskie", + "rubla rosyjskiego" + ], + "RUB": [ + "rubel rosyjski", + "ruble rosyjskie", + "ruble rosyjskie", + "rubla rosyjskiego" + ], + "FR": [ + "frank rwandyjski", + "franki rwandyjskie", + "franków rwandyjskich", + "franka rwandyjskiego" + ], + "RWF": [ + "frank rwandyjski", + "franki rwandyjskie", + "franków rwandyjskich", + "franka rwandyjskiego" + ], + "\u0631.\u0633.\u200f": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SR": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SAR": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SDG": [ + "funt sudański", + "funty sudańskie", + "funtów sudańskich", + "funta sudańskiego" + ], + "Skr": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "SEK": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "S$": [ + "dolar singapurski", + "dolary singapurskie", + "dolarów singapurskich", + "dolara singapurskiego" + ], + "SGD": [ + "dolar singapurski", + "dolary singapurskie", + "dolarów singapurskich", + "dolara singapurskiego" + ], + "Ssh": [ + "szyling somalijski", + "szylingi somalijskie", + "szylingów somalijskich", + "szylinga somalijskiego" + ], + "SOS": [ + "szyling somalijski", + "szylingi somalijskie", + "szylingów somalijskich", + "szylinga somalijskiego" + ], + "\u0644.\u0633.\u200f": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "SY\u00a3": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "SYP": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "\u0e3f": [ + "bat tajlandzki", + "baty tajlandzkie", + "batów tajlandzkich", + "bata tajlandzkiego" + ], + "THB": [ + "bat tajlandzki", + "baty tajlandzkie", + "batów tajlandzkich", + "bata tajlandzkiego" + ], + "\u062f.\u062a.\u200f": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "DT": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "TND": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "T$": [ + "pa'anga", + "pa'anga", + "pa'anga", + "pa'anga" + ], + "TOP": [ + "pa'anga", + "pa'anga", + "pa'anga", + "pa'anga" + ], + "TL": [ + "lira turecka", + "liry tureckie", + "lir tureckich", + "liry tureckiej" + ], + "TRY": [ + "lira turecka", + "liry tureckie", + "lir tureckich", + "liry tureckiej" + ], + "TT$": [ + "dolar Trynidadu i Tobago", + "dolary Trynidadu i Tobago", + "dolarów Trynidadu i Tobago", + "dolara Trynidadu i Tobago" + ], + "TTD": [ + "dolar Trynidadu i Tobago", + "dolary Trynidadu i Tobago", + "dolarów Trynidadu i Tobago", + "dolara Trynidadu i Tobago" + ], + "NT$": [ + "dolar tajwański", + "dolary tajwańskie", + "dolarów tajwańskich", + "dolara tajwańskiego" + ], + "TWD": [ + "dolar tajwański", + "dolary tajwańskie", + "dolarów tajwańskich", + "dolara tajwańskiego" + ], + "TSh": [ + "szyling tanzański", + "szylingi tanzańskie", + "szylingów tanzańskich", + "szylinga tanzańskiego" + ], + "TZS": [ + "szyling tanzański", + "szylingi tanzańskie", + "szylingów tanzańskich", + "szylinga tanzańskiego" + ], + "\u20b4": [ + "hrywna", + "hrywny", + "hrywien", + "hrywny" + ], + "UAH": [ + "hrywna", + "hrywny", + "hrywien", + "hrywny" + ], + "USh": [ + "szyling ugandyjski", + "szylingi ugandyjskie", + "szylingów ugandyjskich", + "szylinga ugandyjskiego" + ], + "UGX": [ + "szyling ugandyjski", + "szylingi ugandyjskie", + "szylingów ugandyjskich", + "szylinga ugandyjskiego" + ], + "$U": [ + "peso urugwajskie", + "peso urugwajskie", + "pesos urugwajskie", + "peso urugwajskiego" + ], + "UYU": [ + "peso urugwajskie", + "peso urugwajskie", + "pesos urugwajskie", + "peso urugwajskiego" + ], + "UZS": [ + "sum", + "sumy", + "sumów", + "suma" + ], + "Bs.F.": [ + "boliwar", + "boliwary", + "boliwarów", + "boliwara" + ], + "VEF": [ + "boliwar", + "boliwary", + "boliwarów", + "boliwara" + ], + "\u20ab": [ + "dong", + "dongi", + "dongów", + "donga" + ], + "VND": [ + "dong", + "dongi", + "dongów", + "donga" + ], + "FCFA": [ + "środkowoafrykański frank CFA", + "środkowoafrykańskie franki CFA", + "środkowoafrykańskich franków CFA", + "środkowoafrykańskiego franka CFA" + ], + "XAF": [ + "środkowoafrykański frank CFA", + "środkowoafrykańskie franki CFA", + "środkowoafrykańskich franków CFA", + "środkowoafrykańskiego franka CFA" + ], + "CFA": [ + "frank CFA Afryki Zachodniej", + "franki CFA Afryki Zachodniej", + "franków CFA Afryki Zachodniej", + "franka CFA Afryki Zachodniej" + ], + "XOF": [ + "frank CFA Afryki Zachodniej", + "franki CFA Afryki Zachodniej", + "franków CFA Afryki Zachodniej", + "franka CFA Afryki Zachodniej" + ], + "\u0631.\u064a.\u200f": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "YR": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "YER": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "R": [ + "rand", + "randy", + "randów", + "randa" + ], + "ZAR": [ + "rand", + "randy", + "randów", + "randa" + ], + "ZK": [ + "kwacha zambijska", + "kwacha zambijskie", + "kwacha zambijskich", + "kwacha zambijskiego" + ], + "ZMK": [ + "kwacha zambijska", + "kwacha zambijskie", + "kwacha zambijskich", + "kwacha zambijskiego" + ], + "ZWL$": [ + "dolar Zimbabwe", + "dolary Zimbabwe", + "dolarów Zimbabwe", + "dolara Zimbabwe" + ], + "ZWL": [ + "dolar Zimbabwe", + "dolary Zimbabwe", + "dolarów Zimbabwe", + "dolara Zimbabwe" + ] +} \ No newline at end of file diff --git a/data/numbers.json b/data/numbers.json new file mode 100644 index 0000000000000000000000000000000000000000..65cae973fa5aa242c6320cec6ecc6f4dbf4efc2e --- /dev/null +++ b/data/numbers.json @@ -0,0 +1,116 @@ +{ + "number_words": { + "0": "zero", + "1": "jeden", + "2": "dwa", + "3": "trzy", + "4": "cztery", + "5": "pięć", + "6": "sześć", + "7": "siedem", + "8": "osiem", + "9": "dziewięć", + "10": "dziesięć", + "11": "jedenaście", + "12": "dwanaście", + "13": "trzynaście", + "14": "czternaście", + "15": "piętnaście", + "16": "szesnaście", + "17": "siedemnaście", + "18": "osiemnaście", + "19": "dziewiętnaście", + "20": "dwadzieścia", + "30": "trzydzieści", + "40": "czterdzieści", + "50": "pięćdziesiąt", + "60": "sześćdziesiąt", + "70": "siedemdziesiąt", + "80": "osiemdziesiąt", + "90": "dziewięćdziesiąt", + "100": "sto", + "200": "dwieście", + "300": "trzysta", + "400": "czterysta", + "500": "pięćset", + "600": "sześćset", + "700": "siedemset", + "800": "osiemset", + "900": "dziewięćset" + }, + "ordinal_number_words": { + "0": "zerowy", + "1": "pierwszy", + "2": "drugi", + "3": "trzeci", + "4": "czwarty", + "5": "piąty", + "6": "szósty", + "7": "siódmy", + "8": "ósmy", + "9": "dziewiąty", + "10": "dziesiąty", + "11": "jedenasty", + "12": "dwunasty", + "13": "trzynasty", + "14": "czternasty", + "15": "piętnasty", + "16": "szesnasty", + "17": "siedemnasty", + "18": "osiemnasty", + "19": "dziewiętnasty", + "20": "dwudziesty", + "30": "trzydziesty", + "40": "czterdziesty", + "50": "pięćdziesiąty", + "60": "sześćdziesiąty", + "70": "siedemdziesiąty", + "80": "osiemdziesiąty", + "90": "dziewięćdziesiąty", + "100": "setny", + "200": "dwusetny", + "300": "trzechsetny", + "400": "czterechsetny", + "500": "pięćsetny", + "600": "sześćsetny", + "700": "siedemsetny", + "800": "osiemsetny", + "900": "dziewięćsetny" + }, + "large_numbers": { + "3": "tysiąc", + "6": "milion", + "9": "miliard", + "12": "bilion", + "15": "biliard", + "18": "trylion", + "21": "tryliard", + "24": "kwadrylion", + "27": "kwadryliard", + "30": "kwintylion", + "33": "kwintyliard", + "36": "sekstylion", + "39": "sekstyliard", + "42": "septylion", + "45": "septyliard", + "48": "oktylion", + "51": "oktyliard", + "54": "nonilion", + "57": "noniliard", + "60": "decylion", + "63": "decyliard", + "66": "undecylion", + "69": "undecyliard", + "72": "duodecylion", + "75": "duodecyliard", + "100": "googol", + "600": "centylion", + "603": "centyliard" + }, + "ordinal_large_numbers": { + "3": "tysięczny", + "6": "milionowy", + "9": "miliardowy", + "12": "bilionowy" + } +} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ad8f8a6e480c5d0eea6253c4c66be6cabc0301f6..98462cca104e8ad985120f4938b88a67acb1ccc8 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,17 +1,16 @@ version: '3' services: - tokenizer: + wordifier: container_name: clarin_wordifier build: ./ working_dir: /home/worker - entrypoint: - - python3.6 - - main.py - - service + command: + - python3.6 main.py service environment: - PYTHONUNBUFFERED=0 volumes: - '/samba:/samba' - './config.ini:/home/worker/config.ini' - './src:/home/worker/src' + - './tests:/home/worker/tests' - './main.py:/home/worker/main.py' diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d55715eaf79b58fdc255c740ceb580757787ea3 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +parameterized==0.8.1 +nose2==0.10.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index aae2e5022e70fd4537aa5304bb79ff161ed85061..e8340049c13ee23f0def228066991df9b0abf234 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ nlp-ws -python-morfeusz \ No newline at end of file +Babel==2.8.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/ccl_handler.py b/src/ccl_handler.py index a61dd8974ee4be3d1362be6856a377ef63758777..02e08ac29c6fc7c51715362b600cee42593064e7 100755 --- a/src/ccl_handler.py +++ b/src/ccl_handler.py @@ -2,19 +2,19 @@ from xml.etree.ElementTree import iterparse -class Ccl_handler: +class CCLHandler: """Implements reading ccl for anonymizer service.""" def __init__(self, ccl_file_name): - """Initialize ccl_handler with a filename.""" + """Initialize CCLHandler with a filename.""" self._file_name = ccl_file_name def process(self, output_file, unmarshallers): """Process xml tags using unmarshallers and save in output_file.""" - with open(output_file, 'w', encoding='utf-8') as out: - with open(self._file_name, 'r', encoding='utf-8') as f: - for event, elem in iterparse(f): - unmarshal = unmarshallers.get(elem.tag, None) - if unmarshal: - out.write(unmarshal(elem)) - elem.clear() + with open(self._file_name, 'r', encoding='utf-8') as input_file, \ + open(output_file, 'w', encoding='utf-8') as output_file: + for event, elem in iterparse(input_file): + unmarshal = unmarshallers.get(elem.tag, None) + if unmarshal: + output_file.write(unmarshal(elem)) + elem.clear() diff --git a/src/date2words.py b/src/date2words.py new file mode 100644 index 0000000000000000000000000000000000000000..055e5f6b4d2af51755c1aafcb4e20bce137ffcd2 --- /dev/null +++ b/src/date2words.py @@ -0,0 +1,108 @@ +"""Module for converting dates to words.""" +from babel import Locale + +from src.num2words import num2words + +date_tags = ['sg:gen:m3'] + + +def check_none(token): + """If token is none then convert to empty list otherwise return token.""" + if not token: + return [] + return token + + +def month_name_expansion(month): + """Expand month abbreviation or change form. + + Args: + month (str): Month abbrevation or full name. + + Returns: + str: Full month name in genitive case. + + """ + abbr = len(month) == 3 + locale = Locale('pl') + month = month.lower() + + if abbr: + months = locale.months['format']['abbreviated'] + index = list(months.values()).index(month) + 1 + month = locale.months['format']['wide'][index] + else: + for format in ['format', 'stand-alone']: + if month in list(locale.months[format]['wide'].values()): + months = locale.months[format]['wide'] + index = list(months.values()).index(month) + 1 + month = locale.months['format']['wide'][index] + return month + + +def date2words(date_match, tags=None): + """Convert a date to list of words. + + Args: + date_match (re.Match): Date match. + tag (str, optional): Morphological tag. Defaults to None. + + Returns: + list of str: List of words representing date. + + """ + if tags and ":".join(tags[0].split(":")[1:4]) in date_tags: + corrected_tag = tags[0] + else: + corrected_tag = None + if date_match['day_or_month_year']: + day_month1 = num2words(date_match['day_month1'], corrected_tag, + ordinal=True) + day_month2 = num2words(date_match['day_month2'], corrected_tag, + ordinal=True) + year = num2words(date_match['year1'], corrected_tag, ordinal=True) + + # split punctuation into single characters and remove if None + date_order = [day_month1, *check_none(date_match['punct1']), + day_month2, *check_none(date_match['punct2']), year] + elif date_match['year_month_or_day']: + day_month3 = num2words(date_match['day_month3'], ordinal=True) + day_month4 = num2words(date_match['day_month4'], ordinal=True) + year = num2words(date_match['year2'], ordinal=True) + + # split punctuation into single characters and remove if None + date_order = [year, *check_none(date_match['punct3']), day_month3, + *check_none(date_match['punct4']), day_month4] + elif date_match['month_in_words']: + day = date_match['day1'] + if date_match['day2']: + day = date_match['day2'] + if day: + day = num2words(day, corrected_tag, ordinal=True) + + year = '' + if date_match['year3']: + year = num2words(date_match['year3'], corrected_tag, ordinal=True) + if date_match['year4']: + year = num2words(date_match['year4'], corrected_tag, ordinal=True) + + if not day and not year: + return [date_match['month']] + else: + month = month_name_expansion(date_match['month']) + + # split punctuation into single characters and remove if None + if date_match['day2']: + date_order = [month, *check_none(date_match['punct7']), + day, *check_none(date_match['punct8'])] + elif date_match['day1']: + date_order = [day, *check_none(date_match['punct5']), + month, *check_none(date_match['punct6'])] + else: + date_order = [month] + if year: + date_order = date_order + [year] + date_order = list(map(lambda x: x if x else '', date_order)) + else: + date_order = [''] + return date_order diff --git a/src/num2words.py b/src/num2words.py new file mode 100644 index 0000000000000000000000000000000000000000..fdae1196ab64932e5ded12cef1af9aef52feffdf --- /dev/null +++ b/src/num2words.py @@ -0,0 +1,105 @@ +"""Module for converting numbers to words.""" +import math +import json + +from src.utils import get_word_form, trailing_zeros + +with open('data/numbers.json', 'r') as numbers_file: + numbers_dict = json.load(numbers_file) + number_words = {int(k): v for k, v in numbers_dict['number_words'].items()} + ordinal_number_words = {int(k): v for k, v + in numbers_dict['ordinal_number_words'].items()} + large_numbers = {int(k): v for k, v + in numbers_dict['large_numbers'].items()} + ordinal_large_numbers = {int(k): v for k, v + in numbers_dict['ordinal_large_numbers'].items()} + + +def three_digit_to_words(text, tag='', ordinal=False): + """Convert three digits numbers to words with given tag. Util function.""" + map_to_words = ordinal_number_words if ordinal else number_words + + number = int(text) + if number == 0: + return get_word_form(map_to_words[number], tag) + words = [] + units = number % 10 + tens = number % 100 - units + hundredths = number // 100 + if 0 < tens + units <= 20: + word = get_word_form(map_to_words[tens + units], tag) + words.append(word) + else: + if units != 0: + words.append(get_word_form(map_to_words[units], tag)) + if tens != 0: + words.append(get_word_form(map_to_words[tens], tag)) + + if hundredths != 0: + if tens == 0 and units == 0: + words.append(get_word_form(map_to_words[hundredths * 100], tag)) + else: + words.append(get_word_form(number_words[hundredths * 100], '')) + + return ' '.join(reversed(words)) + + +def num2words(text, tag='', ordinal=False): + """Converts a number to words. + + Args: + text (str): Three digits number. + tag (str, optional): Morphological tag. Defaults to ''. + ordinal (bool, optional): If word should be derived from ordinal number. + Defaults to False. + + Returns: + str: Returns number as words with given tag. + + """ + i = 0 + words = [] + number = int(text) + + if ordinal: + zeros = trailing_zeros(number) + zeros = 3 * math.floor(zeros / 3) + if zeros > 2 and 0 < len(text) - zeros <= 3: + number = number // 10 ** zeros + if number == 1: + words = '' + else: + words = three_digit_to_words(str(number), 'numcomp') + words += get_word_form(ordinal_large_numbers[zeros], tag) + return words + + if len(text) <= 3 or number == 0: + return three_digit_to_words(text, tag, ordinal) + + while number > 0: + remainder = number % 1000 + if i == 0: + triple = three_digit_to_words(remainder, tag, ordinal) + else: + triple = three_digit_to_words(remainder) + number = number // 1000 + if remainder == 0 and number != 0: + i += 3 + continue + + if i == 0: + words.append(triple) + else: + if remainder == 1: + tag = 'subst:sg:nom:m3' + elif remainder % 10 in [2, 3, 4]: + tag = 'subst:pl:nom:m3' + else: + tag = 'subst:pl:gen:m3' + form = get_word_form(large_numbers[i], tag) + if remainder == 1: + words.append(form) + else: + words.append(triple + ' ' + form) + i += 3 + return ' '.join(list(reversed(words))) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a21467591223c961f8cd6ec0ce92c273640e05aa --- /dev/null +++ b/src/utils.py @@ -0,0 +1,211 @@ +"""Module for useful functions.""" +from enum import Enum + +import morfeusz2 + + +class TokenType(Enum): + """Type of token.""" + + NUMBER = 1 + SPECIAL_CHARACTER = 2 + PUNCTUATION = 3 + CURRENCY = 4 + + +class NumberPlural(Enum): + """Type of number indicating what the word suffix will be. + + E.g: + SINGULAR 1$ - jeden dolar + SEVERAL (2-4) 2$ - dwa dolary + MANY (5+) 7$ - siedem dolarów + """ + + SINGULAR = 0 + SEVERAL = 1 + MANY = 2 + + +def to_number_plural(number): + """Convert a number to enumerate type, that indicates word suffix. + + Args: + number (int or string): Number to be converted. + + Returns: + NumberPlural: Enumerate, which indicates what the end of the word + will be. + + """ + number = int(number) + if number == 1: + return NumberPlural.SINGULAR + elif 2 <= number <= 4: + return NumberPlural.SEVERAL + else: + return NumberPlural.MANY + + +def is_simple_number(tokens, special_types): + """Checks if list of tokens creates a simple number. + + Simple number contains only digits and spaces between groups of three. + + Args: + tokens (list): List of tokens. + special_types (list): Types of tokens. + + Returns: + bool: Return True if joined tokens are simple number otherwise False. + + """ + numbers = [n for i, n in enumerate(tokens) + if special_types[i] == TokenType.NUMBER] + return (all([len(t) == 3 for t in numbers[1:]]) and + all([(s.isdigit() or s == ' ') for s in tokens])) + + +def is_fraction(tokens, decimal=False): + """Check is list of tokens are 2 numbers splitted by slash or dot. + + Args: + tokens (list): List of tokens. + decimal (bool, optional): If True delimiter is dot otherwise slash '/'. + Defaults to False. + + Returns: + bool: Return True if tokens are fraction otherwise False. + + """ + if len(tokens) < 3: + return False + delimiter = '.' if decimal else '/' + splitted = ''.join(tokens).split(delimiter) + return ((len(splitted) == 2) and + tokens.count(delimiter) == 1 and + all([(s.isdigit() or s in ' /.') for s in tokens])) + + +def trailing_zeros(number): + """Count trailing zeros in number. + + Returns: + int: Return number of trailing zeros. + + """ + manipulandum = str(number) + return len(manipulandum) - len(manipulandum.rstrip('0')) + + +def search_form(forms, tag): + """Search for the correct form of word from all those returned by Morfeusz. + + Args: + forms (list of tuples): Tags and variations of words returned + by Morfeusz. + tag (str): The tag of the word whose form is being searched for. + + Returns: + str: Word properly conjugated with the given tag or None if not found. + + """ + for form in forms: + form_categories = [x.split('.') for x in form[2].split(':')] + gramm_categ_enum = enumerate(tag) + if all((c in form_categories[i] for i, c in gramm_categ_enum)): + return form[0] + return None + + +def get_word_form(text, tag): + """Change the word in the appropriate form with given morphological tag. + + Args: + text (str): Word to be changed. + tag (str): Morphological tag. + + Returns: + str: Word changed with given morphological tag. + + """ + if not tag: + return text + + morf = morfeusz2.Morfeusz() + all_forms = morf.generate(text) + + tag = tag.split(':') + forms = [x for x in all_forms if x[2].split(':')[0] == tag[0]] + form = search_form(forms, tag) + + if form: + return form + if len(tag) > 4: + tag = tag[:4] + form = search_form(forms, tag) + + if form: + return form + else: + return text + + +def subtract_from_first(list_of_tuples, offset): + """Subtract from every first element in tuples that make up list.""" + list_of_tuples = (list_of_tuples[0] - offset, *list_of_tuples[1:]) + return list_of_tuples + + +def check_and_replace(string_builder, find, replace, filtered_tokens): + """Check for matches in list and replace them with given tokens. + + Remove replaced tokens from `filtered_tokens` to to avoid double processing. + + Args: + string_builder (list of str): List of all words. + find (list of str): Tokens to be replaced. + replace (list of str): Words that will replace `find` tokens in + `string_builder`. + filtered_tokens (list of tuples): List of tokens and their features. + + Returns: + (list of str, list of tuples): Pair: list of words with replaced matched + tokens and filtered list of tokens and their feature with deleted + items that have been replaced. + + """ + if not find or not replace: + return string_builder, filtered_tokens + + new_builder = string_builder.copy() + max_lenght = max(map(len, find)) + for i, token in enumerate(string_builder): + if not find: + break + to_remove = [i] + check = token + j = i + 1 + if check in find: + new_builder[i] = ''.join(replace[find.index(check)]) + filtered_tokens = list(filter(lambda x: x[0] != i, filtered_tokens)) + del find[0], replace[0] + continue + if check[0] != find[0][:len(check[0])]: + continue + while len(check) < max_lenght and j < len(string_builder): + check += string_builder[j] + to_remove.append(j) + if check in find: + index = find.index(check) + new_builder = new_builder[:i] + replace[index] + if j + 1 < len(string_builder): + new_builder += string_builder[j + 1:] + filtered_tokens = list(filter(lambda x: x[0] not in to_remove, + filtered_tokens)) + find.pop(index) + replace.pop(index) + if not find: + return new_builder, filtered_tokens + j += 1 + return new_builder, filtered_tokens diff --git a/src/wordifier.py b/src/wordifier.py index 8f1f2adc0e5eca0a47fb3cd1ad37e07e844eb790..0f4ed21312c13c03bfaf7c186221b427598e9ddd 100644 --- a/src/wordifier.py +++ b/src/wordifier.py @@ -1,255 +1,149 @@ """Implementation of wordifier functionality.""" -import morfeusz2 import re +import json +from itertools import islice + +from src.utils import is_simple_number, subtract_from_first, trailing_zeros, \ + check_and_replace, TokenType, NumberPlural, to_number_plural, is_fraction +from src.num2words import num2words +from src.date2words import date2words class Wordifier: - """Class used to edit sentences based on options.""" - - _num_list = [ - { - '0': 'zero', - '1': 'jeden', - '2': 'dwa', - '3': 'trzy', - '4': 'cztery', - '5': 'pięć', - '6': 'sześć', - '7': 'siedem', - '8': 'osiem', - '9': 'dziewięć' - }, - { - '10': 'dziesięć', - '11': 'jedenaście', - '12': 'dwanaście', - '13': 'trzynaście', - '14': 'czternaście', - '15': 'piętnaście', - '16': 'szesnaście', - '17': 'siedemnaście', - '18': 'osiemnaście', - '19': 'dziewiętnaście', - '20': 'dwadzieścia', - '30': 'trzydzieści', - '40': 'czterdzieści', - '50': 'pięćdziesiąt', - '60': 'sześćdziesiąt', - '70': 'siedemdziesiąt', - '80': 'osiemdziesiąt', - '90': 'dziewięćdziesiąt' + """Class for generating words from special characters or numbers.""" + + date_regex = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' + r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' + r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)' + r'|Gru(?:|dzie[nń]|dnia))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I + ) + decimal_fraction_regex = re.compile(r'\d+[ ]?(\.)[ ]?\d+') + + number_punctuation = ' .,' + following_type = { + TokenType.NUMBER: [TokenType.NUMBER, TokenType.SPECIAL_CHARACTER, + TokenType.CURRENCY], + TokenType.SPECIAL_CHARACTER: [TokenType.SPECIAL_CHARACTER, + TokenType.NUMBER], + TokenType.CURRENCY: [] + } + + _denominator_tag = { + NumberPlural.SINGULAR: { + 'default': 'adj:sg:nom:f', + ('acc', 'dat', 'gen', 'loc'): { + ('f'): 'adj:sg:acc:f' + } }, - { - '1': 'sto', - '2': 'dwieście', - '3': 'trzysta', - '4': 'czterysta', - '5': 'pięćset', - '6': 'sześćset', - '7': 'siedemset', - '8': 'osiemset', - '9': 'dziewięćset' + NumberPlural.SEVERAL: { + 'default': 'adj:pl:acc:f', + ('dat'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f' + }, + ('gen', 'loc'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:pl:acc:m1' + }, + ('nom', 'voc'): { + ('m1'): 'adj:pl:acc:m1' + } }, - { - 3: 'tysiąc', - 6: 'milion', - 9: 'miliard', - 12: 'bilion', - 15: 'biliard', - 18: 'trylion', - 21: 'tryliard', - 24: 'kwadrylion', - 27: 'kwadryliard', - 30: 'kwintylion', - 33: 'kwintyliard', - 36: 'sekstylion', - 39: 'sekstyliard', - 42: 'septylion', - 45: 'septyliard', - 48: 'oktylion', - 51: 'oktyliard', - 54: 'nonilion', - 57: 'noniliard', - 60: 'decylion', - 63: 'decyliard', - 66: 'undecylion', - 69: 'undecyliard', - 72: 'duodecylion', - 75: 'duodecyliard', - 100: 'googol', - 600: 'centylion', - 603: 'centyliard' + NumberPlural.MANY: { + 'default': 'adj:pl:acc:m1', + ('acc', 'nom', 'voc'): { + ('m1'): 'adj:sg:dat:f' + }, + ('gen', 'dat', 'inst', 'loc'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f' + } } - ] - - _adj_list = [ - { - '0': 'zerowy', - '1': 'pierwszy', - '2': 'drugi', - '3': 'trzeci', - '4': 'czwarty', - '5': 'piąty', - '6': 'szósty', - '7': 'siódmy', - '8': 'ósmy', - '9': 'dziewiąty' - }, - { - '10': 'dziesiąty', - '11': 'jedenasty', - '12': 'dwunasty', - '13': 'trzynasty', - '14': 'czternasty', - '15': 'piętnasty', - '16': 'szesnasty', - '17': 'siedemnasty', - '18': 'osiemnasty', - '19': 'dziewiętnasty', - '20': 'dwudziesty', - '30': 'trzydziesty', - '40': 'czterdziesty', - '50': 'pięćdziesiąty', - '60': 'sześćdziesiąty', - '70': 'siedemdziesiąty', - '80': 'osiemdziesiąty', - '90': 'dziewięćdziesiąty' - }, - { - '1': 'setny', - '2': 'dwusetny', - '3': 'trzechsetny', - '4': 'czterechsetny', - '5': 'pięćsetny', - '6': 'sześćsetny', - '7': 'siedemsetny', - '8': 'osiemsetny', - '9': 'dziewięćsetny' - }, - { - 3: 'tysięczny', - 6: 'milionowy', - 9: 'miliardowy', - 12: 'bilionowy' - } - ] - - _script_translator = [ - { - '\u2070': '0', # SUPERSCRIPT ZERO - '\u00B9': '1', # SUPERSCRIPT ONE - '\u00B2': '2', # SUPERSCRIPT TWO - '\u00B3': '3', # SUPERSCRIPT THREE - '\u2074': '4', # SUPERSCRIPT FOUR - '\u2075': '5', # SUPERSCRIPT FIVE - '\u2076': '6', # SUPERSCRIPT SIX - '\u2077': '7', # SUPERSCRIPT SEVEN - '\u2078': '8', # SUPERSCRIPT EIGHT - '\u2079': '9', # SUPERSCRIPT NINE - }, - { - '\u2080': '0', # SUBSCRIPT ZERO - '\u2081': '1', # SUBSCRIPT ONE - '\u2082': '2', # SUBSCRIPT TWO - '\u2083': '3', # SUBSCRIPT THREE - '\u2084': '4', # SUBSCRIPT FOUR - '\u2085': '5', # SUBSCRIPT FIVE - '\u2086': '6', # SUBSCRIPT SIX - '\u2087': '7', # SUBSCRIPT SEVEN - '\u2088': '8', # SUBSCRIPT EIGHT - '\u2089': '9' # SUBSCRIPT NINE - }, - { - '\u00BC': '1/4', # VULGAR FRACTION ONE QUARTER - '\u00BD': '1/2', # VULGAR FRACTION ONE HALF - '\u00BE': '3/4', # VULGAR FRACTION THREE QUARTERS - } - ] + } + + special_character_numbers_map = { + '+': 'plus', + '-': 'minus', + '/': 'przez', + '*': 'razy', + '%': 'procent', + '&': 'ampersand', + '=': 'równa się', + '^': 'do potęgi', + '#': 'numer' + } + special_character_map = { + '+': 'plus', + '-': '-', + '/': 'ukośnik', + '%': 'procent', + '&': 'i', + '=': 'równa się', + '^': 'kareta', + '#': 'kratka' + } def __init__(self): """Class initialization.""" - self._morf = morfeusz2.Morfeusz() self.unmarshallers = { 'chunk': lambda *args: '\n', 'sentence': lambda *args: self._process_sent_tree(*args), } - self._one_dict = dict() - self._create_one_dict() - self._special_list = [] - self._special_dict = { - 'number': - lambda *args: self._get_number(*args), - 'superscript': - lambda *args: self._get_superscript(*args), - 'subscript': - lambda *args: self._get_subscript(*args), - '/': - lambda *args: '/', - 'number/': - lambda *args: self._get_number_slash(*args), - 'number/subscript': - lambda *args: self._get_number_sub_fraction(*args), - 'number/number': - lambda *args: self._get_number_number_fraction(*args), - 'superscript/': - lambda *args: self._get_superscript_slash(*args), - 'superscript/subscript': - lambda *args: self._get_script_fraction(*args), - 'superscript/number': - lambda *args: self._get_super_number_fraction(*args), - 'fraction': - lambda *args: self._get_fraction(*args), - 'scientific': - lambda *args: self._get_scientific(*args), - 'dot': - lambda *args: self._get_dot(*args), - '^': - lambda *args: '^', - 'number^': - lambda *args: self._get_number_hat(*args), - 'number^number': - lambda *args: self._get_number_to_number(*args), - 'number^superscript': - lambda *args: self._get_number_to_super(*args) - } - - def _create_one_dict(self): - for word in self._morf.generate('jeden'): - self._one_dict[word[0]] = True + with open('data/currencies.json', 'r') as currency_file: + self._currencies = json.load(currency_file) + self._wordify_tokens = [] def _process_sent_tree(self, sentence_subtree): string_builder = [] + tags = [] tok_id = 0 for elem in sentence_subtree: if elem.tag == 'tok': - tok = self._process_single_tok(tok_id, elem) - string_builder.append(tok) + token, tag = self._process_single_tok(tok_id, elem) + string_builder.append(token) string_builder.append(' ') + tags.append(tag) tok_id += 2 elif elem.tag == 'ns': tok_id -= 1 string_builder.pop() else: raise Exception('Unrecognized tag inside sentence: ' + elem.tag) - return self._process_sentence(string_builder) + return self._process_sentence(string_builder, tags) + + def _get_denominator_tag(self, nominator_plural, nom_case, nom_gender=None): + if nom_case == 'default' or nom_gender is None: + return self._denominator_tag[nominator_plural]['default'] + + for cases, value in self._denominator_tag[nominator_plural].items(): + if cases == 'default': + continue + if nom_case in cases: + for genders, tag in value.items(): + if nom_gender in genders: + return tag + return self._denominator_tag[nominator_plural]['default'] def _special_type(self, text): - if text == '/' or text == '\u002F': - return '/' - elif re.match(r'\d+\.\d+', text): - return 'dot' - elif re.match(r'\d+\^\d+', text): - return 'scientific' - elif text == r'^': - return '^' - elif all(char in self._script_translator[0] for char in text): - return 'superscript' - elif all(char in self._script_translator[1] for char in text): - return 'subscript' - elif all(char in self._script_translator[2] for char in text): - return 'fraction' + if text in self.special_character_map: + return TokenType.SPECIAL_CHARACTER + elif text in self._currencies: + return TokenType.CURRENCY elif text.isdigit(): - return 'number' + return TokenType.NUMBER return None def _process_single_tok(self, tok_id, tok_subtree): @@ -261,185 +155,7 @@ class Wordifier: elif elem.tag == 'lex': tag = self._process_lex(elem) word = self._process_word(tok_id, text, tag) - return word - - def _return_large_part(self, num, digit, tag=None, word_text=None): - if word_text: - last_word = word_text.split(' ')[-1] - if last_word == 'dwa' \ - or last_word == 'trzy' \ - or last_word == 'cztery': - tag = 'subst:pl:nom:m3' - else: - tag = 'subst:pl:gen:m3' - return self._return_number(num, 3, digit, tag, digit) - return self._return_number(num, 3, digit, tag) - - def _return_number(self, num, pos, digit, tag=None, key=None): - if tag: - return self._get_correct_form( - text=self._num_list[pos][digit], - tag=tag, - key=key - ) if num else self._get_correct_form( - text=self._adj_list[pos][digit], - tag=tag, - key=key - ) - return self._num_list[pos][digit] if num else self._adj_list[pos][digit] - - def _handle_two_digits(self, from_, to_, num, text, tag=None): - text = text[from_:to_] - if len(text) >= 2: - if text[-2] == '0': - return self._return_number(num, 0, text[-1], tag) - elif text[-2] == '1': - return self._return_number(num, 1, text[-2:], tag) - else: - if text[-1] == '0': - return self._return_number(num, 1, text[-2:], tag) - return self._return_number(num, 1, text[-2] + '0', tag) +\ - ' ' + self._return_number(num, 0, text[-1], tag) - elif len(text) >= 1: - return self._return_number(num, 0, text[-1], tag) - else: - return '' - - def _handle_three_digits(self, from_, to_, num, text, tag=None): - text = text[from_:to_] - if len(text) >= 3: - string = '' - if len(text) > 3: - string = ' ' - if text[-3] != '0': - if text[-2:] == '00': - return string + self._return_number(num, 2, text[-3], tag) - return string + self._return_number(num, 2, text[-3], tag) \ - + ' ' + self._handle_two_digits(from_, to_, num, text, tag) - if len(text) >= 2 and text[-2:] == '00': - return self._return_number(num, 0, text[-1], tag) - return self._handle_two_digits(from_, to_, num, text, tag) - - def _replace_correct_from(self, key, tag, base): - flex = self._get_correct_form(base, tag).lstrip(base) - return self._num_list[3][key] + flex - - def _handle_numbers(self, text, tag=None, word_text='', num=None): - if num is None and tag: - tag_list = tag.split(':') - num = not (tag_list[0] == 'adj' or tag_list[0] == 'subst') - elif num is None: - num = True - length = len(text) - if length <= 3: - word_text = word_text + self._handle_three_digits( - 0, - None, - num, - text, - tag - ) - elif length == 0: - raise Exception('Fragment recognized as number is empty!') - else: - new_text = text - digits = len(new_text) - int(len(new_text) / 3) * 3 - if digits != 0: - new_word = self._handle_two_digits( - 0, - digits, - num, - new_text, - tag - ) - if new_word in self._one_dict: - word_text = self._return_large_part( - num, - int(len(new_text) / 3) * 3, - tag, - None - ) - else: - word_text = new_word + ' ' + self._return_large_part( - num, - int(len(new_text) / 3) * 3, - tag, - new_word - ) - new_text = new_text[digits:] - if len(new_text.rstrip('0')) != 0: - word_text += ' ' - else: - return word_text - for k in reversed(range(0, int(len(new_text) / 3))): - key = k * 3 - new_word = self._handle_three_digits(0, 3, num, new_text, tag) - if new_word in self._one_dict and key != 0: - word_text += self._return_large_part(num, key, tag, None) - else: - word_text += new_word - if key != 0: - word_text += ' ' + self._return_large_part( - num, - key, - tag, - word_text - ) - new_text = new_text[3:] - if len(new_text.rstrip('0')) == 0: - return word_text - word_text += ' ' - return word_text - - def _replace_using(self, key, word_text, tag, base): - text_split = word_text.split(' ') - text_split[-1] = self._replace_correct_from(key, tag, base) - return ' '.join(text_split) - - def _correct_large_number(self, num, word_text, tag, key): - trailing_zeros = key - if not num and trailing_zeros >= 12: - if int(trailing_zeros / 3) * 2 != int(trailing_zeros / 6): - word_text = self._replace_using( - key, - word_text, - tag, - 'miliardowy') - else: - word_text = self._replace_using( - key, - word_text, - tag, - 'bilionowy' - ) - elif num and trailing_zeros >= 27: - if int(trailing_zeros / 3) * 2 != int(trailing_zeros / 6): - word_text = self._replace_using(key, word_text, tag, 'biliard') - elif trailing_zeros >= 54: - word_text = self._replace_using(key, word_text, tag, 'bilion') - else: - word_text = self._get_correct_form(word_text, tag) - else: - word_text = self._get_correct_form(word_text, tag) - return word_text - - def _get_correct_form(self, text, tag, key=None): - if tag is None: - return text - if key: - return self._correct_large_number(True, text, tag, key) - text_split = text.split(' ') - generated = self._morf.generate(text_split[-1]) - for form in generated: - is_correct = True - form_tag = form[2].split(':') - for i, t in enumerate(tag.split(':')): - if t not in form_tag[i].split('.'): - is_correct = False - break - if is_correct: - text_split[-1] = form[0] - return ' '.join(text_split) + return word, tag def _process_word(self, tok_id, text, tag): self._add_special(tok_id, text, tag) @@ -448,7 +164,7 @@ class Wordifier: def _add_special(self, tok_id, text, tag): s_type = self._special_type(text) if s_type: - self._special_list.append((tok_id, text, tag, s_type)) + self._wordify_tokens.append((tok_id, text, tag, s_type)) return text def _process_lex(self, lex_subtree): @@ -462,401 +178,290 @@ class Wordifier: raise Exception('Lex tag had no ctag inside!') return tag - def _get_number(self, string_builder, id_, tag, length): - if length > 1: - words = '' - j = length - i = 0 - while j > 0: - if string_builder[id_ + i] != ' ': - j -= 1 - i -= 1 - i += 1 - for j in range(0, length): - if string_builder[id_ + i] == ' ': - i += 1 - if len(string_builder[id_ + i]) <= 3: - if all(len(elem) == 3 or elem == ' ' for elem - in string_builder[id_ + i:id_ + 1]): - return words + self._handle_numbers( - text=''.join( - string_builder[id_ + i:id_ + 1]) - .replace(' ', ''), - tag=tag - ) - words += self._handle_numbers( - text=string_builder[id_ + i], - tag=tag - ) + ' ' - i += 1 - return words.rstrip() + def _handle_fraction(self, tokens, tags): + """Generate words from fraction splitted by slash '/'. + + Args: + tokens (list of str): List that contains numbers separated by + slash '/'. + + Returns: + str: Fraction as words. + + """ + text = ''.join(tokens) + numerator, denominator = text.split('/') + tag_num = tags[0] + remainder = to_number_plural(int(numerator) % 10) + + tag_case, tag_gender = tag_num.split(':')[2:4] + tag_den = self._get_denominator_tag(remainder, tag_case, tag_gender) + + zeros = trailing_zeros(denominator) + if len(denominator) < 4 or \ + (zeros > 2 and 0 < len(denominator) - zeros <= 3): + return num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) else: - return self._handle_numbers(text=string_builder[id_], tag=tag) - - def _get_superscript(self, string_builder, id_, tag, length): - words = '' - i = 1 - length - new_text = [] - for j in range(0, length): - if string_builder[id_ + i] == ' ': - i += 1 - words += self._handle_numbers( - text=''.join(new_text), - tag=tag - ) + ' ' - for char in string_builder[id_ + i]: - new_text.append(self._script_translator[0][char]) - i += 1 - words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' - return words.rstrip() - - def _get_subscript(self, string_builder, id_, tag, length): - words = '' - i = 1 - length - for j in range(0, length): - if string_builder[id_ + i] == ' ': + return num2words(numerator) + ' przez ' + \ + num2words(denominator) + + def _handle_decimal_fraction(self, tokens): + """Generate words from decimal fraction splitted by dot. + + Args: + tokens (list of str): List that contains numbers separated by dot. + + Returns: + str: Decimal fraction as words. + + """ + text = ''.join(tokens) + number, numerator = text.split('.') + number = number.replace(' ', '') + tag_num = 'adj:sg:nom:f' if int(numerator) == 1 else 'num:pl:nom:f' + denominator = str(10 ** len(numerator)) + remainder = to_number_plural(int(numerator) % 10) + tag_den = self._get_denominator_tag(remainder, 'default') + if int(number) == 0: + return num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) + else: + return num2words(number) + ' i ' + \ + num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) + + def _check_decimal_fraction(self, tokens): + """Checks whether given list of tokens starts with decimal fraction. + + If contains fraction generate words from whole fraction otherwise + generate words from first number. + + Args: + tokens (list of str): List of tokens with number at the beginning. + + Returns: + str: Tokens that form a fraction or number. + int: The number of tokens that make up the fraction. + + """ + match = self.decimal_fraction_regex.search(''.join(tokens[:5])) + if match and match.start() == 0: + tokens_match = tokens[0] + i = 1 + while tokens_match != match.group(0): + tokens_match += tokens[i] i += 1 - new_text = [] - for char in string_builder[id_ + i]: - new_text.append(self._script_translator[1][char]) - words += self._handle_numbers(text=''.join(new_text), tag=tag) + ' ' + return match.group(0), i - 1 + else: + return tokens[0], 0 + + def _handle_mixed_types(self, tokens, special_types, tags): + last_number_plural = NumberPlural.SINGULAR + if TokenType.NUMBER in special_types: + special_character_map = self.special_character_numbers_map + else: + special_character_map = self.special_character_map + i = 0 + iter_special_types = iter(special_types) + for token_type in iter_special_types: + if token_type == TokenType.SPECIAL_CHARACTER: + if tokens[i] in special_character_map: + tokens[i] = special_character_map[tokens[i]] + else: + tokens[i] = '' + elif token_type == TokenType.PUNCTUATION: + if tokens[i] == ' ': + tokens[i] = '' + elif token_type == TokenType.NUMBER: + number, skip = self._check_decimal_fraction(tokens[i:]) + if skip > 0: + words = self._handle_decimal_fraction(number) + if int(''.join(number).split('.')[0]) == 0: + last_number_plural = NumberPlural.FRACTION + else: + last_number_plural = NumberPlural.MANY + else: + words = num2words(number) + last_number_plural = to_number_plural(number) + tokens = tokens[:i] + [words] + tokens[i + skip + 1:] + if skip != 0: + next(islice(iter_special_types, skip - 1, skip), '') + elif token_type == TokenType.CURRENCY: + suffix = last_number_plural.value + tokens[i] = self._currencies[tokens[i]][suffix] i += 1 - return words.rstrip() - - def _get_number_slash(self, string_builder, id_, tag, length): - return self._get_number(string_builder, id_, tag, length - 1) + ' /' - - def _get_superscript_slash(self, string_builder, id_, tag, length): - return self._get_superscript( - string_builder=string_builder, - id_=id_, - tag=tag, - length=length - 1 - ) + ' /' - - def _handle_fraction(self, numerator, denominator, tag): - num = None - tag_list = tag.split(':') - if numerator == '1': - de_tag = 'adj:sg:' + tag_list[2] + ':f' - num_tag = 'adj:sg:' + tag_list[2] + ':f' - num = True - elif numerator == '2' or numerator == '3' or numerator == '4': - de_tag = 'adj:sg:' + tag_list[2] + ':n' - num_tag = 'num:pl:' + tag_list[2] + ':f' - elif numerator[-1] == '0': - de_tag = 'adj:sg:' + tag_list[2] + ':n' - num_tag = None + text = ' '.join([w for w in tokens if w != '']) + return text + + def _get_as_words(self, tokens, tags, special_types): + """Convert special tokens and numbers to words. + + Args: + tokens (list of str): List of tokens. + special_types (list of TokenType): Types of tokens. + + Returns: + str : Joined tokens converted to words. + + """ + if is_simple_number(tokens, special_types): + numbers = ''.join([n for i, n in enumerate(tokens) + if special_types[i] == TokenType.NUMBER]) + return num2words(''.join(numbers), tags[-1]) + elif is_fraction(tokens): + return self._handle_fraction(tokens, tags) + elif is_fraction(tokens, decimal=True): + return self._handle_decimal_fraction(tokens) else: - de_tag = 'subst:pl:gen:' + tag_list[3] - num_tag = None - return self._get_correct_form( - text=self._handle_numbers(text=numerator, tag=num_tag, num=num), - tag=num_tag - ) + ' ' + self._get_correct_form( - text=self._handle_numbers(text=denominator, tag=de_tag), - tag=de_tag - ) - - def _get_fraction(self, string_builder, id_, tag, length): - string = '' - i = 1 - length - for j in range(length): - words = self._script_translator[2][string_builder[id_ + i + j]]\ - .split('/') - string += self._handle_fraction(words[0], words[1], tag) + ' ' - return string.rstrip(' ') - - def _translate_script( - self, - string_builder, - id_, - length, - superscript, - until=None, - from_=None - ): - result = '' - idx = 0 if superscript else 1 - i = 1 - length - p = 0 - if from_ is not None: - for p in range(length): - word = string_builder[id_ + p + i] - if word == from_: - i += p + 1 + return self._handle_mixed_types(tokens, special_types, tags) + + def _check_number_multipart(self, index, next_id, string_builder): + """Check if the next token is continuation of number with actual token. + + Args: + index (int): Actual token id. + next_id (int): Next token id. + string_builder (list of str): List of all words. + + Returns: + bool: Is next token continuation of a number. + + """ + return next_id == index + 1 or \ + (index + 2 == next_id and + string_builder[index + 1] in self.number_punctuation) + + def _join_tokens(self, token, string_builder): + """Combine tokens that form multi-part formulas. + + Args: + tokens (list of tuple): List of tokens and their features. + Every element contains index, word, morphological tag and + token type. + string_builder (list of str): List of all words. + + Returns: + list of tuple: List of joined tokens and their features. + + """ + joined_tokens = [] + iter_wordify_tokens = enumerate(iter(self._wordify_tokens)) + for i, (index, token, tag, token_type) in iter_wordify_tokens: + j = i + 1 + tokens = [token] + tags = [tag] + special_types = [token_type] + start_id = index + + while j < len(self._wordify_tokens): + next_id, next_token, next_tag, \ + next_special_type = self._wordify_tokens[j] + if not self._check_number_multipart(index, next_id, + string_builder): + break + if next_special_type in self.following_type[token_type]: + if index + 2 == next_id: + tokens.append(string_builder[index + 1]) + special_types.append(TokenType.PUNCTUATION) + tags.append('') + tokens.append(next_token) + tags.append(next_tag) + special_types.append(next_special_type) + else: break - for j in range(length - p): - word = string_builder[id_ + j + i] - if until and word == until: - break - if word in self._script_translator[idx]: - result += self._script_translator[idx][word] - else: - break - return result - - def _get_script_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=True, - until='/' - ), - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=False, - from_='/' - ), - tag - ) - - def _get_super_number_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=True, - until='/' - ), - string_builder[id_], - tag - ) - - def _get_number_sub_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - string_builder[id_ - 2], - self._translate_script( - string_builder=string_builder, - id_=id_, - length=length, - superscript=False, - from_='/' - ), - tag - ) - - def _get_number_number_fraction(self, string_builder, id_, tag, length): - return self._handle_fraction( - string_builder[id_ - 2], - string_builder[id_], - tag - ) - - def _get_dot(self, string_builder, id_, tag, length): - word = ''.join(string_builder[id_ + 1 - length:id_ + 1]) - numbers = word.split('.') - return self._handle_numbers(numbers[0]) + ' i ' \ - + self._handle_fraction( - numerator=numbers[1], - denominator=str(10 ** int(len(numbers[1]))), - tag=tag - ) - - def _handle_additional_numbers( - self, - string_builder, - id_, - until, - tag, - length - ): - number = [] - i = 1 - length - for j in range(length): - word = string_builder[id_ + i + j] - if word == until: - break - number.append(word) - length = len(number) - if length > 1: - return self._handle_numbers( - text=' '.join(number[0:length]).rstrip(), - tag=tag - ) - return '' - - def _handle_powers(self, first_number, second_number, tag=None): - if first_number == '10': - return self._handle_numbers( - text='1' + '0' * int(second_number), - tag=tag, - num=True - ) - return self._handle_numbers(first_number) + ' do potęgi ' \ - + self._handle_numbers(second_number, 'adj:sg:gen:f') - - def _get_number_to_number(self, string_builder, id_, tag, length): - text = self._handle_additional_numbers( - string_builder, - id_, - '^', - tag, - length - ) - j = 0 - i = 1 - length - for k in range(length): - if string_builder[id_ + k + i] == '^': - j = k + 1 + i - break - if j < 0 or j >= length: - return text - if text: - text += ' ' - return text + self._handle_powers( - string_builder[id_ + j - 2], - string_builder[id_ + j], - tag - ) - - def _get_number_to_super(self, string_builder, id_, tag, length): - text = self._handle_additional_numbers( - string_builder, - id_, - '^', - tag, - length - ) - j = 0 - i = 1 - length - for k in range(length): - if string_builder[id_ + k + i] == '^': - j = k + 1 + i - break - if j == 0 or j >= length: - return text - if text: - text += ' ' - second_number = self._translate_script( - string_builder=string_builder, - id_=id_ + j, - length=length - j, - superscript=True - ) - return text + self._handle_powers( - string_builder[id_ + j - 2], - second_number, - tag - ) - - def _get_scientific(self, string_builder, id_, tag, length): - words = string_builder[id_].split('^') - return self._handle_powers(words[0], words[1], tag) - - def _get_number_hat(self, string_builder, id_, tag, length): - return self._get_number(string_builder, id_, tag, length - 1) + ' ^' - - def _get_as_words(self, id_, string_builder, tag, length, s_type): - if s_type in self._special_dict: - return self._special_dict[s_type](string_builder, id_, tag, length) - return '' - - @staticmethod - def _check_if_multipart(current_stype, s_type): - return ((current_stype == 'number' or - current_stype == 'superscript') and - s_type == '/') or\ - ((current_stype == 'superscript/' or - current_stype == 'number/') and - (s_type == 'number' or s_type == 'subscript')) or\ - (current_stype == s_type and - (s_type == 'number' or s_type == 'subscript' or - s_type == 'superscript')) or\ - ((current_stype == 'superscript/subscript' or - current_stype == 'number/subscript') and - s_type == 'subscript') or\ - (current_stype == 'number' and s_type == '^') or\ - (current_stype == 'number^' and - (s_type == 'number' or s_type == 'superscript')) - - @staticmethod - def _check_if_number_continuation(current_stype, s_type): - return not ((current_stype == s_type and - (s_type == 'number' or s_type == 'subscript' or - s_type == 'superscript')) or - ((current_stype == 'superscript/subscript' or - current_stype == 'number/subscript') and - s_type == 'subscript')) - - def _handle_special(self, string_builder): - if self._special_list: - it = iter(self._special_list) - id_, text, tag, s_type = next(it) - current_tag = tag - current_stype = s_type - current_id = id_ - length = 1 - for id_, text, tag, s_type in it: - if self._check_if_multipart(current_stype, s_type): - if id_ == current_id + 1 or ( - id_ == current_id + 2 and s_type == 'number' and - string_builder[current_id + 1] == ' '): - length += 1 - if self._check_if_number_continuation( - current_stype, - s_type - ): - current_stype += s_type - current_tag = tag - current_id = id_ - continue - new_text = self._get_as_words( - id_=current_id, - string_builder=string_builder, - tag=current_tag, - length=length, - s_type=current_stype - ) - string_builder = self._replace_string_in_builder( - string_builder=string_builder, - current_id=current_id, - length=length, - new_text=new_text - ) - length = 1 - current_tag = tag - current_stype = s_type - current_id = id_ - new_text = self._get_as_words( - id_=current_id, - string_builder=string_builder, - tag=current_tag, - length=length, - s_type=current_stype - ) - string_builder = self._replace_string_in_builder( - string_builder=string_builder, - current_id=current_id, - length=length, - new_text=new_text - ) - self._special_list.clear() - return string_builder - @staticmethod - def _replace_string_in_builder( - string_builder, - current_id, - length, - new_text - ): - j = current_id - i = length - while i > 0: - if not (string_builder[j] == ' ' or string_builder[j] == ''): - i -= 1 - string_builder[j] = '' - j -= 1 - string_builder[current_id] = new_text + next(iter_wordify_tokens) + index = next_id + token_type = next_special_type + j += 1 + joined_tokens.append((start_id, tokens, tags, special_types)) + return joined_tokens + + def _handle_special_types(self, string_builder): + """Convert special tokens to words and replace them in string builder. + + Args: + string_builder (list of str]): List of all words. + + Returns: + list of str: Return updated string builder with special tokens + replaced by words. + + """ + wordify_tokens = self._join_tokens(self._wordify_tokens, string_builder) + enum_special = enumerate(wordify_tokens) + for i, special_token in enum_special: + index, tokens, tags, token_type = special_token + words = self._get_as_words(tokens, tags, token_type) + no_tokens = len(tokens) + string_builder = string_builder[:index] + [words] + \ + string_builder[index + no_tokens:] + offset = no_tokens - 1 + wordify_tokens[i + 1:] = [subtract_from_first(x, offset) + for x in wordify_tokens[i + 1:]] + self._wordify_tokens.clear() return string_builder - def _process_sentence(self, string_builder): - string_builder = self._handle_special(string_builder) - string_builder[0] = string_builder[0].capitalize() + def _get_match_tag(self, match, string_builder, tags): + match = match.group(0) + j = 0 + for i, word in enumerate(string_builder): + if match.startswith(word): + acc = word + match_tags = [tags[j]] + tmp = j + while i < len(string_builder) - 1 and len(acc) < len(match): + i += 1 + acc += string_builder[i] + if acc != match[:len(acc)]: + break + if string_builder[i] != ' ': + j += 1 + match_tags.append(tags[j]) + j = tmp + if acc == match: + return match_tags + if word != ' ': + j += 1 + return [] + + def _handle_regexes(self, string_builder, tags): + """Check for regexes in the given builder and replace them with words. + + Args: + string_builder (list of str): List of all words. + + Returns: + list of str: Updated string builder with matches replaced by words. + + """ + sentence = ''.join(string_builder) + matches = list(self.date_regex.finditer(sentence)) + if not matches: + return string_builder + replace = [] + for match in matches: + date_tags = self._get_match_tag(match, string_builder, tags) + replace.append(date2words(match, date_tags)) + matches = list(map(lambda m: m.group(0), matches)) + builder, self._wordify_tokens = check_and_replace(string_builder, + matches, replace, + self._wordify_tokens) + return builder + + def _process_sentence(self, string_builder, tags): + """Process a sentence and replace special tokens (eg. numbers) words. + + Args: + string_builder (list of str): List of all words. + + Returns: + str: Sentece with replaced special tokens. + + """ + string_builder = self._handle_regexes(string_builder, tags) + string_builder = self._handle_special_types(string_builder) + if string_builder[0] and not string_builder[0][0].isupper(): + string_builder[0] = string_builder[0].capitalize() return ''.join(string_builder) diff --git a/src/worker.py b/src/worker.py index 45fccfedf1993cae249fc3406ad8f6fc729f4230..8dfe2f8c957b0a0072886f4e6138a944ce9fe9f6 100755 --- a/src/worker.py +++ b/src/worker.py @@ -4,7 +4,7 @@ import logging import nlp_ws from src.wordifier import Wordifier -from src.ccl_handler import Ccl_handler +from src.ccl_handler import CCLHandler _log = logging.getLogger(__name__) @@ -18,7 +18,7 @@ class Worker(nlp_ws.NLPWorker): """One time static initialisation.""" def process(self, input_file, task_options, output_file): - """A.""" + """Processing an input file and generating tokens converted to words.""" wordifier = Wordifier() - ccl_handler = Ccl_handler(input_file) + ccl_handler = CCLHandler(input_file) ccl_handler.process(output_file, wordifier.unmarshallers) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/test_num2words.py b/tests/test_num2words.py new file mode 100644 index 0000000000000000000000000000000000000000..23b06e6ff93dc325dbe96083d3f84ec20c845907 --- /dev/null +++ b/tests/test_num2words.py @@ -0,0 +1,136 @@ +import unittest +from parameterized import parameterized, param + +from src.num2words import num2words + + +class TestNum2Words(unittest.TestCase): + single_tag = 'adj:sg:nom:f' + several_tag = 'adj:pl:acc:f' + many_tag = 'adj:pl:acc:m1' + + @parameterized.expand([ + param('0', 'zero'), + param('08', 'osiem'), + param('12', 'dwanaście'), + param('23', 'dwadzieścia trzy'), + param('48', 'czterdzieści osiem'), + param('187', 'sto osiemdziesiąt siedem'), + param('249', 'dwieście czterdzieści dziewięć'), + param('600', 'sześćset'), + param('720', 'siedemset dwadzieścia'), + param('304', 'trzysta cztery'), + + param('1000', 'tysiąc'), + param('425000', 'czterysta dwadzieścia pięć tysięcy'), + param('102000', 'sto dwa tysiące'), + param('390000', 'trzysta dziewięćdziesiąt tysięcy'), + param('701000', 'siedemset jeden tysięcy'), + param('993999', 'dziewięćset dziewięćdziesiąt trzy tysiące ' + 'dziewięćset dziewięćdziesiąt dziewięć'), + param('1000642', 'milion sześćset czterdzieści dwa'), + param('2001003', 'dwa miliony tysiąc trzy'), + param('18456000', 'osiemnaście milionów ' + 'czterysta pięćdziesiąt sześć tysięcy'), + param('1000000000', 'miliard') + ]) + def test_numbers(self, number, words): + self.assertEqual(num2words(number), words) + + @parameterized.expand([ + param('0', 'zerowy', ordinal=True), + param('1', 'pierwszy', ordinal=True), + param('10', 'dziesiąty', ordinal=True), + param('15', 'piętnasty', ordinal=True), + param('31', 'trzydziesty pierwszy', ordinal=True), + param('70', 'siedemdziesiąty', ordinal=True), + param('099', 'dziewięćdziesiąty dziewiąty', ordinal=True), + param('100', 'setny', ordinal=True), + param('102', 'sto drugi', ordinal=True), + param('183', 'sto osiemdziesiąty trzeci', ordinal=True), + param('201', 'dwieście pierwszy', ordinal=True), + + param('1000', 'tysięczny', ordinal=True), + param('1005', 'tysiąc piąty', ordinal=True), + param('2000', 'dwutysięczny', ordinal=True), + param('2020', 'dwa tysiące dwudziesty', ordinal=True), + param('10000', 'dziesięciotysięczny', ordinal=True), + param('100856', 'sto tysięcy osiemset pięćdziesiąty szósty', + ordinal=True), + param('1000000', 'milionowy', ordinal=True), + param('1002003', 'milion dwa tysiące trzeci', ordinal=True), + param('1948052296', 'miliard dziewięćset czterdzieści osiem milionów ' + 'pięćdziesiąt dwa tysiące ' + 'dwieście dziewięćdziesiąty szósty', ordinal=True), + ]) + def test_ordinal_numbers(self, number, words, ordinal): + self.assertEqual(num2words(number, ordinal=ordinal), words) + + @parameterized.expand([ + ('1', 'adj:sg:nom:f', 'jedna'), + ('2', 'num:pl:nom:f', 'dwie') + ]) + def test_numbers_numerator(self, number, tag, words): + self.assertEqual(num2words(number, tag), words) + + @parameterized.expand([ + param('1', 'pierwsza'), + param('2', 'druga'), + param('5', 'piąta'), + param('10', 'dziesiąta'), + param('31', 'trzydziesta pierwsza'), + param('100', 'setna'), + param('102', 'sto druga'), + param('512', 'pięćset dwunasta'), + param('600', 'sześćsetna'), + + param('1000', 'tysięczna'), + param('2002', 'dwa tysiące druga'), + param('3000', 'trzytysięczna'), + param('1000000000', 'miliardowa'), + param('1473022977', 'miliard czterysta siedemdziesiąt trzy miliony ' + 'dwadzieścia dwa tysiące dziewięćset siedemdziesiąta siódma'), + ]) + def test_single_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.single_tag, ordinal), words) + + @parameterized.expand([ + param('3', 'trzecie'), + param('6', 'szóste'), + param('10', 'dziesiąte'), + param('47', 'czterdzieste siódme'), + param('100', 'setne'), + param('101', 'sto pierwsze'), + param('300', 'trzechsetne'), + param('981', 'dziewięćset osiemdziesiąte pierwsze'), + + param('1000', 'tysięczne'), + param('8000', 'ośmiotysięczne'), + param('10000', 'dziesięciotysięczne'), + param('100000', 'stutysięczne'), + param('1000115376708', 'bilion sto piętnaście milionów ' + 'trzysta siedemdziesiąt sześć tysięcy siedemset ósme'), + ]) + def test_several_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.several_tag, ordinal), words) + + @parameterized.expand([ + param('4', 'czwartych'), + param('8', 'ósmych'), + param('10', 'dziesiątych'), + param('69', 'sześćdziesiątych dziewiątych'), + param('100', 'setnych'), + param('212', 'dwieście dwunastych'), + param('700', 'siedemsetnych'), + param('901', 'dziewięćset pierwszych'), + + param('1000', 'tysięcznych'), + param('6000', 'sześciotysięcznych'), + param('10000', 'dziesięciotysięcznych'), + param('1000000', 'milionowych'), + param('238055017238', 'dwieście trzydzieści osiem miliardów ' + 'pięćdziesiąt pięć milionów siedemnaście tysięcy ' + 'dwieście trzydziestych ósmych'), + ]) + def test_many_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.many_tag, ordinal), words) diff --git a/tox.ini b/tox.ini index 1516042f9a295eeb21fef06a36f317ec805e370e..67d5403ab50027aa81fac8d52de8b1d10379e086 100755 --- a/tox.ini +++ b/tox.ini @@ -40,5 +40,5 @@ max-line-length = 80 # D410 Missing blank line after section # D411 Missing blank line before section ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 -match-dir = ^(?!\.tox|venv).* +match-dir = ^(?!\.tox|venv|tests).* match = ^(?!setup).*\.py \ No newline at end of file