diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f622468222a6d36fc394b19a7a0de058fa0f40ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ +.vscode \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100755 index 0000000000000000000000000000000000000000..811491d2847c21836f8d5d156d70fceb848f1526 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,49 @@ +image: "clarinpl/python:3.6" +cache: + paths: + - .tox +stages: + - check_style + - test + - build + +pep8: + stage: check_style + before_script: + - pip install tox==2.9.1 + script: + - tox -v -e pep8 + +docstyle: + stage: check_style + before_script: + - pip install tox==2.9.1 + script: + - tox -v -e docstyle + +test: + stage: test + image: "docker:18.09.7" + services: + - "docker:18.09.7-dind" + script: + - docker build -t clarinpl/wordifier . + - docker run --rm + -v "$(pwd)/requirements-dev.txt:/home/worker/requirements-dev.txt" + -v "$(pwd)/tests:/home/worker/tests" + clarinpl/wordifier + sh -c 'pip3 install -r requirements-dev.txt ; nose2 -v tests' + +build: + stage: build + image: "docker:18.09.7" + only: + - master + services: + - "docker:18.09.7-dind" + script: + - docker build -t clarinpl/wordifier . + - echo $DOCKER_PASSWORD > pass.txt + - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin + - rm pass.txt + - docker push clarinpl/wordifier diff --git a/Dockerfile b/Dockerfile new file mode 100755 index 0000000000000000000000000000000000000000..2dfcce57a1c900a55566eb90d6797c6bc40d4001 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM clarinpl/python:3.6 + +WORKDIR /home/worker + +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 +RUN update-alternatives --set python /usr/bin/python3.6 + +RUN apt-get update && apt-get install -y morfeusz2 + +RUN wget -O morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl http://download.sgjp.pl/morfeusz/20200913/Linux/18.04/64/morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl + +RUN python3.6 -m pip install morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl + +COPY ./src ./src +COPY ./main.py . +COPY ./requirements.txt . +COPY ./data ./data + +RUN python3.6 -m pip install -r requirements.txt + +CMD ["python3.6", "main.py", "service"] \ No newline at end of file diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3741fc1f3740628f08f560b5c2d6186bb09a2019 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,15 @@ +# Wordifier + +A service that expands abbreviations into full texts. The following modules are implemented at this time: +- verbal notation of digits, numbers, decimal and ordinary fractions (with separators '.' and '/') +- verbal notation of simple equations with addition, subtraction, multiplication and division +- verbal notation of dates + - recognizing different ways to write dates. + - 25.12.2010 or 25,12,12 (day/month, day/month, year) + - 2009-08-30 or 20 08 30 (year, day/month, day/month) + - 12 Jan 2010 or 31 Jan 1998 (day, month, year) + - Mar 12 (month, year) + - Dec 15 (day, month) + - April 30 2000 (month, day, year) +- replace currency symbols with words +- write special characters (%, &, #, ^, =, +, -, /) in words \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100755 index 0000000000000000000000000000000000000000..284524556ac7624660962bb8d6db7da28ef41903 --- /dev/null +++ b/config.ini @@ -0,0 +1,20 @@ +[service] +tool = wordifier + +root = /samba/requests/ +rabbit_host = rabbitmq +rabbit_user = test +rabbit_password = test +queue_prefix = nlp_ + +[tool] +workers_number = 5 +processed_lines = 1000 + +[logging] +port = 9998 +local_log_level = INFO + +[logging_levels] +__main__ = INFO + diff --git a/data/currencies.json b/data/currencies.json new file mode 100644 index 0000000000000000000000000000000000000000..2a29d8155da515c8e4f1fcdaca4342e64ac148c2 --- /dev/null +++ b/data/currencies.json @@ -0,0 +1,1514 @@ +{ + "$": [ + "dolar", + "dolary", + "dolarów", + "dolara" + ], + "USD": [ + "dolar amerykański", + "dolary amerykańskie", + "dolarów amerykańskich", + "dolara amerykańskiego" + ], + "CA$": [ + "dolar kanadyjski", + "dolary kanadyjskie", + "dolarów kanadyjskich", + "dolara kanadyjskiego" + ], + "CAD": [ + "dolar kanadyjski", + "dolary kanadyjskie", + "dolarów kanadyjskich", + "dolara kanadyjskiego" + ], + "\u20ac": [ + "Euro", + "Euro", + "Euro", + "Euro" + ], + "EUR": [ + "Euro", + "Euro", + "Euro", + "Euro" + ], + "\u062f.\u0625.\u200f": [ + "Dirham Zjednoczonych Emiratów Arabskich", + "Dirhamy Zjednoczonych Emiratów Arabskich", + "Dirhamów Zjednoczonych Emiratów Arabskich", + "Dirhama Zjednoczonych Emiratów Arabskich" + ], + "AED": [ + "Dirham Zjednoczonych Emiratów Arabskich", + "Dirhamy Zjednoczonych Emiratów Arabskich", + "Dirhamów Zjednoczonych Emiratów Arabskich", + "Dirhama Zjednoczonych Emiratów Arabskich" + ], + "\u060b": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "Af": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "AFN": [ + "Afgani", + "Afgani", + "Afgani", + "Afgani" + ], + "Lek": [ + "lek", + "leki", + "leków", + "leka" + ], + "ALL": [ + "lek", + "leki", + "leków", + "leka" + ], + "\u0564\u0580.": [ + "armański dram", + "armeńskie dramy", + "armeńskich dramów", + "armeńskiego drama" + ], + "AMD": [ + "armański dram", + "armeńskie dramy", + "armeńskich dramów", + "armeńskiego drama" + ], + "AR$": [ + "argetyńskie peso", + "argetyńskie pesos", + "argetyńsich pesos", + "argetyńskiego peso" + ], + "ARS": [ + "argetyńskie peso", + "argetyńskie pesos", + "argetyńsich pesos", + "argetyńskiego peso" + ], + "AU$": [ + "dolar australijski", + "dolary australijskie", + "dolarów australijskich", + "dolara australijskiego" + ], + "AUD": [ + "dolar australijski", + "dolary australijskie", + "dolarów australijskich", + "dolara australijskiego" + ], + "\u043c\u0430\u043d.": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "man.": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "AZN": [ + "manat azerbejdżański", + "manaty azerbejdżańskie", + "manatów azerbejdżańskich", + "manata azerbejdżańskiego" + ], + "KM": [ + "marka zamienna", + "marki zamienne", + "marek zamiennych", + "marki zamiennej" + ], + "BAM": [ + "marka zamienna", + "marki zamienne", + "marek zamiennych", + "marki zamiennej" + ], + "\u09f3": [ + "taka", + "taka", + "taka", + "taka" + ], + "Tk": [ + "taka", + "taka", + "taka", + "taka" + ], + "BDT": [ + "taka", + "taka", + "taka", + "taka" + ], + "\u043b\u0432.": [ + "lew", + "lewy", + "lewów", + "lewa" + ], + "BGN": [ + "lew", + "lewy", + "lewów", + "lewa" + ], + "\u062f.\u0628.\u200f": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "BD": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "BHD": [ + "dinar bahjraski", + "dinary bahrajskie", + "dinarów bahrajskich", + "dinara bahrajskiego" + ], + "FBu": [ + "frank burundyjski", + "franki burundyjskie", + "franków burundyjskich", + "franka burundyjskiego" + ], + "BIF": [ + "frank burundyjski", + "franki burundyjskie", + "franków burundyjskich", + "franka burundyjskiego" + ], + "BN$": [ + "dolar brunejski", + "dolary brunejskie", + "dolarów brunejskich", + "dolara brunejskiego" + ], + "BND": [ + "dolar brunejski", + "dolary brunejskie", + "dolarów brunejskich", + "dolara brunejskiego" + ], + "Bs": [ + "boliviano", + "bolivianos", + "bolivianos", + "boliviano" + ], + "BOB": [ + "boliviano", + "bolivianos", + "bolivianos", + "boliviano" + ], + "R$": [ + "real brazylijski", + "reale brazylijskie", + "realów brazylijskich", + "reala brazylijskiego" + ], + "BRL": [ + "real brazylijski", + "reale brazylijskie", + "realów brazylijskich", + "reala brazylijskiego" + ], + "P": [ + "pula", + "pula", + "pula", + "pula" + ], + "BWP": [ + "pula", + "pula", + "pula", + "pula" + ], + "\u0440\u0443\u0431.": [ + "rubel białoruski", + "ruble białoruskie", + "rubli białoruskich", + "rubla białoruskiego" + ], + "Br": [ + "birr", + "birry", + "birrów", + "birra" + ], + "BYN": [ + "rubel białoruski", + "ruble białoruskie", + "rubli białoruskich", + "rubla białoruskiego" + ], + "BZ$": [ + "dolar belizeński", + "dolary belizeńskie", + "dolarów belizeńskich", + "dolara belizeńskiego" + ], + "BZD": [ + "dolar belizeński", + "dolary belizeńskie", + "dolarów belizeńskich", + "dolara belizeńskiego" + ], + "FrCD": [ + "frank kongijski", + "franki kongijskie", + "franków kongijskich", + "franka kongijskiego" + ], + "CDF": [ + "frank kongijski", + "franki kongijskie", + "franków kongijskich", + "franka kongijskiego" + ], + "CHF": [ + "frank szwajcarski", + "franki szwajcarskie", + "franków szwajcarskich", + "franka szwajcarskiego" + ], + "CL$": [ + "peso chilijskie", + "peso chilijskie", + "pesos chilijskich", + "peso chilijskiego" + ], + "CLP": [ + "peso chilijskie", + "peso chilijskie", + "pesos chilijskich", + "peso chilijskiego" + ], + "CN\u00a5": [ + "yuan", + "yuan", + "yuan", + "yuan" + ], + "CNY": [ + "yuan", + "yuan", + "yuan", + "yuan" + ], + "CO$": [ + "peso kolumbijskie", + "peso kolumbijskie", + "pesos kolumbijskich", + "peso kolumbijskiego" + ], + "COP": [ + "peso kolumbijskie", + "peso kolumbijskie", + "pesos kolumbijskich", + "peso kolumbijskiego" + ], + "\u20a1": [ + "colón kostarykański", + "colóny kostarytańskie", + "colónów kostarytańskich", + "colóna kostaryńskiego" + ], + "CRC": [ + "colón kostarykański", + "colóny kostarytańskie", + "colónów kostarytańskich", + "colóna kostaryńskiego" + ], + "CV$": [ + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka" + ], + "CVE": [ + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka", + "escudo Zielonego Przylądka" + ], + "K\u010d": [ + "czeska korona", + "czeskie korony", + "czeskich koron", + "czeskiej korony" + ], + "CZK": [ + "czeska korona", + "czeskie korony", + "czeskich koron", + "czeskiej korony" + ], + "Fdj": [ + "frank dżibutyjski", + "franki dżibutyjskie", + "franków dżibutyjskich", + "franka dżibutyjskiego" + ], + "DJF": [ + "frank dżibutyjski", + "franki dżibutyjskie", + "franków dżibutyjskich", + "franka dżibutyjskiego" + ], + "kr": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "Dkr": [ + "korona duńska", + "korony duńskie", + "koron duńskich", + "korony duńskiej" + ], + "DKK": [ + "korona duńska", + "korony duńskie", + "koron duńskich", + "korony duńskiej" + ], + "RD$": [ + "peso dominikańskie", + "peso dominikańskie", + "pesos dominikańskich", + "peso dominikańskiego" + ], + "DOP": [ + "peso dominikańskie", + "peso dominikańskie", + "pesos dominikańskich", + "peso dominikańskiego" + ], + "\u062f.\u062c.\u200f": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "DA": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "DZD": [ + "dinar algierski", + "dinary algierskie", + "dinarów algierskich", + "dinara algierskiego" + ], + "Ekr": [ + "korona estońska", + "korony estońskie", + "koron estońskich", + "korony estońskiej" + ], + "EEK": [ + "korona estońska", + "korony estońskie", + "koron estońskich", + "korony estońskiej" + ], + "\u062c.\u0645.\u200f": [ + "funt egipski", + "funty egipskie", + "funtów egipskich", + "funta egipskiego" + ], + "EGP": [ + "funt egipski", + "funty egipskie", + "funtów egipskich", + "funta egipskiego" + ], + "Nfk": [ + "nakfa", + "nakfy", + "nakf", + "nakfy" + ], + "ERN": [ + "nakfa", + "nakfy", + "nakf", + "nakfy" + ], + "ETB": [ + "birr", + "birry", + "birrów", + "birra" + ], + "\u00a3": [ + "funt szterling", + "funty szterling", + "funtów szterling", + "funta szterlinga" + ], + "GBP": [ + "funt szterling", + "funty szterling", + "funtów szterling", + "funta szterlinga" + ], + "GEL": [ + "lari", + "lari", + "lari", + "lari" + ], + "GH\u20b5": [ + "cedi", + "cedi", + "cedi", + "cedi" + ], + "GHS": [ + "cedi", + "cedi", + "cedi", + "cedi" + ], + "FG": [ + "frank gwinejski", + "franki gwinejskie", + "franków gwinejskich", + "franka gwinejskiego" + ], + "GNF": [ + "frank gwinejski", + "franki gwinejskie", + "franków gwinejskich", + "franka gwinejskiego" + ], + "Q": [ + "quetzal", + "quetzale", + "quetzali", + "quetzala" + ], + "GTQ": [ + "quetzal", + "quetzale", + "quetzali", + "quetzala" + ], + "HK$": [ + "dolar hongkoński", + "dolary hongkońskie", + "dolarów hongkońskich", + "dolara hongkońskiego" + ], + "HKD": [ + "dolar hongkoński", + "dolary hongkońskie", + "dolarów hongkońskich", + "dolara hongkońskiego" + ], + "L": [ + "lempira", + "lempiry", + "lempir", + "lempira" + ], + "HNL": [ + "lempira", + "lempiry", + "lempir", + "lempira" + ], + "kn": [ + "kuna", + "kuny", + "kun", + "kuny" + ], + "HRK": [ + "kuna", + "kuny", + "kun", + "kuny" + ], + "Ft": [ + "forint", + "forinty", + "forintów", + "forinta" + ], + "HUF": [ + "forint", + "forinty", + "forintów", + "forinta" + ], + "Rp": [ + "rupia indonezyjska", + "rupie indonezyjske", + "rupii indonezyjskych", + "rupii indonezyjskiej" + ], + "IDR": [ + "rupia indonezyjska", + "rupie indonezyjske", + "rupii indonezyjskych", + "rupii indonezyjskiej" + ], + "\u20aa": [ + "nowy izraelski szekel", + "nowe izraelskie szekle", + "nowych izraelskich szekli", + "nowego izraelskiego szekla" + ], + "ILS": [ + "nowy izraelski szekel", + "nowe izraelskie szekle", + "nowych izraelskich szekli", + "nowego izraelskiego szekla" + ], + "\u099f\u0995\u09be": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "Rs": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "INR": [ + "rupia indyjska", + "rupie indyjskie", + "rupii indyjskich", + "rupii indyjskiej" + ], + "\u062f.\u0639.\u200f": [ + "dinar iracki", + "dinary irackie", + "dinarów irackich", + "dinara irackiego" + ], + "IQD": [ + "dinar iracki", + "dinary irackie", + "dinarów irackich", + "dinara irackiego" + ], + "\ufdfc": [ + "rial irański", + "riale irańskie", + "riali irańskich", + "riala irańskiego" + ], + "IRR": [ + "rial irański", + "riale irańskie", + "riali irańskich", + "riala irańskiego" + ], + "Ikr": [ + "korona islandzka", + "korony islandzkie", + "koron islandzkich", + "korony islandzkiej" + ], + "ISK": [ + "korona islandzka", + "korony islandzkie", + "koron islandzkich", + "korony islandzkiej" + ], + "J$": [ + "dolar jamajski", + "dolary jamajskie", + "dolarów jamajskich", + "dolara jamajskiego" + ], + "JMD": [ + "dolar jamajski", + "dolary jamajskie", + "dolarów jamajskich", + "dolara jamajskiego" + ], + "\u062f.\u0623.\u200f": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "JD": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "JOD": [ + "dinar jordański", + "dinary jordańskie", + "dinarów jordańskich", + "dinara jordańskiego" + ], + "\uffe5": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "\u00a5": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "JPY": [ + "jen", + "jeny", + "jenów", + "jena" + ], + "Ksh": [ + "szyling kenijski", + "szylingi kenijskie", + "szylingów kenijskich", + "szylinga kenijskiego" + ], + "KES": [ + "szyling kenijski", + "szylingi kenijskie", + "szylingów kenijskich", + "szylinga kenijskiego" + ], + "\u17db": [ + "riel kambodżański", + "riele kambodżańskie", + "rieli kambodżańskich", + "riela kambodzańskiego" + ], + "KHR": [ + "riel kambodżański", + "riele kambodżańskie", + "rieli kambodżańskich", + "riela kambodzańskiego" + ], + "FC": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "CF": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "KMF": [ + "frank Komorów", + "franki Komorów", + "franków Komorów", + "franka Komorów" + ], + "\u20a9": [ + "won południowokoreański", + "wony południowokoreańskie", + "wonów południowokoreańskich", + "wona południowokoreańskiego" + ], + "KRW": [ + "won południowokoreański", + "wony południowokoreańskie", + "wonów południowokoreańskich", + "wona południowokoreańskiego" + ], + "\u062f.\u0643.\u200f": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "KD": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "KWD": [ + "dinar kuwejcki", + "dinary kuwejckie", + "dinarów kuwejckich", + "dinara kuwejckiego" + ], + "\u0442\u04a3\u0433.": [ + "tenge", + "tenge", + "tenge", + "tenge" + ], + "KZT": [ + "tenge", + "tenge", + "tenge", + "tenge" + ], + "\u0644.\u0644.\u200f": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "LB\u00a3": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "LBP": [ + "funt libański", + "funty libańskie", + "funtów libańskich", + "funta libańskiego" + ], + "SL Re": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "SLRs": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "LKR": [ + "rupia lankijska", + "rupie lankijskie", + "rupii lankijskich", + "rupii lankijskiej" + ], + "Lt": [ + "lit", + "lity", + "litów", + "lita" + ], + "LTL": [ + "lit", + "lity", + "litów", + "lita" + ], + "Ls": [ + "łat", + "łaty", + "łatów", + "łata" + ], + "LVL": [ + "łat", + "łaty", + "łatów", + "łata" + ], + "\u062f.\u0644.\u200f": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "LD": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "LYD": [ + "dinar libijski", + "dinary libijskie", + "dinarów libijskich", + "dinara libijskiego" + ], + "\u062f.\u0645.\u200f": [ + "dirham marokański", + "dirhamy marokańskie", + "dirhamów marokańskich", + "dirhama marokańskiego" + ], + "MAD": [ + "dirham marokański", + "dirhamy marokańskie", + "dirhamów marokańskich", + "dirhama marokańskiego" + ], + "MDL": [ + "Lej Mołdawii", + "Leje Mołdawii", + "Lei Mołdawii", + "Leja Mołdawii" + ], + "MGA": [ + "ariary", + "ariary", + "ariary", + "ariary" + ], + "MKD": [ + "denar macedoński", + "denary macedońskie", + "denarów macedońskich", + "denara macedońskiego" + ], + "K": [ + "kiat", + "kiaty", + "kiatów", + "kiata" + ], + "MMK": [ + "kiat", + "kiaty", + "kiatów", + "kiata" + ], + "MOP$": [ + "pataca", + "pataca", + "pataca", + "pataca" + ], + "MOP": [ + "pataca", + "pataca", + "pataca", + "pataca" + ], + "MURs": [ + "rupia Mauritiusu", + "rupie Mauritiusu", + "rupii Mauritiusu", + "rupii Mauritiusu" + ], + "MUR": [ + "rupia Mauritiusu", + "rupie Mauritiusu", + "rupii Mauritiusu", + "rupii Mauritiusu" + ], + "MX$": [ + "peso meksykańskie", + "peso meksykańskie", + "pesos meksykańskich", + "peso meksykańskiego" + ], + "MXN": [ + "peso meksykańskie", + "peso meksykańskie", + "pesos meksykańskich", + "peso meksykańskiego" + ], + "RM": [ + "ringgit", + "ringgit", + "ringgitów", + "ringgita" + ], + "MYR": [ + "ringgit", + "ringgit", + "ringgitów", + "ringgita" + ], + "MTn": [ + "metical", + "meticale", + "meticali", + "meticala" + ], + "MZN": [ + "metical", + "meticale", + "meticali", + "meticala" + ], + "N$": [ + "dolar namibijski", + "dolare namibijskie", + "dolarów namibijskich", + "dolara namibijskiego" + ], + "NAD": [ + "dolar namibijski", + "dolare namibijskie", + "dolarów namibijskich", + "dolara namibijskiego" + ], + "\u20a6": [ + "naira", + "naire", + "nair", + "naira" + ], + "NGN": [ + "naira", + "naire", + "nair", + "naira" + ], + "C$": [ + "cordoba oro", + "cordoby", + "córdob", + "cordoby" + ], + "NIO": [ + "cordoba oro", + "cordoby", + "córdob", + "cordoby" + ], + "Nkr": [ + "korona norweska", + "korony norweskie", + "koron norweskich", + "korony norweskiej" + ], + "NOK": [ + "korona norweska", + "korony norweskie", + "koron norweskich", + "korony norweskiej" + ], + "\u0928\u0947\u0930\u0942": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NPRs": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NPR": [ + "rupia nepalska", + "rupie nepalskie", + "rupii nepalskich", + "rupii nepalskiej" + ], + "NZ$": [ + "dolar nowozelandzki", + "dolary nowozelandzkie", + "dolarów nowozelandzkich", + "dolara nowozelandzkiego" + ], + "NZD": [ + "dolar nowozelandzki", + "dolary nowozelandzkie", + "dolarów nowozelandzkich", + "dolara nowozelandzkiego" + ], + "\u0631.\u0639.\u200f": [ + "rial omański", + "riale omańskie", + "riali omańskich", + "riala omańskiego" + ], + "OMR": [ + "rial omański", + "riale omańskie", + "riali omańskich", + "riala omańskiego" + ], + "B/.": [ + "balboa", + "balboa", + "balboa", + "balboa" + ], + "PAB": [ + "balboa", + "balboa", + "balboa", + "balboa" + ], + "S/.": [ + "sol", + "sole", + "soli", + "sola" + ], + "PEN": [ + "sol", + "sole", + "soli", + "sola" + ], + "\u20b1": [ + "peso filipińskie", + "peso filipińskie", + "pesos filipińskich", + "peso filipińskiego" + ], + "PHP": [ + "peso filipińskie", + "peso filipińskie", + "pesos filipińskich", + "peso filipińskiego" + ], + "\u20a8": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "PKRs": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "PKR": [ + "rupia pakistańska", + "rupie pakistańskie", + "rupii pakistańskich", + "rupii pakistańskiej" + ], + "z\u0142": [ + "złoty", + "złote", + "złotych", + "złotego" + ], + "PLN": [ + "złoty", + "złote", + "złotych", + "złotego" + ], + "\u20b2": [ + "guarani", + "guarani", + "guarani", + "guarani" + ], + "PYG": [ + "guarani", + "guarani", + "guarani", + "guarani" + ], + "\u0631.\u0642.\u200f": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "QR": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "QAR": [ + "rial katarski", + "riale katarskie", + "riali katarskich", + "riala katarskiego" + ], + "RON": [ + "lej rumuński", + "leje rumuńskie", + "lei rumuńsich", + "leja rumuńskiego" + ], + "\u0434\u0438\u043d.": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "din.": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "RSD": [ + "dinar serbski", + "dinary serbskie", + "dinarów serbskich", + "dinara serbskiego" + ], + "\u20bd.": [ + "rubel rosyjski", + "ruble rosyjskie", + "ruble rosyjskie", + "rubla rosyjskiego" + ], + "RUB": [ + "rubel rosyjski", + "ruble rosyjskie", + "ruble rosyjskie", + "rubla rosyjskiego" + ], + "FR": [ + "frank rwandyjski", + "franki rwandyjskie", + "franków rwandyjskich", + "franka rwandyjskiego" + ], + "RWF": [ + "frank rwandyjski", + "franki rwandyjskie", + "franków rwandyjskich", + "franka rwandyjskiego" + ], + "\u0631.\u0633.\u200f": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SR": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SAR": [ + "rial saudyjski", + "riale saudyjskie", + "riali saudyjskich", + "riala saudyjskiego" + ], + "SDG": [ + "funt sudański", + "funty sudańskie", + "funtów sudańskich", + "funta sudańskiego" + ], + "Skr": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "SEK": [ + "szwedzka korona", + "szwedzkie korony", + "szwedzkich koron", + "szwedzkiej korony" + ], + "S$": [ + "dolar singapurski", + "dolary singapurskie", + "dolarów singapurskich", + "dolara singapurskiego" + ], + "SGD": [ + "dolar singapurski", + "dolary singapurskie", + "dolarów singapurskich", + "dolara singapurskiego" + ], + "Ssh": [ + "szyling somalijski", + "szylingi somalijskie", + "szylingów somalijskich", + "szylinga somalijskiego" + ], + "SOS": [ + "szyling somalijski", + "szylingi somalijskie", + "szylingów somalijskich", + "szylinga somalijskiego" + ], + "\u0644.\u0633.\u200f": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "SY\u00a3": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "SYP": [ + "funt syryjski", + "funty syryjskie", + "funtów syryjskich", + "funta syryjskiego" + ], + "\u0e3f": [ + "bat tajlandzki", + "baty tajlandzkie", + "batów tajlandzkich", + "bata tajlandzkiego" + ], + "THB": [ + "bat tajlandzki", + "baty tajlandzkie", + "batów tajlandzkich", + "bata tajlandzkiego" + ], + "\u062f.\u062a.\u200f": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "DT": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "TND": [ + "dinar tunezyjski", + "dinary tunezyjskie", + "dinarów tunezyjskich", + "dinara tunezyjskiego" + ], + "T$": [ + "pa'anga", + "pa'anga", + "pa'anga", + "pa'anga" + ], + "TOP": [ + "pa'anga", + "pa'anga", + "pa'anga", + "pa'anga" + ], + "TL": [ + "lira turecka", + "liry tureckie", + "lir tureckich", + "liry tureckiej" + ], + "TRY": [ + "lira turecka", + "liry tureckie", + "lir tureckich", + "liry tureckiej" + ], + "TT$": [ + "dolar Trynidadu i Tobago", + "dolary Trynidadu i Tobago", + "dolarów Trynidadu i Tobago", + "dolara Trynidadu i Tobago" + ], + "TTD": [ + "dolar Trynidadu i Tobago", + "dolary Trynidadu i Tobago", + "dolarów Trynidadu i Tobago", + "dolara Trynidadu i Tobago" + ], + "NT$": [ + "dolar tajwański", + "dolary tajwańskie", + "dolarów tajwańskich", + "dolara tajwańskiego" + ], + "TWD": [ + "dolar tajwański", + "dolary tajwańskie", + "dolarów tajwańskich", + "dolara tajwańskiego" + ], + "TSh": [ + "szyling tanzański", + "szylingi tanzańskie", + "szylingów tanzańskich", + "szylinga tanzańskiego" + ], + "TZS": [ + "szyling tanzański", + "szylingi tanzańskie", + "szylingów tanzańskich", + "szylinga tanzańskiego" + ], + "\u20b4": [ + "hrywna", + "hrywny", + "hrywien", + "hrywny" + ], + "UAH": [ + "hrywna", + "hrywny", + "hrywien", + "hrywny" + ], + "USh": [ + "szyling ugandyjski", + "szylingi ugandyjskie", + "szylingów ugandyjskich", + "szylinga ugandyjskiego" + ], + "UGX": [ + "szyling ugandyjski", + "szylingi ugandyjskie", + "szylingów ugandyjskich", + "szylinga ugandyjskiego" + ], + "$U": [ + "peso urugwajskie", + "peso urugwajskie", + "pesos urugwajskie", + "peso urugwajskiego" + ], + "UYU": [ + "peso urugwajskie", + "peso urugwajskie", + "pesos urugwajskie", + "peso urugwajskiego" + ], + "UZS": [ + "sum", + "sumy", + "sumów", + "suma" + ], + "Bs.F.": [ + "boliwar", + "boliwary", + "boliwarów", + "boliwara" + ], + "VEF": [ + "boliwar", + "boliwary", + "boliwarów", + "boliwara" + ], + "\u20ab": [ + "dong", + "dongi", + "dongów", + "donga" + ], + "VND": [ + "dong", + "dongi", + "dongów", + "donga" + ], + "FCFA": [ + "środkowoafrykański frank CFA", + "środkowoafrykańskie franki CFA", + "środkowoafrykańskich franków CFA", + "środkowoafrykańskiego franka CFA" + ], + "XAF": [ + "środkowoafrykański frank CFA", + "środkowoafrykańskie franki CFA", + "środkowoafrykańskich franków CFA", + "środkowoafrykańskiego franka CFA" + ], + "CFA": [ + "frank CFA Afryki Zachodniej", + "franki CFA Afryki Zachodniej", + "franków CFA Afryki Zachodniej", + "franka CFA Afryki Zachodniej" + ], + "XOF": [ + "frank CFA Afryki Zachodniej", + "franki CFA Afryki Zachodniej", + "franków CFA Afryki Zachodniej", + "franka CFA Afryki Zachodniej" + ], + "\u0631.\u064a.\u200f": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "YR": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "YER": [ + "rial jemeński", + "riale jemeńskie", + "riali jemeńskich", + "riala jemeńskiego" + ], + "R": [ + "rand", + "randy", + "randów", + "randa" + ], + "ZAR": [ + "rand", + "randy", + "randów", + "randa" + ], + "ZK": [ + "kwacha zambijska", + "kwacha zambijskie", + "kwacha zambijskich", + "kwacha zambijskiego" + ], + "ZMK": [ + "kwacha zambijska", + "kwacha zambijskie", + "kwacha zambijskich", + "kwacha zambijskiego" + ], + "ZWL$": [ + "dolar Zimbabwe", + "dolary Zimbabwe", + "dolarów Zimbabwe", + "dolara Zimbabwe" + ], + "ZWL": [ + "dolar Zimbabwe", + "dolary Zimbabwe", + "dolarów Zimbabwe", + "dolara Zimbabwe" + ] +} \ No newline at end of file diff --git a/data/numbers.json b/data/numbers.json new file mode 100644 index 0000000000000000000000000000000000000000..65cae973fa5aa242c6320cec6ecc6f4dbf4efc2e --- /dev/null +++ b/data/numbers.json @@ -0,0 +1,116 @@ +{ + "number_words": { + "0": "zero", + "1": "jeden", + "2": "dwa", + "3": "trzy", + "4": "cztery", + "5": "pięć", + "6": "sześć", + "7": "siedem", + "8": "osiem", + "9": "dziewięć", + "10": "dziesięć", + "11": "jedenaście", + "12": "dwanaście", + "13": "trzynaście", + "14": "czternaście", + "15": "piętnaście", + "16": "szesnaście", + "17": "siedemnaście", + "18": "osiemnaście", + "19": "dziewiętnaście", + "20": "dwadzieścia", + "30": "trzydzieści", + "40": "czterdzieści", + "50": "pięćdziesiąt", + "60": "sześćdziesiąt", + "70": "siedemdziesiąt", + "80": "osiemdziesiąt", + "90": "dziewięćdziesiąt", + "100": "sto", + "200": "dwieście", + "300": "trzysta", + "400": "czterysta", + "500": "pięćset", + "600": "sześćset", + "700": "siedemset", + "800": "osiemset", + "900": "dziewięćset" + }, + "ordinal_number_words": { + "0": "zerowy", + "1": "pierwszy", + "2": "drugi", + "3": "trzeci", + "4": "czwarty", + "5": "piąty", + "6": "szósty", + "7": "siódmy", + "8": "ósmy", + "9": "dziewiąty", + "10": "dziesiąty", + "11": "jedenasty", + "12": "dwunasty", + "13": "trzynasty", + "14": "czternasty", + "15": "piętnasty", + "16": "szesnasty", + "17": "siedemnasty", + "18": "osiemnasty", + "19": "dziewiętnasty", + "20": "dwudziesty", + "30": "trzydziesty", + "40": "czterdziesty", + "50": "pięćdziesiąty", + "60": "sześćdziesiąty", + "70": "siedemdziesiąty", + "80": "osiemdziesiąty", + "90": "dziewięćdziesiąty", + "100": "setny", + "200": "dwusetny", + "300": "trzechsetny", + "400": "czterechsetny", + "500": "pięćsetny", + "600": "sześćsetny", + "700": "siedemsetny", + "800": "osiemsetny", + "900": "dziewięćsetny" + }, + "large_numbers": { + "3": "tysiąc", + "6": "milion", + "9": "miliard", + "12": "bilion", + "15": "biliard", + "18": "trylion", + "21": "tryliard", + "24": "kwadrylion", + "27": "kwadryliard", + "30": "kwintylion", + "33": "kwintyliard", + "36": "sekstylion", + "39": "sekstyliard", + "42": "septylion", + "45": "septyliard", + "48": "oktylion", + "51": "oktyliard", + "54": "nonilion", + "57": "noniliard", + "60": "decylion", + "63": "decyliard", + "66": "undecylion", + "69": "undecyliard", + "72": "duodecylion", + "75": "duodecyliard", + "100": "googol", + "600": "centylion", + "603": "centyliard" + }, + "ordinal_large_numbers": { + "3": "tysięczny", + "6": "milionowy", + "9": "miliardowy", + "12": "bilionowy" + } +} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100755 index 0000000000000000000000000000000000000000..98462cca104e8ad985120f4938b88a67acb1ccc8 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +version: '3' +services: + wordifier: + container_name: clarin_wordifier + build: ./ + working_dir: /home/worker + command: + - python3.6 main.py service + environment: + - PYTHONUNBUFFERED=0 + volumes: + - '/samba:/samba' + - './config.ini:/home/worker/config.ini' + - './src:/home/worker/src' + - './tests:/home/worker/tests' + - './main.py:/home/worker/main.py' diff --git a/main.py b/main.py new file mode 100755 index 0000000000000000000000000000000000000000..ccb9f30badc8e2d2ae1b746a19260c4e7009471e --- /dev/null +++ b/main.py @@ -0,0 +1,34 @@ +"""Implementation of wordifier service.""" +import argparse +import nlp_ws +from src.worker import Worker + + +def get_args(): + """Gets command line arguments.""" + parser = argparse.ArgumentParser(description="wordifier") + + subparsers = parser.add_subparsers(dest="mode") + subparsers.required = True + + subparsers.add_parser( + "service", + help="Run as a service") + + return parser.parse_args() + + +def main(): + """Runs the program.""" + args = get_args() + + generators = { + "service": lambda: nlp_ws.NLPService.main(Worker), + } + + gen_fn = generators.get(args.mode, lambda: None) + gen_fn() + + +if __name__ == "__main__": + main() diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d55715eaf79b58fdc255c740ceb580757787ea3 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +parameterized==0.8.1 +nose2==0.10.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 0000000000000000000000000000000000000000..e8340049c13ee23f0def228066991df9b0abf234 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +nlp-ws +Babel==2.8.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/ccl_handler.py b/src/ccl_handler.py new file mode 100755 index 0000000000000000000000000000000000000000..02e08ac29c6fc7c51715362b600cee42593064e7 --- /dev/null +++ b/src/ccl_handler.py @@ -0,0 +1,20 @@ +"""Implementation of ccl reading functionality.""" +from xml.etree.ElementTree import iterparse + + +class CCLHandler: + """Implements reading ccl for anonymizer service.""" + + def __init__(self, ccl_file_name): + """Initialize CCLHandler with a filename.""" + self._file_name = ccl_file_name + + def process(self, output_file, unmarshallers): + """Process xml tags using unmarshallers and save in output_file.""" + with open(self._file_name, 'r', encoding='utf-8') as input_file, \ + open(output_file, 'w', encoding='utf-8') as output_file: + for event, elem in iterparse(input_file): + unmarshal = unmarshallers.get(elem.tag, None) + if unmarshal: + output_file.write(unmarshal(elem)) + elem.clear() diff --git a/src/date2words.py b/src/date2words.py new file mode 100644 index 0000000000000000000000000000000000000000..055e5f6b4d2af51755c1aafcb4e20bce137ffcd2 --- /dev/null +++ b/src/date2words.py @@ -0,0 +1,108 @@ +"""Module for converting dates to words.""" +from babel import Locale + +from src.num2words import num2words + +date_tags = ['sg:gen:m3'] + + +def check_none(token): + """If token is none then convert to empty list otherwise return token.""" + if not token: + return [] + return token + + +def month_name_expansion(month): + """Expand month abbreviation or change form. + + Args: + month (str): Month abbrevation or full name. + + Returns: + str: Full month name in genitive case. + + """ + abbr = len(month) == 3 + locale = Locale('pl') + month = month.lower() + + if abbr: + months = locale.months['format']['abbreviated'] + index = list(months.values()).index(month) + 1 + month = locale.months['format']['wide'][index] + else: + for format in ['format', 'stand-alone']: + if month in list(locale.months[format]['wide'].values()): + months = locale.months[format]['wide'] + index = list(months.values()).index(month) + 1 + month = locale.months['format']['wide'][index] + return month + + +def date2words(date_match, tags=None): + """Convert a date to list of words. + + Args: + date_match (re.Match): Date match. + tag (str, optional): Morphological tag. Defaults to None. + + Returns: + list of str: List of words representing date. + + """ + if tags and ":".join(tags[0].split(":")[1:4]) in date_tags: + corrected_tag = tags[0] + else: + corrected_tag = None + if date_match['day_or_month_year']: + day_month1 = num2words(date_match['day_month1'], corrected_tag, + ordinal=True) + day_month2 = num2words(date_match['day_month2'], corrected_tag, + ordinal=True) + year = num2words(date_match['year1'], corrected_tag, ordinal=True) + + # split punctuation into single characters and remove if None + date_order = [day_month1, *check_none(date_match['punct1']), + day_month2, *check_none(date_match['punct2']), year] + elif date_match['year_month_or_day']: + day_month3 = num2words(date_match['day_month3'], ordinal=True) + day_month4 = num2words(date_match['day_month4'], ordinal=True) + year = num2words(date_match['year2'], ordinal=True) + + # split punctuation into single characters and remove if None + date_order = [year, *check_none(date_match['punct3']), day_month3, + *check_none(date_match['punct4']), day_month4] + elif date_match['month_in_words']: + day = date_match['day1'] + if date_match['day2']: + day = date_match['day2'] + if day: + day = num2words(day, corrected_tag, ordinal=True) + + year = '' + if date_match['year3']: + year = num2words(date_match['year3'], corrected_tag, ordinal=True) + if date_match['year4']: + year = num2words(date_match['year4'], corrected_tag, ordinal=True) + + if not day and not year: + return [date_match['month']] + else: + month = month_name_expansion(date_match['month']) + + # split punctuation into single characters and remove if None + if date_match['day2']: + date_order = [month, *check_none(date_match['punct7']), + day, *check_none(date_match['punct8'])] + elif date_match['day1']: + date_order = [day, *check_none(date_match['punct5']), + month, *check_none(date_match['punct6'])] + else: + date_order = [month] + if year: + date_order = date_order + [year] + date_order = list(map(lambda x: x if x else '', date_order)) + else: + date_order = [''] + return date_order diff --git a/src/num2words.py b/src/num2words.py new file mode 100644 index 0000000000000000000000000000000000000000..fdae1196ab64932e5ded12cef1af9aef52feffdf --- /dev/null +++ b/src/num2words.py @@ -0,0 +1,105 @@ +"""Module for converting numbers to words.""" +import math +import json + +from src.utils import get_word_form, trailing_zeros + +with open('data/numbers.json', 'r') as numbers_file: + numbers_dict = json.load(numbers_file) + number_words = {int(k): v for k, v in numbers_dict['number_words'].items()} + ordinal_number_words = {int(k): v for k, v + in numbers_dict['ordinal_number_words'].items()} + large_numbers = {int(k): v for k, v + in numbers_dict['large_numbers'].items()} + ordinal_large_numbers = {int(k): v for k, v + in numbers_dict['ordinal_large_numbers'].items()} + + +def three_digit_to_words(text, tag='', ordinal=False): + """Convert three digits numbers to words with given tag. Util function.""" + map_to_words = ordinal_number_words if ordinal else number_words + + number = int(text) + if number == 0: + return get_word_form(map_to_words[number], tag) + words = [] + units = number % 10 + tens = number % 100 - units + hundredths = number // 100 + if 0 < tens + units <= 20: + word = get_word_form(map_to_words[tens + units], tag) + words.append(word) + else: + if units != 0: + words.append(get_word_form(map_to_words[units], tag)) + if tens != 0: + words.append(get_word_form(map_to_words[tens], tag)) + + if hundredths != 0: + if tens == 0 and units == 0: + words.append(get_word_form(map_to_words[hundredths * 100], tag)) + else: + words.append(get_word_form(number_words[hundredths * 100], '')) + + return ' '.join(reversed(words)) + + +def num2words(text, tag='', ordinal=False): + """Converts a number to words. + + Args: + text (str): Three digits number. + tag (str, optional): Morphological tag. Defaults to ''. + ordinal (bool, optional): If word should be derived from ordinal number. + Defaults to False. + + Returns: + str: Returns number as words with given tag. + + """ + i = 0 + words = [] + number = int(text) + + if ordinal: + zeros = trailing_zeros(number) + zeros = 3 * math.floor(zeros / 3) + if zeros > 2 and 0 < len(text) - zeros <= 3: + number = number // 10 ** zeros + if number == 1: + words = '' + else: + words = three_digit_to_words(str(number), 'numcomp') + words += get_word_form(ordinal_large_numbers[zeros], tag) + return words + + if len(text) <= 3 or number == 0: + return three_digit_to_words(text, tag, ordinal) + + while number > 0: + remainder = number % 1000 + if i == 0: + triple = three_digit_to_words(remainder, tag, ordinal) + else: + triple = three_digit_to_words(remainder) + number = number // 1000 + if remainder == 0 and number != 0: + i += 3 + continue + + if i == 0: + words.append(triple) + else: + if remainder == 1: + tag = 'subst:sg:nom:m3' + elif remainder % 10 in [2, 3, 4]: + tag = 'subst:pl:nom:m3' + else: + tag = 'subst:pl:gen:m3' + form = get_word_form(large_numbers[i], tag) + if remainder == 1: + words.append(form) + else: + words.append(triple + ' ' + form) + i += 3 + return ' '.join(list(reversed(words))) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a21467591223c961f8cd6ec0ce92c273640e05aa --- /dev/null +++ b/src/utils.py @@ -0,0 +1,211 @@ +"""Module for useful functions.""" +from enum import Enum + +import morfeusz2 + + +class TokenType(Enum): + """Type of token.""" + + NUMBER = 1 + SPECIAL_CHARACTER = 2 + PUNCTUATION = 3 + CURRENCY = 4 + + +class NumberPlural(Enum): + """Type of number indicating what the word suffix will be. + + E.g: + SINGULAR 1$ - jeden dolar + SEVERAL (2-4) 2$ - dwa dolary + MANY (5+) 7$ - siedem dolarów + """ + + SINGULAR = 0 + SEVERAL = 1 + MANY = 2 + + +def to_number_plural(number): + """Convert a number to enumerate type, that indicates word suffix. + + Args: + number (int or string): Number to be converted. + + Returns: + NumberPlural: Enumerate, which indicates what the end of the word + will be. + + """ + number = int(number) + if number == 1: + return NumberPlural.SINGULAR + elif 2 <= number <= 4: + return NumberPlural.SEVERAL + else: + return NumberPlural.MANY + + +def is_simple_number(tokens, special_types): + """Checks if list of tokens creates a simple number. + + Simple number contains only digits and spaces between groups of three. + + Args: + tokens (list): List of tokens. + special_types (list): Types of tokens. + + Returns: + bool: Return True if joined tokens are simple number otherwise False. + + """ + numbers = [n for i, n in enumerate(tokens) + if special_types[i] == TokenType.NUMBER] + return (all([len(t) == 3 for t in numbers[1:]]) and + all([(s.isdigit() or s == ' ') for s in tokens])) + + +def is_fraction(tokens, decimal=False): + """Check is list of tokens are 2 numbers splitted by slash or dot. + + Args: + tokens (list): List of tokens. + decimal (bool, optional): If True delimiter is dot otherwise slash '/'. + Defaults to False. + + Returns: + bool: Return True if tokens are fraction otherwise False. + + """ + if len(tokens) < 3: + return False + delimiter = '.' if decimal else '/' + splitted = ''.join(tokens).split(delimiter) + return ((len(splitted) == 2) and + tokens.count(delimiter) == 1 and + all([(s.isdigit() or s in ' /.') for s in tokens])) + + +def trailing_zeros(number): + """Count trailing zeros in number. + + Returns: + int: Return number of trailing zeros. + + """ + manipulandum = str(number) + return len(manipulandum) - len(manipulandum.rstrip('0')) + + +def search_form(forms, tag): + """Search for the correct form of word from all those returned by Morfeusz. + + Args: + forms (list of tuples): Tags and variations of words returned + by Morfeusz. + tag (str): The tag of the word whose form is being searched for. + + Returns: + str: Word properly conjugated with the given tag or None if not found. + + """ + for form in forms: + form_categories = [x.split('.') for x in form[2].split(':')] + gramm_categ_enum = enumerate(tag) + if all((c in form_categories[i] for i, c in gramm_categ_enum)): + return form[0] + return None + + +def get_word_form(text, tag): + """Change the word in the appropriate form with given morphological tag. + + Args: + text (str): Word to be changed. + tag (str): Morphological tag. + + Returns: + str: Word changed with given morphological tag. + + """ + if not tag: + return text + + morf = morfeusz2.Morfeusz() + all_forms = morf.generate(text) + + tag = tag.split(':') + forms = [x for x in all_forms if x[2].split(':')[0] == tag[0]] + form = search_form(forms, tag) + + if form: + return form + if len(tag) > 4: + tag = tag[:4] + form = search_form(forms, tag) + + if form: + return form + else: + return text + + +def subtract_from_first(list_of_tuples, offset): + """Subtract from every first element in tuples that make up list.""" + list_of_tuples = (list_of_tuples[0] - offset, *list_of_tuples[1:]) + return list_of_tuples + + +def check_and_replace(string_builder, find, replace, filtered_tokens): + """Check for matches in list and replace them with given tokens. + + Remove replaced tokens from `filtered_tokens` to to avoid double processing. + + Args: + string_builder (list of str): List of all words. + find (list of str): Tokens to be replaced. + replace (list of str): Words that will replace `find` tokens in + `string_builder`. + filtered_tokens (list of tuples): List of tokens and their features. + + Returns: + (list of str, list of tuples): Pair: list of words with replaced matched + tokens and filtered list of tokens and their feature with deleted + items that have been replaced. + + """ + if not find or not replace: + return string_builder, filtered_tokens + + new_builder = string_builder.copy() + max_lenght = max(map(len, find)) + for i, token in enumerate(string_builder): + if not find: + break + to_remove = [i] + check = token + j = i + 1 + if check in find: + new_builder[i] = ''.join(replace[find.index(check)]) + filtered_tokens = list(filter(lambda x: x[0] != i, filtered_tokens)) + del find[0], replace[0] + continue + if check[0] != find[0][:len(check[0])]: + continue + while len(check) < max_lenght and j < len(string_builder): + check += string_builder[j] + to_remove.append(j) + if check in find: + index = find.index(check) + new_builder = new_builder[:i] + replace[index] + if j + 1 < len(string_builder): + new_builder += string_builder[j + 1:] + filtered_tokens = list(filter(lambda x: x[0] not in to_remove, + filtered_tokens)) + find.pop(index) + replace.pop(index) + if not find: + return new_builder, filtered_tokens + j += 1 + return new_builder, filtered_tokens diff --git a/src/wordifier.py b/src/wordifier.py new file mode 100644 index 0000000000000000000000000000000000000000..0f4ed21312c13c03bfaf7c186221b427598e9ddd --- /dev/null +++ b/src/wordifier.py @@ -0,0 +1,467 @@ +"""Implementation of wordifier functionality.""" +import re +import json +from itertools import islice + +from src.utils import is_simple_number, subtract_from_first, trailing_zeros, \ + check_and_replace, TokenType, NumberPlural, to_number_plural, is_fraction +from src.num2words import num2words +from src.date2words import date2words + + +class Wordifier: + """Class for generating words from special characters or numbers.""" + + date_regex = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' + r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' + r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)' + r'|Gru(?:|dzie[nń]|dnia))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I + ) + decimal_fraction_regex = re.compile(r'\d+[ ]?(\.)[ ]?\d+') + + number_punctuation = ' .,' + following_type = { + TokenType.NUMBER: [TokenType.NUMBER, TokenType.SPECIAL_CHARACTER, + TokenType.CURRENCY], + TokenType.SPECIAL_CHARACTER: [TokenType.SPECIAL_CHARACTER, + TokenType.NUMBER], + TokenType.CURRENCY: [] + } + + _denominator_tag = { + NumberPlural.SINGULAR: { + 'default': 'adj:sg:nom:f', + ('acc', 'dat', 'gen', 'loc'): { + ('f'): 'adj:sg:acc:f' + } + }, + NumberPlural.SEVERAL: { + 'default': 'adj:pl:acc:f', + ('dat'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f' + }, + ('gen', 'loc'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:pl:acc:m1' + }, + ('nom', 'voc'): { + ('m1'): 'adj:pl:acc:m1' + } + }, + NumberPlural.MANY: { + 'default': 'adj:pl:acc:m1', + ('acc', 'nom', 'voc'): { + ('m1'): 'adj:sg:dat:f' + }, + ('gen', 'dat', 'inst', 'loc'): { + ('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f' + } + } + } + + special_character_numbers_map = { + '+': 'plus', + '-': 'minus', + '/': 'przez', + '*': 'razy', + '%': 'procent', + '&': 'ampersand', + '=': 'równa się', + '^': 'do potęgi', + '#': 'numer' + } + special_character_map = { + '+': 'plus', + '-': '-', + '/': 'ukośnik', + '%': 'procent', + '&': 'i', + '=': 'równa się', + '^': 'kareta', + '#': 'kratka' + } + + def __init__(self): + """Class initialization.""" + self.unmarshallers = { + 'chunk': lambda *args: '\n', + 'sentence': lambda *args: self._process_sent_tree(*args), + } + with open('data/currencies.json', 'r') as currency_file: + self._currencies = json.load(currency_file) + self._wordify_tokens = [] + + def _process_sent_tree(self, sentence_subtree): + string_builder = [] + tags = [] + tok_id = 0 + for elem in sentence_subtree: + if elem.tag == 'tok': + token, tag = self._process_single_tok(tok_id, elem) + string_builder.append(token) + string_builder.append(' ') + tags.append(tag) + tok_id += 2 + elif elem.tag == 'ns': + tok_id -= 1 + string_builder.pop() + else: + raise Exception('Unrecognized tag inside sentence: ' + elem.tag) + return self._process_sentence(string_builder, tags) + + def _get_denominator_tag(self, nominator_plural, nom_case, nom_gender=None): + if nom_case == 'default' or nom_gender is None: + return self._denominator_tag[nominator_plural]['default'] + + for cases, value in self._denominator_tag[nominator_plural].items(): + if cases == 'default': + continue + if nom_case in cases: + for genders, tag in value.items(): + if nom_gender in genders: + return tag + return self._denominator_tag[nominator_plural]['default'] + + def _special_type(self, text): + if text in self.special_character_map: + return TokenType.SPECIAL_CHARACTER + elif text in self._currencies: + return TokenType.CURRENCY + elif text.isdigit(): + return TokenType.NUMBER + return None + + def _process_single_tok(self, tok_id, tok_subtree): + text = '' + tag = '' + for elem in tok_subtree: + if elem.tag == 'orth': + text = elem.text + elif elem.tag == 'lex': + tag = self._process_lex(elem) + word = self._process_word(tok_id, text, tag) + return word, tag + + def _process_word(self, tok_id, text, tag): + self._add_special(tok_id, text, tag) + return text + + def _add_special(self, tok_id, text, tag): + s_type = self._special_type(text) + if s_type: + self._wordify_tokens.append((tok_id, text, tag, s_type)) + return text + + def _process_lex(self, lex_subtree): + tag = '' + for elem in lex_subtree: + if elem.tag == 'ctag': + tag = elem.text + elif elem.tag != 'base': + raise Exception('Unrecognized tag inside lex: ' + elem.tag) + if tag == '': + raise Exception('Lex tag had no ctag inside!') + return tag + + def _handle_fraction(self, tokens, tags): + """Generate words from fraction splitted by slash '/'. + + Args: + tokens (list of str): List that contains numbers separated by + slash '/'. + + Returns: + str: Fraction as words. + + """ + text = ''.join(tokens) + numerator, denominator = text.split('/') + tag_num = tags[0] + remainder = to_number_plural(int(numerator) % 10) + + tag_case, tag_gender = tag_num.split(':')[2:4] + tag_den = self._get_denominator_tag(remainder, tag_case, tag_gender) + + zeros = trailing_zeros(denominator) + if len(denominator) < 4 or \ + (zeros > 2 and 0 < len(denominator) - zeros <= 3): + return num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) + else: + return num2words(numerator) + ' przez ' + \ + num2words(denominator) + + def _handle_decimal_fraction(self, tokens): + """Generate words from decimal fraction splitted by dot. + + Args: + tokens (list of str): List that contains numbers separated by dot. + + Returns: + str: Decimal fraction as words. + + """ + text = ''.join(tokens) + number, numerator = text.split('.') + number = number.replace(' ', '') + tag_num = 'adj:sg:nom:f' if int(numerator) == 1 else 'num:pl:nom:f' + denominator = str(10 ** len(numerator)) + remainder = to_number_plural(int(numerator) % 10) + tag_den = self._get_denominator_tag(remainder, 'default') + if int(number) == 0: + return num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) + else: + return num2words(number) + ' i ' + \ + num2words(numerator, tag_num) + ' ' + \ + num2words(denominator, tag_den, True) + + def _check_decimal_fraction(self, tokens): + """Checks whether given list of tokens starts with decimal fraction. + + If contains fraction generate words from whole fraction otherwise + generate words from first number. + + Args: + tokens (list of str): List of tokens with number at the beginning. + + Returns: + str: Tokens that form a fraction or number. + int: The number of tokens that make up the fraction. + + """ + match = self.decimal_fraction_regex.search(''.join(tokens[:5])) + if match and match.start() == 0: + tokens_match = tokens[0] + i = 1 + while tokens_match != match.group(0): + tokens_match += tokens[i] + i += 1 + return match.group(0), i - 1 + else: + return tokens[0], 0 + + def _handle_mixed_types(self, tokens, special_types, tags): + last_number_plural = NumberPlural.SINGULAR + if TokenType.NUMBER in special_types: + special_character_map = self.special_character_numbers_map + else: + special_character_map = self.special_character_map + i = 0 + iter_special_types = iter(special_types) + for token_type in iter_special_types: + if token_type == TokenType.SPECIAL_CHARACTER: + if tokens[i] in special_character_map: + tokens[i] = special_character_map[tokens[i]] + else: + tokens[i] = '' + elif token_type == TokenType.PUNCTUATION: + if tokens[i] == ' ': + tokens[i] = '' + elif token_type == TokenType.NUMBER: + number, skip = self._check_decimal_fraction(tokens[i:]) + if skip > 0: + words = self._handle_decimal_fraction(number) + if int(''.join(number).split('.')[0]) == 0: + last_number_plural = NumberPlural.FRACTION + else: + last_number_plural = NumberPlural.MANY + else: + words = num2words(number) + last_number_plural = to_number_plural(number) + tokens = tokens[:i] + [words] + tokens[i + skip + 1:] + if skip != 0: + next(islice(iter_special_types, skip - 1, skip), '') + elif token_type == TokenType.CURRENCY: + suffix = last_number_plural.value + tokens[i] = self._currencies[tokens[i]][suffix] + i += 1 + text = ' '.join([w for w in tokens if w != '']) + return text + + def _get_as_words(self, tokens, tags, special_types): + """Convert special tokens and numbers to words. + + Args: + tokens (list of str): List of tokens. + special_types (list of TokenType): Types of tokens. + + Returns: + str : Joined tokens converted to words. + + """ + if is_simple_number(tokens, special_types): + numbers = ''.join([n for i, n in enumerate(tokens) + if special_types[i] == TokenType.NUMBER]) + return num2words(''.join(numbers), tags[-1]) + elif is_fraction(tokens): + return self._handle_fraction(tokens, tags) + elif is_fraction(tokens, decimal=True): + return self._handle_decimal_fraction(tokens) + else: + return self._handle_mixed_types(tokens, special_types, tags) + + def _check_number_multipart(self, index, next_id, string_builder): + """Check if the next token is continuation of number with actual token. + + Args: + index (int): Actual token id. + next_id (int): Next token id. + string_builder (list of str): List of all words. + + Returns: + bool: Is next token continuation of a number. + + """ + return next_id == index + 1 or \ + (index + 2 == next_id and + string_builder[index + 1] in self.number_punctuation) + + def _join_tokens(self, token, string_builder): + """Combine tokens that form multi-part formulas. + + Args: + tokens (list of tuple): List of tokens and their features. + Every element contains index, word, morphological tag and + token type. + string_builder (list of str): List of all words. + + Returns: + list of tuple: List of joined tokens and their features. + + """ + joined_tokens = [] + iter_wordify_tokens = enumerate(iter(self._wordify_tokens)) + for i, (index, token, tag, token_type) in iter_wordify_tokens: + j = i + 1 + tokens = [token] + tags = [tag] + special_types = [token_type] + start_id = index + + while j < len(self._wordify_tokens): + next_id, next_token, next_tag, \ + next_special_type = self._wordify_tokens[j] + if not self._check_number_multipart(index, next_id, + string_builder): + break + if next_special_type in self.following_type[token_type]: + if index + 2 == next_id: + tokens.append(string_builder[index + 1]) + special_types.append(TokenType.PUNCTUATION) + tags.append('') + tokens.append(next_token) + tags.append(next_tag) + special_types.append(next_special_type) + else: + break + + next(iter_wordify_tokens) + index = next_id + token_type = next_special_type + j += 1 + joined_tokens.append((start_id, tokens, tags, special_types)) + return joined_tokens + + def _handle_special_types(self, string_builder): + """Convert special tokens to words and replace them in string builder. + + Args: + string_builder (list of str]): List of all words. + + Returns: + list of str: Return updated string builder with special tokens + replaced by words. + + """ + wordify_tokens = self._join_tokens(self._wordify_tokens, string_builder) + enum_special = enumerate(wordify_tokens) + for i, special_token in enum_special: + index, tokens, tags, token_type = special_token + words = self._get_as_words(tokens, tags, token_type) + no_tokens = len(tokens) + string_builder = string_builder[:index] + [words] + \ + string_builder[index + no_tokens:] + offset = no_tokens - 1 + wordify_tokens[i + 1:] = [subtract_from_first(x, offset) + for x in wordify_tokens[i + 1:]] + self._wordify_tokens.clear() + return string_builder + + def _get_match_tag(self, match, string_builder, tags): + match = match.group(0) + j = 0 + for i, word in enumerate(string_builder): + if match.startswith(word): + acc = word + match_tags = [tags[j]] + tmp = j + while i < len(string_builder) - 1 and len(acc) < len(match): + i += 1 + acc += string_builder[i] + if acc != match[:len(acc)]: + break + if string_builder[i] != ' ': + j += 1 + match_tags.append(tags[j]) + j = tmp + if acc == match: + return match_tags + if word != ' ': + j += 1 + return [] + + def _handle_regexes(self, string_builder, tags): + """Check for regexes in the given builder and replace them with words. + + Args: + string_builder (list of str): List of all words. + + Returns: + list of str: Updated string builder with matches replaced by words. + + """ + sentence = ''.join(string_builder) + matches = list(self.date_regex.finditer(sentence)) + if not matches: + return string_builder + replace = [] + for match in matches: + date_tags = self._get_match_tag(match, string_builder, tags) + replace.append(date2words(match, date_tags)) + matches = list(map(lambda m: m.group(0), matches)) + builder, self._wordify_tokens = check_and_replace(string_builder, + matches, replace, + self._wordify_tokens) + return builder + + def _process_sentence(self, string_builder, tags): + """Process a sentence and replace special tokens (eg. numbers) words. + + Args: + string_builder (list of str): List of all words. + + Returns: + str: Sentece with replaced special tokens. + + """ + string_builder = self._handle_regexes(string_builder, tags) + string_builder = self._handle_special_types(string_builder) + if string_builder[0] and not string_builder[0][0].isupper(): + string_builder[0] = string_builder[0].capitalize() + return ''.join(string_builder) diff --git a/src/worker.py b/src/worker.py new file mode 100755 index 0000000000000000000000000000000000000000..8dfe2f8c957b0a0072886f4e6138a944ce9fe9f6 --- /dev/null +++ b/src/worker.py @@ -0,0 +1,24 @@ +"""Implementation of nlp_worker.""" +import logging + +import nlp_ws + +from src.wordifier import Wordifier +from src.ccl_handler import CCLHandler + + +_log = logging.getLogger(__name__) + + +class Worker(nlp_ws.NLPWorker): + """Implements nlp_worker for tokenizer service.""" + + @classmethod + def static_init(cls, config): + """One time static initialisation.""" + + def process(self, input_file, task_options, output_file): + """Processing an input file and generating tokens converted to words.""" + wordifier = Wordifier() + ccl_handler = CCLHandler(input_file) + ccl_handler.process(output_file, wordifier.unmarshallers) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/test_num2words.py b/tests/test_num2words.py new file mode 100644 index 0000000000000000000000000000000000000000..23b06e6ff93dc325dbe96083d3f84ec20c845907 --- /dev/null +++ b/tests/test_num2words.py @@ -0,0 +1,136 @@ +import unittest +from parameterized import parameterized, param + +from src.num2words import num2words + + +class TestNum2Words(unittest.TestCase): + single_tag = 'adj:sg:nom:f' + several_tag = 'adj:pl:acc:f' + many_tag = 'adj:pl:acc:m1' + + @parameterized.expand([ + param('0', 'zero'), + param('08', 'osiem'), + param('12', 'dwanaście'), + param('23', 'dwadzieścia trzy'), + param('48', 'czterdzieści osiem'), + param('187', 'sto osiemdziesiąt siedem'), + param('249', 'dwieście czterdzieści dziewięć'), + param('600', 'sześćset'), + param('720', 'siedemset dwadzieścia'), + param('304', 'trzysta cztery'), + + param('1000', 'tysiąc'), + param('425000', 'czterysta dwadzieścia pięć tysięcy'), + param('102000', 'sto dwa tysiące'), + param('390000', 'trzysta dziewięćdziesiąt tysięcy'), + param('701000', 'siedemset jeden tysięcy'), + param('993999', 'dziewięćset dziewięćdziesiąt trzy tysiące ' + 'dziewięćset dziewięćdziesiąt dziewięć'), + param('1000642', 'milion sześćset czterdzieści dwa'), + param('2001003', 'dwa miliony tysiąc trzy'), + param('18456000', 'osiemnaście milionów ' + 'czterysta pięćdziesiąt sześć tysięcy'), + param('1000000000', 'miliard') + ]) + def test_numbers(self, number, words): + self.assertEqual(num2words(number), words) + + @parameterized.expand([ + param('0', 'zerowy', ordinal=True), + param('1', 'pierwszy', ordinal=True), + param('10', 'dziesiąty', ordinal=True), + param('15', 'piętnasty', ordinal=True), + param('31', 'trzydziesty pierwszy', ordinal=True), + param('70', 'siedemdziesiąty', ordinal=True), + param('099', 'dziewięćdziesiąty dziewiąty', ordinal=True), + param('100', 'setny', ordinal=True), + param('102', 'sto drugi', ordinal=True), + param('183', 'sto osiemdziesiąty trzeci', ordinal=True), + param('201', 'dwieście pierwszy', ordinal=True), + + param('1000', 'tysięczny', ordinal=True), + param('1005', 'tysiąc piąty', ordinal=True), + param('2000', 'dwutysięczny', ordinal=True), + param('2020', 'dwa tysiące dwudziesty', ordinal=True), + param('10000', 'dziesięciotysięczny', ordinal=True), + param('100856', 'sto tysięcy osiemset pięćdziesiąty szósty', + ordinal=True), + param('1000000', 'milionowy', ordinal=True), + param('1002003', 'milion dwa tysiące trzeci', ordinal=True), + param('1948052296', 'miliard dziewięćset czterdzieści osiem milionów ' + 'pięćdziesiąt dwa tysiące ' + 'dwieście dziewięćdziesiąty szósty', ordinal=True), + ]) + def test_ordinal_numbers(self, number, words, ordinal): + self.assertEqual(num2words(number, ordinal=ordinal), words) + + @parameterized.expand([ + ('1', 'adj:sg:nom:f', 'jedna'), + ('2', 'num:pl:nom:f', 'dwie') + ]) + def test_numbers_numerator(self, number, tag, words): + self.assertEqual(num2words(number, tag), words) + + @parameterized.expand([ + param('1', 'pierwsza'), + param('2', 'druga'), + param('5', 'piąta'), + param('10', 'dziesiąta'), + param('31', 'trzydziesta pierwsza'), + param('100', 'setna'), + param('102', 'sto druga'), + param('512', 'pięćset dwunasta'), + param('600', 'sześćsetna'), + + param('1000', 'tysięczna'), + param('2002', 'dwa tysiące druga'), + param('3000', 'trzytysięczna'), + param('1000000000', 'miliardowa'), + param('1473022977', 'miliard czterysta siedemdziesiąt trzy miliony ' + 'dwadzieścia dwa tysiące dziewięćset siedemdziesiąta siódma'), + ]) + def test_single_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.single_tag, ordinal), words) + + @parameterized.expand([ + param('3', 'trzecie'), + param('6', 'szóste'), + param('10', 'dziesiąte'), + param('47', 'czterdzieste siódme'), + param('100', 'setne'), + param('101', 'sto pierwsze'), + param('300', 'trzechsetne'), + param('981', 'dziewięćset osiemdziesiąte pierwsze'), + + param('1000', 'tysięczne'), + param('8000', 'ośmiotysięczne'), + param('10000', 'dziesięciotysięczne'), + param('100000', 'stutysięczne'), + param('1000115376708', 'bilion sto piętnaście milionów ' + 'trzysta siedemdziesiąt sześć tysięcy siedemset ósme'), + ]) + def test_several_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.several_tag, ordinal), words) + + @parameterized.expand([ + param('4', 'czwartych'), + param('8', 'ósmych'), + param('10', 'dziesiątych'), + param('69', 'sześćdziesiątych dziewiątych'), + param('100', 'setnych'), + param('212', 'dwieście dwunastych'), + param('700', 'siedemsetnych'), + param('901', 'dziewięćset pierwszych'), + + param('1000', 'tysięcznych'), + param('6000', 'sześciotysięcznych'), + param('10000', 'dziesięciotysięcznych'), + param('1000000', 'milionowych'), + param('238055017238', 'dwieście trzydzieści osiem miliardów ' + 'pięćdziesiąt pięć milionów siedemnaście tysięcy ' + 'dwieście trzydziestych ósmych'), + ]) + def test_many_numbers_denominator(self, number, words, ordinal=True): + self.assertEqual(num2words(number, self.many_tag, ordinal), words) diff --git a/tox.ini b/tox.ini new file mode 100755 index 0000000000000000000000000000000000000000..67d5403ab50027aa81fac8d52de8b1d10379e086 --- /dev/null +++ b/tox.ini @@ -0,0 +1,44 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python3 +commands = + pydocstyle --verbose {posargs} + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 +match-dir = ^(?!\.tox|venv|tests).* +match = ^(?!setup).*\.py \ No newline at end of file