Skip to content
Snippets Groups Projects
Commit 3d9dd875 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski
Browse files

Merge branch 'develop' into 'master'

Develop

See merge request !2
parents 4da073b8 83bc71de
Branches
No related tags found
1 merge request!2Develop
Pipeline #2546 passed
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
.vscode
\ No newline at end of file
image: "clarinpl/python:3.6"
cache:
paths:
- .tox
stages:
- check_style
- test
- build
pep8:
stage: check_style
before_script:
- pip install tox==2.9.1
script:
- tox -v -e pep8
docstyle:
stage: check_style
before_script:
- pip install tox==2.9.1
script:
- tox -v -e docstyle
test:
stage: test
image: "docker:18.09.7"
services:
- "docker:18.09.7-dind"
script:
- docker build -t clarinpl/wordifier .
- docker run --rm
-v "$(pwd)/requirements-dev.txt:/home/worker/requirements-dev.txt"
-v "$(pwd)/tests:/home/worker/tests"
clarinpl/wordifier
sh -c 'pip3 install -r requirements-dev.txt ; nose2 -v tests'
build:
stage: build
image: "docker:18.09.7"
only:
- master
services:
- "docker:18.09.7-dind"
script:
- docker build -t clarinpl/wordifier .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/wordifier
FROM clarinpl/python:3.6
WORKDIR /home/worker
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
RUN update-alternatives --set python /usr/bin/python3.6
RUN apt-get update && apt-get install -y morfeusz2
RUN wget -O morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl http://download.sgjp.pl/morfeusz/20200913/Linux/18.04/64/morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl
RUN python3.6 -m pip install morfeusz2-1.9.16-cp36-cp36m-linux_x86_64.whl
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
COPY ./data ./data
RUN python3.6 -m pip install -r requirements.txt
CMD ["python3.6", "main.py", "service"]
\ No newline at end of file
# Wordifier
A service that expands abbreviations into full texts. The following modules are implemented at this time:
- verbal notation of digits, numbers, decimal and ordinary fractions (with separators '.' and '/')
- verbal notation of simple equations with addition, subtraction, multiplication and division
- verbal notation of dates
- recognizing different ways to write dates.
- 25.12.2010 or 25,12,12 (day/month, day/month, year)
- 2009-08-30 or 20 08 30 (year, day/month, day/month)
- 12 Jan 2010 or 31 Jan 1998 (day, month, year)
- Mar 12 (month, year)
- Dec 15 (day, month)
- April 30 2000 (month, day, year)
- replace currency symbols with words
- write special characters (%, &, #, ^, =, +, -, /) in words
\ No newline at end of file
[service]
tool = wordifier
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
queue_prefix = nlp_
[tool]
workers_number = 5
processed_lines = 1000
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
This diff is collapsed.
{
"number_words": {
"0": "zero",
"1": "jeden",
"2": "dwa",
"3": "trzy",
"4": "cztery",
"5": "pięć",
"6": "sześć",
"7": "siedem",
"8": "osiem",
"9": "dziewięć",
"10": "dziesięć",
"11": "jedenaście",
"12": "dwanaście",
"13": "trzynaście",
"14": "czternaście",
"15": "piętnaście",
"16": "szesnaście",
"17": "siedemnaście",
"18": "osiemnaście",
"19": "dziewiętnaście",
"20": "dwadzieścia",
"30": "trzydzieści",
"40": "czterdzieści",
"50": "pięćdziesiąt",
"60": "sześćdziesiąt",
"70": "siedemdziesiąt",
"80": "osiemdziesiąt",
"90": "dziewięćdziesiąt",
"100": "sto",
"200": "dwieście",
"300": "trzysta",
"400": "czterysta",
"500": "pięćset",
"600": "sześćset",
"700": "siedemset",
"800": "osiemset",
"900": "dziewięćset"
},
"ordinal_number_words": {
"0": "zerowy",
"1": "pierwszy",
"2": "drugi",
"3": "trzeci",
"4": "czwarty",
"5": "piąty",
"6": "szósty",
"7": "siódmy",
"8": "ósmy",
"9": "dziewiąty",
"10": "dziesiąty",
"11": "jedenasty",
"12": "dwunasty",
"13": "trzynasty",
"14": "czternasty",
"15": "piętnasty",
"16": "szesnasty",
"17": "siedemnasty",
"18": "osiemnasty",
"19": "dziewiętnasty",
"20": "dwudziesty",
"30": "trzydziesty",
"40": "czterdziesty",
"50": "pięćdziesiąty",
"60": "sześćdziesiąty",
"70": "siedemdziesiąty",
"80": "osiemdziesiąty",
"90": "dziewięćdziesiąty",
"100": "setny",
"200": "dwusetny",
"300": "trzechsetny",
"400": "czterechsetny",
"500": "pięćsetny",
"600": "sześćsetny",
"700": "siedemsetny",
"800": "osiemsetny",
"900": "dziewięćsetny"
},
"large_numbers": {
"3": "tysiąc",
"6": "milion",
"9": "miliard",
"12": "bilion",
"15": "biliard",
"18": "trylion",
"21": "tryliard",
"24": "kwadrylion",
"27": "kwadryliard",
"30": "kwintylion",
"33": "kwintyliard",
"36": "sekstylion",
"39": "sekstyliard",
"42": "septylion",
"45": "septyliard",
"48": "oktylion",
"51": "oktyliard",
"54": "nonilion",
"57": "noniliard",
"60": "decylion",
"63": "decyliard",
"66": "undecylion",
"69": "undecyliard",
"72": "duodecylion",
"75": "duodecyliard",
"100": "googol",
"600": "centylion",
"603": "centyliard"
},
"ordinal_large_numbers": {
"3": "tysięczny",
"6": "milionowy",
"9": "miliardowy",
"12": "bilionowy"
}
}
\ No newline at end of file
version: '3'
services:
wordifier:
container_name: clarin_wordifier
build: ./
working_dir: /home/worker
command:
- python3.6 main.py service
environment:
- PYTHONUNBUFFERED=0
volumes:
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './tests:/home/worker/tests'
- './main.py:/home/worker/main.py'
main.py 0 → 100755
"""Implementation of wordifier service."""
import argparse
import nlp_ws
from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="wordifier")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(Worker),
}
gen_fn = generators.get(args.mode, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
parameterized==0.8.1
nose2==0.10.0
\ No newline at end of file
nlp-ws
Babel==2.8.0
\ No newline at end of file
"""Implementation of ccl reading functionality."""
from xml.etree.ElementTree import iterparse
class CCLHandler:
"""Implements reading ccl for anonymizer service."""
def __init__(self, ccl_file_name):
"""Initialize CCLHandler with a filename."""
self._file_name = ccl_file_name
def process(self, output_file, unmarshallers):
"""Process xml tags using unmarshallers and save in output_file."""
with open(self._file_name, 'r', encoding='utf-8') as input_file, \
open(output_file, 'w', encoding='utf-8') as output_file:
for event, elem in iterparse(input_file):
unmarshal = unmarshallers.get(elem.tag, None)
if unmarshal:
output_file.write(unmarshal(elem))
elem.clear()
"""Module for converting dates to words."""
from babel import Locale
from src.num2words import num2words
date_tags = ['sg:gen:m3']
def check_none(token):
"""If token is none then convert to empty list otherwise return token."""
if not token:
return []
return token
def month_name_expansion(month):
"""Expand month abbreviation or change form.
Args:
month (str): Month abbrevation or full name.
Returns:
str: Full month name in genitive case.
"""
abbr = len(month) == 3
locale = Locale('pl')
month = month.lower()
if abbr:
months = locale.months['format']['abbreviated']
index = list(months.values()).index(month) + 1
month = locale.months['format']['wide'][index]
else:
for format in ['format', 'stand-alone']:
if month in list(locale.months[format]['wide'].values()):
months = locale.months[format]['wide']
index = list(months.values()).index(month) + 1
month = locale.months['format']['wide'][index]
return month
def date2words(date_match, tags=None):
"""Convert a date to list of words.
Args:
date_match (re.Match): Date match.
tag (str, optional): Morphological tag. Defaults to None.
Returns:
list of str: List of words representing date.
"""
if tags and ":".join(tags[0].split(":")[1:4]) in date_tags:
corrected_tag = tags[0]
else:
corrected_tag = None
if date_match['day_or_month_year']:
day_month1 = num2words(date_match['day_month1'], corrected_tag,
ordinal=True)
day_month2 = num2words(date_match['day_month2'], corrected_tag,
ordinal=True)
year = num2words(date_match['year1'], corrected_tag, ordinal=True)
# split punctuation into single characters and remove if None
date_order = [day_month1, *check_none(date_match['punct1']),
day_month2, *check_none(date_match['punct2']), year]
elif date_match['year_month_or_day']:
day_month3 = num2words(date_match['day_month3'], ordinal=True)
day_month4 = num2words(date_match['day_month4'], ordinal=True)
year = num2words(date_match['year2'], ordinal=True)
# split punctuation into single characters and remove if None
date_order = [year, *check_none(date_match['punct3']), day_month3,
*check_none(date_match['punct4']), day_month4]
elif date_match['month_in_words']:
day = date_match['day1']
if date_match['day2']:
day = date_match['day2']
if day:
day = num2words(day, corrected_tag, ordinal=True)
year = ''
if date_match['year3']:
year = num2words(date_match['year3'], corrected_tag, ordinal=True)
if date_match['year4']:
year = num2words(date_match['year4'], corrected_tag, ordinal=True)
if not day and not year:
return [date_match['month']]
else:
month = month_name_expansion(date_match['month'])
# split punctuation into single characters and remove if None
if date_match['day2']:
date_order = [month, *check_none(date_match['punct7']),
day, *check_none(date_match['punct8'])]
elif date_match['day1']:
date_order = [day, *check_none(date_match['punct5']),
month, *check_none(date_match['punct6'])]
else:
date_order = [month]
if year:
date_order = date_order + [year]
date_order = list(map(lambda x: x if x else '', date_order))
else:
date_order = ['']
return date_order
"""Module for converting numbers to words."""
import math
import json
from src.utils import get_word_form, trailing_zeros
with open('data/numbers.json', 'r') as numbers_file:
numbers_dict = json.load(numbers_file)
number_words = {int(k): v for k, v in numbers_dict['number_words'].items()}
ordinal_number_words = {int(k): v for k, v
in numbers_dict['ordinal_number_words'].items()}
large_numbers = {int(k): v for k, v
in numbers_dict['large_numbers'].items()}
ordinal_large_numbers = {int(k): v for k, v
in numbers_dict['ordinal_large_numbers'].items()}
def three_digit_to_words(text, tag='', ordinal=False):
"""Convert three digits numbers to words with given tag. Util function."""
map_to_words = ordinal_number_words if ordinal else number_words
number = int(text)
if number == 0:
return get_word_form(map_to_words[number], tag)
words = []
units = number % 10
tens = number % 100 - units
hundredths = number // 100
if 0 < tens + units <= 20:
word = get_word_form(map_to_words[tens + units], tag)
words.append(word)
else:
if units != 0:
words.append(get_word_form(map_to_words[units], tag))
if tens != 0:
words.append(get_word_form(map_to_words[tens], tag))
if hundredths != 0:
if tens == 0 and units == 0:
words.append(get_word_form(map_to_words[hundredths * 100], tag))
else:
words.append(get_word_form(number_words[hundredths * 100], ''))
return ' '.join(reversed(words))
def num2words(text, tag='', ordinal=False):
"""Converts a number to words.
Args:
text (str): Three digits number.
tag (str, optional): Morphological tag. Defaults to ''.
ordinal (bool, optional): If word should be derived from ordinal number.
Defaults to False.
Returns:
str: Returns number as words with given tag.
"""
i = 0
words = []
number = int(text)
if ordinal:
zeros = trailing_zeros(number)
zeros = 3 * math.floor(zeros / 3)
if zeros > 2 and 0 < len(text) - zeros <= 3:
number = number // 10 ** zeros
if number == 1:
words = ''
else:
words = three_digit_to_words(str(number), 'numcomp')
words += get_word_form(ordinal_large_numbers[zeros], tag)
return words
if len(text) <= 3 or number == 0:
return three_digit_to_words(text, tag, ordinal)
while number > 0:
remainder = number % 1000
if i == 0:
triple = three_digit_to_words(remainder, tag, ordinal)
else:
triple = three_digit_to_words(remainder)
number = number // 1000
if remainder == 0 and number != 0:
i += 3
continue
if i == 0:
words.append(triple)
else:
if remainder == 1:
tag = 'subst:sg:nom:m3'
elif remainder % 10 in [2, 3, 4]:
tag = 'subst:pl:nom:m3'
else:
tag = 'subst:pl:gen:m3'
form = get_word_form(large_numbers[i], tag)
if remainder == 1:
words.append(form)
else:
words.append(triple + ' ' + form)
i += 3
return ' '.join(list(reversed(words)))
"""Module for useful functions."""
from enum import Enum
import morfeusz2
class TokenType(Enum):
"""Type of token."""
NUMBER = 1
SPECIAL_CHARACTER = 2
PUNCTUATION = 3
CURRENCY = 4
class NumberPlural(Enum):
"""Type of number indicating what the word suffix will be.
E.g:
SINGULAR 1$ - jeden dolar
SEVERAL (2-4) 2$ - dwa dolary
MANY (5+) 7$ - siedem dolarów
"""
SINGULAR = 0
SEVERAL = 1
MANY = 2
def to_number_plural(number):
"""Convert a number to enumerate type, that indicates word suffix.
Args:
number (int or string): Number to be converted.
Returns:
NumberPlural: Enumerate, which indicates what the end of the word
will be.
"""
number = int(number)
if number == 1:
return NumberPlural.SINGULAR
elif 2 <= number <= 4:
return NumberPlural.SEVERAL
else:
return NumberPlural.MANY
def is_simple_number(tokens, special_types):
"""Checks if list of tokens creates a simple number.
Simple number contains only digits and spaces between groups of three.
Args:
tokens (list): List of tokens.
special_types (list): Types of tokens.
Returns:
bool: Return True if joined tokens are simple number otherwise False.
"""
numbers = [n for i, n in enumerate(tokens)
if special_types[i] == TokenType.NUMBER]
return (all([len(t) == 3 for t in numbers[1:]]) and
all([(s.isdigit() or s == ' ') for s in tokens]))
def is_fraction(tokens, decimal=False):
"""Check is list of tokens are 2 numbers splitted by slash or dot.
Args:
tokens (list): List of tokens.
decimal (bool, optional): If True delimiter is dot otherwise slash '/'.
Defaults to False.
Returns:
bool: Return True if tokens are fraction otherwise False.
"""
if len(tokens) < 3:
return False
delimiter = '.' if decimal else '/'
splitted = ''.join(tokens).split(delimiter)
return ((len(splitted) == 2) and
tokens.count(delimiter) == 1 and
all([(s.isdigit() or s in ' /.') for s in tokens]))
def trailing_zeros(number):
"""Count trailing zeros in number.
Returns:
int: Return number of trailing zeros.
"""
manipulandum = str(number)
return len(manipulandum) - len(manipulandum.rstrip('0'))
def search_form(forms, tag):
"""Search for the correct form of word from all those returned by Morfeusz.
Args:
forms (list of tuples): Tags and variations of words returned
by Morfeusz.
tag (str): The tag of the word whose form is being searched for.
Returns:
str: Word properly conjugated with the given tag or None if not found.
"""
for form in forms:
form_categories = [x.split('.') for x in form[2].split(':')]
gramm_categ_enum = enumerate(tag)
if all((c in form_categories[i] for i, c in gramm_categ_enum)):
return form[0]
return None
def get_word_form(text, tag):
"""Change the word in the appropriate form with given morphological tag.
Args:
text (str): Word to be changed.
tag (str): Morphological tag.
Returns:
str: Word changed with given morphological tag.
"""
if not tag:
return text
morf = morfeusz2.Morfeusz()
all_forms = morf.generate(text)
tag = tag.split(':')
forms = [x for x in all_forms if x[2].split(':')[0] == tag[0]]
form = search_form(forms, tag)
if form:
return form
if len(tag) > 4:
tag = tag[:4]
form = search_form(forms, tag)
if form:
return form
else:
return text
def subtract_from_first(list_of_tuples, offset):
"""Subtract from every first element in tuples that make up list."""
list_of_tuples = (list_of_tuples[0] - offset, *list_of_tuples[1:])
return list_of_tuples
def check_and_replace(string_builder, find, replace, filtered_tokens):
"""Check for matches in list and replace them with given tokens.
Remove replaced tokens from `filtered_tokens` to to avoid double processing.
Args:
string_builder (list of str): List of all words.
find (list of str): Tokens to be replaced.
replace (list of str): Words that will replace `find` tokens in
`string_builder`.
filtered_tokens (list of tuples): List of tokens and their features.
Returns:
(list of str, list of tuples): Pair: list of words with replaced matched
tokens and filtered list of tokens and their feature with deleted
items that have been replaced.
"""
if not find or not replace:
return string_builder, filtered_tokens
new_builder = string_builder.copy()
max_lenght = max(map(len, find))
for i, token in enumerate(string_builder):
if not find:
break
to_remove = [i]
check = token
j = i + 1
if check in find:
new_builder[i] = ''.join(replace[find.index(check)])
filtered_tokens = list(filter(lambda x: x[0] != i, filtered_tokens))
del find[0], replace[0]
continue
if check[0] != find[0][:len(check[0])]:
continue
while len(check) < max_lenght and j < len(string_builder):
check += string_builder[j]
to_remove.append(j)
if check in find:
index = find.index(check)
new_builder = new_builder[:i] + replace[index]
if j + 1 < len(string_builder):
new_builder += string_builder[j + 1:]
filtered_tokens = list(filter(lambda x: x[0] not in to_remove,
filtered_tokens))
find.pop(index)
replace.pop(index)
if not find:
return new_builder, filtered_tokens
j += 1
return new_builder, filtered_tokens
"""Implementation of wordifier functionality."""
import re
import json
from itertools import islice
from src.utils import is_simple_number, subtract_from_first, trailing_zeros, \
check_and_replace, TokenType, NumberPlural, to_number_plural, is_fraction
from src.num2words import num2words
from src.date2words import date2words
class Wordifier:
"""Class for generating words from special characters or numbers."""
date_regex = re.compile(
r'\b(?P<day_or_month_year>'
r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
r'(?P<year1>\d{4}|\d{2}))\b|'
r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
r'(?P<month_in_words>'
r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|'
r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)'
r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)'
r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)'
r'|Gru(?:|dzie[nń]|dnia))\b'
r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I
)
decimal_fraction_regex = re.compile(r'\d+[ ]?(\.)[ ]?\d+')
number_punctuation = ' .,'
following_type = {
TokenType.NUMBER: [TokenType.NUMBER, TokenType.SPECIAL_CHARACTER,
TokenType.CURRENCY],
TokenType.SPECIAL_CHARACTER: [TokenType.SPECIAL_CHARACTER,
TokenType.NUMBER],
TokenType.CURRENCY: []
}
_denominator_tag = {
NumberPlural.SINGULAR: {
'default': 'adj:sg:nom:f',
('acc', 'dat', 'gen', 'loc'): {
('f'): 'adj:sg:acc:f'
}
},
NumberPlural.SEVERAL: {
'default': 'adj:pl:acc:f',
('dat'): {
('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f'
},
('gen', 'loc'): {
('m1', 'm2', 'm3', 'f', 'n'): 'adj:pl:acc:m1'
},
('nom', 'voc'): {
('m1'): 'adj:pl:acc:m1'
}
},
NumberPlural.MANY: {
'default': 'adj:pl:acc:m1',
('acc', 'nom', 'voc'): {
('m1'): 'adj:sg:dat:f'
},
('gen', 'dat', 'inst', 'loc'): {
('m1', 'm2', 'm3', 'f', 'n'): 'adj:sg:dat:f'
}
}
}
special_character_numbers_map = {
'+': 'plus',
'-': 'minus',
'/': 'przez',
'*': 'razy',
'%': 'procent',
'&': 'ampersand',
'=': 'równa się',
'^': 'do potęgi',
'#': 'numer'
}
special_character_map = {
'+': 'plus',
'-': '-',
'/': 'ukośnik',
'%': 'procent',
'&': 'i',
'=': 'równa się',
'^': 'kareta',
'#': 'kratka'
}
def __init__(self):
"""Class initialization."""
self.unmarshallers = {
'chunk': lambda *args: '\n',
'sentence': lambda *args: self._process_sent_tree(*args),
}
with open('data/currencies.json', 'r') as currency_file:
self._currencies = json.load(currency_file)
self._wordify_tokens = []
def _process_sent_tree(self, sentence_subtree):
string_builder = []
tags = []
tok_id = 0
for elem in sentence_subtree:
if elem.tag == 'tok':
token, tag = self._process_single_tok(tok_id, elem)
string_builder.append(token)
string_builder.append(' ')
tags.append(tag)
tok_id += 2
elif elem.tag == 'ns':
tok_id -= 1
string_builder.pop()
else:
raise Exception('Unrecognized tag inside sentence: ' + elem.tag)
return self._process_sentence(string_builder, tags)
def _get_denominator_tag(self, nominator_plural, nom_case, nom_gender=None):
if nom_case == 'default' or nom_gender is None:
return self._denominator_tag[nominator_plural]['default']
for cases, value in self._denominator_tag[nominator_plural].items():
if cases == 'default':
continue
if nom_case in cases:
for genders, tag in value.items():
if nom_gender in genders:
return tag
return self._denominator_tag[nominator_plural]['default']
def _special_type(self, text):
if text in self.special_character_map:
return TokenType.SPECIAL_CHARACTER
elif text in self._currencies:
return TokenType.CURRENCY
elif text.isdigit():
return TokenType.NUMBER
return None
def _process_single_tok(self, tok_id, tok_subtree):
text = ''
tag = ''
for elem in tok_subtree:
if elem.tag == 'orth':
text = elem.text
elif elem.tag == 'lex':
tag = self._process_lex(elem)
word = self._process_word(tok_id, text, tag)
return word, tag
def _process_word(self, tok_id, text, tag):
self._add_special(tok_id, text, tag)
return text
def _add_special(self, tok_id, text, tag):
s_type = self._special_type(text)
if s_type:
self._wordify_tokens.append((tok_id, text, tag, s_type))
return text
def _process_lex(self, lex_subtree):
tag = ''
for elem in lex_subtree:
if elem.tag == 'ctag':
tag = elem.text
elif elem.tag != 'base':
raise Exception('Unrecognized tag inside lex: ' + elem.tag)
if tag == '':
raise Exception('Lex tag had no ctag inside!')
return tag
def _handle_fraction(self, tokens, tags):
"""Generate words from fraction splitted by slash '/'.
Args:
tokens (list of str): List that contains numbers separated by
slash '/'.
Returns:
str: Fraction as words.
"""
text = ''.join(tokens)
numerator, denominator = text.split('/')
tag_num = tags[0]
remainder = to_number_plural(int(numerator) % 10)
tag_case, tag_gender = tag_num.split(':')[2:4]
tag_den = self._get_denominator_tag(remainder, tag_case, tag_gender)
zeros = trailing_zeros(denominator)
if len(denominator) < 4 or \
(zeros > 2 and 0 < len(denominator) - zeros <= 3):
return num2words(numerator, tag_num) + ' ' + \
num2words(denominator, tag_den, True)
else:
return num2words(numerator) + ' przez ' + \
num2words(denominator)
def _handle_decimal_fraction(self, tokens):
"""Generate words from decimal fraction splitted by dot.
Args:
tokens (list of str): List that contains numbers separated by dot.
Returns:
str: Decimal fraction as words.
"""
text = ''.join(tokens)
number, numerator = text.split('.')
number = number.replace(' ', '')
tag_num = 'adj:sg:nom:f' if int(numerator) == 1 else 'num:pl:nom:f'
denominator = str(10 ** len(numerator))
remainder = to_number_plural(int(numerator) % 10)
tag_den = self._get_denominator_tag(remainder, 'default')
if int(number) == 0:
return num2words(numerator, tag_num) + ' ' + \
num2words(denominator, tag_den, True)
else:
return num2words(number) + ' i ' + \
num2words(numerator, tag_num) + ' ' + \
num2words(denominator, tag_den, True)
def _check_decimal_fraction(self, tokens):
"""Checks whether given list of tokens starts with decimal fraction.
If contains fraction generate words from whole fraction otherwise
generate words from first number.
Args:
tokens (list of str): List of tokens with number at the beginning.
Returns:
str: Tokens that form a fraction or number.
int: The number of tokens that make up the fraction.
"""
match = self.decimal_fraction_regex.search(''.join(tokens[:5]))
if match and match.start() == 0:
tokens_match = tokens[0]
i = 1
while tokens_match != match.group(0):
tokens_match += tokens[i]
i += 1
return match.group(0), i - 1
else:
return tokens[0], 0
def _handle_mixed_types(self, tokens, special_types, tags):
last_number_plural = NumberPlural.SINGULAR
if TokenType.NUMBER in special_types:
special_character_map = self.special_character_numbers_map
else:
special_character_map = self.special_character_map
i = 0
iter_special_types = iter(special_types)
for token_type in iter_special_types:
if token_type == TokenType.SPECIAL_CHARACTER:
if tokens[i] in special_character_map:
tokens[i] = special_character_map[tokens[i]]
else:
tokens[i] = ''
elif token_type == TokenType.PUNCTUATION:
if tokens[i] == ' ':
tokens[i] = ''
elif token_type == TokenType.NUMBER:
number, skip = self._check_decimal_fraction(tokens[i:])
if skip > 0:
words = self._handle_decimal_fraction(number)
if int(''.join(number).split('.')[0]) == 0:
last_number_plural = NumberPlural.FRACTION
else:
last_number_plural = NumberPlural.MANY
else:
words = num2words(number)
last_number_plural = to_number_plural(number)
tokens = tokens[:i] + [words] + tokens[i + skip + 1:]
if skip != 0:
next(islice(iter_special_types, skip - 1, skip), '')
elif token_type == TokenType.CURRENCY:
suffix = last_number_plural.value
tokens[i] = self._currencies[tokens[i]][suffix]
i += 1
text = ' '.join([w for w in tokens if w != ''])
return text
def _get_as_words(self, tokens, tags, special_types):
"""Convert special tokens and numbers to words.
Args:
tokens (list of str): List of tokens.
special_types (list of TokenType): Types of tokens.
Returns:
str : Joined tokens converted to words.
"""
if is_simple_number(tokens, special_types):
numbers = ''.join([n for i, n in enumerate(tokens)
if special_types[i] == TokenType.NUMBER])
return num2words(''.join(numbers), tags[-1])
elif is_fraction(tokens):
return self._handle_fraction(tokens, tags)
elif is_fraction(tokens, decimal=True):
return self._handle_decimal_fraction(tokens)
else:
return self._handle_mixed_types(tokens, special_types, tags)
def _check_number_multipart(self, index, next_id, string_builder):
"""Check if the next token is continuation of number with actual token.
Args:
index (int): Actual token id.
next_id (int): Next token id.
string_builder (list of str): List of all words.
Returns:
bool: Is next token continuation of a number.
"""
return next_id == index + 1 or \
(index + 2 == next_id and
string_builder[index + 1] in self.number_punctuation)
def _join_tokens(self, token, string_builder):
"""Combine tokens that form multi-part formulas.
Args:
tokens (list of tuple): List of tokens and their features.
Every element contains index, word, morphological tag and
token type.
string_builder (list of str): List of all words.
Returns:
list of tuple: List of joined tokens and their features.
"""
joined_tokens = []
iter_wordify_tokens = enumerate(iter(self._wordify_tokens))
for i, (index, token, tag, token_type) in iter_wordify_tokens:
j = i + 1
tokens = [token]
tags = [tag]
special_types = [token_type]
start_id = index
while j < len(self._wordify_tokens):
next_id, next_token, next_tag, \
next_special_type = self._wordify_tokens[j]
if not self._check_number_multipart(index, next_id,
string_builder):
break
if next_special_type in self.following_type[token_type]:
if index + 2 == next_id:
tokens.append(string_builder[index + 1])
special_types.append(TokenType.PUNCTUATION)
tags.append('')
tokens.append(next_token)
tags.append(next_tag)
special_types.append(next_special_type)
else:
break
next(iter_wordify_tokens)
index = next_id
token_type = next_special_type
j += 1
joined_tokens.append((start_id, tokens, tags, special_types))
return joined_tokens
def _handle_special_types(self, string_builder):
"""Convert special tokens to words and replace them in string builder.
Args:
string_builder (list of str]): List of all words.
Returns:
list of str: Return updated string builder with special tokens
replaced by words.
"""
wordify_tokens = self._join_tokens(self._wordify_tokens, string_builder)
enum_special = enumerate(wordify_tokens)
for i, special_token in enum_special:
index, tokens, tags, token_type = special_token
words = self._get_as_words(tokens, tags, token_type)
no_tokens = len(tokens)
string_builder = string_builder[:index] + [words] + \
string_builder[index + no_tokens:]
offset = no_tokens - 1
wordify_tokens[i + 1:] = [subtract_from_first(x, offset)
for x in wordify_tokens[i + 1:]]
self._wordify_tokens.clear()
return string_builder
def _get_match_tag(self, match, string_builder, tags):
match = match.group(0)
j = 0
for i, word in enumerate(string_builder):
if match.startswith(word):
acc = word
match_tags = [tags[j]]
tmp = j
while i < len(string_builder) - 1 and len(acc) < len(match):
i += 1
acc += string_builder[i]
if acc != match[:len(acc)]:
break
if string_builder[i] != ' ':
j += 1
match_tags.append(tags[j])
j = tmp
if acc == match:
return match_tags
if word != ' ':
j += 1
return []
def _handle_regexes(self, string_builder, tags):
"""Check for regexes in the given builder and replace them with words.
Args:
string_builder (list of str): List of all words.
Returns:
list of str: Updated string builder with matches replaced by words.
"""
sentence = ''.join(string_builder)
matches = list(self.date_regex.finditer(sentence))
if not matches:
return string_builder
replace = []
for match in matches:
date_tags = self._get_match_tag(match, string_builder, tags)
replace.append(date2words(match, date_tags))
matches = list(map(lambda m: m.group(0), matches))
builder, self._wordify_tokens = check_and_replace(string_builder,
matches, replace,
self._wordify_tokens)
return builder
def _process_sentence(self, string_builder, tags):
"""Process a sentence and replace special tokens (eg. numbers) words.
Args:
string_builder (list of str): List of all words.
Returns:
str: Sentece with replaced special tokens.
"""
string_builder = self._handle_regexes(string_builder, tags)
string_builder = self._handle_special_types(string_builder)
if string_builder[0] and not string_builder[0][0].isupper():
string_builder[0] = string_builder[0].capitalize()
return ''.join(string_builder)
"""Implementation of nlp_worker."""
import logging
import nlp_ws
from src.wordifier import Wordifier
from src.ccl_handler import CCLHandler
_log = logging.getLogger(__name__)
class Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for tokenizer service."""
@classmethod
def static_init(cls, config):
"""One time static initialisation."""
def process(self, input_file, task_options, output_file):
"""Processing an input file and generating tokens converted to words."""
wordifier = Wordifier()
ccl_handler = CCLHandler(input_file)
ccl_handler.process(output_file, wordifier.unmarshallers)
import unittest
from parameterized import parameterized, param
from src.num2words import num2words
class TestNum2Words(unittest.TestCase):
single_tag = 'adj:sg:nom:f'
several_tag = 'adj:pl:acc:f'
many_tag = 'adj:pl:acc:m1'
@parameterized.expand([
param('0', 'zero'),
param('08', 'osiem'),
param('12', 'dwanaście'),
param('23', 'dwadzieścia trzy'),
param('48', 'czterdzieści osiem'),
param('187', 'sto osiemdziesiąt siedem'),
param('249', 'dwieście czterdzieści dziewięć'),
param('600', 'sześćset'),
param('720', 'siedemset dwadzieścia'),
param('304', 'trzysta cztery'),
param('1000', 'tysiąc'),
param('425000', 'czterysta dwadzieścia pięć tysięcy'),
param('102000', 'sto dwa tysiące'),
param('390000', 'trzysta dziewięćdziesiąt tysięcy'),
param('701000', 'siedemset jeden tysięcy'),
param('993999', 'dziewięćset dziewięćdziesiąt trzy tysiące '
'dziewięćset dziewięćdziesiąt dziewięć'),
param('1000642', 'milion sześćset czterdzieści dwa'),
param('2001003', 'dwa miliony tysiąc trzy'),
param('18456000', 'osiemnaście milionów '
'czterysta pięćdziesiąt sześć tysięcy'),
param('1000000000', 'miliard')
])
def test_numbers(self, number, words):
self.assertEqual(num2words(number), words)
@parameterized.expand([
param('0', 'zerowy', ordinal=True),
param('1', 'pierwszy', ordinal=True),
param('10', 'dziesiąty', ordinal=True),
param('15', 'piętnasty', ordinal=True),
param('31', 'trzydziesty pierwszy', ordinal=True),
param('70', 'siedemdziesiąty', ordinal=True),
param('099', 'dziewięćdziesiąty dziewiąty', ordinal=True),
param('100', 'setny', ordinal=True),
param('102', 'sto drugi', ordinal=True),
param('183', 'sto osiemdziesiąty trzeci', ordinal=True),
param('201', 'dwieście pierwszy', ordinal=True),
param('1000', 'tysięczny', ordinal=True),
param('1005', 'tysiąc piąty', ordinal=True),
param('2000', 'dwutysięczny', ordinal=True),
param('2020', 'dwa tysiące dwudziesty', ordinal=True),
param('10000', 'dziesięciotysięczny', ordinal=True),
param('100856', 'sto tysięcy osiemset pięćdziesiąty szósty',
ordinal=True),
param('1000000', 'milionowy', ordinal=True),
param('1002003', 'milion dwa tysiące trzeci', ordinal=True),
param('1948052296', 'miliard dziewięćset czterdzieści osiem milionów '
'pięćdziesiąt dwa tysiące '
'dwieście dziewięćdziesiąty szósty', ordinal=True),
])
def test_ordinal_numbers(self, number, words, ordinal):
self.assertEqual(num2words(number, ordinal=ordinal), words)
@parameterized.expand([
('1', 'adj:sg:nom:f', 'jedna'),
('2', 'num:pl:nom:f', 'dwie')
])
def test_numbers_numerator(self, number, tag, words):
self.assertEqual(num2words(number, tag), words)
@parameterized.expand([
param('1', 'pierwsza'),
param('2', 'druga'),
param('5', 'piąta'),
param('10', 'dziesiąta'),
param('31', 'trzydziesta pierwsza'),
param('100', 'setna'),
param('102', 'sto druga'),
param('512', 'pięćset dwunasta'),
param('600', 'sześćsetna'),
param('1000', 'tysięczna'),
param('2002', 'dwa tysiące druga'),
param('3000', 'trzytysięczna'),
param('1000000000', 'miliardowa'),
param('1473022977', 'miliard czterysta siedemdziesiąt trzy miliony '
'dwadzieścia dwa tysiące dziewięćset siedemdziesiąta siódma'),
])
def test_single_numbers_denominator(self, number, words, ordinal=True):
self.assertEqual(num2words(number, self.single_tag, ordinal), words)
@parameterized.expand([
param('3', 'trzecie'),
param('6', 'szóste'),
param('10', 'dziesiąte'),
param('47', 'czterdzieste siódme'),
param('100', 'setne'),
param('101', 'sto pierwsze'),
param('300', 'trzechsetne'),
param('981', 'dziewięćset osiemdziesiąte pierwsze'),
param('1000', 'tysięczne'),
param('8000', 'ośmiotysięczne'),
param('10000', 'dziesięciotysięczne'),
param('100000', 'stutysięczne'),
param('1000115376708', 'bilion sto piętnaście milionów '
'trzysta siedemdziesiąt sześć tysięcy siedemset ósme'),
])
def test_several_numbers_denominator(self, number, words, ordinal=True):
self.assertEqual(num2words(number, self.several_tag, ordinal), words)
@parameterized.expand([
param('4', 'czwartych'),
param('8', 'ósmych'),
param('10', 'dziesiątych'),
param('69', 'sześćdziesiątych dziewiątych'),
param('100', 'setnych'),
param('212', 'dwieście dwunastych'),
param('700', 'siedemsetnych'),
param('901', 'dziewięćset pierwszych'),
param('1000', 'tysięcznych'),
param('6000', 'sześciotysięcznych'),
param('10000', 'dziesięciotysięcznych'),
param('1000000', 'milionowych'),
param('238055017238', 'dwieście trzydzieści osiem miliardów '
'pięćdziesiąt pięć milionów siedemnaście tysięcy '
'dwieście trzydziestych ósmych'),
])
def test_many_numbers_denominator(self, number, words, ordinal=True):
self.assertEqual(num2words(number, self.many_tag, ordinal), words)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment