From df4a5f01986dbc25b2a455c2b5c5c77f9aad9737 Mon Sep 17 00:00:00 2001 From: NRopiak <norbert.ropiak@pwr.edu.pl> Date: Thu, 29 Oct 2020 18:58:20 +0100 Subject: [PATCH 1/5] code refactor, regex fix + date regex --- .gitignore | 138 ++++++++++++++++++++ requirements.txt | 4 +- src/anonymizer.py | 317 ++++++++++++++-------------------------------- src/generators.py | 188 +++++++++++++++++++++++++++ 4 files changed, 427 insertions(+), 220 deletions(-) create mode 100644 .gitignore create mode 100644 src/generators.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5391d87 --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index dae0fc5..9022646 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -nlp-ws \ No newline at end of file +nlp-ws +regex==2020.10.28 +Babel==2.8.0 \ No newline at end of file diff --git a/src/anonymizer.py b/src/anonymizer.py index e412456..a00b578 100644 --- a/src/anonymizer.py +++ b/src/anonymizer.py @@ -1,12 +1,59 @@ """Implementation of anonymizer functionality.""" -import re -from string import punctuation, ascii_lowercase, ascii_uppercase, digits import random +from src.generators import generate_pseudo_email, generate_pseudo_phone_number, \ + generate_pseudo_user, generate_pseudo_website, generate_phone_number_tag, generate_pseudo_date + +import regex class Anonymizer: """Class used to edit sentences based on options.""" + email_regex = regex.compile( + r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*' + r'@)' + r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)' + r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', regex.I + ) + user_regex = regex.compile(r'\B(?P<username>\@[\w\-]+)') + _website_exceptions = ['m.in'] + website_regex = regex.compile( + r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(_website_exceptions)) + + r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?' + r'(?P<auth>\S+(?::\S*)?@)?' + r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})' + r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})' + r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})' + r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])' + r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}' + r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))' + r'|' + r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?' + r'[a-z0-9\u00a1-\uffff]\.)+)' + r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)' + r'(?P<port>:\d{2,5})?' + r'(?P<path>[/?#]\S*)?)', + regex.UNICODE | regex.I + ) + phone_number_regex = regex.compile( + r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?' + r'(?P<number>(\d[- ]??){9,10})' + ) + date_regex = regex.compile( + r'\b(?P<day_or_month_year>(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})(?P<punct3>[ \t\-\./,]{1,2})' + r'(?P<day_month3>[0-3]?\d)(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|Kwi(?:|ecie[nń]|etnia)' + r'|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)|Gru(?:|dzie[nń]|dnia))\b' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?' + r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2}))?)', regex.I + ) + _file_to_liner_dispatch = { 'nam_liv_person': 'person_first_nam', 'nam_liv_person_last': 'person_last_nam', @@ -33,9 +80,19 @@ class Anonymizer: self._mail_token = '[MAIL]' self._user_token = '@[USER]' self._website_token = '[WWW]' + self._digits_token = '[DIGITS]' + self._date_token = '[DATE]' self._default_token = '[INNE]' self._form_dict = dict() self._pseudo_ann_list = list() + + self._category_anonymisation = { # Order is important, first more specific + 'user': (self.user_regex, self._user_token, generate_pseudo_user), + 'email': (self.email_regex, self._mail_token, generate_pseudo_email), + 'website': (self.website_regex, self._website_token, generate_pseudo_website), + 'date': (self.date_regex, self._date_token, generate_pseudo_date), + 'phone_number': (self.phone_number_regex, self._digits_token, generate_pseudo_phone_number), + } self._load_file() def _load_file(self, file_name='wiki.txt'): @@ -80,15 +137,8 @@ class Anonymizer: def _process_sentence(self, string_builder): string_builder = self._handle_pseudo_ann(string_builder) - return self._anonoymize_phone_number( - self._anonoymize_website( - self._anonoymize_user( - self._anonoymize_email( - ''.join(string_builder) - ) - ) - ) - ) + sentence = ''.join(string_builder) + return self._anonymize(sentence) def _process_word(self, id, text, tag, ann): for annotation in ann: @@ -194,220 +244,49 @@ class Anonymizer: chan = ann_subtree.attrib["chan"] return chan, value - @staticmethod - def _get_random_character(digit=False, upper=False): - return random.choice(digits) if digit \ - else random.choice(ascii_uppercase) \ - if upper else random.choice(ascii_lowercase) - - @staticmethod - def _generate_pseudo_email(email): - new_mail = [] - it = iter(email) - top_domain_len = email.rfind('.') - i = 0 - for char in it: - if char == '@': - new_mail.append(char) - i += 1 - break - elif char in punctuation: - new_mail.append(char) - else: - new_mail.append( - Anonymizer._get_random_character( - char.isdigit(), - char.isupper() - ) - ) - i += 1 - for char in it: - if char == '.': - if i == top_domain_len: - new_mail.append(char) - break - new_mail.append(char) - elif char in punctuation: - new_mail.append(char) - else: - new_mail.append( - Anonymizer._get_random_character( - char.isdigit(), - char.isupper() - ) - ) - i += 1 - for char in it: - new_mail.append(char) - return r''.join(new_mail) - - @staticmethod - def _generate_pseudo_user(user): - it = iter(user) - new_user = [] - for char in it: - if char in punctuation: - new_user.append(char) - else: - new_user.append( - Anonymizer._get_random_character( - char.isdigit(), - char.isupper() - ) - ) - return r''.join(new_user) - - @staticmethod - def _generate_pseudo_website(link): - it = iter(link) - new_link = [] - if link[0:4].lower() == 'http': - slashes = 0 - for char in it: - if char == '/': - slashes += 1 - new_link.append(char) - if slashes == 2: - break - for char in it: - if char == '/': - new_link.append(char) - break - else: - new_link.append(char) - for char in it: - if char in punctuation: - new_link.append(char) - else: - new_link.append( - Anonymizer._get_random_character( - char.isdigit(), - char.isupper() - ) - ) - return r''.join(new_link) - - @staticmethod - def _generate_pseudo_phone_number(number): - new_number = [] - length = len(number) - it = iter(number) - if number[0] == '+': - how_many = length - 9 - for j in range(0, how_many): - new_number.append(next(it)) - elif number[0] == '0' and number[1] == '0' \ - and number[length - 10] == ' ': - for j in range(0, length - 10): - new_number.append(next(it)) - elif number[0] == '(' and number[1] == '0' and number[4] == ')': - for j in range(0, 2): - new_number.append(next(it)) - for char in it: - if char.isdigit(): - new_number.append(Anonymizer._get_random_character(digit=True)) - else: - new_number.append(char) - return r''.join(new_number) - - def _generate_phone_number_tag(self, number): - new_number = number.split(' ') - for i in range(len(new_number)): - new_number[i] = self._default_token - return r' '.join(new_number) - - def _anonoymize_email(self, sentence): - """Handles removal/changing of emails addresses.""" - email_regex = r'[\w\.-]+@[\w\.-]+\.\w{2,4}' + def _anonymize(self, sentence): if self._method == 'delete': - sentence = re.sub(email_regex, '', sentence) + for pattern, _, _ in self._category_anonymisation.values(): + sentence = regex.sub(pattern, '', sentence) elif self._method == 'tag': - sentence = re.sub(email_regex, self._mail_token, sentence) + sentence = self._tagging(sentence) elif self._method == 'pseudo': - matches = re.findall(email_regex, sentence) - for match in matches: - sentence = re.sub( - re.escape(match), - self._generate_pseudo_email(match), - sentence - ) + sentence = self._pseudonymization(sentence) return sentence - def _anonoymize_user(self, sentence): - """Handles removal/change of users.""" - user_regex = r'\B\@([\w\-]+)' - if self._method == 'delete': - sentence = re.sub(user_regex, '', sentence) - elif self._method == 'tag': - sentence = re.sub(user_regex, self._user_token, sentence) - elif self._method == 'pseudo': - matches = re.findall(user_regex, sentence) - for match in matches: - sentence = re.sub( - re.escape(match), - self._generate_pseudo_user(match), - sentence - ) - return sentence + def _tagging(self, sentence): + for category in self._category_anonymisation: + pattern, token, _ = self._category_anonymisation[category] - def _anonoymize_website(self, sentence): - """Handles removal/change of links.""" - link_regex = r'(((h|H)(t|T)(t|T)(p|P)(s|S)?:\/\/(?:www\.|(?!www)))?' \ - r'[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]+\.(?:(?!(h|H)' \ - r'(t|T)(t|T)(p|P)(s|S)?:\/\/))[^\s]{2,}|www\.[a-zA-Z0-9]' \ - r'[a-zA-Z0-9-]+[a-zA-Z0-9]\.(?:(?!(h|H)(t|T)(t|T)(p|P)' \ - r'(s|S)?:\/\/))[^\s]{2,}|(h|H)(t|T)(t|T)(p|P)(s|S)?:\/\/' \ - r'(?:www\.|(?!www))[a-zA-Z0-9]+\.(?:(?!(h|H)(t|T)(t|T)' \ - r'(p|P)(s|S)?:\/\/))[^\s]{2,}|www\.[a-zA-Z0-9]+\.' \ - r'(?:(?!(h|H)(t|T)(t|T)(p|P)(s|S)?:\/\/))[^\s]{2,})' - if self._method == 'delete': - sentence = re.sub(link_regex, '', sentence) - elif self._method == 'tag': - sentence = re.sub(link_regex, self._website_token, sentence) - elif self._method == 'pseudo': - matches = re.findall(link_regex, sentence) - for match in matches: - for val in match: - if val != '': - match = val - break - sentence = re.sub( - re.escape(match), - self._generate_pseudo_website(match), - sentence - ) + if category == 'phone_number': + matches = [m for m in pattern.finditer(sentence)] + for match in matches: + tag = generate_phone_number_tag(match.groupdict(''), token) + replace_match = match.group(0) + sentence = regex.sub(regex.escape( + replace_match), tag, sentence) + else: + sentence = regex.sub(pattern, token, sentence) return sentence - def _anonoymize_phone_number(self, sentence): - """Handles removal/change of links.""" - phone_number_regex = r'(((\+[1-9]\d{0,2}|00[1-9]\d{0,2}) ?)?(\d{9}))' \ - r'|((\+[1-9]\d{0,2} |00[1-9]\d{0,2} )?' \ - r'(\d{3} \d{3} \d{3}))|(\(0\d{2}\) \d{2} \d{2} ' \ - r'\d{3})|(\(\d{2}\) \d{2} \d{3} \d{2})' - if self._method == 'delete': - sentence = re.sub(phone_number_regex, '', sentence) - elif self._method == 'tag': - matches = re.findall(phone_number_regex, sentence) - for match in matches: - for val in match: - if val != '': - match = val - break - sentence = re.sub( - re.escape(match), - self._generate_phone_number_tag(match), - sentence - ) - elif self._method == 'pseudo': - matches = re.findall(phone_number_regex, sentence) - for match in matches: - for val in match: - if val != '': - match = val - break - sentence = re.sub( - re.escape(match), - self._generate_pseudo_phone_number(match), - sentence - ) + def _pseudonymization(self, sentence): + sentence_after_regex = sentence + to_replace = [] + for category in self._category_anonymisation: + pattern, _, generator = self._category_anonymisation[category] + for match in pattern.finditer(sentence_after_regex): + if not match: + continue + to_replace.append((match, generator)) + sentence_after_regex = regex.sub( + regex.escape(match.group(0)), '', sentence_after_regex) + + for match, generator in to_replace: + replace_match = match.group(0) + pseudo_string = generator(match.groupdict('')) + sentence = regex.sub( + regex.escape(replace_match), + pseudo_string, + sentence + ) return sentence diff --git a/src/generators.py b/src/generators.py new file mode 100644 index 0000000..401d450 --- /dev/null +++ b/src/generators.py @@ -0,0 +1,188 @@ +import re +import random +import calendar +from string import punctuation, ascii_lowercase, ascii_uppercase, digits +from datetime import datetime +from babel import Locale + + +def get_random_character(digit=False, upper=False): + return random.choice(digits) if digit \ + else random.choice(ascii_uppercase) \ + if upper else random.choice(ascii_lowercase) + + +def pseudonymize_string(sentence: str, leave_chars: str = ''): + if not sentence: + return '' + pseudonymized = '' + for char in sentence: + if char in leave_chars: + pseudonymized += char + else: + pseudonymized += get_random_character( + char.isdigit(), + char.isupper()) + return pseudonymized + + +def generate_pseudo_email(email_match): + local_part = email_match['local_part'] + domain = email_match['domain'] + top_level_domain = email_match['tld'] + new_email = pseudonymize_string(local_part, punctuation + '@') + new_email += pseudonymize_string(domain, punctuation) + return new_email + top_level_domain + + +def generate_pseudo_user(user_match): + username = user_match['username'][1:] + new_username = pseudonymize_string(username) + return '@' + new_username + + +def generate_pseudo_website(website_match): + protocol = website_match['protocol'] + auth = website_match['auth'] + host = website_match['host'] + top_level_domain = website_match['tld'] + port = website_match['port'] + path = website_match['path'] + new_website = protocol + new_website += pseudonymize_string(auth, punctuation) + new_website += host + new_website += top_level_domain + new_website += pseudonymize_string(port, punctuation) + new_website += pseudonymize_string(path, punctuation) + return new_website + + +def generate_pseudo_phone_number(number_match): + country_code = number_match['country_code'] + phone_number = number_match['number'] + new_phone_number = country_code + \ + pseudonymize_string(phone_number, [' ', '-']) + return new_phone_number + + +def generate_phone_number_tag(number_match, default_token): + splitted_number = re.split('([- ])', ''.join(number_match.values())) + new_number = '' + for part in splitted_number: + if part in [' ', '-']: + new_number += part + else: + new_number += default_token + return ''.join(new_number) + + +def random_year(year_match): + if not year_match: + return '' + popular_years_probability = 0.8 + actual_year = datetime.now().year + if len(year_match) == 2: + if random.random() < popular_years_probability: + year = "{:02d}".format(random.randint(actual_year - 40, + actual_year + 5) % 100) + else: + year = "{:02d}".format(random.randint(0, 99)) + else: + if random.random() < popular_years_probability: + year = random.randint(actual_year - 100, + actual_year + 10) + else: + year = random.randint(1400, datetime.now().year + 100) + return str(year) + + +def random_day(month, year): + if not year: + year = datetime.now().year + dates = calendar.Calendar().itermonthdates(int(year), int(month)) + return random.choice([date.day for date in dates if date.month == int(month)]) + + +def random_date(day_no_digits, month_no_digits, year_match): + year = random_year(year_match) + + month = random.randint(1, 12) if month_no_digits == 2 \ + else random.randint(1, 9) + month = f'{month:02}' if month_no_digits == 2 else str(month) + day = random_day(month, year) if day_no_digits == 2 \ + else random.randint(1, 9) + day = f'{day:02}' if day_no_digits == 2 else str(day) + + return day, month, year + + +def month_number2text(month_number: int, abbr: bool, case='genitive'): + locale = Locale('pl') + if case == 'genitive': + months = locale.months['format'] + elif case == 'nominative': + months = locale.months['stand-alone'] + else: + months = locale.months['format'] + + if abbr: + months = months['abbreviated'] + else: + months = months['wide'] + + return months[month_number] + + +def generate_pseudo_date(date_match): + date = '' + if date_match['day_or_month_year']: + no_digits = (len(date_match['day_month1']), + len(date_match['day_month2'])) + if int(date_match['day_month2']) > 12: + no_digits = (len(date_match['day_month2']), + len(date_match['day_month1'])) + day, month, year = random_date(no_digits[0], no_digits[1], + date_match['year1']) + + date_order = [day, date_match['punct1'], + month, date_match['punct2'], year] + if int(date_match['day_month2']) > 12: + date_order[0], date_order[2] = date_order[2], date_order[0] + date = ''.join(date_order) + elif date_match['year_month_or_day']: + no_digits = (len(date_match['day_month4']), + len(date_match['day_month3'])) + if int(date_match['day_month3']) > 12: + no_digits = (len(date_match['day_month3']), + len(date_match['day_month4'])) + day, month, year = random_date(no_digits[0], no_digits[1], + date_match['year2']) + + date_order = [year, date_match['punct3'], month, + date_match['punct4'], day] + if int(date_match['day_month3']) > 12: + date_order[2], date_order[4] = date_order[4], date_order[2] + date = ''.join(date_order) + elif date_match['month_in_words']: + day = len(date_match['day1']) + if date_match['day2']: + day = len(date_match['day2']) + day, month, year = random_date(day, 2, date_match['year3']) + + abbr = len(date_match['month']) == 3 + locale = Locale('pl') + if date_match['month'] in locale.months['format']['wide'].values(): + case = 'nominative' + else: + case = 'genitive' + month = month_number2text(int(month), abbr, case) + + date_order = [day, date_match['punct5'], + month, date_match['punct6'], year] + if date_match['day2']: + date_order = [month, date_match['punct6'], + day, date_match['punct7'], year] + date = ''.join(date_order) + else: + date = '' + return date -- GitLab From e0dea71eb9bcbb6fbd28d33071b1d22e0ac379d3 Mon Sep 17 00:00:00 2001 From: NRopiak <norbert.ropiak@pwr.edu.pl> Date: Fri, 30 Oct 2020 16:15:07 +0100 Subject: [PATCH 2/5] fix pylint add docstrings --- src/anonymizer.py | 50 +++++++++++-------- src/generators.py | 119 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 142 insertions(+), 27 deletions(-) diff --git a/src/anonymizer.py b/src/anonymizer.py index a00b578..311aae0 100644 --- a/src/anonymizer.py +++ b/src/anonymizer.py @@ -1,7 +1,8 @@ """Implementation of anonymizer functionality.""" import random -from src.generators import generate_pseudo_email, generate_pseudo_phone_number, \ - generate_pseudo_user, generate_pseudo_website, generate_phone_number_tag, generate_pseudo_date +from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, + generate_pseudo_user, generate_pseudo_website, + generate_phone_number_tag, generate_pseudo_date) import regex @@ -10,8 +11,8 @@ class Anonymizer: """Class used to edit sentences based on options.""" email_regex = regex.compile( - r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*' - r'@)' + r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+' + r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)' r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)' r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', regex.I ) @@ -40,16 +41,22 @@ class Anonymizer: r'(?P<number>(\d[- ]??){9,10})' ) date_regex = regex.compile( - r'\b(?P<day_or_month_year>(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' - r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})(?P<year1>\d{4}|\d{2}))\b|' + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' - r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})(?P<punct3>[ \t\-\./,]{1,2})' - r'(?P<day_month3>[0-3]?\d)(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' - r'(?P<month_in_words>(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' - r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|Kwi(?:|ecie[nń]|etnia)' - r'|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' - r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)|Gru(?:|dzie[nń]|dnia))\b' + r'(?P<month_in_words>' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' + r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' + r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)' + r'|Gru(?:|dzie[nń]|dnia))\b' r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?' r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2}))?)', regex.I ) @@ -85,13 +92,18 @@ class Anonymizer: self._default_token = '[INNE]' self._form_dict = dict() self._pseudo_ann_list = list() - - self._category_anonymisation = { # Order is important, first more specific - 'user': (self.user_regex, self._user_token, generate_pseudo_user), - 'email': (self.email_regex, self._mail_token, generate_pseudo_email), - 'website': (self.website_regex, self._website_token, generate_pseudo_website), - 'date': (self.date_regex, self._date_token, generate_pseudo_date), - 'phone_number': (self.phone_number_regex, self._digits_token, generate_pseudo_phone_number), + # Order is important, first more specific + self._category_anonymisation = { + 'user': (self.user_regex, self._user_token, + generate_pseudo_user), + 'email': (self.email_regex, self._mail_token, + generate_pseudo_email), + 'website': (self.website_regex, self._website_token, + generate_pseudo_website), + 'date': (self.date_regex, self._date_token, + generate_pseudo_date), + 'phone_number': (self.phone_number_regex, self._digits_token, + generate_pseudo_phone_number), } self._load_file() diff --git a/src/generators.py b/src/generators.py index 401d450..391801b 100644 --- a/src/generators.py +++ b/src/generators.py @@ -1,3 +1,4 @@ +"""Implementation of pseudonimization for different token categories.""" import re import random import calendar @@ -6,13 +7,29 @@ from datetime import datetime from babel import Locale -def get_random_character(digit=False, upper=False): +def get_random_character(digit: bool = False, upper: bool = False): + """Generate random character. + + Args: + digit (bool): Return random single digit. + upper (bool): Return uppercase character. + + """ return random.choice(digits) if digit \ else random.choice(ascii_uppercase) \ if upper else random.choice(ascii_lowercase) def pseudonymize_string(sentence: str, leave_chars: str = ''): + """Change characters in string. + + Uppercase character for uppercase, lowercase for lowercase, digit for digit. + + Args: + sentence (str): Sentence to pseudonimize. + leave_chars (str): Characters that should remain unchanged e.g ' -()'. + + """ if not sentence: return '' pseudonymized = '' @@ -26,7 +43,13 @@ def pseudonymize_string(sentence: str, leave_chars: str = ''): return pseudonymized -def generate_pseudo_email(email_match): +def generate_pseudo_email(email_match: str): + """Generate pseudonimized email based on matched email in text. + + Args: + email_match: Matched email. + + """ local_part = email_match['local_part'] domain = email_match['domain'] top_level_domain = email_match['tld'] @@ -36,12 +59,24 @@ def generate_pseudo_email(email_match): def generate_pseudo_user(user_match): + """Generate pseudonimized user based on matched user in text. + + Args: + user_match: Matched user. + + """ username = user_match['username'][1:] new_username = pseudonymize_string(username) return '@' + new_username def generate_pseudo_website(website_match): + """Generate pseudonimized website based on matched website in text. + + Args: + website_match: Matched website. + + """ protocol = website_match['protocol'] auth = website_match['auth'] host = website_match['host'] @@ -58,6 +93,12 @@ def generate_pseudo_website(website_match): def generate_pseudo_phone_number(number_match): + """Generate pseudonimized phone number based on matched phone number in text. + + Args: + number_match: Matched phone number string. + + """ country_code = number_match['country_code'] phone_number = number_match['number'] new_phone_number = country_code + \ @@ -66,6 +107,16 @@ def generate_pseudo_phone_number(number_match): def generate_phone_number_tag(number_match, default_token): + """Generate tag for every splitted set of digits. + + Delimiters in phone number: '-', ' ' + e.g 123 456-789 -> [TOKEN] [TOKEN]-[TOKEN] + + Args: + number_match: Matched phone number string. + default_token (str): Token that should replace digits. + + """ splitted_number = re.split('([- ])', ''.join(number_match.values())) new_number = '' for part in splitted_number: @@ -77,6 +128,15 @@ def generate_phone_number_tag(number_match, default_token): def random_year(year_match): + """Generate random year. + + Generate random year based on the number of digits in year match. + Prefer years close to an actual year with a fixed probability. + + Args: + year_match: Year in date match. + + """ if not year_match: return '' popular_years_probability = 0.8 @@ -92,18 +152,39 @@ def random_year(year_match): year = random.randint(actual_year - 100, actual_year + 10) else: - year = random.randint(1400, datetime.now().year + 100) + year = random.randint(1000, datetime.now().year + 100) return str(year) def random_day(month, year): + """Generate random day. + + Generate random day in the month and year previously drawn. + + Args: + month: The month in which the day will be drawn. + year: The year in which the day will be drawn. + + """ if not year: year = datetime.now().year - dates = calendar.Calendar().itermonthdates(int(year), int(month)) - return random.choice([date.day for date in dates if date.month == int(month)]) + month = int(month) + year = int(year) + dates = calendar.Calendar().itermonthdates(year, month) + return random.choice([date.day for date in dates if date.month == month]) + + +def random_date(day_no_digits: int, month_no_digits: int, year_match): + """Generate random date. + Generate random day based on the number of digits in day and month and matched year. -def random_date(day_no_digits, month_no_digits, year_match): + Args: + day_no_digits (int): The number of digits in day match. + month_no_digits (int): The number of digits in month match. + year_match: Year in date match. + + """ year = random_year(year_match) month = random.randint(1, 12) if month_no_digits == 2 \ @@ -116,7 +197,18 @@ def random_date(day_no_digits, month_no_digits, year_match): return day, month, year -def month_number2text(month_number: int, abbr: bool, case='genitive'): +def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'): + """Return the name of the month in words. + + Generate the month name from its number. + The method could return the abbreviation form and name in the nominative or genitive case. + + Args: + month_number (int): Number of the month. + abbr (bool): Return abbreviation form. + case (str): Return the name of the month in the given case. + + """ locale = Locale('pl') if case == 'genitive': months = locale.months['format'] @@ -134,6 +226,17 @@ def month_number2text(month_number: int, abbr: bool, case='genitive'): def generate_pseudo_date(date_match): + """Pseudonymize matched date. + + Generate the pseudonymized based on matched data in text. + This method will return the date in the format day-month-year + or year-month-day if the second number in date match is smaller than 13. + Otherwise, the position of the day and month will be swapped. + + Args: + date_match: Matched date. + + """ date = '' if date_match['day_or_month_year']: no_digits = (len(date_match['day_month1']), @@ -172,7 +275,7 @@ def generate_pseudo_date(date_match): abbr = len(date_match['month']) == 3 locale = Locale('pl') if date_match['month'] in locale.months['format']['wide'].values(): - case = 'nominative' + case = 'nominative' else: case = 'genitive' month = month_number2text(int(month), abbr, case) -- GitLab From 280cf79ed724b4abff25cceee7f6551cbc192738 Mon Sep 17 00:00:00 2001 From: NRopiak <norbert.ropiak@pwr.edu.pl> Date: Fri, 30 Oct 2020 16:22:20 +0100 Subject: [PATCH 3/5] fix pylint --- src/generators.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/generators.py b/src/generators.py index 391801b..d4d6a8a 100644 --- a/src/generators.py +++ b/src/generators.py @@ -177,7 +177,8 @@ def random_day(month, year): def random_date(day_no_digits: int, month_no_digits: int, year_match): """Generate random date. - Generate random day based on the number of digits in day and month and matched year. + Generate random day based on the number of digits in day and month + and also matched year. Args: day_no_digits (int): The number of digits in day match. @@ -200,8 +201,9 @@ def random_date(day_no_digits: int, month_no_digits: int, year_match): def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'): """Return the name of the month in words. - Generate the month name from its number. - The method could return the abbreviation form and name in the nominative or genitive case. + Generate the month name from its number. + The method could return the abbreviation form and name in the nominative + or genitive case. Args: month_number (int): Number of the month. @@ -229,7 +231,7 @@ def generate_pseudo_date(date_match): """Pseudonymize matched date. Generate the pseudonymized based on matched data in text. - This method will return the date in the format day-month-year + This method will return the date in the format day-month-year or year-month-day if the second number in date match is smaller than 13. Otherwise, the position of the day and month will be swapped. -- GitLab From 79025639fcf465f10e1c0b1d7d110a4c0d403030 Mon Sep 17 00:00:00 2001 From: NRopiak <norbert.ropiak@pwr.edu.pl> Date: Fri, 30 Oct 2020 16:33:21 +0100 Subject: [PATCH 4/5] update basepython to python3.6 in tox.ini --- tox.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index 1516042..8d8d0fb 100644 --- a/tox.ini +++ b/tox.ini @@ -5,14 +5,14 @@ skipsdist = True [testenv:pep8] deps = flake8 -basepython = python3 +basepython = python3.6 commands = flake8 {posargs} [testenv:docstyle] deps = pydocstyle -basepython = python3 +basepython = python3.6 commands = pydocstyle --verbose {posargs} -- GitLab From 2f865af2dd0ce14d97a34cc0432545fd3def71a1 Mon Sep 17 00:00:00 2001 From: NRopiak <norbert.ropiak@pwr.edu.pl> Date: Thu, 11 Mar 2021 13:34:04 +0100 Subject: [PATCH 5/5] update date regex and fix pep8 --- .gitignore | 4 +++- README.md | 18 ++++++++++++++++++ src/anonymizer.py | 11 ++++++----- src/ccl_handler.py | 20 ++++++++++---------- src/generators.py | 37 ++++++++++++++++++++++++++----------- src/worker.py | 4 ++-- 6 files changed, 65 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index 5391d87..baad420 100644 --- a/.gitignore +++ b/.gitignore @@ -135,4 +135,6 @@ dmypy.json .pytype/ # Cython debug symbols -cython_debug/ \ No newline at end of file +cython_debug/ + +.vscode \ No newline at end of file diff --git a/README.md b/README.md index c3784e7..919e632 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,22 @@ # Anonymizer +Service that automatically anonymizes text for polish language. + +Anonymizer works in 3 modes, when sensitive data is detected, it can perform operations: +- delete - sensitive data is deleted +- tag - sensitive data is replaced by the category tag it belongs to +- pseudo (pseudonymization) - sensitive data is replaced by another object in the same category + +### Examples: +- Delete + - Spotkałem się dzisiaj z Janem Kowalskim. + - Spotkałem się dzisiaj z . +- Tag + - Spotkałem się dzisiaj z Janem Kowalskim. + - Spotkałem się dzisiaj z [OSOBA] [OSOBA]. +- Pseudonymization + - Spotkałem się dzisiaj z Janem Kowalskim. + - Spotkałem się dzisiaj z Stefanem Michlem. + Liner2 should use model 5nam. tekst->any2txt->morphodita->liner2->anonymizer diff --git a/src/anonymizer.py b/src/anonymizer.py index 311aae0..38ecf34 100644 --- a/src/anonymizer.py +++ b/src/anonymizer.py @@ -57,8 +57,9 @@ class Anonymizer: r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)' r'|Gru(?:|dzie[nń]|dnia))\b' - r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?' - r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2}))?)', regex.I + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', regex.I ) _file_to_liner_dispatch = { @@ -153,9 +154,9 @@ class Anonymizer: return self._anonymize(sentence) def _process_word(self, id, text, tag, ann): - for annotation in ann: - if annotation[1] != 0: - text = self._handle_annotated(id, text, tag, annotation[0]) + for chan, value in ann: + if value != 0: + text = self._handle_annotated(id, text, tag, chan) break return text diff --git a/src/ccl_handler.py b/src/ccl_handler.py index a61dd89..99664b1 100644 --- a/src/ccl_handler.py +++ b/src/ccl_handler.py @@ -2,19 +2,19 @@ from xml.etree.ElementTree import iterparse -class Ccl_handler: +class CCLHandler: """Implements reading ccl for anonymizer service.""" def __init__(self, ccl_file_name): - """Initialize ccl_handler with a filename.""" + """Initialize CCLHandler with a filename.""" self._file_name = ccl_file_name - def process(self, output_file, unmarshallers): + def process(self, output_filename, unmarshallers): """Process xml tags using unmarshallers and save in output_file.""" - with open(output_file, 'w', encoding='utf-8') as out: - with open(self._file_name, 'r', encoding='utf-8') as f: - for event, elem in iterparse(f): - unmarshal = unmarshallers.get(elem.tag, None) - if unmarshal: - out.write(unmarshal(elem)) - elem.clear() + with open(self._file_name, 'r', encoding='utf-8') as input_file, \ + open(output_filename, 'w', encoding='utf-8') as output_file: + for event, elem in iterparse(input_file): + unmarshal = unmarshallers.get(elem.tag, None) + if unmarshal: + output_file.write(unmarshal(elem)) + elem.clear() diff --git a/src/generators.py b/src/generators.py index d4d6a8a..2d4a147 100644 --- a/src/generators.py +++ b/src/generators.py @@ -269,24 +269,39 @@ def generate_pseudo_date(date_match): date_order[2], date_order[4] = date_order[4], date_order[2] date = ''.join(date_order) elif date_match['month_in_words']: - day = len(date_match['day1']) - if date_match['day2']: - day = len(date_match['day2']) - day, month, year = random_date(day, 2, date_match['year3']) + if date_match['day1']: + day_len = len(date_match['day1']) + elif date_match['day2']: + day_len = len(date_match['day2']) + else: + day_len = 0 + + if date_match['year3']: + year_match = date_match['year3'] + elif date_match['year4']: + year_match = date_match['year4'] + else: + year_match = '' + day, month, year = random_date(day_len, 2, year_match) abbr = len(date_match['month']) == 3 locale = Locale('pl') if date_match['month'] in locale.months['format']['wide'].values(): - case = 'nominative' - else: case = 'genitive' + else: + case = 'nominative' month = month_number2text(int(month), abbr, case) - date_order = [day, date_match['punct5'], - month, date_match['punct6'], year] - if date_match['day2']: - date_order = [month, date_match['punct6'], - day, date_match['punct7'], year] + if date_match['day1']: + date_order = [day, date_match['punct5'], + month, date_match['punct6']] + elif date_match['day2']: + date_order = [month, date_match['punct7'], + day, date_match['punct8']] + else: + date_order = [month] + if date_match['year3'] or date_match['year4']: + date_order += [year] date = ''.join(date_order) else: date = '' diff --git a/src/worker.py b/src/worker.py index 08903f8..de27b90 100644 --- a/src/worker.py +++ b/src/worker.py @@ -5,7 +5,7 @@ import nlp_ws from src.anonymizer import Anonymizer -from src.ccl_handler import Ccl_handler +from src.ccl_handler import CCLHandler _log = logging.getLogger(__name__) @@ -24,5 +24,5 @@ class Worker(nlp_ws.NLPWorker): replaces selected tokens with a random token that """ anon = Anonymizer(task_options) - ccl_handler = Ccl_handler(input_file) + ccl_handler = CCLHandler(input_file) ccl_handler.process(output_file, anon.unmarshallers) -- GitLab