From 773f801141c81f7ec8ef62c9af4699744c21d40c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Fri, 7 Aug 2020 16:32:40 +0200 Subject: [PATCH] Added some emails and users handling. --- src/anonymizer.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/src/anonymizer.py b/src/anonymizer.py index 577638f..06c80dc 100644 --- a/src/anonymizer.py +++ b/src/anonymizer.py @@ -1,14 +1,84 @@ """Implementation of anonymizer functionality.""" import re +from string import punctuation, ascii_lowercase, ascii_uppercase +import random class Anonymizer: """Class used to edit sentences based on options.""" def __init__(self, task_options): - self.method = task_options.get('method', 'delete') + self._method = task_options.get('method', 'delete') + self._mail_token = '[MAIL]' + self._user_token = '@[USER]' def process(self): if ctag == 'ign': # sprawddz czy to nick a potem email # sprawdz czy to nazwa własna jak mBank? nie wiem + print() + + @staticmethod + def _get_random_chatacter(upper=False): + return random.choice(ascii_uppercase) \ + if upper else random.choice(ascii_lowercase) + + def _generate_pseudo_email(self, email): + new_mail = [] + it = iter(email) + top_domain_len = len(email) - email.rfind('.') + for char in it: + if char == '@': + new_mail.append(char) + break + elif char in punctuation: + new_mail.append(char) + else: + new_mail.append(self._get_random_chatacter(char.isupper())) + for char in it: + if char == '.': + if len(list(it)) == top_domain_len: + new_mail.append(char) + break + new_mail.append(char) + elif char in punctuation: + new_mail.append(char) + else: + new_mail.append(self._get_random_chatacter(char.isupper())) + for char in it: + new_mail.append(char) + return ''.join(new_mail) + + def _generate_pseudo_user(self, user): + it = iter(user) + new_user = [] + new_user.append(next(it)) + for char in it: + if char in punctuation: + new_user.append(char) + else: + new_user.append(self._get_random_chatacter(char.isupper())) + return ''.join(new_user) + + def _anonoymize_email(self, token): + """Handles removal/changing of emails addresses.""" + email_regex = r'[\w\.-]+@[\w\.-]+\.\w{2,4}' + if self._method == 'delete': + token = re.sub(email_regex, r'', token) + elif self._method == 'tag': + token = re.sub(email_regex, self._mail_token, token) + elif self._method == 'pseudo': + token = self._generate_pseudo_email(token) + return token + + def _anonoymize_user(self, token): + """Handles removal/change of users.""" + mention_regex = r'\B\@([\w\-]+)' + if self._method == 'delete': + token = re.sub(mention_regex, r'', token) + elif self._method == 'tag': + token = re.sub(mention_regex, self._user_token, token) + elif self._method == 'pseudo': + token = self._generate_pseudo_user(token) + return token + -- GitLab