From 8503609eb6fcfa95c13ab68a746e7c6a27334ecc Mon Sep 17 00:00:00 2001 From: Norbert Ropiak <norbert.ropiak@pwr.edu.pl> Date: Thu, 10 Jun 2021 11:40:19 +0000 Subject: [PATCH] Support for English and Russian language --- .gitignore | 3 +- Dockerfile | 3 +- dictionaries/en_dict.txt | 129 +++++++++++ wiki.txt => dictionaries/pl_dict.txt | 0 dictionaries/ru_dict.txt | 84 +++++++ src/__init__.py | 0 src/anonymizer.py | 305 -------------------------- src/anonymizers/english_anonymizer.py | 192 ++++++++++++++++ src/anonymizers/polish_anonymizer.py | 200 +++++++++++++++++ src/anonymizers/russian_anonymizer.py | 191 ++++++++++++++++ src/base_anonymizer.py | 164 ++++++++++++++ src/generators.py | 20 +- src/utils.py | 14 ++ src/worker.py | 15 +- 14 files changed, 1001 insertions(+), 319 deletions(-) create mode 100644 dictionaries/en_dict.txt rename wiki.txt => dictionaries/pl_dict.txt (100%) create mode 100644 dictionaries/ru_dict.txt create mode 100644 src/__init__.py delete mode 100644 src/anonymizer.py create mode 100644 src/anonymizers/english_anonymizer.py create mode 100644 src/anonymizers/polish_anonymizer.py create mode 100644 src/anonymizers/russian_anonymizer.py create mode 100644 src/base_anonymizer.py create mode 100644 src/utils.py diff --git a/.gitignore b/.gitignore index baad420..f1d48cf 100644 --- a/.gitignore +++ b/.gitignore @@ -137,4 +137,5 @@ dmypy.json # Cython debug symbols cython_debug/ -.vscode \ No newline at end of file +.vscode +*.ipynb diff --git a/Dockerfile b/Dockerfile index 836cdd0..62a552b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,8 @@ WORKDIR /home/worker COPY ./src ./src COPY ./main.py . COPY ./requirements.txt . -COPY ./wiki.txt . +COPY ./dictionaries . + RUN python3.6 -m pip install -r requirements.txt diff --git a/dictionaries/en_dict.txt b/dictionaries/en_dict.txt new file mode 100644 index 0000000..744500b --- /dev/null +++ b/dictionaries/en_dict.txt @@ -0,0 +1,129 @@ +PERSON FIRST_M John +PERSON FIRST_M Liam +PERSON FIRST_M Noah +PERSON FIRST_M Olivier +PERSON FIRST_M Elijah +PERSON FIRST_M William +PERSON FIRST_M James +PERSON FIRST_M Benjamin +PERSON FIRST_M Lucas +PERSON FIRST_M Henry +PERSON FIRST_M Alexander +PERSON FIRST_M Logan +PERSON FIRST_F Emma +PERSON FIRST_F Olivia +PERSON FIRST_F Ava +PERSON FIRST_F Isabella +PERSON FIRST_F Sophia +PERSON FIRST_F Charlotte +PERSON FIRST_F Mia +PERSON FIRST_F Amelia +PERSON FIRST_F Harper +PERSON FIRST_F Evelyn +PERSON FIRST_F Abigail +PERSON LAST Smith +PERSON LAST Johnson +PERSON LAST Williams +PERSON LAST Brown +PERSON LAST Jones +PERSON LAST Garcia +PERSON LAST Miller +PERSON LAST Davis +PERSON LAST Rodriguez +PERSON LAST Martinez +PERSON LAST Hernandez +PERSON LAST Lopez +PERSON LAST Gonzales +PERSON LAST Wilson +PERSON LAST Anderson +GPE COUNTRY Poland +GPE COUNTRY France +GPE COUNTRY China +GPE COUNTRY India +GPE COUNTRY United States +GPE COUNTRY Indonesia +GPE COUNTRY Brazil +GPE COUNTRY Germany +GPE COUNTRY Egypt +GPE COUNTRY United Kingdom +GPE COUNTRY Thailand +GPE COUNTRY South Africa +GPE COUNTRY Spain +GPE COUNTRY Argentina +GPE COUNTRY Italy +GPE COUNTRY Canada +GPE CITY New York +GPE CITY Los Angelos +GPE CITY Tokyo +GPE CITY Delhi +GPE CITY Shanghai +GPE CITY Mexico City +GPE CITY Cairo +GPE CITY Mumbai +GPE CITY Beijing +GPE CITY Dhaka +GPE CITY Osaka +GPE CITY Karachi +GPE CITY Buenos Aires +GPE CITY Rio de Janeiro +GPE CITY Paris +GPE CITY Madrid +GPE CITY Toronto +GPE CITY Barcelona +GPE CITY Warsaw +GPE STATE Arizona +GPE STATE California +GPE STATE New Mexico +GPE STATE Alabama +GPE STATE Florida +GPE STATE New York +GPE STATE Oklohama +GPE STATE Texas +FAC STREET First Street +FAC STREET Second Street +FAC STREET Third Street +FAC STREET Fourth Street +FAC STREET Fifth Street +FAC STREET Park Street +FAC STREET Main Street +FAC STREET Oak Street +FAC STREET Pine Street +FAC STREET Maple Street +FAC STREET Cedar Street +FAC STREET Washington Street +FAC STREET Lincoln Street +FAC STREET Church Street +FAC AVENUE First Avenue +FAC AVENUE Second Avenue +FAC AVENUE Third Avenue +FAC AVENUE Fourth Avenue +FAC AVENUE Park Avenue +FAC AVENUE Fifth Avenue +FAC AVENUE Main Avenue +FAC AVENUE Oak Avenue +FAC AVENUE Pine Avenue +FAC AVENUE Maple Avenue +FAC AVENUE Cedar Avenue +FAC AVENUE Washington Avenue +FAC AVENUE Lincoln Avenue +FAC AVENUE Church Avenue +FAC HIGHWAY Route 66 +FAC HIGHWAY Highway 12 +FAC HIGHWAY Great River Road +FAC HIGHWAY Blue Ridge Parkway +FAC HIGHWAY Pacific Coast Highway +FAC HIGHWAY Overseas Highway +FAC HIGHWAY Going-to-the-Sun Road +FAC AIRPORT Guangzhou Baiyun International Airport +FAC AIRPORT Hartsfield–Jackson Atlanta International Airport +FAC AIRPORT Chengdu Shuangliu International Airport +FAC AIRPORT Dallas/Fort Worth International Airport +FAC AIRPORT Shenzhen Bao'an International Airport +FAC AIRPORT Tokyo Haneda Airport +FAC AIRPORT Indira Gandhi International Airport +FAC AIRPORT Los Angeles International Airport +FAC AIRPORT O'Hare International Airport +FAC AIRPORT Istanbul Airport +FAC AIRPORT Charles de Gaulle Airport +FAC AIRPORT Heathrow Airport +FAC AIRPORT Mexico City International Airport diff --git a/wiki.txt b/dictionaries/pl_dict.txt similarity index 100% rename from wiki.txt rename to dictionaries/pl_dict.txt diff --git a/dictionaries/ru_dict.txt b/dictionaries/ru_dict.txt new file mode 100644 index 0000000..c2006d8 --- /dev/null +++ b/dictionaries/ru_dict.txt @@ -0,0 +1,84 @@ +PERSON FIRST_M Ðртём +PERSON FIRST_M ÐлекÑандр +PERSON FIRST_M Роман +PERSON FIRST_M Евгений +PERSON FIRST_M Иван +PERSON FIRST_M МакÑим +PERSON FIRST_M Ð”ÐµÐ½Ð¸Ñ +PERSON FIRST_M ÐлекÑей +PERSON FIRST_M Дмитрий +PERSON FIRST_M Даниил +PERSON FIRST_M Сергей +PERSON FIRST_M Ðиколай +PERSON FIRST_F Ð¡Ð¾Ñ„Ð¸Ñ +PERSON FIRST_F ÐнаÑтаÑÐ¸Ñ +PERSON FIRST_F Ð’Ð¸ÐºÑ‚Ð¾Ñ€Ð¸Ñ +PERSON FIRST_F КÑÐµÐ½Ð¸Ñ +PERSON FIRST_F Ðрина +PERSON FIRST_F Елизавета +PERSON FIRST_F Ðделина +PERSON FIRST_F Ирина +PERSON FIRST_F Елена +PERSON FIRST_F Полина +PERSON FIRST_F Ð”Ð°Ñ€ÑŒÑ +PERSON LAST Иванов +PERSON LAST Петров +PERSON LAST Сидоров +PERSON LAST Смирнов +PERSON LAST Волков +PERSON LAST Фёдоров +PERSON LAST Поповv +PERSON LAST Семёнов +PERSON LAST Михайлов +PERSON LAST Егоров +PERSON LAST Ленков +PERSON LAST ВаÑильев +PERSON LAST Ðиколаев +PERSON LAST Морозов +PERSON LAST Степанов +LOCATION COUNTRY Польша +LOCATION COUNTRY Ð¤Ñ€Ð°Ð½Ñ†Ð¸Ñ +LOCATION COUNTRY Китай +LOCATION COUNTRY Ð˜Ð½Ð´Ð¸Ñ +LOCATION COUNTRY Соединённые Штаты Ðмерики +LOCATION COUNTRY Ð˜Ð½Ð´Ð¾Ð½ÐµÐ·Ð¸Ñ +LOCATION COUNTRY Ð‘Ñ€Ð°Ð·Ð¸Ð»Ð¸Ñ +LOCATION COUNTRY Ð“ÐµÑ€Ð¼Ð°Ð½Ð¸Ñ +LOCATION COUNTRY Египет +LOCATION COUNTRY Ð’ÐµÐ»Ð¸ÐºÐ¾Ð±Ñ€Ð¸Ñ‚Ð°Ð½Ð¸Ñ +LOCATION COUNTRY Таиланд +LOCATION COUNTRY Ð®Ð¶Ð½Ð°Ñ Ðфрика +LOCATION COUNTRY ИÑÐ¿Ð°Ð½Ð¸Ñ +LOCATION COUNTRY Ðргентина +LOCATION COUNTRY Ð˜Ñ‚Ð°Ð»Ð¸Ñ +LOCATION COUNTRY Канада +LOCATION CITY New York +LOCATION CITY Los Angelos +LOCATION CITY Tokyo +LOCATION CITY Delhi +LOCATION CITY Shanghai +LOCATION CITY Mexico City +LOCATION CITY Cairo +LOCATION CITY Mumbai +LOCATION CITY Beijing +LOCATION CITY Dhaka +LOCATION CITY Osaka +LOCATION CITY Karachi +LOCATION CITY Buenos Aires +LOCATION CITY Rio de Janeiro +LOCATION CITY Paris +LOCATION CITY Madrid +LOCATION CITY Toronto +LOCATION CITY Barcelona +LOCATION CITY Warsaw +LOCATION STREET СоветÑÐºÐ°Ñ +LOCATION STREET ОктÑбрьÑÐºÐ°Ñ +LOCATION STREET Ленина +LOCATION STREET Гагарина +LOCATION STREET МаркÑа +LOCATION STREET ÐнглийÑÐºÐ°Ñ +LOCATION STREET Кадырова +LOCATION STREET Пушкина +LOCATION STREET Победы +LOCATION STREET ПервомайÑÐºÐ°Ñ +LOCATION STREET Мира \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/anonymizer.py b/src/anonymizer.py deleted file mode 100644 index 38ecf34..0000000 --- a/src/anonymizer.py +++ /dev/null @@ -1,305 +0,0 @@ -"""Implementation of anonymizer functionality.""" -import random -from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, - generate_pseudo_user, generate_pseudo_website, - generate_phone_number_tag, generate_pseudo_date) - -import regex - - -class Anonymizer: - """Class used to edit sentences based on options.""" - - email_regex = regex.compile( - r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+' - r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)' - r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)' - r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', regex.I - ) - user_regex = regex.compile(r'\B(?P<username>\@[\w\-]+)') - _website_exceptions = ['m.in'] - website_regex = regex.compile( - r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(_website_exceptions)) + - r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?' - r'(?P<auth>\S+(?::\S*)?@)?' - r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})' - r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})' - r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})' - r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])' - r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}' - r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))' - r'|' - r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?' - r'[a-z0-9\u00a1-\uffff]\.)+)' - r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)' - r'(?P<port>:\d{2,5})?' - r'(?P<path>[/?#]\S*)?)', - regex.UNICODE | regex.I - ) - phone_number_regex = regex.compile( - r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?' - r'(?P<number>(\d[- ]??){9,10})' - ) - date_regex = regex.compile( - r'\b(?P<day_or_month_year>' - r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' - r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' - r'(?P<year1>\d{4}|\d{2}))\b|' - - r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' - r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' - r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' - - r'(?P<month_in_words>' - r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' - r'\b(?P<month>Sty(?:|cze[nÅ„]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' - r'Kwi(?:|ecie[nÅ„]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' - r'|Sie(?:|rpie[nÅ„]|rpnia)|Wrz(?:|esie[nÅ„]|e[Å›s]nia)' - r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|stopada)' - r'|Gru(?:|dzie[nÅ„]|dnia))\b' - r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' - r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' - r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', regex.I - ) - - _file_to_liner_dispatch = { - 'nam_liv_person': 'person_first_nam', - 'nam_liv_person_last': 'person_last_nam', - 'nam_fac_road': 'road_nam', - 'nam_loc_gpe_city': 'city_nam', - 'nam_org_group_team': 'country_nam' - } - - _liner_to_tag_dispatch = { - 'person_first_nam': '[OSOBA]', - 'person_last_nam': '[OSOBA]', - 'road_nam': '[MIEJSCE]', - 'city_nam': '[MIEJSCE]', - 'country_nam': '[MIEJSCE]' - } - - def __init__(self, task_options): - """Initialize anonymizer with task_options.""" - self.unmarshallers = { - 'chunk': lambda *args: '\n', - 'sentence': lambda *args: self._process_sent_tree(*args), - } - self._method = task_options.get('method', 'delete') - self._mail_token = '[MAIL]' - self._user_token = '@[USER]' - self._website_token = '[WWW]' - self._digits_token = '[DIGITS]' - self._date_token = '[DATE]' - self._default_token = '[INNE]' - self._form_dict = dict() - self._pseudo_ann_list = list() - # Order is important, first more specific - self._category_anonymisation = { - 'user': (self.user_regex, self._user_token, - generate_pseudo_user), - 'email': (self.email_regex, self._mail_token, - generate_pseudo_email), - 'website': (self.website_regex, self._website_token, - generate_pseudo_website), - 'date': (self.date_regex, self._date_token, - generate_pseudo_date), - 'phone_number': (self.phone_number_regex, self._digits_token, - generate_pseudo_phone_number), - } - self._load_file() - - def _load_file(self, file_name='wiki.txt'): - with open(file_name, 'r', encoding='utf-8') as f: - for line in f.readlines(): - l_list = line.split() - cat = l_list[0] - if cat in self._file_to_liner_dispatch: - cat_name = self._file_to_liner_dispatch[cat] - length = int((len(l_list) - 2) / 2) - gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)]) - flx_name = ' '.join(l_list[1:(1 + length)]) - flex = l_list[-1] - if cat_name not in self._form_dict: - self._form_dict[cat_name] = dict() - if length not in self._form_dict[cat_name]: - self._form_dict[cat_name][length] = dict() - if gen_name not in self._form_dict[cat_name][length]: - self._form_dict[cat_name][length][gen_name] = dict() - self._form_dict[cat_name][length][gen_name][flex] = flx_name - for cat in self._form_dict: - for length in self._form_dict[cat]: - self._form_dict[cat][length] = list( - self._form_dict[cat][length].items() - ) - - def _process_sent_tree(self, sentence_subtree): - string_builder = [] - id = 0 - for elem in sentence_subtree: - if elem.tag == 'tok': - tok = self._process_single_tok(id, elem) - string_builder.append(tok) - string_builder.append(' ') - id += 2 - elif elem.tag == 'ns': - id -= 1 - string_builder.pop() - else: - raise Exception('Unrecognized tag inside sentence: ' + elem.tag) - return self._process_sentence(string_builder) - - def _process_sentence(self, string_builder): - string_builder = self._handle_pseudo_ann(string_builder) - sentence = ''.join(string_builder) - return self._anonymize(sentence) - - def _process_word(self, id, text, tag, ann): - for chan, value in ann: - if value != 0: - text = self._handle_annotated(id, text, tag, chan) - break - return text - - def _handle_annotated(self, id, text, tag, ann): - if self._method == 'delete': - return '' - elif self._method == 'tag': - if ann in self._liner_to_tag_dispatch: - return self._liner_to_tag_dispatch[ann] - elif self._method == 'pseudo': - if ann in self._form_dict: - self._pseudo_ann_list.append((id, text, tag, ann)) - return text - - def _handle_pseudo_ann(self, string_builder): - if self._pseudo_ann_list: - it = iter(self._pseudo_ann_list) - id, text, tag, ann = next(it) - current_tag = tag - current_ann = ann - current_id = id - length = 1 - for id, text, tag, ann in it: - if current_ann == ann and (ann != 'person_first_nam' and - ann != 'person_last_nam'): - if id == current_id + 2: - length += 1 - current_tag = tag - current_id = id - continue - new_text = self._get_pseudo_ann( - ann=current_ann, - tag=current_tag, - length=length - ) - for t in new_text.split(' '): - string_builder[current_id - 2 * (length - 1)] = t - length -= 1 - length = 1 - current_tag = tag - current_ann = ann - current_id = id - new_text = self._get_pseudo_ann(current_ann, current_tag, length) - toks = new_text.split(' ') - for i in range(length): - if i < len(toks): - string_builder[current_id - 2 * (length - 1)] = toks[i] - else: - string_builder[current_id - 2 * (length - 1)] = '' - if string_builder[current_id - 2 * (length - 1) + 1] == ' ': - string_builder[current_id - 2 * (length - 1) + 1] = '' - length -= 1 - self._pseudo_ann_list.clear() - return string_builder - - def _get_pseudo_ann(self, ann, tag, length): - while length not in self._form_dict[ann] and length > 0: - length -= 1 - if length == 0: - return '' - new_tag = ':'.join(tag.split(':')[1:4]) - for i in range(0, 10): - random_entry = random.choice(self._form_dict[ann][length]) - if new_tag in random_entry[1]: - return random_entry[1][new_tag] - if new_tag == 'ign': - return random_entry[0] - random_entry = random.choice(self._form_dict[ann][length]) - return random_entry[0] - - def _process_single_tok(self, id, tok_subtree): - text = '' - tag = '' - ann = [] - for elem in tok_subtree: - if elem.tag == 'orth': - text = elem.text - elif elem.tag == 'lex': - tag = self._process_lex(elem) - elif elem.tag == 'ann': - ann.append(self._process_ann(elem)) - word = self._process_word(id, text, tag, ann) - return word - - def _process_lex(self, lex_subtree): - tag = '' - for elem in lex_subtree: - if elem.tag == 'ctag': - tag = elem.text - elif elem.tag != 'base': - raise Exception('Unrecognized tag inside lex: ' + elem.tag) - if tag == '': - raise Exception('Lex tag had no ctag inside!') - return tag - - def _process_ann(self, ann_subtree): - value = int(ann_subtree.text) - chan = ann_subtree.attrib["chan"] - return chan, value - - def _anonymize(self, sentence): - if self._method == 'delete': - for pattern, _, _ in self._category_anonymisation.values(): - sentence = regex.sub(pattern, '', sentence) - elif self._method == 'tag': - sentence = self._tagging(sentence) - elif self._method == 'pseudo': - sentence = self._pseudonymization(sentence) - return sentence - - def _tagging(self, sentence): - for category in self._category_anonymisation: - pattern, token, _ = self._category_anonymisation[category] - - if category == 'phone_number': - matches = [m for m in pattern.finditer(sentence)] - for match in matches: - tag = generate_phone_number_tag(match.groupdict(''), token) - replace_match = match.group(0) - sentence = regex.sub(regex.escape( - replace_match), tag, sentence) - else: - sentence = regex.sub(pattern, token, sentence) - return sentence - - def _pseudonymization(self, sentence): - sentence_after_regex = sentence - to_replace = [] - for category in self._category_anonymisation: - pattern, _, generator = self._category_anonymisation[category] - for match in pattern.finditer(sentence_after_regex): - if not match: - continue - to_replace.append((match, generator)) - sentence_after_regex = regex.sub( - regex.escape(match.group(0)), '', sentence_after_regex) - - for match, generator in to_replace: - replace_match = match.group(0) - pseudo_string = generator(match.groupdict('')) - sentence = regex.sub( - regex.escape(replace_match), - pseudo_string, - sentence - ) - return sentence diff --git a/src/anonymizers/english_anonymizer.py b/src/anonymizers/english_anonymizer.py new file mode 100644 index 0000000..6638942 --- /dev/null +++ b/src/anonymizers/english_anonymizer.py @@ -0,0 +1,192 @@ +"""Implementation of anonymizer functionality for English language.""" +import math +import random + +import regex + + +from src.utils import consume +from src.ccl_handler import CCLHandler +from src.base_anonymizer import BaseAnonymizer +from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, + generate_pseudo_user, generate_pseudo_website, + generate_pseudo_date) + + +class EnglishAnonymizer(BaseAnonymizer): + """Class with an anonymization implementation for the English language.""" + + skip_ann = ['CARDINAL', 'LAW', 'DATE', 'QUANTITY', 'TIME', 'EVENT'] + date_regex = regex.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|' + r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)' + r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b' + r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?' + r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?' + r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', regex.I + ) + + spacy_tag_map = { + 'PERSON': '[PERSON]', + 'GPE': '[LOCATION]', + 'FAC': '[LOCATION]', + } + + def __init__(self, task_options): + """Initialize anonymizer with base regexes.""" + super().__init__(task_options) + self.lang = task_options.get('language', 'en') + # Order is important, first more specific + self._category_anonymisation = { + 'user': (self.user_regex, self._user_token, + generate_pseudo_user, {}), + 'email': (self.email_regex, self._mail_token, + generate_pseudo_email, {}), + 'website': (self.website_regex, self._website_token, + generate_pseudo_website, {}), + 'date': (self.date_regex, self._date_token, + generate_pseudo_date, {'lang': self.lang}), + 'phone_number': (self.phone_number_regex, self._digits_token, + generate_pseudo_phone_number, {}), + } + self.unmarshallers = { + 'chunk': lambda *args: '\n', + 'sentence': lambda *args: self._process_sent_tree(*args), + } + self._load_dict_file() + + def _load_dict_file(self, filename='en_dict.txt'): + with open(filename, 'r', encoding='utf-8') as f: + for line in f.readlines(): + l_list = line.strip('\n').split() + cat, subtype = l_list[0], l_list[1] + length = len(l_list[2:]) + text = ' '.join(l_list[2:]) + if cat not in self._form_dict: + self._form_dict[cat] = {} + if subtype not in self._form_dict[cat]: + self._form_dict[cat][subtype] = [] + self._form_dict[cat][subtype].append((text, length)) + + def _handle_annotated(self, id, text, tag, ann): + if self._method == 'delete': + return '' + elif self._method == 'tag': + if ann in self.spacy_tag_map: + return self.spacy_tag_map[ann] + elif self._method == 'pseudo': + if ann in self.spacy_tag_map: + self._pseudo_ann_list.append((id, text, tag, ann)) + return text + + def _process_sentence(self, string_builder): + string_builder = self._handle_pseudo_ann(string_builder) + sentence = ''.join(string_builder) + return self._anonymize(sentence) + + def _get_pseudo_ann(self, ann, text, length): + new_text = [] + if ann == 'PERSON': + gen = random.choice(['FIRST_M', 'FIRST_F']) + name_length = length - 1 if length > 1 else 1 + while name_length > 0: + names = [p for p in self._form_dict['PERSON'][gen] + if p[1] <= name_length] + random_name = random.choice(names) + name_length -= random_name[1] + new_text.append(random_name[0]) + if length > 1: + last_name = random.choice(self._form_dict['PERSON']['LAST']) + new_text.append(last_name[0]) + elif ann == 'GPE': + found = False + for _, values in self._form_dict['GPE'].items(): + if ' '.join(text) in values: + new_text = [random.choice(values)[0]] + found = True + if not found: + new_text = [random.choice(self._form_dict['GPE']['CITY'])[0]] + else: + new_text = ' '.join(text) + return ' '.join(new_text) + + def _handle_pseudo_ann(self, string_builder): + if not self._pseudo_ann_list: + return string_builder + shifted_id = 0 + pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list)) + for i, (id_, text, _, ann) in pseudo_ann_iter: + if ann in self.skip_ann: + continue + j = i + 1 + id_ += shifted_id + start_id = id_ + ann_len = 1 + phrase = [text] + skip_tokens = 1 + while j < len(self._pseudo_ann_list): + next_id, next_text, _, next_ann = self._pseudo_ann_list[j] + next_id += shifted_id + if ann != next_ann: + break + if next_id == id_ + 1 and string_builder[next_id] == '-': + skip_tokens += 1 + elif next_id == id_ + 1 and string_builder[id_] == '-': + ann_len += 1 + skip_tokens += 1 + phrase.append(next_text) + elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ': + ann_len += 1 + skip_tokens += 2 + phrase.append(next_text) + else: + break + id_ = next_id + j += 1 + new_text = self._get_pseudo_ann( + ann=ann, + text=phrase, + length=ann_len + ) + new_text = regex.split('( )', new_text) + string_builder = string_builder[:start_id] + new_text + \ + string_builder[start_id + skip_tokens:] + if ann_len > 1: + consume(pseudo_ann_iter, ann_len - 1) + if math.ceil(len(new_text) / 2) != ann_len: + shifted_id += len(new_text) - (ann_len * 2) + 1 + self._pseudo_ann_list.clear() + return string_builder + + def _anonymize(self, sentence): + if self._method == 'delete': + for pattern, _, _ in self._category_anonymisation.values(): + sentence = regex.sub(pattern, '', sentence) + elif self._method == 'tag': + sentence = self._tagging(sentence) + elif self._method == 'pseudo': + sentence = self._pseudonymization(sentence) + return sentence + + def process(self, input_filename, output_filename): + """Anonymize the file in CCL format to the resulting file in plain text. + + Args: + input_filename (str): Input filename in CCL format. \ + Text tagged and processed with spacy NER. + output_filename (str): Output filename. + + """ + ccl_handler = CCLHandler(input_filename) + ccl_handler.process(output_filename, self.unmarshallers) diff --git a/src/anonymizers/polish_anonymizer.py b/src/anonymizers/polish_anonymizer.py new file mode 100644 index 0000000..2706ef2 --- /dev/null +++ b/src/anonymizers/polish_anonymizer.py @@ -0,0 +1,200 @@ +"""Implementation of anonymizer functionality for Polish language.""" +import math +import regex +import random + + +from src.utils import consume +from src.base_anonymizer import BaseAnonymizer +from src.ccl_handler import CCLHandler +from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, + generate_pseudo_user, generate_pseudo_website, + generate_pseudo_date) + + +class PolishAnonymizer(BaseAnonymizer): + """Class with an anonymization implementation for the Polish language.""" + + date_regex = regex.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nÅ„]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' + r'Kwi(?:|ecie[nÅ„]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' + r'|Sie(?:|rpie[nÅ„]|rpnia)|Wrz(?:|esie[nÅ„]|e[Å›s]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)' + r'|Gru(?:|dzie[nÅ„]|dnia))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', regex.I + ) + + _file_to_liner_dispatch = { + 'nam_liv_person': 'person_first_nam', + 'nam_liv_person_last': 'person_last_nam', + 'nam_fac_road': 'road_nam', + 'nam_loc_gpe_city': 'city_nam', + 'nam_org_group_team': 'country_nam' + } + + _liner_to_tag_dispatch = { + 'person_first_nam': '[OSOBA]', + 'person_last_nam': '[OSOBA]', + 'road_nam': '[MIEJSCE]', + 'city_nam': '[MIEJSCE]', + 'country_nam': '[MIEJSCE]' + } + + def __init__(self, task_options): + """Initialize anonymizer with base regexes.""" + super().__init__(task_options) + self.lang = task_options.get('language', 'pl') + # Order is important, first more specific + self._category_anonymisation = { + 'user': (self.user_regex, self._user_token, + generate_pseudo_user, {}), + 'email': (self.email_regex, self._mail_token, + generate_pseudo_email, {}), + 'website': (self.website_regex, self._website_token, + generate_pseudo_website, {}), + 'date': (self.date_regex, self._date_token, + generate_pseudo_date, {'lang': self.lang}), + 'phone_number': (self.phone_number_regex, self._digits_token, + generate_pseudo_phone_number, {}), + } + self.unmarshallers = { + 'chunk': lambda *args: '\n', + 'sentence': lambda *args: self._process_sent_tree(*args), + } + self._form_dict = dict() + self._pseudo_ann_list = list() + self._load_file() + + def _load_file(self, filename='pl_dict.txt'): + with open(filename, 'r', encoding='utf-8') as f: + for line in f.readlines(): + l_list = line.split() + cat = l_list[0] + if cat in self._file_to_liner_dispatch: + cat_name = self._file_to_liner_dispatch[cat] + length = int((len(l_list) - 2) / 2) + gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)]) + flx_name = ' '.join(l_list[1:(1 + length)]) + flex = l_list[-1] + if cat_name not in self._form_dict: + self._form_dict[cat_name] = dict() + if length not in self._form_dict[cat_name]: + self._form_dict[cat_name][length] = dict() + if gen_name not in self._form_dict[cat_name][length]: + self._form_dict[cat_name][length][gen_name] = dict() + self._form_dict[cat_name][length][gen_name][flex] = flx_name + for cat in self._form_dict: + for length in self._form_dict[cat]: + self._form_dict[cat][length] = list( + self._form_dict[cat][length].items() + ) + + def _handle_annotated(self, id, text, tag, ann): + if self._method == 'delete': + return '' + elif self._method == 'tag': + if ann in self._liner_to_tag_dispatch: + return self._liner_to_tag_dispatch[ann] + elif self._method == 'pseudo': + if ann in self._form_dict: + self._pseudo_ann_list.append((id, text, tag, ann)) + return text + + def _process_sentence(self, string_builder): + string_builder = self._handle_pseudo_ann(string_builder) + sentence = ''.join(string_builder) + return self._anonymize(sentence) + + def _get_pseudo_ann(self, ann, tag, length): + while length not in self._form_dict[ann] and length > 0: + length -= 1 + if length == 0: + return '' + new_tag = ':'.join(tag.split(':')[1:4]) + for _ in range(0, 10): + random_entry = random.choice(self._form_dict[ann][length]) + if new_tag in random_entry[1]: + return random_entry[1][new_tag] + if new_tag == 'ign': + return random_entry[0] + random_entry = random.choice(self._form_dict[ann][length]) + return random_entry[0] + + def _handle_pseudo_ann(self, string_builder): + if not self._pseudo_ann_list: + return string_builder + shifted_id = 0 + pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list)) + for i, (id_, _, tag, ann) in pseudo_ann_iter: + j = i + 1 + start_id = id_ + shifted_id + ann_len = 1 + skip_tokens = 1 + while j < len(self._pseudo_ann_list): + next_id, _, _, next_ann = self._pseudo_ann_list[j] + next_id += shifted_id + if ann != next_ann or (ann == 'person_first_nam' or + ann == 'person_last_nam'): + break + if next_id == id_ + 1 and string_builder[next_id] == '-': + skip_tokens += 1 + elif next_id == id_ + 1 and string_builder[id_] == '-': + ann_len += 1 + skip_tokens += 1 + elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ': + ann_len += 1 + skip_tokens += 2 + else: + break + id_ = next_id + j += 1 + new_text = self._get_pseudo_ann( + ann=ann, + tag=tag, + length=ann_len + ) + new_text = regex.split('( )', new_text) + string_builder = string_builder[:start_id] + new_text + \ + string_builder[start_id + skip_tokens:] + if ann_len > 1: + consume(pseudo_ann_iter, ann_len - 1) + if math.ceil(len(new_text) / 2) != ann_len: + shifted_id += len(new_text) - (ann_len * 2) + 1 + self._pseudo_ann_list.clear() + return string_builder + + def _anonymize(self, sentence): + if self._method == 'delete': + for pattern, _, _ in self._category_anonymisation.values(): + sentence = regex.sub(pattern, '', sentence) + elif self._method == 'tag': + sentence = self._tagging(sentence) + elif self._method == 'pseudo': + sentence = self._pseudonymization(sentence) + return sentence + + def process(self, input_filename, output_filename): + """Anonymize the file in CCL format to the resulting file in plain text. + + Args: + input_filename (str): Input filename in CCL format. \ + Text tagged and processed with LINER. + output_filename (str): Output filename. + + """ + ccl_handler = CCLHandler(input_filename) + ccl_handler.process(output_filename, self.unmarshallers) diff --git a/src/anonymizers/russian_anonymizer.py b/src/anonymizers/russian_anonymizer.py new file mode 100644 index 0000000..2b9bef6 --- /dev/null +++ b/src/anonymizers/russian_anonymizer.py @@ -0,0 +1,191 @@ +"""Implementation of anonymizer functionality for Russian language.""" +import math +import random + +import regex + + +from src.utils import consume +from src.ccl_handler import CCLHandler +from src.base_anonymizer import BaseAnonymizer +from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, + generate_pseudo_user, generate_pseudo_website, + generate_pseudo_date) + + +class RussianAnonymizer(BaseAnonymizer): + """Class with an anonymization implementation for the Russian language.""" + + date_regex = regex.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b)' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Янв(?:|ар[ьеÑ])|Фев(?:|рал[ьеÑ])|Мар(?:|Ñ‚|те|та)|' + r'Ðпр(?:|ел[ьеÑ])|Ма[йеÑ]|Июн(?:|[ьеÑ])|Июл(?:|[ьеÑ])|' + r'Ðвг(?:|уÑÑ‚|уÑÑ‚[еа])|Сен(?:|Ñ‚Ñбр[ьеÑ])|Окт(?:|Ñбр[ьеÑ])|' + r'ÐоÑ(?:|бр[ьеÑ])|Дек(?:|абр[ьеÑ]))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?' + r'(?<!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b))', regex.I + ) + + spacy_tag_map = { + 'PER': '[PERSON]', + 'LOC': '[LOCATION]', + } + + def __init__(self, task_options): + """Initialize anonymizer with base regexes.""" + super().__init__(task_options) + self.lang = task_options.get('language', 'ru') + # Order is important, first more specific + self._category_anonymisation = { + 'user': (self.user_regex, self._user_token, + generate_pseudo_user, {}), + 'email': (self.email_regex, self._mail_token, + generate_pseudo_email, {}), + 'website': (self.website_regex, self._website_token, + generate_pseudo_website, {}), + 'date': (self.date_regex, self._date_token, + generate_pseudo_date, {'lang': self.lang}), + 'phone_number': (self.phone_number_regex, self._digits_token, + generate_pseudo_phone_number, {}), + } + self.unmarshallers = { + 'chunk': lambda *args: '\n', + 'sentence': lambda *args: self._process_sent_tree(*args), + } + self._load_dict_file() + + def _load_dict_file(self, filename='ru_dict.txt'): + with open(filename, 'r', encoding='utf-8') as f: + for line in f.readlines(): + l_list = line.strip('\n').split() + cat, subtype = l_list[0], l_list[1] + length = len(l_list[2:]) + text = ' '.join(l_list[2:]) + if cat not in self._form_dict: + self._form_dict[cat] = {} + if subtype not in self._form_dict[cat]: + self._form_dict[cat][subtype] = [] + self._form_dict[cat][subtype].append((text, length)) + + def _handle_annotated(self, id, text, tag, ann): + if self._method == 'delete': + return '' + elif self._method == 'tag': + if ann in self.spacy_tag_map: + return self.spacy_tag_map[ann] + elif self._method == 'pseudo': + if ann in self.spacy_tag_map: + self._pseudo_ann_list.append((id, text, tag, ann)) + return text + + def _process_sentence(self, string_builder): + string_builder = self._handle_pseudo_ann(string_builder) + sentence = ''.join(string_builder) + return self._anonymize(sentence) + + def _get_pseudo_ann(self, ann, text, length): + new_text = [] + if ann == 'PER': + gen = random.choice(['FIRST_M', 'FIRST_F']) + name_length = length - 1 if length > 1 else 1 + while name_length > 0: + names = [p for p in self._form_dict['PERSON'][gen] + if p[1] <= name_length] + random_name = random.choice(names) + name_length -= random_name[1] + new_text.append(random_name[0]) + if length > 1: + last_name = random.choice(self._form_dict['PERSON']['LAST']) + new_text.append(last_name[0]) + elif ann == 'LOC': + found = False + for _, values in self._form_dict['LOCATION'].items(): + if ' '.join(text) in values: + new_text = [random.choice(values)[0]] + found = True + if not found: + default_loc = self._form_dict['LOCATION']['CITY'] + new_text = [random.choice(default_loc)[0]] + else: + new_text = ' '.join(text) + return ' '.join(new_text) + + def _handle_pseudo_ann(self, string_builder): + if not self._pseudo_ann_list: + return string_builder + shifted_id = 0 + pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list)) + for i, (id_, text, _, ann) in pseudo_ann_iter: + j = i + 1 + start_id = id_ + shifted_id + ann_len = 1 + phrase = [text] + skip_tokens = 1 + while j < len(self._pseudo_ann_list): + next_id, next_text, _, next_ann = self._pseudo_ann_list[j] + next_id += shifted_id + if ann != next_ann: + break + if next_id == id_ + 1 and string_builder[next_id] == '-': + skip_tokens += 1 + elif next_id == id_ + 1 and string_builder[id_] == '-': + ann_len += 1 + skip_tokens += 1 + phrase.append(next_text) + elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ': + ann_len += 1 + skip_tokens += 2 + phrase.append(next_text) + else: + break + id_ = next_id + j += 1 + new_text = self._get_pseudo_ann( + ann=ann, + text=phrase, + length=ann_len + ) + new_text = regex.split('( )', new_text) + string_builder = string_builder[:start_id] + new_text + \ + string_builder[start_id + skip_tokens:] + if ann_len > 1: + consume(pseudo_ann_iter, ann_len - 1) + if math.ceil(len(new_text) / 2) != ann_len: + shifted_id += len(new_text) - (ann_len * 2) + 1 + self._pseudo_ann_list.clear() + return string_builder + + def _anonymize(self, sentence): + if self._method == 'delete': + for pattern, _, _ in self._category_anonymisation.values(): + sentence = regex.sub(pattern, '', sentence) + elif self._method == 'tag': + sentence = self._tagging(sentence) + elif self._method == 'pseudo': + sentence = self._pseudonymization(sentence) + return sentence + + def process(self, input_filename, output_filename): + """Anonymize the file in CCL format to the resulting file in plain text. + + Args: + input_filename (str): Input filename in CCL format. \ + Text tagged and processed with spacy NER. + output_filename (str): Output filename. + + """ + ccl_handler = CCLHandler(input_filename) + ccl_handler.process(output_filename, self.unmarshallers) diff --git a/src/base_anonymizer.py b/src/base_anonymizer.py new file mode 100644 index 0000000..fd62de5 --- /dev/null +++ b/src/base_anonymizer.py @@ -0,0 +1,164 @@ +"""Abstract description of anonymizer including base regexes.""" +import regex +from abc import ABC, abstractmethod +from src.generators import generate_phone_number_tag + + +class BaseAnonymizer(ABC): + """Base abstract class for anonymization.""" + + email_regex = regex.compile( + r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+' + r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)' + r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)' + r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', regex.I + ) + user_regex = regex.compile(r'\B(?P<username>\@[\w\-]+)') + _website_exceptions = ['m.in'] + website_regex = regex.compile( + r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(_website_exceptions)) + + r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?' + r'(?P<auth>\S+(?::\S*)?@)?' + r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})' + r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})' + r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})' + r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])' + r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}' + r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))' + r'|' + r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?' + r'[a-z0-9\u00a1-\uffff]\.)+)' + r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)' + r'(?P<port>:\d{2,5})?' + r'(?P<path>[/?#]\S*)?)', + regex.UNICODE | regex.I + ) + phone_number_regex = regex.compile( + r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?' + r'(?P<number>(\d[- ]??){9,10})' + ) + + def __init__(self, task_options): + """Initialize anonymizer with chosen method and default tokens.""" + self._mail_token = '[MAIL]' + self._user_token = '@[USER]' + self._website_token = '[WWW]' + self._digits_token = '[DIGITS]' + self._date_token = '[DATE]' + self._default_token = '[INNE]' + + self._method = task_options.get('method', 'delete') + + self._category_anonymisation = {} + self._form_dict = {} + self._pseudo_ann_list = [] + + def _process_lex(self, lex_subtree): + tag = '' + for elem in lex_subtree: + if elem.tag == 'ctag': + tag = elem.text + elif elem.tag != 'base': + raise Exception('Unrecognized tag inside lex: ' + elem.tag) + if tag == '': + raise Exception('Lex tag had no ctag inside!') + return tag + + def _tagging(self, sentence): + for category in self._category_anonymisation: + pattern, token, _, _ = self._category_anonymisation[category] + + if category == 'phone_number': + matches = [m for m in pattern.finditer(sentence)] + for match in matches: + tag = generate_phone_number_tag(match.groupdict(''), token) + replace_match = match.group(0) + sentence = regex.sub(regex.escape(replace_match), + tag, sentence) + else: + sentence = regex.sub(pattern, token, sentence) + return sentence + + def _pseudonymization(self, sentence): + sentence_after_regex = sentence + to_replace = [] + for category in self._category_anonymisation: + pattern, _, generator, args = self._category_anonymisation[category] + for match in pattern.finditer(sentence_after_regex): + if not match: + continue + to_replace.append((match, generator, args)) + sentence_after_regex = regex.sub(regex.escape(match.group(0)), + '', sentence_after_regex) + + for match, generator, args in to_replace: + replace_match = match.group(0) + pseudo_string = generator(match.groupdict(''), **args) + sentence = regex.sub( + regex.escape(replace_match), + pseudo_string, + sentence + ) + return sentence + + def _process_ann(self, ann_subtree): + value = int(ann_subtree.text) + chan = ann_subtree.attrib["chan"] + return chan, value + + def _process_single_tok(self, id, tok_subtree): + text = '' + tag = '' + ann = [] + for elem in tok_subtree: + if elem.tag == 'orth': + text = elem.text + elif elem.tag == 'lex': + tag = self._process_lex(elem) + elif elem.tag == 'ann': + ann.append(self._process_ann(elem)) + word = self._process_word(id, text, tag, ann) + return word + + def _process_word(self, id, text, tag, ann): + for chan, value in ann: + if value != 0: + text = self._handle_annotated(id, text, tag, chan) + break + return text + + def _process_sent_tree(self, sentence_subtree): + string_builder = [] + id = 0 + for elem in sentence_subtree: + if elem.tag == 'tok': + tok = self._process_single_tok(id, elem) + string_builder.append(tok) + string_builder.append(' ') + id += 2 + elif elem.tag == 'ns': + id -= 1 + string_builder.pop() + else: + raise Exception('Unrecognized tag inside sentence: ' + elem.tag) + return self._process_sentence(string_builder) + + @abstractmethod + def _handle_annotated(self, id, text, tag, ann): + pass + + @abstractmethod + def _process_sentence(self, string_builder): + pass + + @abstractmethod + def process(self, input_filename, output_filename): + """Anonymize the text in a file input_filename and save the anonymized \ + output text to a file output_filename. + + Args: + input_filename ([type]): [description] + output_filename ([type]): [description] + + """ + pass diff --git a/src/generators.py b/src/generators.py index 2d4a147..606aabc 100644 --- a/src/generators.py +++ b/src/generators.py @@ -198,7 +198,8 @@ def random_date(day_no_digits: int, month_no_digits: int, year_match): return day, month, year -def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'): +def month_number2text(month_number: int, abbr: bool, case: str = 'genitive', + lang='pl'): """Return the name of the month in words. Generate the month name from its number. @@ -209,9 +210,10 @@ def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'): month_number (int): Number of the month. abbr (bool): Return abbreviation form. case (str): Return the name of the month in the given case. + lang (str): The language which is used to generate text. """ - locale = Locale('pl') + locale = Locale(lang) if case == 'genitive': months = locale.months['format'] elif case == 'nominative': @@ -227,7 +229,7 @@ def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'): return months[month_number] -def generate_pseudo_date(date_match): +def generate_pseudo_date(date_match, lang='pl'): """Pseudonymize matched date. Generate the pseudonymized based on matched data in text. @@ -237,6 +239,7 @@ def generate_pseudo_date(date_match): Args: date_match: Matched date. + lang: The language which is used to generate the date. """ date = '' @@ -278,29 +281,32 @@ def generate_pseudo_date(date_match): if date_match['year3']: year_match = date_match['year3'] - elif date_match['year4']: + elif lang != 'en' and date_match['year4']: year_match = date_match['year4'] else: year_match = '' day, month, year = random_date(day_len, 2, year_match) abbr = len(date_match['month']) == 3 - locale = Locale('pl') + locale = Locale(lang) if date_match['month'] in locale.months['format']['wide'].values(): case = 'genitive' else: case = 'nominative' - month = month_number2text(int(month), abbr, case) + month = month_number2text(int(month), abbr, case, lang=lang) if date_match['day1']: date_order = [day, date_match['punct5'], month, date_match['punct6']] + elif date_match['day2'] and lang == 'en': + date_order = [month, date_match['punct7'], + day, date_match['punct6']] elif date_match['day2']: date_order = [month, date_match['punct7'], day, date_match['punct8']] else: date_order = [month] - if date_match['year3'] or date_match['year4']: + if date_match['year3'] or (lang != 'en' and date_match['year4']): date_order += [year] date = ''.join(date_order) else: diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..81cc67f --- /dev/null +++ b/src/utils.py @@ -0,0 +1,14 @@ +"""Module for useful functions.""" + +import itertools + + +def consume(iterative, n): + """Consume n elements from iterative object. + + Args: + iterative (iter): Python iterative object. + n (int): Number of elements to consume. + + """ + next(itertools.islice(iterative, n - 1, n), None) diff --git a/src/worker.py b/src/worker.py index de27b90..6cbc166 100644 --- a/src/worker.py +++ b/src/worker.py @@ -4,8 +4,9 @@ import logging import nlp_ws -from src.anonymizer import Anonymizer -from src.ccl_handler import CCLHandler +from src.anonymizers.polish_anonymizer import PolishAnonymizer +from src.anonymizers.english_anonymizer import EnglishAnonymizer +from src.anonymizers.russian_anonymizer import RussianAnonymizer _log = logging.getLogger(__name__) @@ -23,6 +24,10 @@ class Worker(nlp_ws.NLPWorker): 'tag' replaces selected tokens with arbitrary tags, 'pseudo' replaces selected tokens with a random token that """ - anon = Anonymizer(task_options) - ccl_handler = CCLHandler(input_file) - ccl_handler.process(output_file, anon.unmarshallers) + lang = task_options.get('language', 'pl') + anonymizers = {'pl': PolishAnonymizer, + 'en': EnglishAnonymizer, + 'ru': RussianAnonymizer + } + anon = anonymizers.get(lang, PolishAnonymizer)(task_options) + anon.process(input_file, output_file) -- GitLab