Commit 1677a12c authored by Szymon Ciombor's avatar Szymon Ciombor

Merge branch 'multilanguage' into 'master'

Support for English and Russian language

See merge request !5
parents 53be7c15 8503609e
Pipeline #3079 passed with stages
in 1 minute and 59 seconds
......@@ -137,4 +137,5 @@ dmypy.json
# Cython debug symbols
cython_debug/
.vscode
\ No newline at end of file
.vscode
*.ipynb
......@@ -4,7 +4,8 @@ WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
COPY ./wiki.txt .
COPY ./dictionaries .
RUN python3.6 -m pip install -r requirements.txt
......
PERSON FIRST_M John
PERSON FIRST_M Liam
PERSON FIRST_M Noah
PERSON FIRST_M Olivier
PERSON FIRST_M Elijah
PERSON FIRST_M William
PERSON FIRST_M James
PERSON FIRST_M Benjamin
PERSON FIRST_M Lucas
PERSON FIRST_M Henry
PERSON FIRST_M Alexander
PERSON FIRST_M Logan
PERSON FIRST_F Emma
PERSON FIRST_F Olivia
PERSON FIRST_F Ava
PERSON FIRST_F Isabella
PERSON FIRST_F Sophia
PERSON FIRST_F Charlotte
PERSON FIRST_F Mia
PERSON FIRST_F Amelia
PERSON FIRST_F Harper
PERSON FIRST_F Evelyn
PERSON FIRST_F Abigail
PERSON LAST Smith
PERSON LAST Johnson
PERSON LAST Williams
PERSON LAST Brown
PERSON LAST Jones
PERSON LAST Garcia
PERSON LAST Miller
PERSON LAST Davis
PERSON LAST Rodriguez
PERSON LAST Martinez
PERSON LAST Hernandez
PERSON LAST Lopez
PERSON LAST Gonzales
PERSON LAST Wilson
PERSON LAST Anderson
GPE COUNTRY Poland
GPE COUNTRY France
GPE COUNTRY China
GPE COUNTRY India
GPE COUNTRY United States
GPE COUNTRY Indonesia
GPE COUNTRY Brazil
GPE COUNTRY Germany
GPE COUNTRY Egypt
GPE COUNTRY United Kingdom
GPE COUNTRY Thailand
GPE COUNTRY South Africa
GPE COUNTRY Spain
GPE COUNTRY Argentina
GPE COUNTRY Italy
GPE COUNTRY Canada
GPE CITY New York
GPE CITY Los Angelos
GPE CITY Tokyo
GPE CITY Delhi
GPE CITY Shanghai
GPE CITY Mexico City
GPE CITY Cairo
GPE CITY Mumbai
GPE CITY Beijing
GPE CITY Dhaka
GPE CITY Osaka
GPE CITY Karachi
GPE CITY Buenos Aires
GPE CITY Rio de Janeiro
GPE CITY Paris
GPE CITY Madrid
GPE CITY Toronto
GPE CITY Barcelona
GPE CITY Warsaw
GPE STATE Arizona
GPE STATE California
GPE STATE New Mexico
GPE STATE Alabama
GPE STATE Florida
GPE STATE New York
GPE STATE Oklohama
GPE STATE Texas
FAC STREET First Street
FAC STREET Second Street
FAC STREET Third Street
FAC STREET Fourth Street
FAC STREET Fifth Street
FAC STREET Park Street
FAC STREET Main Street
FAC STREET Oak Street
FAC STREET Pine Street
FAC STREET Maple Street
FAC STREET Cedar Street
FAC STREET Washington Street
FAC STREET Lincoln Street
FAC STREET Church Street
FAC AVENUE First Avenue
FAC AVENUE Second Avenue
FAC AVENUE Third Avenue
FAC AVENUE Fourth Avenue
FAC AVENUE Park Avenue
FAC AVENUE Fifth Avenue
FAC AVENUE Main Avenue
FAC AVENUE Oak Avenue
FAC AVENUE Pine Avenue
FAC AVENUE Maple Avenue
FAC AVENUE Cedar Avenue
FAC AVENUE Washington Avenue
FAC AVENUE Lincoln Avenue
FAC AVENUE Church Avenue
FAC HIGHWAY Route 66
FAC HIGHWAY Highway 12
FAC HIGHWAY Great River Road
FAC HIGHWAY Blue Ridge Parkway
FAC HIGHWAY Pacific Coast Highway
FAC HIGHWAY Overseas Highway
FAC HIGHWAY Going-to-the-Sun Road
FAC AIRPORT Guangzhou Baiyun International Airport
FAC AIRPORT Hartsfield–Jackson Atlanta International Airport
FAC AIRPORT Chengdu Shuangliu International Airport
FAC AIRPORT Dallas/Fort Worth International Airport
FAC AIRPORT Shenzhen Bao'an International Airport
FAC AIRPORT Tokyo Haneda Airport
FAC AIRPORT Indira Gandhi International Airport
FAC AIRPORT Los Angeles International Airport
FAC AIRPORT O'Hare International Airport
FAC AIRPORT Istanbul Airport
FAC AIRPORT Charles de Gaulle Airport
FAC AIRPORT Heathrow Airport
FAC AIRPORT Mexico City International Airport
PERSON FIRST_M Артём
PERSON FIRST_M Александр
PERSON FIRST_M Роман
PERSON FIRST_M Евгений
PERSON FIRST_M Иван
PERSON FIRST_M Максим
PERSON FIRST_M Денис
PERSON FIRST_M Алексей
PERSON FIRST_M Дмитрий
PERSON FIRST_M Даниил
PERSON FIRST_M Сергей
PERSON FIRST_M Николай
PERSON FIRST_F София
PERSON FIRST_F Анастасия
PERSON FIRST_F Виктория
PERSON FIRST_F Ксения
PERSON FIRST_F Арина
PERSON FIRST_F Елизавета
PERSON FIRST_F Аделина
PERSON FIRST_F Ирина
PERSON FIRST_F Елена
PERSON FIRST_F Полина
PERSON FIRST_F Дарья
PERSON LAST Иванов
PERSON LAST Петров
PERSON LAST Сидоров
PERSON LAST Смирнов
PERSON LAST Волков
PERSON LAST Фёдоров
PERSON LAST Поповv
PERSON LAST Семёнов
PERSON LAST Михайлов
PERSON LAST Егоров
PERSON LAST Ленков
PERSON LAST Васильев
PERSON LAST Николаев
PERSON LAST Морозов
PERSON LAST Степанов
LOCATION COUNTRY Польша
LOCATION COUNTRY Франция
LOCATION COUNTRY Китай
LOCATION COUNTRY Индия
LOCATION COUNTRY Соединённые Штаты Америки
LOCATION COUNTRY Индонезия
LOCATION COUNTRY Бразилия
LOCATION COUNTRY Германия
LOCATION COUNTRY Египет
LOCATION COUNTRY Великобритания
LOCATION COUNTRY Таиланд
LOCATION COUNTRY Южная Африка
LOCATION COUNTRY Испания
LOCATION COUNTRY Аргентина
LOCATION COUNTRY Италия
LOCATION COUNTRY Канада
LOCATION CITY New York
LOCATION CITY Los Angelos
LOCATION CITY Tokyo
LOCATION CITY Delhi
LOCATION CITY Shanghai
LOCATION CITY Mexico City
LOCATION CITY Cairo
LOCATION CITY Mumbai
LOCATION CITY Beijing
LOCATION CITY Dhaka
LOCATION CITY Osaka
LOCATION CITY Karachi
LOCATION CITY Buenos Aires
LOCATION CITY Rio de Janeiro
LOCATION CITY Paris
LOCATION CITY Madrid
LOCATION CITY Toronto
LOCATION CITY Barcelona
LOCATION CITY Warsaw
LOCATION STREET Советская
LOCATION STREET Октябрьская
LOCATION STREET Ленина
LOCATION STREET Гагарина
LOCATION STREET Маркса
LOCATION STREET Английская
LOCATION STREET Кадырова
LOCATION STREET Пушкина
LOCATION STREET Победы
LOCATION STREET Первомайская
LOCATION STREET Мира
\ No newline at end of file
"""Implementation of anonymizer functionality for English language."""
import math
import random
import regex
from src.utils import consume
from src.ccl_handler import CCLHandler
from src.base_anonymizer import BaseAnonymizer
from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
generate_pseudo_user, generate_pseudo_website,
generate_pseudo_date)
class EnglishAnonymizer(BaseAnonymizer):
"""Class with an anonymization implementation for the English language."""
skip_ann = ['CARDINAL', 'LAW', 'DATE', 'QUANTITY', 'TIME', 'EVENT']
date_regex = regex.compile(
r'\b(?P<day_or_month_year>'
r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
r'(?P<year1>\d{4}|\d{2}))\b|'
r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
r'(?P<month_in_words>'
r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|'
r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)'
r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b'
r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?'
r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?'
r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', regex.I
)
spacy_tag_map = {
'PERSON': '[PERSON]',
'GPE': '[LOCATION]',
'FAC': '[LOCATION]',
}
def __init__(self, task_options):
"""Initialize anonymizer with base regexes."""
super().__init__(task_options)
self.lang = task_options.get('language', 'en')
# Order is important, first more specific
self._category_anonymisation = {
'user': (self.user_regex, self._user_token,
generate_pseudo_user, {}),
'email': (self.email_regex, self._mail_token,
generate_pseudo_email, {}),
'website': (self.website_regex, self._website_token,
generate_pseudo_website, {}),
'date': (self.date_regex, self._date_token,
generate_pseudo_date, {'lang': self.lang}),
'phone_number': (self.phone_number_regex, self._digits_token,
generate_pseudo_phone_number, {}),
}
self.unmarshallers = {
'chunk': lambda *args: '\n',
'sentence': lambda *args: self._process_sent_tree(*args),
}
self._load_dict_file()
def _load_dict_file(self, filename='en_dict.txt'):
with open(filename, 'r', encoding='utf-8') as f:
for line in f.readlines():
l_list = line.strip('\n').split()
cat, subtype = l_list[0], l_list[1]
length = len(l_list[2:])
text = ' '.join(l_list[2:])
if cat not in self._form_dict:
self._form_dict[cat] = {}
if subtype not in self._form_dict[cat]:
self._form_dict[cat][subtype] = []
self._form_dict[cat][subtype].append((text, length))
def _handle_annotated(self, id, text, tag, ann):
if self._method == 'delete':
return ''
elif self._method == 'tag':
if ann in self.spacy_tag_map:
return self.spacy_tag_map[ann]
elif self._method == 'pseudo':
if ann in self.spacy_tag_map:
self._pseudo_ann_list.append((id, text, tag, ann))
return text
def _process_sentence(self, string_builder):
string_builder = self._handle_pseudo_ann(string_builder)
sentence = ''.join(string_builder)
return self._anonymize(sentence)
def _get_pseudo_ann(self, ann, text, length):
new_text = []
if ann == 'PERSON':
gen = random.choice(['FIRST_M', 'FIRST_F'])
name_length = length - 1 if length > 1 else 1
while name_length > 0:
names = [p for p in self._form_dict['PERSON'][gen]
if p[1] <= name_length]
random_name = random.choice(names)
name_length -= random_name[1]
new_text.append(random_name[0])
if length > 1:
last_name = random.choice(self._form_dict['PERSON']['LAST'])
new_text.append(last_name[0])
elif ann == 'GPE':
found = False
for _, values in self._form_dict['GPE'].items():
if ' '.join(text) in values:
new_text = [random.choice(values)[0]]
found = True
if not found:
new_text = [random.choice(self._form_dict['GPE']['CITY'])[0]]
else:
new_text = ' '.join(text)
return ' '.join(new_text)
def _handle_pseudo_ann(self, string_builder):
if not self._pseudo_ann_list:
return string_builder
shifted_id = 0
pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list))
for i, (id_, text, _, ann) in pseudo_ann_iter:
if ann in self.skip_ann:
continue
j = i + 1
id_ += shifted_id
start_id = id_
ann_len = 1
phrase = [text]
skip_tokens = 1
while j < len(self._pseudo_ann_list):
next_id, next_text, _, next_ann = self._pseudo_ann_list[j]
next_id += shifted_id
if ann != next_ann:
break
if next_id == id_ + 1 and string_builder[next_id] == '-':
skip_tokens += 1
elif next_id == id_ + 1 and string_builder[id_] == '-':
ann_len += 1
skip_tokens += 1
phrase.append(next_text)
elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ':
ann_len += 1
skip_tokens += 2
phrase.append(next_text)
else:
break
id_ = next_id
j += 1
new_text = self._get_pseudo_ann(
ann=ann,
text=phrase,
length=ann_len
)
new_text = regex.split('( )', new_text)
string_builder = string_builder[:start_id] + new_text + \
string_builder[start_id + skip_tokens:]
if ann_len > 1:
consume(pseudo_ann_iter, ann_len - 1)
if math.ceil(len(new_text) / 2) != ann_len:
shifted_id += len(new_text) - (ann_len * 2) + 1
self._pseudo_ann_list.clear()
return string_builder
def _anonymize(self, sentence):
if self._method == 'delete':
for pattern, _, _ in self._category_anonymisation.values():
sentence = regex.sub(pattern, '', sentence)
elif self._method == 'tag':
sentence = self._tagging(sentence)
elif self._method == 'pseudo':
sentence = self._pseudonymization(sentence)
return sentence
def process(self, input_filename, output_filename):
"""Anonymize the file in CCL format to the resulting file in plain text.
Args:
input_filename (str): Input filename in CCL format. \
Text tagged and processed with spacy NER.
output_filename (str): Output filename.
"""
ccl_handler = CCLHandler(input_filename)
ccl_handler.process(output_filename, self.unmarshallers)
"""Implementation of anonymizer functionality for Russian language."""
import math
import random
import regex
from src.utils import consume
from src.ccl_handler import CCLHandler
from src.base_anonymizer import BaseAnonymizer
from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
generate_pseudo_user, generate_pseudo_website,
generate_pseudo_date)
class RussianAnonymizer(BaseAnonymizer):
"""Class with an anonymization implementation for the Russian language."""
date_regex = regex.compile(
r'\b(?P<day_or_month_year>'
r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
r'(?P<year1>\d{4}|\d{2}))\b|'
r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
r'(?P<month_in_words>'
r'(?!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b)'
r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
r'\b(?P<month>Янв(?:|ар[ьея])|Фев(?:|рал[ьея])|Мар(?:|т|те|та)|'
r'Апр(?:|ел[ьея])|Ма[йея]|Июн(?:|[ьея])|Июл(?:|[ьея])|'
r'Авг(?:|уст|уст[еа])|Сен(?:|тябр[ьея])|Окт(?:|ябр[ьея])|'
r'Ноя(?:|бр[ьея])|Дек(?:|абр[ьея]))\b'
r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?'
r'(?<!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b))', regex.I
)
spacy_tag_map = {
'PER': '[PERSON]',
'LOC': '[LOCATION]',
}
def __init__(self, task_options):
"""Initialize anonymizer with base regexes."""
super().__init__(task_options)
self.lang = task_options.get('language', 'ru')
# Order is important, first more specific
self._category_anonymisation = {
'user': (self.user_regex, self._user_token,
generate_pseudo_user, {}),
'email': (self.email_regex, self._mail_token,
generate_pseudo_email, {}),
'website': (self.website_regex, self._website_token,
generate_pseudo_website, {}),
'date': (self.date_regex, self._date_token,
generate_pseudo_date, {'lang': self.lang}),
'phone_number': (self.phone_number_regex, self._digits_token,
generate_pseudo_phone_number, {}),
}
self.unmarshallers = {
'chunk': lambda *args: '\n',
'sentence': lambda *args: self._process_sent_tree(*args),
}
self._load_dict_file()
def _load_dict_file(self, filename='ru_dict.txt'):
with open(filename, 'r', encoding='utf-8') as f:
for line in f.readlines():
l_list = line.strip('\n').split()
cat, subtype = l_list[0], l_list[1]
length = len(l_list[2:])
text = ' '.join(l_list[2:])
if cat not in self._form_dict:
self._form_dict[cat] = {}
if subtype not in self._form_dict[cat]:
self._form_dict[cat][subtype] = []
self._form_dict[cat][subtype].append((text, length))
def _handle_annotated(self, id, text, tag, ann):
if self._method == 'delete':
return ''
elif self._method == 'tag':
if ann in self.spacy_tag_map:
return self.spacy_tag_map[ann]
elif self._method == 'pseudo':
if ann in self.spacy_tag_map:
self._pseudo_ann_list.append((id, text, tag, ann))
return text
def _process_sentence(self, string_builder):
string_builder = self._handle_pseudo_ann(string_builder)
sentence = ''.join(string_builder)
return self._anonymize(sentence)
def _get_pseudo_ann(self, ann, text, length):
new_text = []
if ann == 'PER':
gen = random.choice(['FIRST_M', 'FIRST_F'])
name_length = length - 1 if length > 1 else 1
while name_length > 0:
names = [p for p in self._form_dict['PERSON'][gen]
if p[1] <= name_length]
random_name = random.choice(names)
name_length -= random_name[1]
new_text.append(random_name[0])
if length > 1:
last_name = random.choice(self._form_dict['PERSON']['LAST'])
new_text.append(last_name[0])
elif ann == 'LOC':
found = False
for _, values in self._form_dict['LOCATION'].items():
if ' '.join(text) in values:
new_text = [random.choice(values)[0]]
found = True
if not found:
default_loc = self._form_dict['LOCATION']['CITY']
new_text = [random.choice(default_loc)[0]]
else:
new_text = ' '.join(text)
return ' '.join(new_text)
def _handle_pseudo_ann(self, string_builder):
if not self._pseudo_ann_list:
return string_builder
shifted_id = 0
pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list))
for i, (id_, text, _, ann) in pseudo_ann_iter:
j = i + 1
start_id = id_ + shifted_id
ann_len = 1
phrase = [text]
skip_tokens = 1
while j < len(self._pseudo_ann_list):
next_id, next_text, _, next_ann = self._pseudo_ann_list[j]
next_id += shifted_id
if ann != next_ann:
break
if next_id == id_ + 1 and string_builder[next_id] == '-':
skip_tokens += 1
elif next_id == id_ + 1 and string_builder[id_] == '-':
ann_len += 1
skip_tokens += 1
phrase.append(next_text)
elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ':
ann_len += 1
skip_tokens += 2
phrase.append(next_text)
else:
break
id_ = next_id
j += 1
new_text = self._get_pseudo_ann(
ann=ann,
text=phrase,
length=ann_len
)
new_text = regex.split('( )', new_text)
string_builder = string_builder[:start_id] + new_text + \
string_builder[start_id + skip_tokens:]
if ann_len > 1:
consume(pseudo_ann_iter, ann_len - 1)
if math.ceil(len(new_text) / 2) != ann_len:
shifted_id += len(new_text) - (ann_len * 2) + 1
self._pseudo_ann_list.clear()
return string_builder
def _anonymize(self, sentence):
if self._method == 'delete':
for pattern, _, _ in self._category_anonymisation.values():
sentence = regex.sub(pattern, '', sentence)
elif self._method == 'tag':
sentence = self._tagging(sentence)
elif self._method == 'pseudo':