From 8503609eb6fcfa95c13ab68a746e7c6a27334ecc Mon Sep 17 00:00:00 2001
From: Norbert Ropiak <norbert.ropiak@pwr.edu.pl>
Date: Thu, 10 Jun 2021 11:40:19 +0000
Subject: [PATCH] Support for English and Russian language

---
 .gitignore                            |   3 +-
 Dockerfile                            |   3 +-
 dictionaries/en_dict.txt              | 129 +++++++++++
 wiki.txt => dictionaries/pl_dict.txt  |   0
 dictionaries/ru_dict.txt              |  84 +++++++
 src/__init__.py                       |   0
 src/anonymizer.py                     | 305 --------------------------
 src/anonymizers/english_anonymizer.py | 192 ++++++++++++++++
 src/anonymizers/polish_anonymizer.py  | 200 +++++++++++++++++
 src/anonymizers/russian_anonymizer.py | 191 ++++++++++++++++
 src/base_anonymizer.py                | 164 ++++++++++++++
 src/generators.py                     |  20 +-
 src/utils.py                          |  14 ++
 src/worker.py                         |  15 +-
 14 files changed, 1001 insertions(+), 319 deletions(-)
 create mode 100644 dictionaries/en_dict.txt
 rename wiki.txt => dictionaries/pl_dict.txt (100%)
 create mode 100644 dictionaries/ru_dict.txt
 create mode 100644 src/__init__.py
 delete mode 100644 src/anonymizer.py
 create mode 100644 src/anonymizers/english_anonymizer.py
 create mode 100644 src/anonymizers/polish_anonymizer.py
 create mode 100644 src/anonymizers/russian_anonymizer.py
 create mode 100644 src/base_anonymizer.py
 create mode 100644 src/utils.py

diff --git a/.gitignore b/.gitignore
index baad420..f1d48cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,4 +137,5 @@ dmypy.json
 # Cython debug symbols
 cython_debug/
 
-.vscode
\ No newline at end of file
+.vscode
+*.ipynb
diff --git a/Dockerfile b/Dockerfile
index 836cdd0..62a552b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,8 @@ WORKDIR /home/worker
 COPY ./src ./src
 COPY ./main.py .
 COPY ./requirements.txt .
-COPY ./wiki.txt .
+COPY ./dictionaries .
+
 
 RUN python3.6 -m pip install -r requirements.txt
 
diff --git a/dictionaries/en_dict.txt b/dictionaries/en_dict.txt
new file mode 100644
index 0000000..744500b
--- /dev/null
+++ b/dictionaries/en_dict.txt
@@ -0,0 +1,129 @@
+PERSON  FIRST_M    John
+PERSON  FIRST_M    Liam
+PERSON  FIRST_M    Noah
+PERSON  FIRST_M    Olivier
+PERSON  FIRST_M    Elijah
+PERSON  FIRST_M    William
+PERSON  FIRST_M    James
+PERSON  FIRST_M    Benjamin
+PERSON  FIRST_M    Lucas
+PERSON  FIRST_M    Henry
+PERSON  FIRST_M    Alexander
+PERSON  FIRST_M    Logan
+PERSON  FIRST_F    Emma
+PERSON  FIRST_F    Olivia
+PERSON  FIRST_F    Ava
+PERSON  FIRST_F    Isabella
+PERSON  FIRST_F    Sophia
+PERSON  FIRST_F    Charlotte
+PERSON  FIRST_F    Mia
+PERSON  FIRST_F    Amelia
+PERSON  FIRST_F    Harper
+PERSON  FIRST_F    Evelyn
+PERSON  FIRST_F    Abigail
+PERSON  LAST    Smith
+PERSON  LAST    Johnson
+PERSON  LAST    Williams
+PERSON  LAST    Brown
+PERSON  LAST    Jones
+PERSON  LAST    Garcia
+PERSON  LAST    Miller
+PERSON  LAST    Davis
+PERSON  LAST    Rodriguez
+PERSON  LAST    Martinez
+PERSON  LAST    Hernandez
+PERSON  LAST    Lopez
+PERSON  LAST    Gonzales
+PERSON  LAST    Wilson
+PERSON  LAST    Anderson
+GPE COUNTRY Poland
+GPE COUNTRY France
+GPE COUNTRY China
+GPE COUNTRY India
+GPE COUNTRY United States
+GPE COUNTRY Indonesia
+GPE COUNTRY Brazil
+GPE COUNTRY Germany
+GPE COUNTRY Egypt
+GPE COUNTRY United Kingdom
+GPE COUNTRY Thailand
+GPE COUNTRY South Africa
+GPE COUNTRY Spain
+GPE COUNTRY Argentina
+GPE COUNTRY Italy
+GPE COUNTRY Canada
+GPE CITY    New York
+GPE CITY    Los Angelos
+GPE CITY    Tokyo
+GPE CITY    Delhi
+GPE CITY    Shanghai
+GPE CITY    Mexico City
+GPE CITY    Cairo
+GPE CITY    Mumbai
+GPE CITY    Beijing
+GPE CITY    Dhaka
+GPE CITY    Osaka
+GPE CITY    Karachi
+GPE CITY    Buenos Aires
+GPE CITY    Rio de Janeiro
+GPE CITY    Paris
+GPE CITY    Madrid
+GPE CITY    Toronto
+GPE CITY    Barcelona
+GPE CITY    Warsaw
+GPE STATE   Arizona
+GPE STATE   California
+GPE STATE   New Mexico
+GPE STATE   Alabama
+GPE STATE   Florida
+GPE STATE   New York
+GPE STATE   Oklohama
+GPE STATE   Texas
+FAC STREET First Street
+FAC STREET Second Street
+FAC STREET Third Street
+FAC STREET Fourth Street
+FAC STREET Fifth Street
+FAC STREET Park Street
+FAC STREET Main Street
+FAC STREET Oak Street
+FAC STREET Pine Street
+FAC STREET Maple Street
+FAC STREET Cedar Street
+FAC STREET Washington Street
+FAC STREET Lincoln Street
+FAC STREET Church Street
+FAC AVENUE First Avenue
+FAC AVENUE Second Avenue
+FAC AVENUE Third Avenue
+FAC AVENUE Fourth Avenue
+FAC AVENUE Park Avenue
+FAC AVENUE Fifth Avenue
+FAC AVENUE Main Avenue
+FAC AVENUE Oak Avenue
+FAC AVENUE Pine Avenue
+FAC AVENUE Maple Avenue
+FAC AVENUE Cedar Avenue
+FAC AVENUE Washington Avenue
+FAC AVENUE Lincoln Avenue
+FAC AVENUE Church Avenue
+FAC HIGHWAY Route 66
+FAC HIGHWAY Highway 12
+FAC HIGHWAY Great River Road
+FAC HIGHWAY Blue Ridge Parkway
+FAC HIGHWAY Pacific Coast Highway
+FAC HIGHWAY Overseas Highway
+FAC HIGHWAY Going-to-the-Sun Road
+FAC AIRPORT Guangzhou Baiyun International Airport
+FAC AIRPORT Hartsfieldâ€“Jackson Atlanta International Airport
+FAC AIRPORT Chengdu Shuangliu International Airport
+FAC AIRPORT Dallas/Fort Worth International Airport
+FAC AIRPORT Shenzhen Bao'an International Airport
+FAC AIRPORT Tokyo Haneda Airport
+FAC AIRPORT Indira Gandhi International Airport
+FAC AIRPORT Los Angeles International Airport
+FAC AIRPORT O'Hare International Airport
+FAC AIRPORT Istanbul Airport
+FAC AIRPORT Charles de Gaulle Airport
+FAC AIRPORT Heathrow Airport
+FAC AIRPORT Mexico City International Airport
diff --git a/wiki.txt b/dictionaries/pl_dict.txt
similarity index 100%
rename from wiki.txt
rename to dictionaries/pl_dict.txt
diff --git a/dictionaries/ru_dict.txt b/dictionaries/ru_dict.txt
new file mode 100644
index 0000000..c2006d8
--- /dev/null
+++ b/dictionaries/ru_dict.txt
@@ -0,0 +1,84 @@
+PERSON  FIRST_M    ÐÑ€Ñ‚Ñ‘Ð¼
+PERSON  FIRST_M    ÐÐ»ÐµÐºÑÐ°Ð½Ð´Ñ€
+PERSON  FIRST_M    Ð Ð¾Ð¼Ð°Ð½
+PERSON  FIRST_M    Ð•Ð²Ð³ÐµÐ½Ð¸Ð¹
+PERSON  FIRST_M    Ð˜Ð²Ð°Ð½
+PERSON  FIRST_M    ÐœÐ°ÐºÑÐ¸Ð¼
+PERSON  FIRST_M    Ð”ÐµÐ½Ð¸Ñ
+PERSON  FIRST_M    ÐÐ»ÐµÐºÑÐµÐ¹
+PERSON  FIRST_M    Ð”Ð¼Ð¸Ñ‚Ñ€Ð¸Ð¹
+PERSON  FIRST_M    Ð”Ð°Ð½Ð¸Ð¸Ð»
+PERSON  FIRST_M    Ð¡ÐµÑ€Ð³ÐµÐ¹
+PERSON  FIRST_M    ÐÐ¸ÐºÐ¾Ð»Ð°Ð¹
+PERSON  FIRST_F    Ð¡Ð¾Ñ„Ð¸Ñ
+PERSON  FIRST_F    ÐÐ½Ð°ÑÑ‚Ð°ÑÐ¸Ñ
+PERSON  FIRST_F    Ð’Ð¸ÐºÑ‚Ð¾Ñ€Ð¸Ñ
+PERSON  FIRST_F    ÐšÑÐµÐ½Ð¸Ñ
+PERSON  FIRST_F    ÐÑ€Ð¸Ð½Ð°
+PERSON  FIRST_F    Ð•Ð»Ð¸Ð·Ð°Ð²ÐµÑ‚Ð°
+PERSON  FIRST_F    ÐÐ´ÐµÐ»Ð¸Ð½Ð°
+PERSON  FIRST_F    Ð˜Ñ€Ð¸Ð½Ð°
+PERSON  FIRST_F    Ð•Ð»ÐµÐ½Ð°
+PERSON  FIRST_F    ÐŸÐ¾Ð»Ð¸Ð½Ð°
+PERSON  FIRST_F    Ð”Ð°Ñ€ÑŒÑ
+PERSON  LAST    Ð˜Ð²Ð°Ð½Ð¾Ð²
+PERSON  LAST    ÐŸÐµÑ‚Ñ€Ð¾Ð²
+PERSON  LAST    Ð¡Ð¸Ð´Ð¾Ñ€Ð¾Ð²
+PERSON  LAST    Ð¡Ð¼Ð¸Ñ€Ð½Ð¾Ð²
+PERSON  LAST    Ð’Ð¾Ð»ÐºÐ¾Ð²
+PERSON  LAST    Ð¤Ñ‘Ð´Ð¾Ñ€Ð¾Ð²
+PERSON  LAST    ÐŸÐ¾Ð¿Ð¾Ð²v
+PERSON  LAST    Ð¡ÐµÐ¼Ñ‘Ð½Ð¾Ð²
+PERSON  LAST    ÐœÐ¸Ñ…Ð°Ð¹Ð»Ð¾Ð²
+PERSON  LAST    Ð•Ð³Ð¾Ñ€Ð¾Ð²
+PERSON  LAST    Ð›ÐµÐ½ÐºÐ¾Ð²
+PERSON  LAST    Ð’Ð°ÑÐ¸Ð»ÑŒÐµÐ²
+PERSON  LAST    ÐÐ¸ÐºÐ¾Ð»Ð°ÐµÐ²
+PERSON  LAST    ÐœÐ¾Ñ€Ð¾Ð·Ð¾Ð²
+PERSON  LAST    Ð¡Ñ‚ÐµÐ¿Ð°Ð½Ð¾Ð²
+LOCATION COUNTRY ÐŸÐ¾Ð»ÑŒÑˆÐ°
+LOCATION COUNTRY Ð¤Ñ€Ð°Ð½Ñ†Ð¸Ñ
+LOCATION COUNTRY ÐšÐ¸Ñ‚Ð°Ð¹
+LOCATION COUNTRY Ð˜Ð½Ð´Ð¸Ñ
+LOCATION COUNTRY Ð¡Ð¾ÐµÐ´Ð¸Ð½Ñ‘Ð½Ð½Ñ‹Ðµ Ð¨Ñ‚Ð°Ñ‚Ñ‹ ÐÐ¼ÐµÑ€Ð¸ÐºÐ¸
+LOCATION COUNTRY Ð˜Ð½Ð´Ð¾Ð½ÐµÐ·Ð¸Ñ
+LOCATION COUNTRY Ð‘Ñ€Ð°Ð·Ð¸Ð»Ð¸Ñ
+LOCATION COUNTRY Ð“ÐµÑ€Ð¼Ð°Ð½Ð¸Ñ
+LOCATION COUNTRY Ð•Ð³Ð¸Ð¿ÐµÑ‚
+LOCATION COUNTRY Ð’ÐµÐ»Ð¸ÐºÐ¾Ð±Ñ€Ð¸Ñ‚Ð°Ð½Ð¸Ñ
+LOCATION COUNTRY Ð¢Ð°Ð¸Ð»Ð°Ð½Ð´
+LOCATION COUNTRY Ð®Ð¶Ð½Ð°Ñ ÐÑ„Ñ€Ð¸ÐºÐ°
+LOCATION COUNTRY Ð˜ÑÐ¿Ð°Ð½Ð¸Ñ
+LOCATION COUNTRY ÐÑ€Ð³ÐµÐ½Ñ‚Ð¸Ð½Ð°
+LOCATION COUNTRY Ð˜Ñ‚Ð°Ð»Ð¸Ñ
+LOCATION COUNTRY ÐšÐ°Ð½Ð°Ð´Ð°
+LOCATION CITY    New York
+LOCATION CITY    Los Angelos
+LOCATION CITY    Tokyo
+LOCATION CITY    Delhi
+LOCATION CITY    Shanghai
+LOCATION CITY    Mexico City
+LOCATION CITY    Cairo
+LOCATION CITY    Mumbai
+LOCATION CITY    Beijing
+LOCATION CITY    Dhaka
+LOCATION CITY    Osaka
+LOCATION CITY    Karachi
+LOCATION CITY    Buenos Aires
+LOCATION CITY    Rio de Janeiro
+LOCATION CITY    Paris
+LOCATION CITY    Madrid
+LOCATION CITY    Toronto
+LOCATION CITY    Barcelona
+LOCATION CITY    Warsaw
+LOCATION STREET Ð¡Ð¾Ð²ÐµÑ‚ÑÐºÐ°Ñ
+LOCATION STREET ÐžÐºÑ‚ÑÐ±Ñ€ÑŒÑÐºÐ°Ñ
+LOCATION STREET Ð›ÐµÐ½Ð¸Ð½Ð°
+LOCATION STREET Ð“Ð°Ð³Ð°Ñ€Ð¸Ð½Ð°
+LOCATION STREET ÐœÐ°Ñ€ÐºÑÐ°
+LOCATION STREET ÐÐ½Ð³Ð»Ð¸Ð¹ÑÐºÐ°Ñ
+LOCATION STREET ÐšÐ°Ð´Ñ‹Ñ€Ð¾Ð²Ð°
+LOCATION STREET ÐŸÑƒÑˆÐºÐ¸Ð½Ð°
+LOCATION STREET ÐŸÐ¾Ð±ÐµÐ´Ñ‹
+LOCATION STREET ÐŸÐµÑ€Ð²Ð¾Ð¼Ð°Ð¹ÑÐºÐ°Ñ 
+LOCATION STREET ÐœÐ¸Ñ€Ð°
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/anonymizer.py b/src/anonymizer.py
deleted file mode 100644
index 38ecf34..0000000
--- a/src/anonymizer.py
+++ /dev/null
@@ -1,305 +0,0 @@
-"""Implementation of anonymizer functionality."""
-import random
-from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
-                            generate_pseudo_user, generate_pseudo_website,
-                            generate_phone_number_tag, generate_pseudo_date)
-
-import regex
-
-
-class Anonymizer:
-    """Class used to edit sentences based on options."""
-
-    email_regex = regex.compile(
-        r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+'
-        r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)'
-        r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)'
-        r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', regex.I
-    )
-    user_regex = regex.compile(r'\B(?P<username>\@[\w\-]+)')
-    _website_exceptions = ['m.in']
-    website_regex = regex.compile(
-        r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(_website_exceptions)) +
-        r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?'
-        r'(?P<auth>\S+(?::\S*)?@)?'
-        r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})'
-        r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})'
-        r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})'
-        r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
-        r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
-        r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
-        r'|'
-        r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?'
-        r'[a-z0-9\u00a1-\uffff]\.)+)'
-        r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)'
-        r'(?P<port>:\d{2,5})?'
-        r'(?P<path>[/?#]\S*)?)',
-        regex.UNICODE | regex.I
-    )
-    phone_number_regex = regex.compile(
-        r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?'
-        r'(?P<number>(\d[- ]??){9,10})'
-    )
-    date_regex = regex.compile(
-        r'\b(?P<day_or_month_year>'
-        r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
-        r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
-        r'(?P<year1>\d{4}|\d{2}))\b|'
-
-        r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
-        r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
-        r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
-
-        r'(?P<month_in_words>'
-        r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
-        r'\b(?P<month>Sty(?:|cze[nÅ„]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|'
-        r'Kwi(?:|ecie[nÅ„]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)'
-        r'|Sie(?:|rpie[nÅ„]|rpnia)|Wrz(?:|esie[nÅ„]|e[Å›s]nia)'
-        r'|Pa[zÅº](?:|dziernik|dziernika)|Lis(?:|topad|stopada)'
-        r'|Gru(?:|dzie[nÅ„]|dnia))\b'
-        r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
-        r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
-        r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', regex.I
-    )
-
-    _file_to_liner_dispatch = {
-        'nam_liv_person': 'person_first_nam',
-        'nam_liv_person_last': 'person_last_nam',
-        'nam_fac_road': 'road_nam',
-        'nam_loc_gpe_city': 'city_nam',
-        'nam_org_group_team': 'country_nam'
-    }
-
-    _liner_to_tag_dispatch = {
-        'person_first_nam': '[OSOBA]',
-        'person_last_nam': '[OSOBA]',
-        'road_nam': '[MIEJSCE]',
-        'city_nam': '[MIEJSCE]',
-        'country_nam': '[MIEJSCE]'
-    }
-
-    def __init__(self, task_options):
-        """Initialize anonymizer with task_options."""
-        self.unmarshallers = {
-            'chunk': lambda *args: '\n',
-            'sentence': lambda *args: self._process_sent_tree(*args),
-        }
-        self._method = task_options.get('method', 'delete')
-        self._mail_token = '[MAIL]'
-        self._user_token = '@[USER]'
-        self._website_token = '[WWW]'
-        self._digits_token = '[DIGITS]'
-        self._date_token = '[DATE]'
-        self._default_token = '[INNE]'
-        self._form_dict = dict()
-        self._pseudo_ann_list = list()
-        # Order is important, first more specific
-        self._category_anonymisation = {
-            'user': (self.user_regex, self._user_token,
-                     generate_pseudo_user),
-            'email': (self.email_regex, self._mail_token,
-                      generate_pseudo_email),
-            'website': (self.website_regex, self._website_token,
-                        generate_pseudo_website),
-            'date': (self.date_regex, self._date_token,
-                     generate_pseudo_date),
-            'phone_number': (self.phone_number_regex, self._digits_token,
-                             generate_pseudo_phone_number),
-        }
-        self._load_file()
-
-    def _load_file(self, file_name='wiki.txt'):
-        with open(file_name, 'r', encoding='utf-8') as f:
-            for line in f.readlines():
-                l_list = line.split()
-                cat = l_list[0]
-                if cat in self._file_to_liner_dispatch:
-                    cat_name = self._file_to_liner_dispatch[cat]
-                    length = int((len(l_list) - 2) / 2)
-                    gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)])
-                    flx_name = ' '.join(l_list[1:(1 + length)])
-                    flex = l_list[-1]
-                    if cat_name not in self._form_dict:
-                        self._form_dict[cat_name] = dict()
-                    if length not in self._form_dict[cat_name]:
-                        self._form_dict[cat_name][length] = dict()
-                    if gen_name not in self._form_dict[cat_name][length]:
-                        self._form_dict[cat_name][length][gen_name] = dict()
-                    self._form_dict[cat_name][length][gen_name][flex] = flx_name
-        for cat in self._form_dict:
-            for length in self._form_dict[cat]:
-                self._form_dict[cat][length] = list(
-                    self._form_dict[cat][length].items()
-                )
-
-    def _process_sent_tree(self, sentence_subtree):
-        string_builder = []
-        id = 0
-        for elem in sentence_subtree:
-            if elem.tag == 'tok':
-                tok = self._process_single_tok(id, elem)
-                string_builder.append(tok)
-                string_builder.append(' ')
-                id += 2
-            elif elem.tag == 'ns':
-                id -= 1
-                string_builder.pop()
-            else:
-                raise Exception('Unrecognized tag inside sentence: ' + elem.tag)
-        return self._process_sentence(string_builder)
-
-    def _process_sentence(self, string_builder):
-        string_builder = self._handle_pseudo_ann(string_builder)
-        sentence = ''.join(string_builder)
-        return self._anonymize(sentence)
-
-    def _process_word(self, id, text, tag, ann):
-        for chan, value in ann:
-            if value != 0:
-                text = self._handle_annotated(id, text, tag, chan)
-                break
-        return text
-
-    def _handle_annotated(self, id, text, tag, ann):
-        if self._method == 'delete':
-            return ''
-        elif self._method == 'tag':
-            if ann in self._liner_to_tag_dispatch:
-                return self._liner_to_tag_dispatch[ann]
-        elif self._method == 'pseudo':
-            if ann in self._form_dict:
-                self._pseudo_ann_list.append((id, text, tag, ann))
-        return text
-
-    def _handle_pseudo_ann(self, string_builder):
-        if self._pseudo_ann_list:
-            it = iter(self._pseudo_ann_list)
-            id, text, tag, ann = next(it)
-            current_tag = tag
-            current_ann = ann
-            current_id = id
-            length = 1
-            for id, text, tag, ann in it:
-                if current_ann == ann and (ann != 'person_first_nam' and
-                                           ann != 'person_last_nam'):
-                    if id == current_id + 2:
-                        length += 1
-                        current_tag = tag
-                        current_id = id
-                        continue
-                new_text = self._get_pseudo_ann(
-                    ann=current_ann,
-                    tag=current_tag,
-                    length=length
-                )
-                for t in new_text.split(' '):
-                    string_builder[current_id - 2 * (length - 1)] = t
-                    length -= 1
-                length = 1
-                current_tag = tag
-                current_ann = ann
-                current_id = id
-            new_text = self._get_pseudo_ann(current_ann, current_tag, length)
-            toks = new_text.split(' ')
-            for i in range(length):
-                if i < len(toks):
-                    string_builder[current_id - 2 * (length - 1)] = toks[i]
-                else:
-                    string_builder[current_id - 2 * (length - 1)] = ''
-                    if string_builder[current_id - 2 * (length - 1) + 1] == ' ':
-                        string_builder[current_id - 2 * (length - 1) + 1] = ''
-                length -= 1
-            self._pseudo_ann_list.clear()
-        return string_builder
-
-    def _get_pseudo_ann(self, ann, tag, length):
-        while length not in self._form_dict[ann] and length > 0:
-            length -= 1
-        if length == 0:
-            return ''
-        new_tag = ':'.join(tag.split(':')[1:4])
-        for i in range(0, 10):
-            random_entry = random.choice(self._form_dict[ann][length])
-            if new_tag in random_entry[1]:
-                return random_entry[1][new_tag]
-            if new_tag == 'ign':
-                return random_entry[0]
-        random_entry = random.choice(self._form_dict[ann][length])
-        return random_entry[0]
-
-    def _process_single_tok(self, id, tok_subtree):
-        text = ''
-        tag = ''
-        ann = []
-        for elem in tok_subtree:
-            if elem.tag == 'orth':
-                text = elem.text
-            elif elem.tag == 'lex':
-                tag = self._process_lex(elem)
-            elif elem.tag == 'ann':
-                ann.append(self._process_ann(elem))
-        word = self._process_word(id, text, tag, ann)
-        return word
-
-    def _process_lex(self, lex_subtree):
-        tag = ''
-        for elem in lex_subtree:
-            if elem.tag == 'ctag':
-                tag = elem.text
-            elif elem.tag != 'base':
-                raise Exception('Unrecognized tag inside lex: ' + elem.tag)
-        if tag == '':
-            raise Exception('Lex tag had no ctag inside!')
-        return tag
-
-    def _process_ann(self, ann_subtree):
-        value = int(ann_subtree.text)
-        chan = ann_subtree.attrib["chan"]
-        return chan, value
-
-    def _anonymize(self, sentence):
-        if self._method == 'delete':
-            for pattern, _, _ in self._category_anonymisation.values():
-                sentence = regex.sub(pattern, '', sentence)
-        elif self._method == 'tag':
-            sentence = self._tagging(sentence)
-        elif self._method == 'pseudo':
-            sentence = self._pseudonymization(sentence)
-        return sentence
-
-    def _tagging(self, sentence):
-        for category in self._category_anonymisation:
-            pattern, token, _ = self._category_anonymisation[category]
-
-            if category == 'phone_number':
-                matches = [m for m in pattern.finditer(sentence)]
-                for match in matches:
-                    tag = generate_phone_number_tag(match.groupdict(''), token)
-                    replace_match = match.group(0)
-                    sentence = regex.sub(regex.escape(
-                        replace_match), tag, sentence)
-            else:
-                sentence = regex.sub(pattern, token, sentence)
-        return sentence
-
-    def _pseudonymization(self, sentence):
-        sentence_after_regex = sentence
-        to_replace = []
-        for category in self._category_anonymisation:
-            pattern, _, generator = self._category_anonymisation[category]
-            for match in pattern.finditer(sentence_after_regex):
-                if not match:
-                    continue
-                to_replace.append((match, generator))
-                sentence_after_regex = regex.sub(
-                    regex.escape(match.group(0)), '', sentence_after_regex)
-
-        for match, generator in to_replace:
-            replace_match = match.group(0)
-            pseudo_string = generator(match.groupdict(''))
-            sentence = regex.sub(
-                regex.escape(replace_match),
-                pseudo_string,
-                sentence
-            )
-        return sentence
diff --git a/src/anonymizers/english_anonymizer.py b/src/anonymizers/english_anonymizer.py
new file mode 100644
index 0000000..6638942
--- /dev/null
+++ b/src/anonymizers/english_anonymizer.py
@@ -0,0 +1,192 @@
+"""Implementation of anonymizer functionality for English language."""
+import math
+import random
+
+import regex
+
+
+from src.utils import consume
+from src.ccl_handler import CCLHandler
+from src.base_anonymizer import BaseAnonymizer
+from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
+                            generate_pseudo_user, generate_pseudo_website,
+                            generate_pseudo_date)
+
+
+class EnglishAnonymizer(BaseAnonymizer):
+    """Class with an anonymization implementation for the English language."""
+
+    skip_ann = ['CARDINAL', 'LAW', 'DATE', 'QUANTITY', 'TIME', 'EVENT']
+    date_regex = regex.compile(
+        r'\b(?P<day_or_month_year>'
+        r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+        r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+        r'(?P<year1>\d{4}|\d{2}))\b|'
+
+        r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+        r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+        r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+        r'(?P<month_in_words>'
+        r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+        r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|'
+        r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)'
+        r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b'
+        r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?'
+        r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?'
+        r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', regex.I
+    )
+
+    spacy_tag_map = {
+        'PERSON': '[PERSON]',
+        'GPE': '[LOCATION]',
+        'FAC': '[LOCATION]',
+    }
+
+    def __init__(self, task_options):
+        """Initialize anonymizer with base regexes."""
+        super().__init__(task_options)
+        self.lang = task_options.get('language', 'en')
+        # Order is important, first more specific
+        self._category_anonymisation = {
+            'user': (self.user_regex, self._user_token,
+                     generate_pseudo_user, {}),
+            'email': (self.email_regex, self._mail_token,
+                      generate_pseudo_email, {}),
+            'website': (self.website_regex, self._website_token,
+                        generate_pseudo_website, {}),
+            'date': (self.date_regex, self._date_token,
+                     generate_pseudo_date, {'lang': self.lang}),
+            'phone_number': (self.phone_number_regex, self._digits_token,
+                             generate_pseudo_phone_number, {}),
+        }
+        self.unmarshallers = {
+            'chunk': lambda *args: '\n',
+            'sentence': lambda *args: self._process_sent_tree(*args),
+        }
+        self._load_dict_file()
+
+    def _load_dict_file(self, filename='en_dict.txt'):
+        with open(filename, 'r', encoding='utf-8') as f:
+            for line in f.readlines():
+                l_list = line.strip('\n').split()
+                cat, subtype = l_list[0], l_list[1]
+                length = len(l_list[2:])
+                text = ' '.join(l_list[2:])
+                if cat not in self._form_dict:
+                    self._form_dict[cat] = {}
+                if subtype not in self._form_dict[cat]:
+                    self._form_dict[cat][subtype] = []
+                self._form_dict[cat][subtype].append((text, length))
+
+    def _handle_annotated(self, id, text, tag, ann):
+        if self._method == 'delete':
+            return ''
+        elif self._method == 'tag':
+            if ann in self.spacy_tag_map:
+                return self.spacy_tag_map[ann]
+        elif self._method == 'pseudo':
+            if ann in self.spacy_tag_map:
+                self._pseudo_ann_list.append((id, text, tag, ann))
+        return text
+
+    def _process_sentence(self, string_builder):
+        string_builder = self._handle_pseudo_ann(string_builder)
+        sentence = ''.join(string_builder)
+        return self._anonymize(sentence)
+
+    def _get_pseudo_ann(self, ann, text, length):
+        new_text = []
+        if ann == 'PERSON':
+            gen = random.choice(['FIRST_M', 'FIRST_F'])
+            name_length = length - 1 if length > 1 else 1
+            while name_length > 0:
+                names = [p for p in self._form_dict['PERSON'][gen]
+                         if p[1] <= name_length]
+                random_name = random.choice(names)
+                name_length -= random_name[1]
+                new_text.append(random_name[0])
+            if length > 1:
+                last_name = random.choice(self._form_dict['PERSON']['LAST'])
+                new_text.append(last_name[0])
+        elif ann == 'GPE':
+            found = False
+            for _, values in self._form_dict['GPE'].items():
+                if ' '.join(text) in values:
+                    new_text = [random.choice(values)[0]]
+                    found = True
+            if not found:
+                new_text = [random.choice(self._form_dict['GPE']['CITY'])[0]]
+        else:
+            new_text = ' '.join(text)
+        return ' '.join(new_text)
+
+    def _handle_pseudo_ann(self, string_builder):
+        if not self._pseudo_ann_list:
+            return string_builder
+        shifted_id = 0
+        pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list))
+        for i, (id_, text, _, ann) in pseudo_ann_iter:
+            if ann in self.skip_ann:
+                continue
+            j = i + 1
+            id_ += shifted_id
+            start_id = id_
+            ann_len = 1
+            phrase = [text]
+            skip_tokens = 1
+            while j < len(self._pseudo_ann_list):
+                next_id, next_text, _, next_ann = self._pseudo_ann_list[j]
+                next_id += shifted_id
+                if ann != next_ann:
+                    break
+                if next_id == id_ + 1 and string_builder[next_id] == '-':
+                    skip_tokens += 1
+                elif next_id == id_ + 1 and string_builder[id_] == '-':
+                    ann_len += 1
+                    skip_tokens += 1
+                    phrase.append(next_text)
+                elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ':
+                    ann_len += 1
+                    skip_tokens += 2
+                    phrase.append(next_text)
+                else:
+                    break
+                id_ = next_id
+                j += 1
+            new_text = self._get_pseudo_ann(
+                ann=ann,
+                text=phrase,
+                length=ann_len
+            )
+            new_text = regex.split('( )', new_text)
+            string_builder = string_builder[:start_id] + new_text + \
+                string_builder[start_id + skip_tokens:]
+            if ann_len > 1:
+                consume(pseudo_ann_iter, ann_len - 1)
+            if math.ceil(len(new_text) / 2) != ann_len:
+                shifted_id += len(new_text) - (ann_len * 2) + 1
+        self._pseudo_ann_list.clear()
+        return string_builder
+
+    def _anonymize(self, sentence):
+        if self._method == 'delete':
+            for pattern, _, _ in self._category_anonymisation.values():
+                sentence = regex.sub(pattern, '', sentence)
+        elif self._method == 'tag':
+            sentence = self._tagging(sentence)
+        elif self._method == 'pseudo':
+            sentence = self._pseudonymization(sentence)
+        return sentence
+
+    def process(self, input_filename, output_filename):
+        """Anonymize the file in CCL format to the resulting file in plain text.
+
+        Args:
+            input_filename (str): Input filename in CCL format. \
+                Text tagged and processed with spacy NER.
+            output_filename (str): Output filename.
+
+        """
+        ccl_handler = CCLHandler(input_filename)
+        ccl_handler.process(output_filename, self.unmarshallers)
diff --git a/src/anonymizers/polish_anonymizer.py b/src/anonymizers/polish_anonymizer.py
new file mode 100644
index 0000000..2706ef2
--- /dev/null
+++ b/src/anonymizers/polish_anonymizer.py
@@ -0,0 +1,200 @@
+"""Implementation of anonymizer functionality for Polish language."""
+import math
+import regex
+import random
+
+
+from src.utils import consume
+from src.base_anonymizer import BaseAnonymizer
+from src.ccl_handler import CCLHandler
+from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
+                            generate_pseudo_user, generate_pseudo_website,
+                            generate_pseudo_date)
+
+
+class PolishAnonymizer(BaseAnonymizer):
+    """Class with an anonymization implementation for the Polish language."""
+
+    date_regex = regex.compile(
+        r'\b(?P<day_or_month_year>'
+        r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+        r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+        r'(?P<year1>\d{4}|\d{2}))\b|'
+
+        r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+        r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+        r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+        r'(?P<month_in_words>'
+        r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)'
+        r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+        r'\b(?P<month>Sty(?:|cze[nÅ„]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|'
+        r'Kwi(?:|ecie[nÅ„]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)'
+        r'|Sie(?:|rpie[nÅ„]|rpnia)|Wrz(?:|esie[nÅ„]|e[Å›s]nia)'
+        r'|Pa[zÅº](?:|dziernik|dziernika)|Lis(?:|topad|topada)'
+        r'|Gru(?:|dzie[nÅ„]|dnia))\b'
+        r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
+        r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
+        r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', regex.I
+    )
+
+    _file_to_liner_dispatch = {
+        'nam_liv_person': 'person_first_nam',
+        'nam_liv_person_last': 'person_last_nam',
+        'nam_fac_road': 'road_nam',
+        'nam_loc_gpe_city': 'city_nam',
+        'nam_org_group_team': 'country_nam'
+    }
+
+    _liner_to_tag_dispatch = {
+        'person_first_nam': '[OSOBA]',
+        'person_last_nam': '[OSOBA]',
+        'road_nam': '[MIEJSCE]',
+        'city_nam': '[MIEJSCE]',
+        'country_nam': '[MIEJSCE]'
+    }
+
+    def __init__(self, task_options):
+        """Initialize anonymizer with base regexes."""
+        super().__init__(task_options)
+        self.lang = task_options.get('language', 'pl')
+        # Order is important, first more specific
+        self._category_anonymisation = {
+            'user': (self.user_regex, self._user_token,
+                     generate_pseudo_user, {}),
+            'email': (self.email_regex, self._mail_token,
+                      generate_pseudo_email, {}),
+            'website': (self.website_regex, self._website_token,
+                        generate_pseudo_website, {}),
+            'date': (self.date_regex, self._date_token,
+                     generate_pseudo_date, {'lang': self.lang}),
+            'phone_number': (self.phone_number_regex, self._digits_token,
+                             generate_pseudo_phone_number, {}),
+        }
+        self.unmarshallers = {
+            'chunk': lambda *args: '\n',
+            'sentence': lambda *args: self._process_sent_tree(*args),
+        }
+        self._form_dict = dict()
+        self._pseudo_ann_list = list()
+        self._load_file()
+
+    def _load_file(self, filename='pl_dict.txt'):
+        with open(filename, 'r', encoding='utf-8') as f:
+            for line in f.readlines():
+                l_list = line.split()
+                cat = l_list[0]
+                if cat in self._file_to_liner_dispatch:
+                    cat_name = self._file_to_liner_dispatch[cat]
+                    length = int((len(l_list) - 2) / 2)
+                    gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)])
+                    flx_name = ' '.join(l_list[1:(1 + length)])
+                    flex = l_list[-1]
+                    if cat_name not in self._form_dict:
+                        self._form_dict[cat_name] = dict()
+                    if length not in self._form_dict[cat_name]:
+                        self._form_dict[cat_name][length] = dict()
+                    if gen_name not in self._form_dict[cat_name][length]:
+                        self._form_dict[cat_name][length][gen_name] = dict()
+                    self._form_dict[cat_name][length][gen_name][flex] = flx_name
+        for cat in self._form_dict:
+            for length in self._form_dict[cat]:
+                self._form_dict[cat][length] = list(
+                    self._form_dict[cat][length].items()
+                )
+
+    def _handle_annotated(self, id, text, tag, ann):
+        if self._method == 'delete':
+            return ''
+        elif self._method == 'tag':
+            if ann in self._liner_to_tag_dispatch:
+                return self._liner_to_tag_dispatch[ann]
+        elif self._method == 'pseudo':
+            if ann in self._form_dict:
+                self._pseudo_ann_list.append((id, text, tag, ann))
+        return text
+
+    def _process_sentence(self, string_builder):
+        string_builder = self._handle_pseudo_ann(string_builder)
+        sentence = ''.join(string_builder)
+        return self._anonymize(sentence)
+
+    def _get_pseudo_ann(self, ann, tag, length):
+        while length not in self._form_dict[ann] and length > 0:
+            length -= 1
+        if length == 0:
+            return ''
+        new_tag = ':'.join(tag.split(':')[1:4])
+        for _ in range(0, 10):
+            random_entry = random.choice(self._form_dict[ann][length])
+            if new_tag in random_entry[1]:
+                return random_entry[1][new_tag]
+            if new_tag == 'ign':
+                return random_entry[0]
+        random_entry = random.choice(self._form_dict[ann][length])
+        return random_entry[0]
+
+    def _handle_pseudo_ann(self, string_builder):
+        if not self._pseudo_ann_list:
+            return string_builder
+        shifted_id = 0
+        pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list))
+        for i, (id_, _, tag, ann) in pseudo_ann_iter:
+            j = i + 1
+            start_id = id_ + shifted_id
+            ann_len = 1
+            skip_tokens = 1
+            while j < len(self._pseudo_ann_list):
+                next_id, _, _, next_ann = self._pseudo_ann_list[j]
+                next_id += shifted_id
+                if ann != next_ann or (ann == 'person_first_nam' or
+                                       ann == 'person_last_nam'):
+                    break
+                if next_id == id_ + 1 and string_builder[next_id] == '-':
+                    skip_tokens += 1
+                elif next_id == id_ + 1 and string_builder[id_] == '-':
+                    ann_len += 1
+                    skip_tokens += 1
+                elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ':
+                    ann_len += 1
+                    skip_tokens += 2
+                else:
+                    break
+                id_ = next_id
+                j += 1
+            new_text = self._get_pseudo_ann(
+                ann=ann,
+                tag=tag,
+                length=ann_len
+            )
+            new_text = regex.split('( )', new_text)
+            string_builder = string_builder[:start_id] + new_text + \
+                string_builder[start_id + skip_tokens:]
+            if ann_len > 1:
+                consume(pseudo_ann_iter, ann_len - 1)
+            if math.ceil(len(new_text) / 2) != ann_len:
+                shifted_id += len(new_text) - (ann_len * 2) + 1
+        self._pseudo_ann_list.clear()
+        return string_builder
+
+    def _anonymize(self, sentence):
+        if self._method == 'delete':
+            for pattern, _, _ in self._category_anonymisation.values():
+                sentence = regex.sub(pattern, '', sentence)
+        elif self._method == 'tag':
+            sentence = self._tagging(sentence)
+        elif self._method == 'pseudo':
+            sentence = self._pseudonymization(sentence)
+        return sentence
+
+    def process(self, input_filename, output_filename):
+        """Anonymize the file in CCL format to the resulting file in plain text.
+
+        Args:
+            input_filename (str): Input filename in CCL format. \
+                Text tagged and processed with LINER.
+            output_filename (str): Output filename.
+
+        """
+        ccl_handler = CCLHandler(input_filename)
+        ccl_handler.process(output_filename, self.unmarshallers)
diff --git a/src/anonymizers/russian_anonymizer.py b/src/anonymizers/russian_anonymizer.py
new file mode 100644
index 0000000..2b9bef6
--- /dev/null
+++ b/src/anonymizers/russian_anonymizer.py
@@ -0,0 +1,191 @@
+"""Implementation of anonymizer functionality for Russian language."""
+import math
+import random
+
+import regex
+
+
+from src.utils import consume
+from src.ccl_handler import CCLHandler
+from src.base_anonymizer import BaseAnonymizer
+from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
+                            generate_pseudo_user, generate_pseudo_website,
+                            generate_pseudo_date)
+
+
+class RussianAnonymizer(BaseAnonymizer):
+    """Class with an anonymization implementation for the Russian language."""
+
+    date_regex = regex.compile(
+        r'\b(?P<day_or_month_year>'
+        r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+        r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+        r'(?P<year1>\d{4}|\d{2}))\b|'
+
+        r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+        r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+        r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+        r'(?P<month_in_words>'
+        r'(?!\b(Ð¯Ð½Ð²|Ð¤ÐµÐ²|ÐœÐ°Ñ€|ÐÐ¿Ñ€|ÐœÐ°Ð¹|Ð˜ÑŽÐ½|Ð˜ÑŽÐ»|ÐÐ²Ð³|Ð¡ÐµÐ½|ÐžÐºÑ‚|ÐÐ¾Ñ|Ð”ÐµÐº)\b)'
+        r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+        r'\b(?P<month>Ð¯Ð½Ð²(?:|Ð°Ñ€[ÑŒÐµÑ])|Ð¤ÐµÐ²(?:|Ñ€Ð°Ð»[ÑŒÐµÑ])|ÐœÐ°Ñ€(?:|Ñ‚|Ñ‚Ðµ|Ñ‚Ð°)|'
+        r'ÐÐ¿Ñ€(?:|ÐµÐ»[ÑŒÐµÑ])|ÐœÐ°[Ð¹ÐµÑ]|Ð˜ÑŽÐ½(?:|[ÑŒÐµÑ])|Ð˜ÑŽÐ»(?:|[ÑŒÐµÑ])|'
+        r'ÐÐ²Ð³(?:|ÑƒÑÑ‚|ÑƒÑÑ‚[ÐµÐ°])|Ð¡ÐµÐ½(?:|Ñ‚ÑÐ±Ñ€[ÑŒÐµÑ])|ÐžÐºÑ‚(?:|ÑÐ±Ñ€[ÑŒÐµÑ])|'
+        r'ÐÐ¾Ñ(?:|Ð±Ñ€[ÑŒÐµÑ])|Ð”ÐµÐº(?:|Ð°Ð±Ñ€[ÑŒÐµÑ]))\b'
+        r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
+        r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
+        r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?'
+        r'(?<!\b(Ð¯Ð½Ð²|Ð¤ÐµÐ²|ÐœÐ°Ñ€|ÐÐ¿Ñ€|ÐœÐ°Ð¹|Ð˜ÑŽÐ½|Ð˜ÑŽÐ»|ÐÐ²Ð³|Ð¡ÐµÐ½|ÐžÐºÑ‚|ÐÐ¾Ñ|Ð”ÐµÐº)\b))', regex.I
+    )
+
+    spacy_tag_map = {
+        'PER': '[PERSON]',
+        'LOC': '[LOCATION]',
+    }
+
+    def __init__(self, task_options):
+        """Initialize anonymizer with base regexes."""
+        super().__init__(task_options)
+        self.lang = task_options.get('language', 'ru')
+        # Order is important, first more specific
+        self._category_anonymisation = {
+            'user': (self.user_regex, self._user_token,
+                     generate_pseudo_user, {}),
+            'email': (self.email_regex, self._mail_token,
+                      generate_pseudo_email, {}),
+            'website': (self.website_regex, self._website_token,
+                        generate_pseudo_website, {}),
+            'date': (self.date_regex, self._date_token,
+                     generate_pseudo_date, {'lang': self.lang}),
+            'phone_number': (self.phone_number_regex, self._digits_token,
+                             generate_pseudo_phone_number, {}),
+        }
+        self.unmarshallers = {
+            'chunk': lambda *args: '\n',
+            'sentence': lambda *args: self._process_sent_tree(*args),
+        }
+        self._load_dict_file()
+
+    def _load_dict_file(self, filename='ru_dict.txt'):
+        with open(filename, 'r', encoding='utf-8') as f:
+            for line in f.readlines():
+                l_list = line.strip('\n').split()
+                cat, subtype = l_list[0], l_list[1]
+                length = len(l_list[2:])
+                text = ' '.join(l_list[2:])
+                if cat not in self._form_dict:
+                    self._form_dict[cat] = {}
+                if subtype not in self._form_dict[cat]:
+                    self._form_dict[cat][subtype] = []
+                self._form_dict[cat][subtype].append((text, length))
+
+    def _handle_annotated(self, id, text, tag, ann):
+        if self._method == 'delete':
+            return ''
+        elif self._method == 'tag':
+            if ann in self.spacy_tag_map:
+                return self.spacy_tag_map[ann]
+        elif self._method == 'pseudo':
+            if ann in self.spacy_tag_map:
+                self._pseudo_ann_list.append((id, text, tag, ann))
+        return text
+
+    def _process_sentence(self, string_builder):
+        string_builder = self._handle_pseudo_ann(string_builder)
+        sentence = ''.join(string_builder)
+        return self._anonymize(sentence)
+
+    def _get_pseudo_ann(self, ann, text, length):
+        new_text = []
+        if ann == 'PER':
+            gen = random.choice(['FIRST_M', 'FIRST_F'])
+            name_length = length - 1 if length > 1 else 1
+            while name_length > 0:
+                names = [p for p in self._form_dict['PERSON'][gen]
+                         if p[1] <= name_length]
+                random_name = random.choice(names)
+                name_length -= random_name[1]
+                new_text.append(random_name[0])
+            if length > 1:
+                last_name = random.choice(self._form_dict['PERSON']['LAST'])
+                new_text.append(last_name[0])
+        elif ann == 'LOC':
+            found = False
+            for _, values in self._form_dict['LOCATION'].items():
+                if ' '.join(text) in values:
+                    new_text = [random.choice(values)[0]]
+                    found = True
+            if not found:
+                default_loc = self._form_dict['LOCATION']['CITY']
+                new_text = [random.choice(default_loc)[0]]
+        else:
+            new_text = ' '.join(text)
+        return ' '.join(new_text)
+
+    def _handle_pseudo_ann(self, string_builder):
+        if not self._pseudo_ann_list:
+            return string_builder
+        shifted_id = 0
+        pseudo_ann_iter = enumerate(iter(self._pseudo_ann_list))
+        for i, (id_, text, _, ann) in pseudo_ann_iter:
+            j = i + 1
+            start_id = id_ + shifted_id
+            ann_len = 1
+            phrase = [text]
+            skip_tokens = 1
+            while j < len(self._pseudo_ann_list):
+                next_id, next_text, _, next_ann = self._pseudo_ann_list[j]
+                next_id += shifted_id
+                if ann != next_ann:
+                    break
+                if next_id == id_ + 1 and string_builder[next_id] == '-':
+                    skip_tokens += 1
+                elif next_id == id_ + 1 and string_builder[id_] == '-':
+                    ann_len += 1
+                    skip_tokens += 1
+                    phrase.append(next_text)
+                elif next_id == id_ + 2 and string_builder[id_ + 1] == ' ':
+                    ann_len += 1
+                    skip_tokens += 2
+                    phrase.append(next_text)
+                else:
+                    break
+                id_ = next_id
+                j += 1
+            new_text = self._get_pseudo_ann(
+                ann=ann,
+                text=phrase,
+                length=ann_len
+            )
+            new_text = regex.split('( )', new_text)
+            string_builder = string_builder[:start_id] + new_text + \
+                string_builder[start_id + skip_tokens:]
+            if ann_len > 1:
+                consume(pseudo_ann_iter, ann_len - 1)
+            if math.ceil(len(new_text) / 2) != ann_len:
+                shifted_id += len(new_text) - (ann_len * 2) + 1
+        self._pseudo_ann_list.clear()
+        return string_builder
+
+    def _anonymize(self, sentence):
+        if self._method == 'delete':
+            for pattern, _, _ in self._category_anonymisation.values():
+                sentence = regex.sub(pattern, '', sentence)
+        elif self._method == 'tag':
+            sentence = self._tagging(sentence)
+        elif self._method == 'pseudo':
+            sentence = self._pseudonymization(sentence)
+        return sentence
+
+    def process(self, input_filename, output_filename):
+        """Anonymize the file in CCL format to the resulting file in plain text.
+
+        Args:
+            input_filename (str): Input filename in CCL format. \
+                Text tagged and processed with spacy NER.
+            output_filename (str): Output filename.
+
+        """
+        ccl_handler = CCLHandler(input_filename)
+        ccl_handler.process(output_filename, self.unmarshallers)
diff --git a/src/base_anonymizer.py b/src/base_anonymizer.py
new file mode 100644
index 0000000..fd62de5
--- /dev/null
+++ b/src/base_anonymizer.py
@@ -0,0 +1,164 @@
+"""Abstract description of anonymizer including base regexes."""
+import regex
+from abc import ABC, abstractmethod
+from src.generators import generate_phone_number_tag
+
+
+class BaseAnonymizer(ABC):
+    """Base abstract class for anonymization."""
+
+    email_regex = regex.compile(
+        r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+'
+        r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)'
+        r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)'
+        r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', regex.I
+    )
+    user_regex = regex.compile(r'\B(?P<username>\@[\w\-]+)')
+    _website_exceptions = ['m.in']
+    website_regex = regex.compile(
+        r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(_website_exceptions)) +
+        r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?'
+        r'(?P<auth>\S+(?::\S*)?@)?'
+        r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})'
+        r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})'
+        r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})'
+        r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
+        r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
+        r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
+        r'|'
+        r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?'
+        r'[a-z0-9\u00a1-\uffff]\.)+)'
+        r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)'
+        r'(?P<port>:\d{2,5})?'
+        r'(?P<path>[/?#]\S*)?)',
+        regex.UNICODE | regex.I
+    )
+    phone_number_regex = regex.compile(
+        r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?'
+        r'(?P<number>(\d[- ]??){9,10})'
+    )
+
+    def __init__(self, task_options):
+        """Initialize anonymizer with chosen method and default tokens."""
+        self._mail_token = '[MAIL]'
+        self._user_token = '@[USER]'
+        self._website_token = '[WWW]'
+        self._digits_token = '[DIGITS]'
+        self._date_token = '[DATE]'
+        self._default_token = '[INNE]'
+
+        self._method = task_options.get('method', 'delete')
+
+        self._category_anonymisation = {}
+        self._form_dict = {}
+        self._pseudo_ann_list = []
+
+    def _process_lex(self, lex_subtree):
+        tag = ''
+        for elem in lex_subtree:
+            if elem.tag == 'ctag':
+                tag = elem.text
+            elif elem.tag != 'base':
+                raise Exception('Unrecognized tag inside lex: ' + elem.tag)
+        if tag == '':
+            raise Exception('Lex tag had no ctag inside!')
+        return tag
+
+    def _tagging(self, sentence):
+        for category in self._category_anonymisation:
+            pattern, token, _, _ = self._category_anonymisation[category]
+
+            if category == 'phone_number':
+                matches = [m for m in pattern.finditer(sentence)]
+                for match in matches:
+                    tag = generate_phone_number_tag(match.groupdict(''), token)
+                    replace_match = match.group(0)
+                    sentence = regex.sub(regex.escape(replace_match),
+                                         tag, sentence)
+            else:
+                sentence = regex.sub(pattern, token, sentence)
+        return sentence
+
+    def _pseudonymization(self, sentence):
+        sentence_after_regex = sentence
+        to_replace = []
+        for category in self._category_anonymisation:
+            pattern, _, generator, args = self._category_anonymisation[category]
+            for match in pattern.finditer(sentence_after_regex):
+                if not match:
+                    continue
+                to_replace.append((match, generator, args))
+                sentence_after_regex = regex.sub(regex.escape(match.group(0)),
+                                                 '', sentence_after_regex)
+
+        for match, generator, args in to_replace:
+            replace_match = match.group(0)
+            pseudo_string = generator(match.groupdict(''), **args)
+            sentence = regex.sub(
+                regex.escape(replace_match),
+                pseudo_string,
+                sentence
+            )
+        return sentence
+
+    def _process_ann(self, ann_subtree):
+        value = int(ann_subtree.text)
+        chan = ann_subtree.attrib["chan"]
+        return chan, value
+
+    def _process_single_tok(self, id, tok_subtree):
+        text = ''
+        tag = ''
+        ann = []
+        for elem in tok_subtree:
+            if elem.tag == 'orth':
+                text = elem.text
+            elif elem.tag == 'lex':
+                tag = self._process_lex(elem)
+            elif elem.tag == 'ann':
+                ann.append(self._process_ann(elem))
+        word = self._process_word(id, text, tag, ann)
+        return word
+
+    def _process_word(self, id, text, tag, ann):
+        for chan, value in ann:
+            if value != 0:
+                text = self._handle_annotated(id, text, tag, chan)
+                break
+        return text
+
+    def _process_sent_tree(self, sentence_subtree):
+        string_builder = []
+        id = 0
+        for elem in sentence_subtree:
+            if elem.tag == 'tok':
+                tok = self._process_single_tok(id, elem)
+                string_builder.append(tok)
+                string_builder.append(' ')
+                id += 2
+            elif elem.tag == 'ns':
+                id -= 1
+                string_builder.pop()
+            else:
+                raise Exception('Unrecognized tag inside sentence: ' + elem.tag)
+        return self._process_sentence(string_builder)
+
+    @abstractmethod
+    def _handle_annotated(self, id, text, tag, ann):
+        pass
+
+    @abstractmethod
+    def _process_sentence(self, string_builder):
+        pass
+
+    @abstractmethod
+    def process(self, input_filename, output_filename):
+        """Anonymize the text in a file input_filename and save the anonymized \
+            output text to a file output_filename.
+
+        Args:
+            input_filename ([type]): [description]
+            output_filename ([type]): [description]
+
+        """
+        pass
diff --git a/src/generators.py b/src/generators.py
index 2d4a147..606aabc 100644
--- a/src/generators.py
+++ b/src/generators.py
@@ -198,7 +198,8 @@ def random_date(day_no_digits: int, month_no_digits: int, year_match):
     return day, month, year
 
 
-def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'):
+def month_number2text(month_number: int, abbr: bool, case: str = 'genitive',
+                      lang='pl'):
     """Return the name of the month in words.
 
     Generate the month name from its number.
@@ -209,9 +210,10 @@ def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'):
         month_number (int): Number of the month.
         abbr (bool): Return abbreviation form.
         case (str): Return the name of the month in the given case.
+        lang (str): The language which is used to generate text.
 
     """
-    locale = Locale('pl')
+    locale = Locale(lang)
     if case == 'genitive':
         months = locale.months['format']
     elif case == 'nominative':
@@ -227,7 +229,7 @@ def month_number2text(month_number: int, abbr: bool, case: str = 'genitive'):
     return months[month_number]
 
 
-def generate_pseudo_date(date_match):
+def generate_pseudo_date(date_match, lang='pl'):
     """Pseudonymize matched date.
 
     Generate the pseudonymized based on matched data in text.
@@ -237,6 +239,7 @@ def generate_pseudo_date(date_match):
 
     Args:
         date_match: Matched date.
+        lang: The language which is used to generate the date.
 
     """
     date = ''
@@ -278,29 +281,32 @@ def generate_pseudo_date(date_match):
 
         if date_match['year3']:
             year_match = date_match['year3']
-        elif date_match['year4']:
+        elif lang != 'en' and date_match['year4']:
             year_match = date_match['year4']
         else:
             year_match = ''
         day, month, year = random_date(day_len, 2, year_match)
 
         abbr = len(date_match['month']) == 3
-        locale = Locale('pl')
+        locale = Locale(lang)
         if date_match['month'] in locale.months['format']['wide'].values():
             case = 'genitive'
         else:
             case = 'nominative'
-        month = month_number2text(int(month), abbr, case)
+        month = month_number2text(int(month), abbr, case, lang=lang)
 
         if date_match['day1']:
             date_order = [day, date_match['punct5'],
                           month, date_match['punct6']]
+        elif date_match['day2'] and lang == 'en':
+            date_order = [month, date_match['punct7'],
+                          day, date_match['punct6']]
         elif date_match['day2']:
             date_order = [month, date_match['punct7'],
                           day, date_match['punct8']]
         else:
             date_order = [month]
-        if date_match['year3'] or date_match['year4']:
+        if date_match['year3'] or (lang != 'en' and date_match['year4']):
             date_order += [year]
         date = ''.join(date_order)
     else:
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000..81cc67f
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,14 @@
+"""Module for useful functions."""
+
+import itertools
+
+
+def consume(iterative, n):
+    """Consume n elements from iterative object.
+
+    Args:
+        iterative (iter): Python iterative object.
+        n (int): Number of elements to consume.
+
+    """
+    next(itertools.islice(iterative, n - 1, n), None)
diff --git a/src/worker.py b/src/worker.py
index de27b90..6cbc166 100644
--- a/src/worker.py
+++ b/src/worker.py
@@ -4,8 +4,9 @@ import logging
 import nlp_ws
 
 
-from src.anonymizer import Anonymizer
-from src.ccl_handler import CCLHandler
+from src.anonymizers.polish_anonymizer import PolishAnonymizer
+from src.anonymizers.english_anonymizer import EnglishAnonymizer
+from src.anonymizers.russian_anonymizer import RussianAnonymizer
 
 _log = logging.getLogger(__name__)
 
@@ -23,6 +24,10 @@ class Worker(nlp_ws.NLPWorker):
                 'tag' replaces selected tokens with arbitrary tags, 'pseudo'
                 replaces selected tokens with a random token that
         """
-        anon = Anonymizer(task_options)
-        ccl_handler = CCLHandler(input_file)
-        ccl_handler.process(output_file, anon.unmarshallers)
+        lang = task_options.get('language', 'pl')
+        anonymizers = {'pl': PolishAnonymizer,
+                       'en': EnglishAnonymizer,
+                       'ru': RussianAnonymizer
+                       }
+        anon = anonymizers.get(lang, PolishAnonymizer)(task_options)
+        anon.process(input_file, output_file)
-- 
GitLab