From 8bc5de430e139b8c453bf0450c60bc632f9a678e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Mon, 19 Dec 2022 12:18:27 +0100
Subject: [PATCH] [WIP] more refactoring & unit testing

---
 .../marek_kowalski_pojechal_do_wroclawia.ccl  | 30 ++++----
 requirements.txt                              |  3 +-
 src/anonymizers/english_anonymizer.py         |  2 +-
 src/anonymizers/polish_anonymizer.py          |  2 +-
 src/anonymizers/russian_anonymizer.py         |  2 +-
 src/ccl_parser.py                             | 70 +++++++++++++++++++
 src/detectors/date/date.py                    |  2 +-
 src/detectors/date/en.py                      |  5 +-
 src/detectors/date/pl.py                      |  5 +-
 src/detectors/date/ru.py                      |  5 +-
 src/detectors/email/email.py                  |  5 +-
 src/detectors/ner/__init__.py                 |  1 +
 src/detectors/ner/ner.py                      |  8 +++
 src/detectors/ner/pl_liner_n5.py              | 33 +++++++++
 src/detectors/phone/phone.py                  |  5 +-
 src/detectors/url/url.py                      |  5 +-
 src/detectors/user/user.py                    |  5 +-
 src/dictionaries/__init__.py                  |  0
 src/dictionaries/pl_ner_replacements.py       | 46 ++++++++++++
 src/entity_types.py                           | 14 ++++
 src/string_replacements.py                    | 27 +++++++
 src/suppressors/__init__.py                   |  1 +
 src/suppressors/order_based.py                | 27 +++++++
 src/tag_anonimization.py                      | 40 +++++++++++
 src/utils.py                                  | 14 ----
 src/utils/__init__.py                         |  1 +
 src/utils/ner_pl_n5_mapping.py                |  9 +++
 src/utils/utils.py                            | 33 +++++++++
 tests/detectors/date/test_en.py               | 13 ++--
 tests/detectors/date/test_pl.py               |  6 +-
 tests/detectors/date/test_ru.py               |  6 +-
 tests/detectors/email/test_email.py           |  3 +-
 tests/detectors/ner/__init__.py               |  0
 tests/detectors/ner/test_pl_liner_n5.py       | 21 ++++++
 tests/detectors/phone/test_phone.py           |  3 +-
 tests/detectors/url/test_url.py               |  7 +-
 tests/detectors/user/test_user.py             |  3 +-
 tests/dictionaries/__init__.py                |  0
 .../dictionaries/test_pl_ner_replacements.py  | 38 ++++++++++
 tests/suppressors/test_order_based.py         | 16 +++++
 tests/test_ccl_parser.py                      | 60 ++++++++++++++++
 tests/test_string_replacements.py             | 20 ++++++
 tests/test_tag_anonimization.py               | 17 +++++
 43 files changed, 552 insertions(+), 61 deletions(-)
 create mode 100644 src/ccl_parser.py
 create mode 100644 src/detectors/ner/__init__.py
 create mode 100644 src/detectors/ner/ner.py
 create mode 100644 src/detectors/ner/pl_liner_n5.py
 create mode 100644 src/dictionaries/__init__.py
 create mode 100644 src/dictionaries/pl_ner_replacements.py
 create mode 100644 src/entity_types.py
 create mode 100644 src/string_replacements.py
 create mode 100644 src/suppressors/__init__.py
 create mode 100644 src/suppressors/order_based.py
 create mode 100644 src/tag_anonimization.py
 delete mode 100644 src/utils.py
 create mode 100644 src/utils/__init__.py
 create mode 100644 src/utils/ner_pl_n5_mapping.py
 create mode 100644 src/utils/utils.py
 create mode 100644 tests/detectors/ner/__init__.py
 create mode 100644 tests/detectors/ner/test_pl_liner_n5.py
 create mode 100644 tests/dictionaries/__init__.py
 create mode 100644 tests/dictionaries/test_pl_ner_replacements.py
 create mode 100644 tests/suppressors/test_order_based.py
 create mode 100644 tests/test_ccl_parser.py
 create mode 100644 tests/test_string_replacements.py
 create mode 100644 tests/test_tag_anonimization.py

diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
index b19c400..f1459ba 100644
--- a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
+++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
@@ -7,39 +7,45 @@
     <orth>Marek</orth>
     <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex>
     <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex>
-    <ann chan="nam_liv" head="1">1</ann>
-    <ann chan="nam_loc">0</ann>
+    <ann chan="person_first_nam" head="1">1</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam">0</ann>
    </tok>
    <tok>
     <orth>Kowalski</orth>
     <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex>
-    <ann chan="nam_liv">1</ann>
-    <ann chan="nam_loc">0</ann>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam" head="1">1</ann>
+    <ann chan="city_nam">0</ann>
    </tok>
    <tok>
     <orth>pojechaÅ‚</orth>
     <lex disamb="1"><base>pojechaÄ‡</base><ctag>praet:sg:m1:perf</ctag></lex>
-    <ann chan="nam_liv">0</ann>
-    <ann chan="nam_loc">0</ann>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam">0</ann>
    </tok>
    <tok>
     <orth>do</orth>
     <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex>
-    <ann chan="nam_liv">0</ann>
-    <ann chan="nam_loc">0</ann>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam">0</ann>
    </tok>
    <tok>
     <orth>WrocÅ‚awia</orth>
     <lex disamb="1"><base>WrocÅ‚aw</base><ctag>subst:sg:gen:m3</ctag></lex>
-    <ann chan="nam_liv">0</ann>
-    <ann chan="nam_loc" head="1">1</ann>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam" head="1">1</ann>
    </tok>
    <ns/>
    <tok>
     <orth>.</orth>
     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
-    <ann chan="nam_liv">0</ann>
-    <ann chan="nam_loc">0</ann>
+    <ann chan="person_first_nam">0</ann>
+    <ann chan="person_last_nam">0</ann>
+    <ann chan="city_nam">0</ann>
    </tok>
   </sentence>
  </chunk>
diff --git a/requirements.txt b/requirements.txt
index f7260eb..3923df9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 --index-url https://pypi.clarin-pl.eu/simple/
 nlp-ws
 regex==2020.10.28
-Babel==2.8.0
\ No newline at end of file
+Babel==2.8.0
+bitarray==2.6.1
\ No newline at end of file
diff --git a/src/anonymizers/english_anonymizer.py b/src/anonymizers/english_anonymizer.py
index 61f29b1..0961c24 100644
--- a/src/anonymizers/english_anonymizer.py
+++ b/src/anonymizers/english_anonymizer.py
@@ -5,7 +5,7 @@ import random
 import regex
 
 
-from src.utils import consume
+from src.utils.utils import consume
 from src.ccl_handler import CCLHandler
 from src.base_anonymizer import BaseAnonymizer
 from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
diff --git a/src/anonymizers/polish_anonymizer.py b/src/anonymizers/polish_anonymizer.py
index 60f9c50..f725254 100644
--- a/src/anonymizers/polish_anonymizer.py
+++ b/src/anonymizers/polish_anonymizer.py
@@ -4,7 +4,7 @@ import regex
 import random
 
 
-from src.utils import consume
+from src.utils.utils import consume
 from src.base_anonymizer import BaseAnonymizer
 from src.ccl_handler import CCLHandler
 from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
diff --git a/src/anonymizers/russian_anonymizer.py b/src/anonymizers/russian_anonymizer.py
index 32c0a91..d9e6c07 100644
--- a/src/anonymizers/russian_anonymizer.py
+++ b/src/anonymizers/russian_anonymizer.py
@@ -5,7 +5,7 @@ import random
 import regex
 
 
-from src.utils import consume
+from src.utils.utils import consume
 from src.ccl_handler import CCLHandler
 from src.base_anonymizer import BaseAnonymizer
 from src.generators import (generate_pseudo_email, generate_pseudo_phone_number,
diff --git a/src/ccl_parser.py b/src/ccl_parser.py
new file mode 100644
index 0000000..41e6971
--- /dev/null
+++ b/src/ccl_parser.py
@@ -0,0 +1,70 @@
+from typing import Dict, Any, List, Tuple
+from lxml import etree
+from collections import defaultdict
+
+def parse_ccl(ccl: str) -> Tuple[str, Dict[str, List[Tuple[int, int, str]]]]:
+    """
+    Parses CCL XML format and returns original text and annotations.
+    
+    Annotations are returned as a dictionary with keys being annotation channels
+    and values being a list of tuples (start, end, word) where:
+    * start is an index of the first character in the word
+    * end is an index of the last character in the word
+    * word is a word or a group of words (in case of multiword tokens)
+    
+    :param ccl: CCL XML
+    :return: (text, annotations)
+    """
+    ccl_tree = etree.fromstring(ccl.strip().encode('utf-8'))
+    
+    results = defaultdict(list)
+    text = ""
+    
+    # First token is assumed to not have space before it
+    last_was_ns = True
+    
+    tokens = ccl_tree.xpath("//ns | //tok")
+    for token in tokens:                
+        if token.tag == 'tok':
+            if not last_was_ns:
+                text += " "
+
+            word = token.xpath('./orth')[0].text
+            start = len(text)
+            end = start + len(word)
+
+            for lex in token.xpath('./lex'):
+                if lex.attrib['disamb'] == "1":
+                    ctag = lex.xpath('./ctag')[0]
+                    results["ctag"].append((start, end, ctag.text))
+                    
+                    break
+                
+            for ann in token.xpath('./ann'):
+                is_present = int(ann.text) == 1
+                if not is_present:
+                    continue
+                
+                channel = ann.attrib['chan']
+                is_head = "head" in ann.attrib and ann.attrib['head'] == "1"
+                
+                if is_head:
+                    results[channel].append((start, end, word))
+                else:
+                    if last_was_ns:
+                        new_word = results[channel][-1][2] + word
+                    else:
+                        new_word = results[channel][-1][2] + " " + word
+                        
+                    old_start = results[channel][-1][0]
+                        
+                    results[channel][-1] = (old_start, end, new_word)
+                        
+            last_was_ns = False
+            text += word
+        elif token.tag == 'ns':
+            last_was_ns = True
+            
+    return text, results
+            
+    
\ No newline at end of file
diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py
index 85e34b4..2f1f132 100644
--- a/src/detectors/date/date.py
+++ b/src/detectors/date/date.py
@@ -10,7 +10,7 @@ def find_dates(text: str, language: str = "en") -> List[Tuple[int, int, str]]:
     :type text: str
     :param language: the language of the text
     :type language: str
-    :return: a list of tuples containing (start, end, detected_date)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     
diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py
index 594e663..a716bc1 100644
--- a/src/detectors/date/en.py
+++ b/src/detectors/date/en.py
@@ -1,5 +1,6 @@
 import regex as re
 from typing import List, Tuple
+from src.entity_types import EntityTypes
 
 EN_DATES_REGEX = re.compile(
     r'\b(?P<day_or_month_year>'
@@ -26,11 +27,11 @@ def detect_dates_en(text: str) -> List[Tuple[int, int, str]]:
     Detects English dates in the text.
     :param text: the text to be searched
     :type text: str
-    :return: a list of tuples containing (start, end, detected_date)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     matches = EN_DATES_REGEX.finditer(text)
     dates = []
     for match in matches:
-        dates.append((match.start(), match.end(), match.group()))
+        dates.append((match.start(), match.end(), EntityTypes.DATE))
     return dates
\ No newline at end of file
diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py
index 7001b9f..02abfdd 100644
--- a/src/detectors/date/pl.py
+++ b/src/detectors/date/pl.py
@@ -1,5 +1,6 @@
 import regex as re
 from typing import List, Tuple
+from src.entity_types import EntityTypes
 
 PL_DATES_REGEX = re.compile(
     r'\b(?P<day_or_month_year>'
@@ -29,11 +30,11 @@ def detect_dates_pl(text: str) -> List[Tuple[int, int, str]]:
     Detects Polish dates in the text.
     :param text: the text to be searched
     :type text: str
-    :return: a list of tuples containing (start, end, detected_date)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     matches = PL_DATES_REGEX.finditer(text)
     dates = []
     for match in matches:
-        dates.append((match.start(), match.end(), match.group()))
+        dates.append((match.start(), match.end(), EntityTypes.DATE))
     return dates
\ No newline at end of file
diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py
index 91017c8..4100717 100644
--- a/src/detectors/date/ru.py
+++ b/src/detectors/date/ru.py
@@ -1,5 +1,6 @@
 import regex as re
 from typing import List, Tuple
+from src.entity_types import EntityTypes
 
 RU_DATES_REGEX = re.compile(
     r'\b(?P<day_or_month_year>'
@@ -29,11 +30,11 @@ def detect_dates_ru(text: str) -> List[Tuple[int, int, str]]:
     Detects Russian dates in the text.
     :param text: the text to be searched
     :type text: str
-    :return: a list of tuples containing (start, end, detected_date)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     matches = RU_DATES_REGEX.finditer(text)
     dates = []
     for match in matches:
-        dates.append((match.start(), match.end(), match.group()))
+        dates.append((match.start(), match.end(), EntityTypes.DATE))
     return dates
\ No newline at end of file
diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py
index a0637ec..82e1756 100644
--- a/src/detectors/email/email.py
+++ b/src/detectors/email/email.py
@@ -1,5 +1,6 @@
 import regex as re
 from typing import List, Tuple
+from src.entity_types import EntityTypes
 
 EMAIL_REGEX = re.compile(
     r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+'
@@ -16,11 +17,11 @@ def detect_emails(text: str, language: str) -> List[Tuple[int, int, str]]:
     :type text: str
     :param language: the language of the text
     :type language: str
-    :return: a list of tuples containing (start, end, detected_email)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     matches = EMAIL_REGEX.finditer(text)
     emails = []
     for match in matches:
-        emails.append((match.start(), match.end(), match.group()))
+        emails.append((match.start(), match.end(), EntityTypes.EMAIL))
     return emails
\ No newline at end of file
diff --git a/src/detectors/ner/__init__.py b/src/detectors/ner/__init__.py
new file mode 100644
index 0000000..9f8aefd
--- /dev/null
+++ b/src/detectors/ner/__init__.py
@@ -0,0 +1 @@
+from src.detectors.ner.ner import detect_ner
\ No newline at end of file
diff --git a/src/detectors/ner/ner.py b/src/detectors/ner/ner.py
new file mode 100644
index 0000000..18c5622
--- /dev/null
+++ b/src/detectors/ner/ner.py
@@ -0,0 +1,8 @@
+from typing import List, Tuple
+from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5
+
+def detect_ner(ccl_annotations, language) -> List[Tuple[int, int, str]]:
+    if language == 'pl':
+        return detect_ner_pl_liner_n5(ccl_annotations)
+    else:
+        raise NotImplementedError
\ No newline at end of file
diff --git a/src/detectors/ner/pl_liner_n5.py b/src/detectors/ner/pl_liner_n5.py
new file mode 100644
index 0000000..c494d13
--- /dev/null
+++ b/src/detectors/ner/pl_liner_n5.py
@@ -0,0 +1,33 @@
+from typing import List, Tuple, Dict
+from src.utils.utils import subdict
+from src.entity_types import EntityTypes
+from src.utils.ner_pl_n5_mapping import NER_PL_N5_MAPPING
+
+def detect_ner_pl_liner_n5(
+    ccl_annotations: Dict[str, List[Tuple[int, int, str]]]
+) -> List[Tuple[int, int, str]]:
+    """
+    Detects ner entities in the text based on liner_n5 NER ontology.
+
+    :param ner_annotations: a dictionary of NER annotations
+    :type ner_annotations: Dict[str, List[Tuple[int, int, str]]]
+    :return: a list of tuples containing (start, end, entity_type)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    names = subdict(
+        ccl_annotations,
+        [
+            "nam_liv_person",
+            "nam_liv_person_last",
+            "nam_fac_road",
+            "nam_loc_gpe_city",
+            "nam_org_group_team",
+        ],
+        all_must_be_present=False,
+    )
+
+    return [
+        (start, end, NER_PL_N5_MAPPING.get(entity_type, EntityTypes.OTHER))
+        for entity_type, entity in names.items()
+        for start, end, _ in entity
+    ]
diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py
index 49abeb5..8ab3d65 100644
--- a/src/detectors/phone/phone.py
+++ b/src/detectors/phone/phone.py
@@ -1,5 +1,6 @@
 import regex as re
 from typing import List, Tuple
+from src.entity_types import EntityTypes
 
 PHONE_NUMBER_REGEX = re.compile(
     r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?'
@@ -14,11 +15,11 @@ def detect_phone_numbers(text: str, language: str) -> List[Tuple[int, int, str]]
     :type text: str
     :param language: the language of the text
     :type language: str
-    :return: a list of tuples containing (start, end, detected_date)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     matches = PHONE_NUMBER_REGEX.finditer(text)
     phone_numbers = []
     for match in matches:
-        phone_numbers.append((match.start(), match.end(), match.group()))
+        phone_numbers.append((match.start(), match.end(), EntityTypes.PHONE_NUMBER))
     return phone_numbers
\ No newline at end of file
diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py
index 2ca1fec..70b8ba8 100644
--- a/src/detectors/url/url.py
+++ b/src/detectors/url/url.py
@@ -2,6 +2,7 @@ import regex as re
 from typing import List, Tuple
 from .pl import URL_REGEX_PL
 from .common import generate_url_regex
+from src.entity_types import EntityTypes
 
 def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]:
     """
@@ -10,7 +11,7 @@ def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]:
     :type text: str
     :param language: the language of the text
     :type language: str
-    :return: a list of tuples containing (start, end, detected_url)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     if language == "pl":
@@ -21,6 +22,6 @@ def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]:
     matches = url_regex.finditer(text)
     urls = []
     for match in matches:
-        urls.append((match.start(), match.end(), match.group()))
+        urls.append((match.start(), match.end(), EntityTypes.URL))
         
     return urls
\ No newline at end of file
diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py
index 4d8f035..d588a25 100644
--- a/src/detectors/user/user.py
+++ b/src/detectors/user/user.py
@@ -1,5 +1,6 @@
 import regex as re
 from typing import List, Tuple
+from src.entity_types import EntityTypes
 
 USER_REGEX = re.compile(r'\B(?P<username>\@[\w\-]+)')
 
@@ -10,11 +11,11 @@ def detect_users(text: str, language: str) -> List[Tuple[int, int, str]]:
     :type text: str
     :param language: the language of the text
     :type language: str
-    :return: a list of tuples containing (start, end, detected_user)
+    :return: a list of tuples containing (start, end, entity_type)
     :rtype: List[Tuple[int, int, str]]
     """
     matches = USER_REGEX.finditer(text)
     users = []
     for match in matches:
-        users.append((match.start(), match.end(), match.group()))
+        users.append((match.start(), match.end(), EntityTypes.USER))
     return users
\ No newline at end of file
diff --git a/src/dictionaries/__init__.py b/src/dictionaries/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/dictionaries/pl_ner_replacements.py b/src/dictionaries/pl_ner_replacements.py
new file mode 100644
index 0000000..77e7e87
--- /dev/null
+++ b/src/dictionaries/pl_ner_replacements.py
@@ -0,0 +1,46 @@
+from typing import Dict, List, Optional
+from collections import defaultdict
+from src.entity_types import EntityTypes
+
+def load_pl_ner_replacements_dictionary(path: str, ner_mapping: Optional[Dict[str, str]] = None) -> Dict[str, Dict[str, Dict[str, str]]]:
+    """
+    Loads a dictionary that maps named entity tags to lemmas to part-of-speech tags to words.
+    
+    The dictionary is a nested defaultdict, so if a key is not found, an empty defaultdict is returned.
+    
+    The dictionary is stored in a tab-separated file, where each line has the following format:
+    
+    <ner_tag> <word> <lemma> <pos_tag>
+    
+    Example:
+    
+    OSOBA Andrzejowi Andrzej subst:sg:dat:m1
+    OSOBA Andrzej Andrzej subst:sg:m1:imperf
+    OSOBA Kasia Kasia subst:sg:f:imperf
+    MIEJSCE WrocÅ‚aw WrocÅ‚aw subst:sg:m2:imperf
+    MIEJSCE Warszawa Warszawa subst:sg:f:imperf
+    MIEJSCE KrakÃ³w KrakÃ³w subst:sg:m2:imperf
+    
+    Parameters
+    ----------
+    path : str
+        Path to the dictionary file.
+    
+    Returns
+    -------
+    Dict[str, Dict[str, Dict[str, str]]]
+        Nested defaultdict that maps named entity tags to lemmas to part-of-speech tags to words.
+    """
+    
+    replacement_dictionary = defaultdict(lambda: defaultdict(dict))
+    with open(path, "r", encoding="utf-8") as file:
+        for line in file:
+            line = line.strip()
+            ner_tag, word, lemma, pos_tag = line.split("\t")
+            
+            if ner_mapping is not None:
+                ner_tag = ner_mapping.get(ner_tag, EntityTypes.OTHER)
+                        
+            replacement_dictionary[ner_tag][lemma][pos_tag] = word
+            
+    return replacement_dictionary
\ No newline at end of file
diff --git a/src/entity_types.py b/src/entity_types.py
new file mode 100644
index 0000000..ed0496b
--- /dev/null
+++ b/src/entity_types.py
@@ -0,0 +1,14 @@
+class EntityTypes:
+    NAME = "name"
+    SURNAME = "surname"
+    STREET_NAME = "street_name"
+    CITY = "city"
+    COUNTRY = "country"
+    PHONE_NUMBER = "phone_number"
+    URL = "url"
+    USER = "user"
+    EMAIL = "email"
+    DATE = "date"
+    TIN = "tin" # Tax Identification Number
+    KRS = "krs" # National Court Register
+    OTHER = "other"
\ No newline at end of file
diff --git a/src/string_replacements.py b/src/string_replacements.py
new file mode 100644
index 0000000..33c426d
--- /dev/null
+++ b/src/string_replacements.py
@@ -0,0 +1,27 @@
+from typing import List, Tuple
+
+def replace(original_string: str, replacements: List[Tuple[int, int, str]]):
+    """
+    Replaces substrings in a string.
+    
+    Parameters
+    ----------
+    original_string : str
+        The original string.
+    replacements : List[Tuple[int, int, str]]
+        A list of tuples containing (start, end, replacement).
+    
+    Returns
+    -------
+    str
+        The string with replacements applied.
+    """
+    
+    replacements = sorted(replacements, key=lambda x: x[0])
+    
+    delta = 0
+    for replacement in replacements:
+        original_string = original_string[:replacement[0] + delta] + replacement[2] + original_string[replacement[1] + delta:]
+        delta += len(replacement[2]) - (replacement[1] - replacement[0])
+        
+    return original_string
\ No newline at end of file
diff --git a/src/suppressors/__init__.py b/src/suppressors/__init__.py
new file mode 100644
index 0000000..e9cc16f
--- /dev/null
+++ b/src/suppressors/__init__.py
@@ -0,0 +1 @@
+from src.suppressors.order_based import suppress_order_based
\ No newline at end of file
diff --git a/src/suppressors/order_based.py b/src/suppressors/order_based.py
new file mode 100644
index 0000000..8488465
--- /dev/null
+++ b/src/suppressors/order_based.py
@@ -0,0 +1,27 @@
+from typing import List, Tuple, Dict
+from bitarray import bitarray
+
+def suppress_order_based(annotations: List[Tuple[int, int, str]]) -> List[Tuple[int, int, str]]:
+    """If two annotations overlap, the first one int the list is kept.
+
+    Args:
+        annotations (List[Tuple[int, int, str]]): List of annotations.
+
+    Returns:
+        List[Tuple[int, int, str]]: List of annotations with overlapping
+            annotations removed.
+
+    """
+    annotations = annotations
+    bitarray_size = max([end for _, end, _ in annotations])
+    bitarray_ = bitarray(bitarray_size)
+    bitarray_.setall(False)
+    
+    result = []
+    
+    for start, end, entity_type in annotations:
+        if not bitarray_[start:end].any():
+            bitarray_[start:end] = True
+            result.append((start, end, entity_type))
+            
+    return result
diff --git a/src/tag_anonimization.py b/src/tag_anonimization.py
new file mode 100644
index 0000000..89e1a10
--- /dev/null
+++ b/src/tag_anonimization.py
@@ -0,0 +1,40 @@
+from typing import List, Tuple
+from collections import defaultdict
+from src.entity_types import EntityTypes
+from src.string_replacements import replace
+
+def replace_with_tags(text: str, detections: List[Tuple[int, int, str]]) -> str:
+    """Replace entities with tags.
+    
+    Args:
+        text (str): Text to be processed.
+        detections (List[Tuple[int, int, str]]): List of detections.
+    
+    Returns:
+        str: Text with entities replaced with tags.
+    
+    """
+    
+    tags_map = {
+        EntityTypes.NAME: "[OSOBA]",
+        EntityTypes.SURNAME: "[OSOBA]",
+        EntityTypes.STREET_NAME: "[MIEJSCE]",
+        EntityTypes.CITY: "[MIEJSCE]",
+        EntityTypes.COUNTRY: "[MIEJSCE]",
+        EntityTypes.PHONE_NUMBER: "[DIGITS]",
+        EntityTypes.URL: "[WWW]",
+        EntityTypes.USER: "@[USER]",
+        EntityTypes.EMAIL: "[MAIL]",
+        EntityTypes.DATE: "[DATE]",
+        EntityTypes.TIN: "[DIGITS]",
+        EntityTypes.KRS: "[DIGITS]",
+    }
+    
+    result = [
+        (start, end, tags_map.get(entity_type, "[OTHER]"))
+        for start, end, entity_type in detections
+    ]
+    
+    return replace(text, result)
+    
+    
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
deleted file mode 100644
index 81cc67f..0000000
--- a/src/utils.py
+++ /dev/null
@@ -1,14 +0,0 @@
-"""Module for useful functions."""
-
-import itertools
-
-
-def consume(iterative, n):
-    """Consume n elements from iterative object.
-
-    Args:
-        iterative (iter): Python iterative object.
-        n (int): Number of elements to consume.
-
-    """
-    next(itertools.islice(iterative, n - 1, n), None)
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
new file mode 100644
index 0000000..a8b0bd1
--- /dev/null
+++ b/src/utils/__init__.py
@@ -0,0 +1 @@
+from src.utils.utils import *
\ No newline at end of file
diff --git a/src/utils/ner_pl_n5_mapping.py b/src/utils/ner_pl_n5_mapping.py
new file mode 100644
index 0000000..0b857b5
--- /dev/null
+++ b/src/utils/ner_pl_n5_mapping.py
@@ -0,0 +1,9 @@
+from src.entity_types import EntityTypes
+
+NER_PL_N5_MAPPING = {
+    "nam_liv_person": EntityTypes.NAME,
+    "nam_liv_person_last": EntityTypes.SURNAME,
+    "nam_fac_road": EntityTypes.STREET_NAME,
+    "nam_loc_gpe_city": EntityTypes.CITY,
+    "nam_org_group_team": EntityTypes.COUNTRY,
+}
\ No newline at end of file
diff --git a/src/utils/utils.py b/src/utils/utils.py
new file mode 100644
index 0000000..c0035e6
--- /dev/null
+++ b/src/utils/utils.py
@@ -0,0 +1,33 @@
+"""Module for useful functions."""
+
+import itertools
+
+
+def consume(iterative, n):
+    """Consume n elements from iterative object.
+
+    Args:
+        iterative (iter): Python iterative object.
+        n (int): Number of elements to consume.
+
+    """
+    next(itertools.islice(iterative, n - 1, n), None)
+
+
+def subdict(dictionary, keys, all_must_be_present=True):
+    """Return a subdictionary of dictionary containing only keys.
+
+    Args:
+        dictionary (dict): Dictionary to take a subdictionary from.
+        keys (list): List of keys to take from dictionary.
+        all_must_be_present (bool): If True, all keys must be present in
+            dictionary. If False, only keys that are present are returned.
+
+    Returns:
+        dict: Subdictionary of dictionary containing only keys.
+
+    """
+    if all_must_be_present:
+        return {key: dictionary[key] for key in keys}
+    else:
+        return {key: dictionary[key] for key in keys if key in dictionary}
\ No newline at end of file
diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py
index 429ee2a..8104a83 100644
--- a/tests/detectors/date/test_en.py
+++ b/tests/detectors/date/test_en.py
@@ -1,16 +1,17 @@
 from src.detectors.date.en import detect_dates_en
+from src.entity_types import EntityTypes
+
 
 def test_detect_dates_en():
     # Check en-us
     text = "On 1.01.2022, I sold my cat. On April 5, 2021, I bought a dog."
     found_dates = detect_dates_en(text)
-    
-    assert found_dates == [(3,12,"1.01.2022"), (32,45, "April 5, 2021")]
-    
+
+    assert found_dates == [(3, 12, EntityTypes.DATE), (32, 45, EntityTypes.DATE)]
+
     # Check en-gb
     # TODO: Following test fails. Fix it.
     # text = "On 1.01.2022 I sold the cat. On 5th April 2021 I bought a dog."
     # found_dates = detect_dates_en(text)
-    
-    # assert found_dates == [(3,12,"1.01.2022"), (32,46, "5th April 2021")]
-    
\ No newline at end of file
+
+    # assert found_dates == [(3,12, EntityTypes.DATE), (32,46, EntityTypes.DATE)]
diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py
index a441c36..2942163 100644
--- a/tests/detectors/date/test_pl.py
+++ b/tests/detectors/date/test_pl.py
@@ -1,7 +1,9 @@
 from src.detectors.date.pl import detect_dates_pl
+from src.entity_types import EntityTypes
+
 
 def test_detect_dates_pl():
     text = "W dniu 1.01.2022 sprzedaÅ‚em kota. 5 kwietnia 2021 roku kupiÅ‚em psa."
     found_dates = detect_dates_pl(text)
-    
-    assert found_dates == [(7,16,"1.01.2022"), (34,49, "5 kwietnia 2021")]
\ No newline at end of file
+
+    assert found_dates == [(7, 16, EntityTypes.DATE), (34, 49, EntityTypes.DATE)]
diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py
index 44e9805..5b90d29 100644
--- a/tests/detectors/date/test_ru.py
+++ b/tests/detectors/date/test_ru.py
@@ -1,7 +1,9 @@
 from src.detectors.date.ru import detect_dates_ru
+from src.entity_types import EntityTypes
+
 
 def test_detect_dates_pl():
     text = "1.01.2022 Ñ Ð¿Ñ€Ð¾Ð´Ð°Ð» ÐºÐ¾ÑˆÐºÑƒ. 5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021 Ñ ÐºÑƒÐ¿Ð¸Ð» ÑÐ¾Ð±Ð°ÐºÑƒ."
     found_dates = detect_dates_ru(text)
-    
-    assert found_dates == [(0,9,"1.01.2022"), (26,39, "5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021")]
\ No newline at end of file
+
+    assert found_dates == [(0, 9, EntityTypes.DATE), (26, 39, EntityTypes.DATE)]
diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py
index 05b3e63..6be224f 100644
--- a/tests/detectors/email/test_email.py
+++ b/tests/detectors/email/test_email.py
@@ -1,7 +1,8 @@
 from src.detectors.email import detect_emails
+from src.entity_types import EntityTypes
 
 def test_detect_emails():
     text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl"
     found_emails = detect_emails(text, "en")
     
-    assert found_emails == [(12, 30, "arkadiusz@borek.pw"), (53, 78, "arkadiusz.dump@pwr.edu.pl")]
\ No newline at end of file
+    assert found_emails == [(12, 30, EntityTypes.EMAIL), (53, 78, EntityTypes.EMAIL)]
\ No newline at end of file
diff --git a/tests/detectors/ner/__init__.py b/tests/detectors/ner/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/ner/test_pl_liner_n5.py b/tests/detectors/ner/test_pl_liner_n5.py
new file mode 100644
index 0000000..ab14e41
--- /dev/null
+++ b/tests/detectors/ner/test_pl_liner_n5.py
@@ -0,0 +1,21 @@
+from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5
+from src.entity_types import EntityTypes
+
+def test_detect_names_pl_liner_n5():
+    ccl_annotations = {
+        'nam_liv_person': [(10, 16, 'Marian'), (100, 109, 'Magdalena')],
+        'nam_liv_person_last': [(30, 35, 'Nowak')],
+        'nam_loc_gpe_city': [(50, 59, 'WrocÅ‚awiu')],
+        'some_other_annotation': [(120, 124, 'zowd')],
+    }
+    
+    result = detect_ner_pl_liner_n5(ccl_annotations)
+    
+    expected = [
+        (10, 16, EntityTypes.NAME), 
+        (100, 109, EntityTypes.NAME),
+        (30, 35, EntityTypes.SURNAME),
+        (50, 59, EntityTypes.CITY),
+    ]
+    
+    assert set(result) == set(expected)
\ No newline at end of file
diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py
index b2efe23..733f263 100644
--- a/tests/detectors/phone/test_phone.py
+++ b/tests/detectors/phone/test_phone.py
@@ -1,7 +1,8 @@
 from src.detectors.phone.phone import detect_phone_numbers
+from src.entity_types import EntityTypes
 
 def test_detect_phone_numbers():
     text = "My phone number is +48 123 456 789. My friend's number is 123456789."
     found_phone_numbers = detect_phone_numbers(text, "en")
     
-    assert found_phone_numbers == [(19, 34, '+48 123 456 789'), (58, 67, '123456789')]
\ No newline at end of file
+    assert found_phone_numbers == [(19, 34, EntityTypes.PHONE_NUMBER), (58, 67, EntityTypes.PHONE_NUMBER)]
\ No newline at end of file
diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py
index ad22f69..3d50e4d 100644
--- a/tests/detectors/url/test_url.py
+++ b/tests/detectors/url/test_url.py
@@ -1,10 +1,11 @@
 from src.detectors.url import detect_urls
+from src.entity_types import EntityTypes
 
 def test_detect_urls():
     text = "This is a test for www.google.com. Make sure to go to https://www.google.com"
     found_urls = detect_urls(text, "en")
     
-    assert found_urls == [(19, 33, 'www.google.com'), (54, 76, 'https://www.google.com')]
+    assert found_urls == [(19, 33, EntityTypes.URL), (54, 76, EntityTypes.URL)]
     
 def test_detect_urls_pl():
     text = "m.in. https://www.google.com"  
@@ -12,5 +13,5 @@ def test_detect_urls_pl():
     found_urls_en = detect_urls(text, "en")
     
     # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected as a URL.
-    assert found_urls_pl == [(6, 28, 'https://www.google.com')]
-    assert found_urls_en == [(0, 4, "m.in"), (6, 28, 'https://www.google.com')]
\ No newline at end of file
+    assert found_urls_pl == [(6, 28, EntityTypes.URL)]
+    assert found_urls_en == [(0, 4, EntityTypes.URL), (6, 28, EntityTypes.URL)]
\ No newline at end of file
diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py
index b198f71..0ae3c9e 100644
--- a/tests/detectors/user/test_user.py
+++ b/tests/detectors/user/test_user.py
@@ -1,7 +1,8 @@
 from src.detectors.user.user import detect_users
+from src.entity_types import EntityTypes
 
 def test_detect_users():
     text = "My username is @john_smith. My friend's username is @jane_doe."
     found_users = detect_users(text, "en")
     
-    assert found_users == [(15, 26, '@john_smith'), (52, 61, '@jane_doe')]
\ No newline at end of file
+    assert found_users == [(15, 26, EntityTypes.USER), (52, 61, EntityTypes.USER)]
\ No newline at end of file
diff --git a/tests/dictionaries/__init__.py b/tests/dictionaries/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/dictionaries/test_pl_ner_replacements.py b/tests/dictionaries/test_pl_ner_replacements.py
new file mode 100644
index 0000000..a694d2e
--- /dev/null
+++ b/tests/dictionaries/test_pl_ner_replacements.py
@@ -0,0 +1,38 @@
+from src.dictionaries.pl_ner_replacements import load_pl_ner_replacements_dictionary
+from tempfile import NamedTemporaryFile
+
+def test_load_pl_ner_replacements_dictionary():
+    with NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) as file:
+        file.write("OSOBA\tAndrzejowi\tAndrzej\tsubst:sg:dat:m1\n")
+        file.write("OSOBA\tAndrzej\tAndrzej\tsubst:sg:m1:imperf\n")
+        file.write("OSOBA\tKasia\tKasia\tsubst:sg:f:imperf\n")
+        file.write("MIEJSCE\tWrocÅ‚aw\tWrocÅ‚aw\tsubst:sg:m2:imperf\n")
+        file.write("MIEJSCE\tWarszawa\tWarszawa\tsubst:sg:f:imperf\n")
+        file.write("MIEJSCE\tKrakÃ³w\tKrakÃ³w\tsubst:sg:m2:imperf\n")
+        
+        path = file.name
+        
+    dictionary = load_pl_ner_replacements_dictionary(path)
+    
+    assert dictionary == {
+        "OSOBA": {
+            "Andrzej": {
+                "subst:sg:dat:m1": "Andrzejowi",
+                "subst:sg:m1:imperf": "Andrzej"
+            },
+            "Kasia": {
+                "subst:sg:f:imperf": "Kasia"
+            }
+        },
+        "MIEJSCE": {
+            "WrocÅ‚aw": {
+                "subst:sg:m2:imperf": "WrocÅ‚aw"
+            },
+            "Warszawa": {
+                "subst:sg:f:imperf": "Warszawa"
+            },
+            "KrakÃ³w": {
+                "subst:sg:m2:imperf": "KrakÃ³w"
+            }
+        }
+    }
\ No newline at end of file
diff --git a/tests/suppressors/test_order_based.py b/tests/suppressors/test_order_based.py
new file mode 100644
index 0000000..8cf35b9
--- /dev/null
+++ b/tests/suppressors/test_order_based.py
@@ -0,0 +1,16 @@
+from src.suppressors.order_based import suppress_order_based
+
+def test_supress_order_based():
+    annotations = [
+        (10, 16, "Marian"),
+        (10, 18, "Marianna"),
+        (30, 35, "Nowak"),
+        (50, 59, "WrocÅ‚awiu"),
+    ]
+    result = suppress_order_based(annotations)
+    expected = [
+        (10, 16, "Marian"),
+        (30, 35, "Nowak"),
+        (50, 59, "WrocÅ‚awiu"),
+    ]
+    assert set(result) == set(expected)
\ No newline at end of file
diff --git a/tests/test_ccl_parser.py b/tests/test_ccl_parser.py
new file mode 100644
index 0000000..e140edc
--- /dev/null
+++ b/tests/test_ccl_parser.py
@@ -0,0 +1,60 @@
+from src.ccl_parser import parse_ccl
+
+example_ccl = """<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chunkList SYSTEM "ccl.dtd">
+<chunkList>
+ <chunk type="p" id="ch1">
+  <sentence id="s1">
+   <tok>
+    <orth>Marek</orth>
+    <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="nam_liv" head="1">1</ann>
+    <ann chan="nam_loc">0</ann>
+   </tok>
+   <tok>
+    <orth>Kowalski</orth>
+    <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex>
+    <ann chan="nam_liv">1</ann>
+    <ann chan="nam_loc">0</ann>
+   </tok>
+   <tok>
+    <orth>pojechaÅ‚</orth>
+    <lex disamb="1"><base>pojechaÄ‡</base><ctag>praet:sg:m1:perf</ctag></lex>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
+   </tok>
+   <tok>
+    <orth>do</orth>
+    <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
+   </tok>
+   <tok>
+    <orth>WrocÅ‚awia</orth>
+    <lex disamb="1"><base>WrocÅ‚aw</base><ctag>subst:sg:gen:m3</ctag></lex>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc" head="1">1</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
+   </tok>
+  </sentence>
+ </chunk>
+</chunkList>
+"""
+
+def test_parse_ccl():
+    text, annotations = parse_ccl(example_ccl)
+    
+    assert text == "Marek Kowalski pojechaÅ‚ do WrocÅ‚awia."
+    
+    assert set(annotations.keys()) == set(["nam_liv", "nam_loc", "ctag"])
+    
+    assert annotations["nam_liv"] == [(0, 14, "Marek Kowalski")]
+    assert annotations["nam_loc"] == [(27, 36, "WrocÅ‚awia")]
+    assert annotations["ctag"] == [(0, 5, "subst:sg:nom:m1"), (6, 14, "subst:sg:nom:m1"), (15, 23, "praet:sg:m1:perf"), (24, 26, "prep:gen"), (27, 36, "subst:sg:gen:m3"), (36, 37, "interp")]
\ No newline at end of file
diff --git a/tests/test_string_replacements.py b/tests/test_string_replacements.py
new file mode 100644
index 0000000..f44644d
--- /dev/null
+++ b/tests/test_string_replacements.py
@@ -0,0 +1,20 @@
+from src.string_replacements import replace
+
+def test_replace():
+    text = "Ala ma kota"
+    replacements = [(0, 3, "Andrzej"), (7, 11, "psa")]
+    
+    expected = "Andrzej ma psa"
+    
+    result = replace(text, replacements)
+    
+    assert result == expected
+    
+def test_replace_out_of_order():
+    text = "Ala ma kota"
+    replacements = [(7, 11, "psa"), (0, 3, "Andrzej")]
+    
+    expected = "Andrzej ma psa"
+    result = replace(text, replacements)
+    
+    assert result == expected
\ No newline at end of file
diff --git a/tests/test_tag_anonimization.py b/tests/test_tag_anonimization.py
new file mode 100644
index 0000000..3bfd374
--- /dev/null
+++ b/tests/test_tag_anonimization.py
@@ -0,0 +1,17 @@
+
+from src.tag_anonimization import replace_with_tags
+from src.entity_types import EntityTypes
+
+def test_replace_with_tags():
+    text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu"
+    detections = [
+        (0, 3, EntityTypes.NAME), 
+        (4, 14, EntityTypes.SURNAME),
+        (28, 38, EntityTypes.DATE),
+        (42, 51, EntityTypes.CITY),
+    ]
+    
+    result = replace_with_tags(text, detections)
+    expected = "[OSOBA] [OSOBA] urodziÅ‚a sie [DATE] we [MIEJSCE]"
+    
+    assert result == expected
\ No newline at end of file
-- 
GitLab