From 1eeb5a983e40ce8573da3a26122fd6790f96aa29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Mon, 12 Dec 2022 16:07:00 +0100
Subject: [PATCH] [WIP] - Refactoring + unit testing

---
 .../marek_kowalski_pojechal_do_wroclawia.ccl  | 32 ++++++++-------
 src/base_anonymizer.py                        |  3 ++
 src/detectors/__init__.py                     |  0
 src/detectors/date/__init__.py                |  1 +
 src/detectors/date/date.py                    | 23 +++++++++++
 src/detectors/date/en.py                      | 36 +++++++++++++++++
 src/detectors/date/pl.py                      | 39 +++++++++++++++++++
 src/detectors/date/ru.py                      | 39 +++++++++++++++++++
 src/detectors/email/__init__.py               |  1 +
 src/detectors/email/email.py                  | 26 +++++++++++++
 src/detectors/phone/__init__.py               |  1 +
 src/detectors/phone/phone.py                  | 24 ++++++++++++
 src/detectors/url/__init__.py                 |  1 +
 src/detectors/url/common.py                   | 24 ++++++++++++
 src/detectors/url/pl.py                       |  5 +++
 src/detectors/url/url.py                      | 26 +++++++++++++
 src/detectors/user/__init__.py                |  1 +
 src/detectors/user/user.py                    | 20 ++++++++++
 tests/__init__.py                             |  0
 tests/detectors/__init__.py                   |  0
 tests/detectors/date/__init__.py              |  0
 tests/detectors/date/test_en.py               | 16 ++++++++
 tests/detectors/date/test_pl.py               |  7 ++++
 tests/detectors/date/test_ru.py               |  7 ++++
 tests/detectors/email/__init__.py             |  0
 tests/detectors/email/test_email.py           |  7 ++++
 tests/detectors/phone/__init__.py             |  0
 tests/detectors/phone/test_phone.py           |  7 ++++
 tests/detectors/url/__init__.py               |  0
 tests/detectors/url/test_url.py               | 16 ++++++++
 tests/detectors/user/__init__.py              |  0
 tests/detectors/user/test_user.py             |  7 ++++
 32 files changed, 354 insertions(+), 15 deletions(-)
 create mode 100644 src/detectors/__init__.py
 create mode 100644 src/detectors/date/__init__.py
 create mode 100644 src/detectors/date/date.py
 create mode 100644 src/detectors/date/en.py
 create mode 100644 src/detectors/date/pl.py
 create mode 100644 src/detectors/date/ru.py
 create mode 100644 src/detectors/email/__init__.py
 create mode 100644 src/detectors/email/email.py
 create mode 100644 src/detectors/phone/__init__.py
 create mode 100644 src/detectors/phone/phone.py
 create mode 100644 src/detectors/url/__init__.py
 create mode 100644 src/detectors/url/common.py
 create mode 100644 src/detectors/url/pl.py
 create mode 100644 src/detectors/url/url.py
 create mode 100644 src/detectors/user/__init__.py
 create mode 100644 src/detectors/user/user.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/detectors/__init__.py
 create mode 100644 tests/detectors/date/__init__.py
 create mode 100644 tests/detectors/date/test_en.py
 create mode 100644 tests/detectors/date/test_pl.py
 create mode 100644 tests/detectors/date/test_ru.py
 create mode 100644 tests/detectors/email/__init__.py
 create mode 100644 tests/detectors/email/test_email.py
 create mode 100644 tests/detectors/phone/__init__.py
 create mode 100644 tests/detectors/phone/test_phone.py
 create mode 100644 tests/detectors/url/__init__.py
 create mode 100644 tests/detectors/url/test_url.py
 create mode 100644 tests/detectors/user/__init__.py
 create mode 100644 tests/detectors/user/test_user.py

diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
index d8db042..b19c400 100644
--- a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
+++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
@@ -7,37 +7,39 @@
     <orth>Marek</orth>
     <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex>
     <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex>
-    <ann chan="person_first_nam" head="1">1</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv" head="1">1</ann>
+    <ann chan="nam_loc">0</ann>
    </tok>
    <tok>
     <orth>Kowalski</orth>
     <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam" head="1">1</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv">1</ann>
+    <ann chan="nam_loc">0</ann>
    </tok>
    <tok>
     <orth>pojechał</orth>
     <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
    </tok>
    <tok>
     <orth>do</orth>
     <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
    </tok>
    <tok>
     <orth>Wrocławia</orth>
     <lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam" head="1">1</ann>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc" head="1">1</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
    </tok>
   </sentence>
  </chunk>
diff --git a/src/base_anonymizer.py b/src/base_anonymizer.py
index fd62de5..a863322 100644
--- a/src/base_anonymizer.py
+++ b/src/base_anonymizer.py
@@ -4,6 +4,9 @@ from abc import ABC, abstractmethod
 from src.generators import generate_phone_number_tag
 
 
+regex.compile(r'\B(?P<username>\@[\w\-]+)')
+# This regex detects the following
+
 class BaseAnonymizer(ABC):
     """Base abstract class for anonymization."""
 
diff --git a/src/detectors/__init__.py b/src/detectors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/detectors/date/__init__.py b/src/detectors/date/__init__.py
new file mode 100644
index 0000000..2c5b35b
--- /dev/null
+++ b/src/detectors/date/__init__.py
@@ -0,0 +1 @@
+from src.detectors.date.date import find_dates
\ No newline at end of file
diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py
new file mode 100644
index 0000000..85e34b4
--- /dev/null
+++ b/src/detectors/date/date.py
@@ -0,0 +1,23 @@
+from typing import List, Tuple
+from .en import detect_dates_en
+from .pl import detect_dates_pl
+from .ru import detect_dates_ru
+
+def find_dates(text: str, language: str = "en") -> List[Tuple[int, int, str]]:
+    """
+    Finds dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    
+    language_processors = {
+        "en": detect_dates_en,
+        "pl": detect_dates_pl,
+        "ru": detect_dates_ru
+    }
+    
+    return language_processors.get(language, detect_dates_en)(text)
\ No newline at end of file
diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py
new file mode 100644
index 0000000..594e663
--- /dev/null
+++ b/src/detectors/date/en.py
@@ -0,0 +1,36 @@
+import regex as re
+from typing import List, Tuple
+
+EN_DATES_REGEX = re.compile(
+    r'\b(?P<day_or_month_year>'
+    r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+    r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+    r'(?P<year1>\d{4}|\d{2}))\b|'
+
+    r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+    r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+    r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+    r'(?P<month_in_words>'
+    r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+    r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|'
+    r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)'
+    r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b'
+    r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?'
+    r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?'
+    r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I
+)
+
+def detect_dates_en(text: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects English dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = EN_DATES_REGEX.finditer(text)
+    dates = []
+    for match in matches:
+        dates.append((match.start(), match.end(), match.group()))
+    return dates
\ No newline at end of file
diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py
new file mode 100644
index 0000000..7001b9f
--- /dev/null
+++ b/src/detectors/date/pl.py
@@ -0,0 +1,39 @@
+import regex as re
+from typing import List, Tuple
+
+PL_DATES_REGEX = re.compile(
+    r'\b(?P<day_or_month_year>'
+    r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+    r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+    r'(?P<year1>\d{4}|\d{2}))\b|'
+
+    r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+    r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+    r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+    r'(?P<month_in_words>'
+    r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)'
+    r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+    r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|'
+    r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)'
+    r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)'
+    r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)'
+    r'|Gru(?:|dzie[nń]|dnia))\b'
+    r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
+    r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
+    r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I
+)
+
+def detect_dates_pl(text: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects Polish dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = PL_DATES_REGEX.finditer(text)
+    dates = []
+    for match in matches:
+        dates.append((match.start(), match.end(), match.group()))
+    return dates
\ No newline at end of file
diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py
new file mode 100644
index 0000000..91017c8
--- /dev/null
+++ b/src/detectors/date/ru.py
@@ -0,0 +1,39 @@
+import regex as re
+from typing import List, Tuple
+
+RU_DATES_REGEX = re.compile(
+    r'\b(?P<day_or_month_year>'
+    r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+    r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+    r'(?P<year1>\d{4}|\d{2}))\b|'
+
+    r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+    r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+    r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+    r'(?P<month_in_words>'
+    r'(?!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b)'
+    r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+    r'\b(?P<month>Янв(?:|ар[ьея])|Фев(?:|рал[ьея])|Мар(?:|т|те|та)|'
+    r'Апр(?:|ел[ьея])|Ма[йея]|Июн(?:|[ьея])|Июл(?:|[ьея])|'
+    r'Авг(?:|уст|уст[еа])|Сен(?:|тябр[ьея])|Окт(?:|ябр[ьея])|'
+    r'Ноя(?:|бр[ьея])|Дек(?:|абр[ьея]))\b'
+    r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
+    r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
+    r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?'
+    r'(?<!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b))', re.I
+)
+
+def detect_dates_ru(text: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects Russian dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = RU_DATES_REGEX.finditer(text)
+    dates = []
+    for match in matches:
+        dates.append((match.start(), match.end(), match.group()))
+    return dates
\ No newline at end of file
diff --git a/src/detectors/email/__init__.py b/src/detectors/email/__init__.py
new file mode 100644
index 0000000..58050bc
--- /dev/null
+++ b/src/detectors/email/__init__.py
@@ -0,0 +1 @@
+from src.detectors.email.email import detect_emails
\ No newline at end of file
diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py
new file mode 100644
index 0000000..a0637ec
--- /dev/null
+++ b/src/detectors/email/email.py
@@ -0,0 +1,26 @@
+import regex as re
+from typing import List, Tuple
+
+EMAIL_REGEX = re.compile(
+    r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+'
+    r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)'
+    r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)'
+    r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', re.I
+)
+
+
+def detect_emails(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects emails in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_email)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = EMAIL_REGEX.finditer(text)
+    emails = []
+    for match in matches:
+        emails.append((match.start(), match.end(), match.group()))
+    return emails
\ No newline at end of file
diff --git a/src/detectors/phone/__init__.py b/src/detectors/phone/__init__.py
new file mode 100644
index 0000000..e30518d
--- /dev/null
+++ b/src/detectors/phone/__init__.py
@@ -0,0 +1 @@
+from src.detectors.phone.phone import detect_phone_numbers
\ No newline at end of file
diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py
new file mode 100644
index 0000000..49abeb5
--- /dev/null
+++ b/src/detectors/phone/phone.py
@@ -0,0 +1,24 @@
+import regex as re
+from typing import List, Tuple
+
+PHONE_NUMBER_REGEX = re.compile(
+    r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?'
+    r'(?P<number>(\d[- ]??){9,10})'
+)
+
+
+def detect_phone_numbers(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects phone numbers in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = PHONE_NUMBER_REGEX.finditer(text)
+    phone_numbers = []
+    for match in matches:
+        phone_numbers.append((match.start(), match.end(), match.group()))
+    return phone_numbers
\ No newline at end of file
diff --git a/src/detectors/url/__init__.py b/src/detectors/url/__init__.py
new file mode 100644
index 0000000..72b8dc6
--- /dev/null
+++ b/src/detectors/url/__init__.py
@@ -0,0 +1 @@
+from src.detectors.url.url import detect_urls
diff --git a/src/detectors/url/common.py b/src/detectors/url/common.py
new file mode 100644
index 0000000..9d39241
--- /dev/null
+++ b/src/detectors/url/common.py
@@ -0,0 +1,24 @@
+import regex as re
+from typing import List, Tuple
+
+def generate_url_regex(exeptions: List[str]) -> str:
+    return re.compile(
+        r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(exeptions)) +
+        r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?'
+        r'(?P<auth>\S+(?::\S*)?@)?'
+        r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})'
+        r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})'
+        r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})'
+        r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
+        r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
+        r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
+        r'|'
+        r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?'
+        r'[a-z0-9\u00a1-\uffff]\.)+)'
+        r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)'
+        r'(?P<port>:\d{2,5})?'
+        r'(?P<path>[/?#]\S*)?)',
+        re.UNICODE | re.I
+    )
+    
+URL_REGEX_GENERAL = generate_url_regex([])
\ No newline at end of file
diff --git a/src/detectors/url/pl.py b/src/detectors/url/pl.py
new file mode 100644
index 0000000..5d1a9ed
--- /dev/null
+++ b/src/detectors/url/pl.py
@@ -0,0 +1,5 @@
+from .common import generate_url_regex
+
+PL_URL_REGEX_EXEPTIONS = ["m.in"]
+
+URL_REGEX_PL = generate_url_regex(PL_URL_REGEX_EXEPTIONS)
\ No newline at end of file
diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py
new file mode 100644
index 0000000..2ca1fec
--- /dev/null
+++ b/src/detectors/url/url.py
@@ -0,0 +1,26 @@
+import regex as re
+from typing import List, Tuple
+from .pl import URL_REGEX_PL
+from .common import generate_url_regex
+
+def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects urls in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_url)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    if language == "pl":
+        url_regex = URL_REGEX_PL
+    else:
+        url_regex = generate_url_regex(language)
+        
+    matches = url_regex.finditer(text)
+    urls = []
+    for match in matches:
+        urls.append((match.start(), match.end(), match.group()))
+        
+    return urls
\ No newline at end of file
diff --git a/src/detectors/user/__init__.py b/src/detectors/user/__init__.py
new file mode 100644
index 0000000..3ba0c10
--- /dev/null
+++ b/src/detectors/user/__init__.py
@@ -0,0 +1 @@
+from src.detectors.user.user import detect_users
\ No newline at end of file
diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py
new file mode 100644
index 0000000..4d8f035
--- /dev/null
+++ b/src/detectors/user/user.py
@@ -0,0 +1,20 @@
+import regex as re
+from typing import List, Tuple
+
+USER_REGEX = re.compile(r'\B(?P<username>\@[\w\-]+)')
+
+def detect_users(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects users in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_user)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = USER_REGEX.finditer(text)
+    users = []
+    for match in matches:
+        users.append((match.start(), match.end(), match.group()))
+    return users
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/__init__.py b/tests/detectors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/date/__init__.py b/tests/detectors/date/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py
new file mode 100644
index 0000000..429ee2a
--- /dev/null
+++ b/tests/detectors/date/test_en.py
@@ -0,0 +1,16 @@
+from src.detectors.date.en import detect_dates_en
+
+def test_detect_dates_en():
+    # Check en-us
+    text = "On 1.01.2022, I sold my cat. On April 5, 2021, I bought a dog."
+    found_dates = detect_dates_en(text)
+    
+    assert found_dates == [(3,12,"1.01.2022"), (32,45, "April 5, 2021")]
+    
+    # Check en-gb
+    # TODO: Following test fails. Fix it.
+    # text = "On 1.01.2022 I sold the cat. On 5th April 2021 I bought a dog."
+    # found_dates = detect_dates_en(text)
+    
+    # assert found_dates == [(3,12,"1.01.2022"), (32,46, "5th April 2021")]
+    
\ No newline at end of file
diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py
new file mode 100644
index 0000000..a441c36
--- /dev/null
+++ b/tests/detectors/date/test_pl.py
@@ -0,0 +1,7 @@
+from src.detectors.date.pl import detect_dates_pl
+
+def test_detect_dates_pl():
+    text = "W dniu 1.01.2022 sprzedałem kota. 5 kwietnia 2021 roku kupiłem psa."
+    found_dates = detect_dates_pl(text)
+    
+    assert found_dates == [(7,16,"1.01.2022"), (34,49, "5 kwietnia 2021")]
\ No newline at end of file
diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py
new file mode 100644
index 0000000..44e9805
--- /dev/null
+++ b/tests/detectors/date/test_ru.py
@@ -0,0 +1,7 @@
+from src.detectors.date.ru import detect_dates_ru
+
+def test_detect_dates_pl():
+    text = "1.01.2022 я продал кошку. 5 апреля 2021 я купил собаку."
+    found_dates = detect_dates_ru(text)
+    
+    assert found_dates == [(0,9,"1.01.2022"), (26,39, "5 апреля 2021")]
\ No newline at end of file
diff --git a/tests/detectors/email/__init__.py b/tests/detectors/email/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py
new file mode 100644
index 0000000..05b3e63
--- /dev/null
+++ b/tests/detectors/email/test_email.py
@@ -0,0 +1,7 @@
+from src.detectors.email import detect_emails
+
+def test_detect_emails():
+    text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl"
+    found_emails = detect_emails(text, "en")
+    
+    assert found_emails == [(12, 30, "arkadiusz@borek.pw"), (53, 78, "arkadiusz.dump@pwr.edu.pl")]
\ No newline at end of file
diff --git a/tests/detectors/phone/__init__.py b/tests/detectors/phone/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py
new file mode 100644
index 0000000..b2efe23
--- /dev/null
+++ b/tests/detectors/phone/test_phone.py
@@ -0,0 +1,7 @@
+from src.detectors.phone.phone import detect_phone_numbers
+
+def test_detect_phone_numbers():
+    text = "My phone number is +48 123 456 789. My friend's number is 123456789."
+    found_phone_numbers = detect_phone_numbers(text, "en")
+    
+    assert found_phone_numbers == [(19, 34, '+48 123 456 789'), (58, 67, '123456789')]
\ No newline at end of file
diff --git a/tests/detectors/url/__init__.py b/tests/detectors/url/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py
new file mode 100644
index 0000000..ad22f69
--- /dev/null
+++ b/tests/detectors/url/test_url.py
@@ -0,0 +1,16 @@
+from src.detectors.url import detect_urls
+
+def test_detect_urls():
+    text = "This is a test for www.google.com. Make sure to go to https://www.google.com"
+    found_urls = detect_urls(text, "en")
+    
+    assert found_urls == [(19, 33, 'www.google.com'), (54, 76, 'https://www.google.com')]
+    
+def test_detect_urls_pl():
+    text = "m.in. https://www.google.com"  
+    found_urls_pl = detect_urls(text, "pl")
+    found_urls_en = detect_urls(text, "en")
+    
+    # m.in is a valid shortcut for między innymi in Polish. It should not be detected as a URL.
+    assert found_urls_pl == [(6, 28, 'https://www.google.com')]
+    assert found_urls_en == [(0, 4, "m.in"), (6, 28, 'https://www.google.com')]
\ No newline at end of file
diff --git a/tests/detectors/user/__init__.py b/tests/detectors/user/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py
new file mode 100644
index 0000000..b198f71
--- /dev/null
+++ b/tests/detectors/user/test_user.py
@@ -0,0 +1,7 @@
+from src.detectors.user.user import detect_users
+
+def test_detect_users():
+    text = "My username is @john_smith. My friend's username is @jane_doe."
+    found_users = detect_users(text, "en")
+    
+    assert found_users == [(15, 26, '@john_smith'), (52, 61, '@jane_doe')]
\ No newline at end of file
-- 
GitLab