[WIP] - Refactoring + unit testing

1eeb5a98 · Michał Pogoda · c7a903ad · 1eeb5a98 · 1eeb5a98 · 1eeb5a98
Commit 1eeb5a98 authored 2 years ago by Michał Pogoda
--- a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
+++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl
@@ -7,37 +7,39 @@
    <orth>Marek</orth>
    <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex>
    <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex>
-    <ann chan="person_first_nam" head="1">1</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv" head="1">1</ann>
+    <ann chan="nam_loc">0</ann>
   </tok>
   <tok>
    <orth>Kowalski</orth>
    <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam" head="1">1</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv">1</ann>
+    <ann chan="nam_loc">0</ann>
   </tok>
   <tok>
    <orth>pojechał</orth>
    <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
   </tok>
   <tok>
    <orth>do</orth>
    <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam">0</ann>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
   </tok>
   <tok>
    <orth>Wrocławia</orth>
    <lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex>
-    <ann chan="person_first_nam">0</ann>
-    <ann chan="person_last_nam">0</ann>
-    <ann chan="city_nam" head="1">1</ann>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc" head="1">1</ann>
+   </tok>
+   <ns/>
+   <tok>
+    <orth>.</orth>
+    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
+    <ann chan="nam_liv">0</ann>
+    <ann chan="nam_loc">0</ann>
   </tok>
  </sentence>
 </chunk>

--- a/src/base_anonymizer.py
+++ b/src/base_anonymizer.py
@@ -4,6 +4,9 @@ from abc import ABC, abstractmethod
 from src.generators import generate_phone_number_tag


+regex.compile(r'\B(?P<username>\@[\w\-]+)')
+# This regex detects the following
+
 class BaseAnonymizer(ABC):
    """Base abstract class for anonymization."""


--- a/src/detectors/__init__.py
+++ b/src/detectors/__init__.py
--- a/src/detectors/date/__init__.py
+++ b/src/detectors/date/__init__.py
+from src.detectors.date.date import find_dates
\ No newline at end of file
--- a/src/detectors/date/date.py
+++ b/src/detectors/date/date.py
+from typing import List, Tuple
+from .en import detect_dates_en
+from .pl import detect_dates_pl
+from .ru import detect_dates_ru
+
+def find_dates(text: str, language: str = "en") -> List[Tuple[int, int, str]]:
+    """
+    Finds dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    
+    language_processors = {
+        "en": detect_dates_en,
+        "pl": detect_dates_pl,
+        "ru": detect_dates_ru
+    }
+    
+    return language_processors.get(language, detect_dates_en)(text)
\ No newline at end of file
--- a/src/detectors/date/en.py
+++ b/src/detectors/date/en.py
+import regex as re
+from typing import List, Tuple
+
+EN_DATES_REGEX = re.compile(
+    r'\b(?P<day_or_month_year>'
+    r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+    r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+    r'(?P<year1>\d{4}|\d{2}))\b|'
+
+    r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+    r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+    r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+    r'(?P<month_in_words>'
+    r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+    r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|'
+    r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)'
+    r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b'
+    r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?'
+    r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?'
+    r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I
+)
+
+def detect_dates_en(text: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects English dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = EN_DATES_REGEX.finditer(text)
+    dates = []
+    for match in matches:
+        dates.append((match.start(), match.end(), match.group()))
+    return dates
\ No newline at end of file
--- a/src/detectors/date/pl.py
+++ b/src/detectors/date/pl.py
+import regex as re
+from typing import List, Tuple
+
+PL_DATES_REGEX = re.compile(
+    r'\b(?P<day_or_month_year>'
+    r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+    r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+    r'(?P<year1>\d{4}|\d{2}))\b|'
+
+    r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+    r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+    r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+    r'(?P<month_in_words>'
+    r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)'
+    r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+    r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|'
+    r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)'
+    r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)'
+    r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)'
+    r'|Gru(?:|dzie[nń]|dnia))\b'
+    r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
+    r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
+    r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I
+)
+
+def detect_dates_pl(text: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects Polish dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = PL_DATES_REGEX.finditer(text)
+    dates = []
+    for match in matches:
+        dates.append((match.start(), match.end(), match.group()))
+    return dates
\ No newline at end of file
--- a/src/detectors/date/ru.py
+++ b/src/detectors/date/ru.py
+import regex as re
+from typing import List, Tuple
+
+RU_DATES_REGEX = re.compile(
+    r'\b(?P<day_or_month_year>'
+    r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
+    r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
+    r'(?P<year1>\d{4}|\d{2}))\b|'
+
+    r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
+    r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
+    r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
+
+    r'(?P<month_in_words>'
+    r'(?!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b)'
+    r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
+    r'\b(?P<month>Янв(?:|ар[ьея])|Фев(?:|рал[ьея])|Мар(?:|т|те|та)|'
+    r'Апр(?:|ел[ьея])|Ма[йея]|Июн(?:|[ьея])|Июл(?:|[ьея])|'
+    r'Авг(?:|уст|уст[еа])|Сен(?:|тябр[ьея])|Окт(?:|ябр[ьея])|'
+    r'Ноя(?:|бр[ьея])|Дек(?:|абр[ьея]))\b'
+    r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
+    r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
+    r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?'
+    r'(?<!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b))', re.I
+)
+
+def detect_dates_ru(text: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects Russian dates in the text.
+    :param text: the text to be searched
+    :type text: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = RU_DATES_REGEX.finditer(text)
+    dates = []
+    for match in matches:
+        dates.append((match.start(), match.end(), match.group()))
+    return dates
\ No newline at end of file
--- a/src/detectors/email/__init__.py
+++ b/src/detectors/email/__init__.py
+from src.detectors.email.email import detect_emails
\ No newline at end of file
--- a/src/detectors/email/email.py
+++ b/src/detectors/email/email.py
+import regex as re
+from typing import List, Tuple
+
+EMAIL_REGEX = re.compile(
+    r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+'
+    r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)'
+    r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)'
+    r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', re.I
+)
+
+
+def detect_emails(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects emails in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_email)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = EMAIL_REGEX.finditer(text)
+    emails = []
+    for match in matches:
+        emails.append((match.start(), match.end(), match.group()))
+    return emails
\ No newline at end of file
--- a/src/detectors/phone/__init__.py
+++ b/src/detectors/phone/__init__.py
+from src.detectors.phone.phone import detect_phone_numbers
\ No newline at end of file
--- a/src/detectors/phone/phone.py
+++ b/src/detectors/phone/phone.py
+import regex as re
+from typing import List, Tuple
+
+PHONE_NUMBER_REGEX = re.compile(
+    r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?'
+    r'(?P<number>(\d[- ]??){9,10})'
+)
+
+
+def detect_phone_numbers(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects phone numbers in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_date)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = PHONE_NUMBER_REGEX.finditer(text)
+    phone_numbers = []
+    for match in matches:
+        phone_numbers.append((match.start(), match.end(), match.group()))
+    return phone_numbers
\ No newline at end of file
--- a/src/detectors/url/__init__.py
+++ b/src/detectors/url/__init__.py
+from src.detectors.url.url import detect_urls
--- a/src/detectors/url/common.py
+++ b/src/detectors/url/common.py
+import regex as re
+from typing import List, Tuple
+
+def generate_url_regex(exeptions: List[str]) -> str:
+    return re.compile(
+        r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(exeptions)) +
+        r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?'
+        r'(?P<auth>\S+(?::\S*)?@)?'
+        r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})'
+        r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})'
+        r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})'
+        r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
+        r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
+        r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
+        r'|'
+        r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?'
+        r'[a-z0-9\u00a1-\uffff]\.)+)'
+        r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)'
+        r'(?P<port>:\d{2,5})?'
+        r'(?P<path>[/?#]\S*)?)',
+        re.UNICODE | re.I
+    )
+    
+URL_REGEX_GENERAL = generate_url_regex([])
\ No newline at end of file
--- a/src/detectors/url/pl.py
+++ b/src/detectors/url/pl.py
+from .common import generate_url_regex
+
+PL_URL_REGEX_EXEPTIONS = ["m.in"]
+
+URL_REGEX_PL = generate_url_regex(PL_URL_REGEX_EXEPTIONS)
\ No newline at end of file
--- a/src/detectors/url/url.py
+++ b/src/detectors/url/url.py
+import regex as re
+from typing import List, Tuple
+from .pl import URL_REGEX_PL
+from .common import generate_url_regex
+
+def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects urls in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_url)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    if language == "pl":
+        url_regex = URL_REGEX_PL
+    else:
+        url_regex = generate_url_regex(language)
+        
+    matches = url_regex.finditer(text)
+    urls = []
+    for match in matches:
+        urls.append((match.start(), match.end(), match.group()))
+        
+    return urls
\ No newline at end of file
--- a/src/detectors/user/__init__.py
+++ b/src/detectors/user/__init__.py
+from src.detectors.user.user import detect_users
\ No newline at end of file
--- a/src/detectors/user/user.py
+++ b/src/detectors/user/user.py
+import regex as re
+from typing import List, Tuple
+
+USER_REGEX = re.compile(r'\B(?P<username>\@[\w\-]+)')
+
+def detect_users(text: str, language: str) -> List[Tuple[int, int, str]]:
+    """
+    Detects users in the text.
+    :param text: the text to be searched
+    :type text: str
+    :param language: the language of the text
+    :type language: str
+    :return: a list of tuples containing (start, end, detected_user)
+    :rtype: List[Tuple[int, int, str]]
+    """
+    matches = USER_REGEX.finditer(text)
+    users = []
+    for match in matches:
+        users.append((match.start(), match.end(), match.group()))
+    return users
\ No newline at end of file
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/detectors/__init__.py
+++ b/tests/detectors/__init__.py