Skip to content
Snippets Groups Projects
Commit 1eeb5a98 authored by Michał Pogoda's avatar Michał Pogoda
Browse files

[WIP] - Refactoring + unit testing

parent c7a903ad
2 merge requests!10Anonimizer v2,!7Better coverage
Pipeline #6757 failed with stage
in 27 seconds
Showing
with 287 additions and 15 deletions
......@@ -7,37 +7,39 @@
<orth>Marek</orth>
<lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex>
<lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex>
<ann chan="person_first_nam" head="1">1</ann>
<ann chan="person_last_nam">0</ann>
<ann chan="city_nam">0</ann>
<ann chan="nam_liv" head="1">1</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>Kowalski</orth>
<lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex>
<ann chan="person_first_nam">0</ann>
<ann chan="person_last_nam" head="1">1</ann>
<ann chan="city_nam">0</ann>
<ann chan="nam_liv">1</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>pojechał</orth>
<lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex>
<ann chan="person_first_nam">0</ann>
<ann chan="person_last_nam">0</ann>
<ann chan="city_nam">0</ann>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>do</orth>
<lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex>
<ann chan="person_first_nam">0</ann>
<ann chan="person_last_nam">0</ann>
<ann chan="city_nam">0</ann>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc">0</ann>
</tok>
<tok>
<orth>Wrocławia</orth>
<lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex>
<ann chan="person_first_nam">0</ann>
<ann chan="person_last_nam">0</ann>
<ann chan="city_nam" head="1">1</ann>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc" head="1">1</ann>
</tok>
<ns/>
<tok>
<orth>.</orth>
<lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
<ann chan="nam_liv">0</ann>
<ann chan="nam_loc">0</ann>
</tok>
</sentence>
</chunk>
......
......@@ -4,6 +4,9 @@ from abc import ABC, abstractmethod
from src.generators import generate_phone_number_tag
regex.compile(r'\B(?P<username>\@[\w\-]+)')
# This regex detects the following
class BaseAnonymizer(ABC):
"""Base abstract class for anonymization."""
......
from src.detectors.date.date import find_dates
\ No newline at end of file
from typing import List, Tuple
from .en import detect_dates_en
from .pl import detect_dates_pl
from .ru import detect_dates_ru
def find_dates(text: str, language: str = "en") -> List[Tuple[int, int, str]]:
"""
Finds dates in the text.
:param text: the text to be searched
:type text: str
:param language: the language of the text
:type language: str
:return: a list of tuples containing (start, end, detected_date)
:rtype: List[Tuple[int, int, str]]
"""
language_processors = {
"en": detect_dates_en,
"pl": detect_dates_pl,
"ru": detect_dates_ru
}
return language_processors.get(language, detect_dates_en)(text)
\ No newline at end of file
import regex as re
from typing import List, Tuple
EN_DATES_REGEX = re.compile(
r'\b(?P<day_or_month_year>'
r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
r'(?P<year1>\d{4}|\d{2}))\b|'
r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
r'(?P<month_in_words>'
r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|'
r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)'
r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b'
r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?'
r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?'
r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I
)
def detect_dates_en(text: str) -> List[Tuple[int, int, str]]:
"""
Detects English dates in the text.
:param text: the text to be searched
:type text: str
:return: a list of tuples containing (start, end, detected_date)
:rtype: List[Tuple[int, int, str]]
"""
matches = EN_DATES_REGEX.finditer(text)
dates = []
for match in matches:
dates.append((match.start(), match.end(), match.group()))
return dates
\ No newline at end of file
import regex as re
from typing import List, Tuple
PL_DATES_REGEX = re.compile(
r'\b(?P<day_or_month_year>'
r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
r'(?P<year1>\d{4}|\d{2}))\b|'
r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
r'(?P<month_in_words>'
r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)'
r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|'
r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)'
r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)'
r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)'
r'|Gru(?:|dzie[nń]|dnia))\b'
r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I
)
def detect_dates_pl(text: str) -> List[Tuple[int, int, str]]:
"""
Detects Polish dates in the text.
:param text: the text to be searched
:type text: str
:return: a list of tuples containing (start, end, detected_date)
:rtype: List[Tuple[int, int, str]]
"""
matches = PL_DATES_REGEX.finditer(text)
dates = []
for match in matches:
dates.append((match.start(), match.end(), match.group()))
return dates
\ No newline at end of file
import regex as re
from typing import List, Tuple
RU_DATES_REGEX = re.compile(
r'\b(?P<day_or_month_year>'
r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})'
r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})'
r'(?P<year1>\d{4}|\d{2}))\b|'
r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})'
r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)'
r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|'
r'(?P<month_in_words>'
r'(?!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b)'
r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?'
r'\b(?P<month>Янв(?:|ар[ьея])|Фев(?:|рал[ьея])|Мар(?:|т|те|та)|'
r'Апр(?:|ел[ьея])|Ма[йея]|Июн(?:|[ьея])|Июл(?:|[ьея])|'
r'Авг(?:|уст|уст[еа])|Сен(?:|тябр[ьея])|Окт(?:|ябр[ьея])|'
r'Ноя(?:|бр[ьея])|Дек(?:|абр[ьея]))\b'
r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))'
r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|'
r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?'
r'(?<!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b))', re.I
)
def detect_dates_ru(text: str) -> List[Tuple[int, int, str]]:
"""
Detects Russian dates in the text.
:param text: the text to be searched
:type text: str
:return: a list of tuples containing (start, end, detected_date)
:rtype: List[Tuple[int, int, str]]
"""
matches = RU_DATES_REGEX.finditer(text)
dates = []
for match in matches:
dates.append((match.start(), match.end(), match.group()))
return dates
\ No newline at end of file
from src.detectors.email.email import detect_emails
\ No newline at end of file
import regex as re
from typing import List, Tuple
EMAIL_REGEX = re.compile(
r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+'
r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)'
r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)'
r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', re.I
)
def detect_emails(text: str, language: str) -> List[Tuple[int, int, str]]:
"""
Detects emails in the text.
:param text: the text to be searched
:type text: str
:param language: the language of the text
:type language: str
:return: a list of tuples containing (start, end, detected_email)
:rtype: List[Tuple[int, int, str]]
"""
matches = EMAIL_REGEX.finditer(text)
emails = []
for match in matches:
emails.append((match.start(), match.end(), match.group()))
return emails
\ No newline at end of file
from src.detectors.phone.phone import detect_phone_numbers
\ No newline at end of file
import regex as re
from typing import List, Tuple
PHONE_NUMBER_REGEX = re.compile(
r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?'
r'(?P<number>(\d[- ]??){9,10})'
)
def detect_phone_numbers(text: str, language: str) -> List[Tuple[int, int, str]]:
"""
Detects phone numbers in the text.
:param text: the text to be searched
:type text: str
:param language: the language of the text
:type language: str
:return: a list of tuples containing (start, end, detected_date)
:rtype: List[Tuple[int, int, str]]
"""
matches = PHONE_NUMBER_REGEX.finditer(text)
phone_numbers = []
for match in matches:
phone_numbers.append((match.start(), match.end(), match.group()))
return phone_numbers
\ No newline at end of file
from src.detectors.url.url import detect_urls
import regex as re
from typing import List, Tuple
def generate_url_regex(exeptions: List[str]) -> str:
return re.compile(
r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(exeptions)) +
r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?'
r'(?P<auth>\S+(?::\S*)?@)?'
r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})'
r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})'
r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})'
r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}'
r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'
r'|'
r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?'
r'[a-z0-9\u00a1-\uffff]\.)+)'
r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)'
r'(?P<port>:\d{2,5})?'
r'(?P<path>[/?#]\S*)?)',
re.UNICODE | re.I
)
URL_REGEX_GENERAL = generate_url_regex([])
\ No newline at end of file
from .common import generate_url_regex
PL_URL_REGEX_EXEPTIONS = ["m.in"]
URL_REGEX_PL = generate_url_regex(PL_URL_REGEX_EXEPTIONS)
\ No newline at end of file
import regex as re
from typing import List, Tuple
from .pl import URL_REGEX_PL
from .common import generate_url_regex
def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]:
"""
Detects urls in the text.
:param text: the text to be searched
:type text: str
:param language: the language of the text
:type language: str
:return: a list of tuples containing (start, end, detected_url)
:rtype: List[Tuple[int, int, str]]
"""
if language == "pl":
url_regex = URL_REGEX_PL
else:
url_regex = generate_url_regex(language)
matches = url_regex.finditer(text)
urls = []
for match in matches:
urls.append((match.start(), match.end(), match.group()))
return urls
\ No newline at end of file
from src.detectors.user.user import detect_users
\ No newline at end of file
import regex as re
from typing import List, Tuple
USER_REGEX = re.compile(r'\B(?P<username>\@[\w\-]+)')
def detect_users(text: str, language: str) -> List[Tuple[int, int, str]]:
"""
Detects users in the text.
:param text: the text to be searched
:type text: str
:param language: the language of the text
:type language: str
:return: a list of tuples containing (start, end, detected_user)
:rtype: List[Tuple[int, int, str]]
"""
matches = USER_REGEX.finditer(text)
users = []
for match in matches:
users.append((match.start(), match.end(), match.group()))
return users
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment