From 1eeb5a983e40ce8573da3a26122fd6790f96aa29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl> Date: Mon, 12 Dec 2022 16:07:00 +0100 Subject: [PATCH] [WIP] - Refactoring + unit testing --- .../marek_kowalski_pojechal_do_wroclawia.ccl | 32 ++++++++------- src/base_anonymizer.py | 3 ++ src/detectors/__init__.py | 0 src/detectors/date/__init__.py | 1 + src/detectors/date/date.py | 23 +++++++++++ src/detectors/date/en.py | 36 +++++++++++++++++ src/detectors/date/pl.py | 39 +++++++++++++++++++ src/detectors/date/ru.py | 39 +++++++++++++++++++ src/detectors/email/__init__.py | 1 + src/detectors/email/email.py | 26 +++++++++++++ src/detectors/phone/__init__.py | 1 + src/detectors/phone/phone.py | 24 ++++++++++++ src/detectors/url/__init__.py | 1 + src/detectors/url/common.py | 24 ++++++++++++ src/detectors/url/pl.py | 5 +++ src/detectors/url/url.py | 26 +++++++++++++ src/detectors/user/__init__.py | 1 + src/detectors/user/user.py | 20 ++++++++++ tests/__init__.py | 0 tests/detectors/__init__.py | 0 tests/detectors/date/__init__.py | 0 tests/detectors/date/test_en.py | 16 ++++++++ tests/detectors/date/test_pl.py | 7 ++++ tests/detectors/date/test_ru.py | 7 ++++ tests/detectors/email/__init__.py | 0 tests/detectors/email/test_email.py | 7 ++++ tests/detectors/phone/__init__.py | 0 tests/detectors/phone/test_phone.py | 7 ++++ tests/detectors/url/__init__.py | 0 tests/detectors/url/test_url.py | 16 ++++++++ tests/detectors/user/__init__.py | 0 tests/detectors/user/test_user.py | 7 ++++ 32 files changed, 354 insertions(+), 15 deletions(-) create mode 100644 src/detectors/__init__.py create mode 100644 src/detectors/date/__init__.py create mode 100644 src/detectors/date/date.py create mode 100644 src/detectors/date/en.py create mode 100644 src/detectors/date/pl.py create mode 100644 src/detectors/date/ru.py create mode 100644 src/detectors/email/__init__.py create mode 100644 src/detectors/email/email.py create mode 100644 src/detectors/phone/__init__.py create mode 100644 src/detectors/phone/phone.py create mode 100644 src/detectors/url/__init__.py create mode 100644 src/detectors/url/common.py create mode 100644 src/detectors/url/pl.py create mode 100644 src/detectors/url/url.py create mode 100644 src/detectors/user/__init__.py create mode 100644 src/detectors/user/user.py create mode 100644 tests/__init__.py create mode 100644 tests/detectors/__init__.py create mode 100644 tests/detectors/date/__init__.py create mode 100644 tests/detectors/date/test_en.py create mode 100644 tests/detectors/date/test_pl.py create mode 100644 tests/detectors/date/test_ru.py create mode 100644 tests/detectors/email/__init__.py create mode 100644 tests/detectors/email/test_email.py create mode 100644 tests/detectors/phone/__init__.py create mode 100644 tests/detectors/phone/test_phone.py create mode 100644 tests/detectors/url/__init__.py create mode 100644 tests/detectors/url/test_url.py create mode 100644 tests/detectors/user/__init__.py create mode 100644 tests/detectors/user/test_user.py diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl index d8db042..b19c400 100644 --- a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl +++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl @@ -7,37 +7,39 @@ <orth>Marek</orth> <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex> <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex> - <ann chan="person_first_nam" head="1">1</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv" head="1">1</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>Kowalski</orth> <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam" head="1">1</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv">1</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>pojechaÅ‚</orth> <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>do</orth> <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>WrocÅ‚awia</orth> <lex disamb="1"><base>WrocÅ‚aw</base><ctag>subst:sg:gen:m3</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam" head="1">1</ann> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc" head="1">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> </tok> </sentence> </chunk> diff --git a/src/base_anonymizer.py b/src/base_anonymizer.py index fd62de5..a863322 100644 --- a/src/base_anonymizer.py +++ b/src/base_anonymizer.py @@ -4,6 +4,9 @@ from abc import ABC, abstractmethod from src.generators import generate_phone_number_tag +regex.compile(r'\B(?P<username>\@[\w\-]+)') +# This regex detects the following + class BaseAnonymizer(ABC): """Base abstract class for anonymization.""" diff --git a/src/detectors/__init__.py b/src/detectors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/detectors/date/__init__.py b/src/detectors/date/__init__.py new file mode 100644 index 0000000..2c5b35b --- /dev/null +++ b/src/detectors/date/__init__.py @@ -0,0 +1 @@ +from src.detectors.date.date import find_dates \ No newline at end of file diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py new file mode 100644 index 0000000..85e34b4 --- /dev/null +++ b/src/detectors/date/date.py @@ -0,0 +1,23 @@ +from typing import List, Tuple +from .en import detect_dates_en +from .pl import detect_dates_pl +from .ru import detect_dates_ru + +def find_dates(text: str, language: str = "en") -> List[Tuple[int, int, str]]: + """ + Finds dates in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + + language_processors = { + "en": detect_dates_en, + "pl": detect_dates_pl, + "ru": detect_dates_ru + } + + return language_processors.get(language, detect_dates_en)(text) \ No newline at end of file diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py new file mode 100644 index 0000000..594e663 --- /dev/null +++ b/src/detectors/date/en.py @@ -0,0 +1,36 @@ +import regex as re +from typing import List, Tuple + +EN_DATES_REGEX = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|' + r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)' + r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b' + r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?' + r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?' + r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I +) + +def detect_dates_en(text: str) -> List[Tuple[int, int, str]]: + """ + Detects English dates in the text. + :param text: the text to be searched + :type text: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = EN_DATES_REGEX.finditer(text) + dates = [] + for match in matches: + dates.append((match.start(), match.end(), match.group())) + return dates \ No newline at end of file diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py new file mode 100644 index 0000000..7001b9f --- /dev/null +++ b/src/detectors/date/pl.py @@ -0,0 +1,39 @@ +import regex as re +from typing import List, Tuple + +PL_DATES_REGEX = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nÅ„]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' + r'Kwi(?:|ecie[nÅ„]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' + r'|Sie(?:|rpie[nÅ„]|rpnia)|Wrz(?:|esie[nÅ„]|e[Å›s]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)' + r'|Gru(?:|dzie[nÅ„]|dnia))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I +) + +def detect_dates_pl(text: str) -> List[Tuple[int, int, str]]: + """ + Detects Polish dates in the text. + :param text: the text to be searched + :type text: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = PL_DATES_REGEX.finditer(text) + dates = [] + for match in matches: + dates.append((match.start(), match.end(), match.group())) + return dates \ No newline at end of file diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py new file mode 100644 index 0000000..91017c8 --- /dev/null +++ b/src/detectors/date/ru.py @@ -0,0 +1,39 @@ +import regex as re +from typing import List, Tuple + +RU_DATES_REGEX = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b)' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Янв(?:|ар[ьеÑ])|Фев(?:|рал[ьеÑ])|Мар(?:|Ñ‚|те|та)|' + r'Ðпр(?:|ел[ьеÑ])|Ма[йеÑ]|Июн(?:|[ьеÑ])|Июл(?:|[ьеÑ])|' + r'Ðвг(?:|уÑÑ‚|уÑÑ‚[еа])|Сен(?:|Ñ‚Ñбр[ьеÑ])|Окт(?:|Ñбр[ьеÑ])|' + r'ÐоÑ(?:|бр[ьеÑ])|Дек(?:|абр[ьеÑ]))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?' + r'(?<!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b))', re.I +) + +def detect_dates_ru(text: str) -> List[Tuple[int, int, str]]: + """ + Detects Russian dates in the text. + :param text: the text to be searched + :type text: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = RU_DATES_REGEX.finditer(text) + dates = [] + for match in matches: + dates.append((match.start(), match.end(), match.group())) + return dates \ No newline at end of file diff --git a/src/detectors/email/__init__.py b/src/detectors/email/__init__.py new file mode 100644 index 0000000..58050bc --- /dev/null +++ b/src/detectors/email/__init__.py @@ -0,0 +1 @@ +from src.detectors.email.email import detect_emails \ No newline at end of file diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py new file mode 100644 index 0000000..a0637ec --- /dev/null +++ b/src/detectors/email/email.py @@ -0,0 +1,26 @@ +import regex as re +from typing import List, Tuple + +EMAIL_REGEX = re.compile( + r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+' + r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)' + r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)' + r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', re.I +) + + +def detect_emails(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects emails in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_email) + :rtype: List[Tuple[int, int, str]] + """ + matches = EMAIL_REGEX.finditer(text) + emails = [] + for match in matches: + emails.append((match.start(), match.end(), match.group())) + return emails \ No newline at end of file diff --git a/src/detectors/phone/__init__.py b/src/detectors/phone/__init__.py new file mode 100644 index 0000000..e30518d --- /dev/null +++ b/src/detectors/phone/__init__.py @@ -0,0 +1 @@ +from src.detectors.phone.phone import detect_phone_numbers \ No newline at end of file diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py new file mode 100644 index 0000000..49abeb5 --- /dev/null +++ b/src/detectors/phone/phone.py @@ -0,0 +1,24 @@ +import regex as re +from typing import List, Tuple + +PHONE_NUMBER_REGEX = re.compile( + r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?' + r'(?P<number>(\d[- ]??){9,10})' +) + + +def detect_phone_numbers(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects phone numbers in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = PHONE_NUMBER_REGEX.finditer(text) + phone_numbers = [] + for match in matches: + phone_numbers.append((match.start(), match.end(), match.group())) + return phone_numbers \ No newline at end of file diff --git a/src/detectors/url/__init__.py b/src/detectors/url/__init__.py new file mode 100644 index 0000000..72b8dc6 --- /dev/null +++ b/src/detectors/url/__init__.py @@ -0,0 +1 @@ +from src.detectors.url.url import detect_urls diff --git a/src/detectors/url/common.py b/src/detectors/url/common.py new file mode 100644 index 0000000..9d39241 --- /dev/null +++ b/src/detectors/url/common.py @@ -0,0 +1,24 @@ +import regex as re +from typing import List, Tuple + +def generate_url_regex(exeptions: List[str]) -> str: + return re.compile( + r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(exeptions)) + + r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?' + r'(?P<auth>\S+(?::\S*)?@)?' + r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})' + r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})' + r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})' + r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])' + r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}' + r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))' + r'|' + r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?' + r'[a-z0-9\u00a1-\uffff]\.)+)' + r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)' + r'(?P<port>:\d{2,5})?' + r'(?P<path>[/?#]\S*)?)', + re.UNICODE | re.I + ) + +URL_REGEX_GENERAL = generate_url_regex([]) \ No newline at end of file diff --git a/src/detectors/url/pl.py b/src/detectors/url/pl.py new file mode 100644 index 0000000..5d1a9ed --- /dev/null +++ b/src/detectors/url/pl.py @@ -0,0 +1,5 @@ +from .common import generate_url_regex + +PL_URL_REGEX_EXEPTIONS = ["m.in"] + +URL_REGEX_PL = generate_url_regex(PL_URL_REGEX_EXEPTIONS) \ No newline at end of file diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py new file mode 100644 index 0000000..2ca1fec --- /dev/null +++ b/src/detectors/url/url.py @@ -0,0 +1,26 @@ +import regex as re +from typing import List, Tuple +from .pl import URL_REGEX_PL +from .common import generate_url_regex + +def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects urls in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_url) + :rtype: List[Tuple[int, int, str]] + """ + if language == "pl": + url_regex = URL_REGEX_PL + else: + url_regex = generate_url_regex(language) + + matches = url_regex.finditer(text) + urls = [] + for match in matches: + urls.append((match.start(), match.end(), match.group())) + + return urls \ No newline at end of file diff --git a/src/detectors/user/__init__.py b/src/detectors/user/__init__.py new file mode 100644 index 0000000..3ba0c10 --- /dev/null +++ b/src/detectors/user/__init__.py @@ -0,0 +1 @@ +from src.detectors.user.user import detect_users \ No newline at end of file diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py new file mode 100644 index 0000000..4d8f035 --- /dev/null +++ b/src/detectors/user/user.py @@ -0,0 +1,20 @@ +import regex as re +from typing import List, Tuple + +USER_REGEX = re.compile(r'\B(?P<username>\@[\w\-]+)') + +def detect_users(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects users in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_user) + :rtype: List[Tuple[int, int, str]] + """ + matches = USER_REGEX.finditer(text) + users = [] + for match in matches: + users.append((match.start(), match.end(), match.group())) + return users \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/__init__.py b/tests/detectors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/date/__init__.py b/tests/detectors/date/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py new file mode 100644 index 0000000..429ee2a --- /dev/null +++ b/tests/detectors/date/test_en.py @@ -0,0 +1,16 @@ +from src.detectors.date.en import detect_dates_en + +def test_detect_dates_en(): + # Check en-us + text = "On 1.01.2022, I sold my cat. On April 5, 2021, I bought a dog." + found_dates = detect_dates_en(text) + + assert found_dates == [(3,12,"1.01.2022"), (32,45, "April 5, 2021")] + + # Check en-gb + # TODO: Following test fails. Fix it. + # text = "On 1.01.2022 I sold the cat. On 5th April 2021 I bought a dog." + # found_dates = detect_dates_en(text) + + # assert found_dates == [(3,12,"1.01.2022"), (32,46, "5th April 2021")] + \ No newline at end of file diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py new file mode 100644 index 0000000..a441c36 --- /dev/null +++ b/tests/detectors/date/test_pl.py @@ -0,0 +1,7 @@ +from src.detectors.date.pl import detect_dates_pl + +def test_detect_dates_pl(): + text = "W dniu 1.01.2022 sprzedaÅ‚em kota. 5 kwietnia 2021 roku kupiÅ‚em psa." + found_dates = detect_dates_pl(text) + + assert found_dates == [(7,16,"1.01.2022"), (34,49, "5 kwietnia 2021")] \ No newline at end of file diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py new file mode 100644 index 0000000..44e9805 --- /dev/null +++ b/tests/detectors/date/test_ru.py @@ -0,0 +1,7 @@ +from src.detectors.date.ru import detect_dates_ru + +def test_detect_dates_pl(): + text = "1.01.2022 Ñ Ð¿Ñ€Ð¾Ð´Ð°Ð» кошку. 5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021 Ñ ÐºÑƒÐ¿Ð¸Ð» Ñобаку." + found_dates = detect_dates_ru(text) + + assert found_dates == [(0,9,"1.01.2022"), (26,39, "5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021")] \ No newline at end of file diff --git a/tests/detectors/email/__init__.py b/tests/detectors/email/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py new file mode 100644 index 0000000..05b3e63 --- /dev/null +++ b/tests/detectors/email/test_email.py @@ -0,0 +1,7 @@ +from src.detectors.email import detect_emails + +def test_detect_emails(): + text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl" + found_emails = detect_emails(text, "en") + + assert found_emails == [(12, 30, "arkadiusz@borek.pw"), (53, 78, "arkadiusz.dump@pwr.edu.pl")] \ No newline at end of file diff --git a/tests/detectors/phone/__init__.py b/tests/detectors/phone/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py new file mode 100644 index 0000000..b2efe23 --- /dev/null +++ b/tests/detectors/phone/test_phone.py @@ -0,0 +1,7 @@ +from src.detectors.phone.phone import detect_phone_numbers + +def test_detect_phone_numbers(): + text = "My phone number is +48 123 456 789. My friend's number is 123456789." + found_phone_numbers = detect_phone_numbers(text, "en") + + assert found_phone_numbers == [(19, 34, '+48 123 456 789'), (58, 67, '123456789')] \ No newline at end of file diff --git a/tests/detectors/url/__init__.py b/tests/detectors/url/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py new file mode 100644 index 0000000..ad22f69 --- /dev/null +++ b/tests/detectors/url/test_url.py @@ -0,0 +1,16 @@ +from src.detectors.url import detect_urls + +def test_detect_urls(): + text = "This is a test for www.google.com. Make sure to go to https://www.google.com" + found_urls = detect_urls(text, "en") + + assert found_urls == [(19, 33, 'www.google.com'), (54, 76, 'https://www.google.com')] + +def test_detect_urls_pl(): + text = "m.in. https://www.google.com" + found_urls_pl = detect_urls(text, "pl") + found_urls_en = detect_urls(text, "en") + + # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected as a URL. + assert found_urls_pl == [(6, 28, 'https://www.google.com')] + assert found_urls_en == [(0, 4, "m.in"), (6, 28, 'https://www.google.com')] \ No newline at end of file diff --git a/tests/detectors/user/__init__.py b/tests/detectors/user/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py new file mode 100644 index 0000000..b198f71 --- /dev/null +++ b/tests/detectors/user/test_user.py @@ -0,0 +1,7 @@ +from src.detectors.user.user import detect_users + +def test_detect_users(): + text = "My username is @john_smith. My friend's username is @jane_doe." + found_users = detect_users(text, "en") + + assert found_users == [(15, 26, '@john_smith'), (52, 61, '@jane_doe')] \ No newline at end of file -- GitLab