diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl index d8db042b7f3caf0b2957d27b2d6c34e91dfc9117..b19c4001e36a636590731e21ea65858e61f31073 100644 --- a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl +++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl @@ -7,37 +7,39 @@ <orth>Marek</orth> <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex> <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex> - <ann chan="person_first_nam" head="1">1</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv" head="1">1</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>Kowalski</orth> <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam" head="1">1</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv">1</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>pojechał</orth> <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>do</orth> <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam">0</ann> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> </tok> <tok> <orth>Wrocławia</orth> <lex disamb="1"><base>Wrocław</base><ctag>subst:sg:gen:m3</ctag></lex> - <ann chan="person_first_nam">0</ann> - <ann chan="person_last_nam">0</ann> - <ann chan="city_nam" head="1">1</ann> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc" head="1">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> </tok> </sentence> </chunk> diff --git a/src/base_anonymizer.py b/src/base_anonymizer.py index fd62de5e34530f2a015f0e31fc5078326b75e41a..a86332255aea11d5f193fa5a48e33053139c5384 100644 --- a/src/base_anonymizer.py +++ b/src/base_anonymizer.py @@ -4,6 +4,9 @@ from abc import ABC, abstractmethod from src.generators import generate_phone_number_tag +regex.compile(r'\B(?P<username>\@[\w\-]+)') +# This regex detects the following + class BaseAnonymizer(ABC): """Base abstract class for anonymization.""" diff --git a/src/detectors/__init__.py b/src/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/detectors/date/__init__.py b/src/detectors/date/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2c5b35bc8b4eed33dffff4b062fbf6cb37187049 --- /dev/null +++ b/src/detectors/date/__init__.py @@ -0,0 +1 @@ +from src.detectors.date.date import find_dates \ No newline at end of file diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py new file mode 100644 index 0000000000000000000000000000000000000000..85e34b4dac03d9adfd0a32d679a50ce2ba0dce21 --- /dev/null +++ b/src/detectors/date/date.py @@ -0,0 +1,23 @@ +from typing import List, Tuple +from .en import detect_dates_en +from .pl import detect_dates_pl +from .ru import detect_dates_ru + +def find_dates(text: str, language: str = "en") -> List[Tuple[int, int, str]]: + """ + Finds dates in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + + language_processors = { + "en": detect_dates_en, + "pl": detect_dates_pl, + "ru": detect_dates_ru + } + + return language_processors.get(language, detect_dates_en)(text) \ No newline at end of file diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py new file mode 100644 index 0000000000000000000000000000000000000000..594e663cb67fed4573da82741d2353c2aff7861d --- /dev/null +++ b/src/detectors/date/en.py @@ -0,0 +1,36 @@ +import regex as re +from typing import List, Tuple + +EN_DATES_REGEX = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|' + r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)' + r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b' + r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?' + r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?' + r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I +) + +def detect_dates_en(text: str) -> List[Tuple[int, int, str]]: + """ + Detects English dates in the text. + :param text: the text to be searched + :type text: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = EN_DATES_REGEX.finditer(text) + dates = [] + for match in matches: + dates.append((match.start(), match.end(), match.group())) + return dates \ No newline at end of file diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py new file mode 100644 index 0000000000000000000000000000000000000000..7001b9f5b82f622dec9a03ca370740cc4a2ae9c0 --- /dev/null +++ b/src/detectors/date/pl.py @@ -0,0 +1,39 @@ +import regex as re +from typing import List, Tuple + +PL_DATES_REGEX = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Sty(?:|cze[nń]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' + r'Kwi(?:|ecie[nń]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' + r'|Sie(?:|rpie[nń]|rpnia)|Wrz(?:|esie[nń]|e[śs]nia)' + r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)' + r'|Gru(?:|dzie[nń]|dnia))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I +) + +def detect_dates_pl(text: str) -> List[Tuple[int, int, str]]: + """ + Detects Polish dates in the text. + :param text: the text to be searched + :type text: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = PL_DATES_REGEX.finditer(text) + dates = [] + for match in matches: + dates.append((match.start(), match.end(), match.group())) + return dates \ No newline at end of file diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py new file mode 100644 index 0000000000000000000000000000000000000000..91017c8c32e44cfd66d8bb94692cd207e6551309 --- /dev/null +++ b/src/detectors/date/ru.py @@ -0,0 +1,39 @@ +import regex as re +from typing import List, Tuple + +RU_DATES_REGEX = re.compile( + r'\b(?P<day_or_month_year>' + r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' + r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' + r'(?P<year1>\d{4}|\d{2}))\b|' + + r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' + r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' + r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' + + r'(?P<month_in_words>' + r'(?!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b)' + r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' + r'\b(?P<month>Янв(?:|ар[ьея])|Фев(?:|рал[ьея])|Мар(?:|т|те|та)|' + r'Апр(?:|ел[ьея])|Ма[йея]|Июн(?:|[ьея])|Июл(?:|[ьея])|' + r'Авг(?:|уст|уст[еа])|Сен(?:|тябр[ьея])|Окт(?:|ябр[ьея])|' + r'Ноя(?:|бр[ьея])|Дек(?:|абр[ьея]))\b' + r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' + r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' + r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?' + r'(?<!\b(Янв|Фев|Мар|Апр|Май|Июн|Июл|Авг|Сен|Окт|Ноя|Дек)\b))', re.I +) + +def detect_dates_ru(text: str) -> List[Tuple[int, int, str]]: + """ + Detects Russian dates in the text. + :param text: the text to be searched + :type text: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = RU_DATES_REGEX.finditer(text) + dates = [] + for match in matches: + dates.append((match.start(), match.end(), match.group())) + return dates \ No newline at end of file diff --git a/src/detectors/email/__init__.py b/src/detectors/email/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..58050bc5531df68ba7ed8597be022326571632c5 --- /dev/null +++ b/src/detectors/email/__init__.py @@ -0,0 +1 @@ +from src.detectors.email.email import detect_emails \ No newline at end of file diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py new file mode 100644 index 0000000000000000000000000000000000000000..a0637ec67652440e3e2fb7c961c78ff80d55ecfe --- /dev/null +++ b/src/detectors/email/email.py @@ -0,0 +1,26 @@ +import regex as re +from typing import List, Tuple + +EMAIL_REGEX = re.compile( + r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+' + r'(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)' + r'(?P<domain>(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+)' + r'(?P<tld>[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)', re.I +) + + +def detect_emails(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects emails in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_email) + :rtype: List[Tuple[int, int, str]] + """ + matches = EMAIL_REGEX.finditer(text) + emails = [] + for match in matches: + emails.append((match.start(), match.end(), match.group())) + return emails \ No newline at end of file diff --git a/src/detectors/phone/__init__.py b/src/detectors/phone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e30518ddfea1425e77e56db0634e457a21cc21fe --- /dev/null +++ b/src/detectors/phone/__init__.py @@ -0,0 +1 @@ +from src.detectors.phone.phone import detect_phone_numbers \ No newline at end of file diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py new file mode 100644 index 0000000000000000000000000000000000000000..49abeb51e8467aa5beec9e579ca0bda1ffd88704 --- /dev/null +++ b/src/detectors/phone/phone.py @@ -0,0 +1,24 @@ +import regex as re +from typing import List, Tuple + +PHONE_NUMBER_REGEX = re.compile( + r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?' + r'(?P<number>(\d[- ]??){9,10})' +) + + +def detect_phone_numbers(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects phone numbers in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_date) + :rtype: List[Tuple[int, int, str]] + """ + matches = PHONE_NUMBER_REGEX.finditer(text) + phone_numbers = [] + for match in matches: + phone_numbers.append((match.start(), match.end(), match.group())) + return phone_numbers \ No newline at end of file diff --git a/src/detectors/url/__init__.py b/src/detectors/url/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72b8dc675cbb54f91189f3450bd16bb45c9b75b1 --- /dev/null +++ b/src/detectors/url/__init__.py @@ -0,0 +1 @@ +from src.detectors.url.url import detect_urls diff --git a/src/detectors/url/common.py b/src/detectors/url/common.py new file mode 100644 index 0000000000000000000000000000000000000000..9d39241b467a147904f516fdc6f702d5603af558 --- /dev/null +++ b/src/detectors/url/common.py @@ -0,0 +1,24 @@ +import regex as re +from typing import List, Tuple + +def generate_url_regex(exeptions: List[str]) -> str: + return re.compile( + r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(exeptions)) + + r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?' + r'(?P<auth>\S+(?::\S*)?@)?' + r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})' + r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})' + r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})' + r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])' + r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}' + r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))' + r'|' + r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?' + r'[a-z0-9\u00a1-\uffff]\.)+)' + r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)' + r'(?P<port>:\d{2,5})?' + r'(?P<path>[/?#]\S*)?)', + re.UNICODE | re.I + ) + +URL_REGEX_GENERAL = generate_url_regex([]) \ No newline at end of file diff --git a/src/detectors/url/pl.py b/src/detectors/url/pl.py new file mode 100644 index 0000000000000000000000000000000000000000..5d1a9edd62b640216f32c84d4aa59195058b2846 --- /dev/null +++ b/src/detectors/url/pl.py @@ -0,0 +1,5 @@ +from .common import generate_url_regex + +PL_URL_REGEX_EXEPTIONS = ["m.in"] + +URL_REGEX_PL = generate_url_regex(PL_URL_REGEX_EXEPTIONS) \ No newline at end of file diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py new file mode 100644 index 0000000000000000000000000000000000000000..2ca1fec60b6c964670c3f41f7e659796390df6cb --- /dev/null +++ b/src/detectors/url/url.py @@ -0,0 +1,26 @@ +import regex as re +from typing import List, Tuple +from .pl import URL_REGEX_PL +from .common import generate_url_regex + +def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects urls in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_url) + :rtype: List[Tuple[int, int, str]] + """ + if language == "pl": + url_regex = URL_REGEX_PL + else: + url_regex = generate_url_regex(language) + + matches = url_regex.finditer(text) + urls = [] + for match in matches: + urls.append((match.start(), match.end(), match.group())) + + return urls \ No newline at end of file diff --git a/src/detectors/user/__init__.py b/src/detectors/user/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba0c10ddd3561d0f4d232f522ee0f67a3a1fa2c --- /dev/null +++ b/src/detectors/user/__init__.py @@ -0,0 +1 @@ +from src.detectors.user.user import detect_users \ No newline at end of file diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py new file mode 100644 index 0000000000000000000000000000000000000000..4d8f0352540682829b8cd154e227c48c38110e90 --- /dev/null +++ b/src/detectors/user/user.py @@ -0,0 +1,20 @@ +import regex as re +from typing import List, Tuple + +USER_REGEX = re.compile(r'\B(?P<username>\@[\w\-]+)') + +def detect_users(text: str, language: str) -> List[Tuple[int, int, str]]: + """ + Detects users in the text. + :param text: the text to be searched + :type text: str + :param language: the language of the text + :type language: str + :return: a list of tuples containing (start, end, detected_user) + :rtype: List[Tuple[int, int, str]] + """ + matches = USER_REGEX.finditer(text) + users = [] + for match in matches: + users.append((match.start(), match.end(), match.group())) + return users \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/detectors/__init__.py b/tests/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/detectors/date/__init__.py b/tests/detectors/date/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py new file mode 100644 index 0000000000000000000000000000000000000000..429ee2ab0cd6ef71af64494cb34c4b467075323c --- /dev/null +++ b/tests/detectors/date/test_en.py @@ -0,0 +1,16 @@ +from src.detectors.date.en import detect_dates_en + +def test_detect_dates_en(): + # Check en-us + text = "On 1.01.2022, I sold my cat. On April 5, 2021, I bought a dog." + found_dates = detect_dates_en(text) + + assert found_dates == [(3,12,"1.01.2022"), (32,45, "April 5, 2021")] + + # Check en-gb + # TODO: Following test fails. Fix it. + # text = "On 1.01.2022 I sold the cat. On 5th April 2021 I bought a dog." + # found_dates = detect_dates_en(text) + + # assert found_dates == [(3,12,"1.01.2022"), (32,46, "5th April 2021")] + \ No newline at end of file diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py new file mode 100644 index 0000000000000000000000000000000000000000..a441c3684f7a6fbf7721c5a4b79854f4f36a4106 --- /dev/null +++ b/tests/detectors/date/test_pl.py @@ -0,0 +1,7 @@ +from src.detectors.date.pl import detect_dates_pl + +def test_detect_dates_pl(): + text = "W dniu 1.01.2022 sprzedałem kota. 5 kwietnia 2021 roku kupiłem psa." + found_dates = detect_dates_pl(text) + + assert found_dates == [(7,16,"1.01.2022"), (34,49, "5 kwietnia 2021")] \ No newline at end of file diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py new file mode 100644 index 0000000000000000000000000000000000000000..44e9805ae7b6b0724cc3e70ee907b9a78a26e86c --- /dev/null +++ b/tests/detectors/date/test_ru.py @@ -0,0 +1,7 @@ +from src.detectors.date.ru import detect_dates_ru + +def test_detect_dates_pl(): + text = "1.01.2022 я продал кошку. 5 апреля 2021 я купил собаку." + found_dates = detect_dates_ru(text) + + assert found_dates == [(0,9,"1.01.2022"), (26,39, "5 апреля 2021")] \ No newline at end of file diff --git a/tests/detectors/email/__init__.py b/tests/detectors/email/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py new file mode 100644 index 0000000000000000000000000000000000000000..05b3e63fc1fc1496e3dd468b6305d29040ff9077 --- /dev/null +++ b/tests/detectors/email/test_email.py @@ -0,0 +1,7 @@ +from src.detectors.email import detect_emails + +def test_detect_emails(): + text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl" + found_emails = detect_emails(text, "en") + + assert found_emails == [(12, 30, "arkadiusz@borek.pw"), (53, 78, "arkadiusz.dump@pwr.edu.pl")] \ No newline at end of file diff --git a/tests/detectors/phone/__init__.py b/tests/detectors/phone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py new file mode 100644 index 0000000000000000000000000000000000000000..b2efe2366531a3f183bf06b526343d1b431ed673 --- /dev/null +++ b/tests/detectors/phone/test_phone.py @@ -0,0 +1,7 @@ +from src.detectors.phone.phone import detect_phone_numbers + +def test_detect_phone_numbers(): + text = "My phone number is +48 123 456 789. My friend's number is 123456789." + found_phone_numbers = detect_phone_numbers(text, "en") + + assert found_phone_numbers == [(19, 34, '+48 123 456 789'), (58, 67, '123456789')] \ No newline at end of file diff --git a/tests/detectors/url/__init__.py b/tests/detectors/url/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py new file mode 100644 index 0000000000000000000000000000000000000000..ad22f6968833aa58da27d87dfea5ff363aa6f8b8 --- /dev/null +++ b/tests/detectors/url/test_url.py @@ -0,0 +1,16 @@ +from src.detectors.url import detect_urls + +def test_detect_urls(): + text = "This is a test for www.google.com. Make sure to go to https://www.google.com" + found_urls = detect_urls(text, "en") + + assert found_urls == [(19, 33, 'www.google.com'), (54, 76, 'https://www.google.com')] + +def test_detect_urls_pl(): + text = "m.in. https://www.google.com" + found_urls_pl = detect_urls(text, "pl") + found_urls_en = detect_urls(text, "en") + + # m.in is a valid shortcut for między innymi in Polish. It should not be detected as a URL. + assert found_urls_pl == [(6, 28, 'https://www.google.com')] + assert found_urls_en == [(0, 4, "m.in"), (6, 28, 'https://www.google.com')] \ No newline at end of file diff --git a/tests/detectors/user/__init__.py b/tests/detectors/user/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py new file mode 100644 index 0000000000000000000000000000000000000000..b198f71653965c69d101c64d08941c67dad3c844 --- /dev/null +++ b/tests/detectors/user/test_user.py @@ -0,0 +1,7 @@ +from src.detectors.user.user import detect_users + +def test_detect_users(): + text = "My username is @john_smith. My friend's username is @jane_doe." + found_users = detect_users(text, "en") + + assert found_users == [(15, 26, '@john_smith'), (52, 61, '@jane_doe')] \ No newline at end of file