From c814a2c1ad346c5ae79ec37e772295cd52dee768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl> Date: Thu, 19 Jan 2023 08:47:36 +0100 Subject: [PATCH] Fix utils.py beeing deleted in last commit, rename annotations to detections --- src/annotations/__init__.py | 2 - src/detections/__init__.py | 2 + src/{annotations => detections}/date.py | 4 +- .../annotation.py => detections/detection.py} | 26 +++---- src/detectors/date/date.py | 6 +- src/detectors/date/en.py | 6 +- src/detectors/date/pl.py | 6 +- src/detectors/date/ru.py | 6 +- src/detectors/date/utils.py | 72 +++++++++---------- src/detectors/email/email.py | 8 +-- src/detectors/interface.py | 4 +- src/detectors/ner/ner.py | 4 +- src/detectors/ner/pl_liner_n5.py | 6 +- src/detectors/phone/phone.py | 8 +-- src/detectors/url/url.py | 8 +-- src/detectors/user/user.py | 8 +-- src/dictionaries/morphosyntactic/interface.py | 4 +- src/dictionaries/morphosyntactic/pl_ner.py | 16 ++--- src/mappings/ner_pl_n5_mapping.py | 22 +++--- src/replacers/date_replacer.py | 36 +++++----- src/replacers/delete_replacer.py | 6 +- src/replacers/email_replacer.py | 10 +-- src/replacers/interface.py | 6 +- src/replacers/ner_replacer.py | 8 +-- src/replacers/tag_replacer.py | 56 +++++++-------- src/replacers/user_replacer.py | 10 +-- src/utils/__init__.py | 0 src/utils/utils.py | 33 +++++++++ tests/detectors/date/test_en.py | 24 +++---- tests/detectors/date/test_pl.py | 24 +++---- tests/detectors/date/test_ru.py | 24 +++---- tests/detectors/email/test_email.py | 4 +- tests/detectors/ner/test_pl_liner_n5.py | 10 +-- tests/detectors/phone/test_phone.py | 4 +- tests/detectors/url/test_url.py | 8 +-- tests/detectors/user/test_user.py | 4 +- .../morphosyntactic/test_pl_ner.py | 20 +++--- tests/pipeline/test_default.py | 4 +- tests/replacers/test_date_replacer.py | 22 +++--- tests/replacers/test_email_replacer.py | 18 ++--- tests/replacers/test_ner_replacer.py | 22 +++--- tests/replacers/test_tag_replacer.py | 10 +-- tests/replacers/test_user_replacer.py | 18 ++--- 43 files changed, 316 insertions(+), 283 deletions(-) delete mode 100644 src/annotations/__init__.py create mode 100644 src/detections/__init__.py rename src/{annotations => detections}/date.py (91%) rename src/{annotations/annotation.py => detections/detection.py} (73%) create mode 100644 src/utils/__init__.py create mode 100644 src/utils/utils.py diff --git a/src/annotations/__init__.py b/src/annotations/__init__.py deleted file mode 100644 index cf65f5a..0000000 --- a/src/annotations/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from src.annotations.annotation import * -from src.annotations.date import * \ No newline at end of file diff --git a/src/detections/__init__.py b/src/detections/__init__.py new file mode 100644 index 0000000..a26fabc --- /dev/null +++ b/src/detections/__init__.py @@ -0,0 +1,2 @@ +from src.detections.detection import * +from src.detections.date import * \ No newline at end of file diff --git a/src/annotations/date.py b/src/detections/date.py similarity index 91% rename from src/annotations/date.py rename to src/detections/date.py index 502004c..4ae956b 100644 --- a/src/annotations/date.py +++ b/src/detections/date.py @@ -1,7 +1,7 @@ -from src.annotations.annotation import Annotation +from src.detections.detection import Detection from typing import List, Tuple, Optional -class DateAnnotation(Annotation): +class DateDetection(Detection): class AnnotationPart: TWO_DIGITS_DAY = "DD" ONE_DIGIT_DAY = "D" diff --git a/src/annotations/annotation.py b/src/detections/detection.py similarity index 73% rename from src/annotations/annotation.py rename to src/detections/detection.py index 69e6f50..59d712d 100644 --- a/src/annotations/annotation.py +++ b/src/detections/detection.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from typing import Optional @dataclass -class Annotation: +class Detection: def __init__(self, type_name: str) -> None: self._type_name = type_name @@ -18,50 +18,50 @@ class MorphosyntacticInfoMixin: def morpho_tag(self) -> str: return self._morpho_tag -class NameAnnotation(MorphosyntacticInfoMixin, Annotation): +class NameDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="name") -class SurnameAnnotation(MorphosyntacticInfoMixin, Annotation): +class SurnameDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="surname") -class StreetNameAnnotation(MorphosyntacticInfoMixin, Annotation): +class StreetNameDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="street_name") -class CityAnnotation(MorphosyntacticInfoMixin, Annotation): +class CityDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="city") -class CountryAnnotation(MorphosyntacticInfoMixin, Annotation): +class CountryDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="country") -class PhoneNumberAnnotation(Annotation): +class PhoneNumberDetection(Detection): def __init__(self) -> None: super().__init__("phone_number") -class UrlAnnotation(Annotation): +class UrlDetection(Detection): def __init__(self) -> None: super().__init__("url") -class UserAnnotation(Annotation): +class UserDetection(Detection): def __init__(self) -> None: super().__init__("user") -class EmailAnnotation(Annotation): +class EmailDetection(Detection): def __init__(self) -> None: super().__init__("email") -class TINAnnotation(Annotation): # Tax Identification Number +class TINDetection(Detection): # Tax Identification Number def __init__(self) -> None: super().__init__("tin") -class KRSAnnotation(Annotation): # National Court Register +class KRSDetection(Detection): # National Court Register def __init__(self) -> None: super().__init__("krs") -class OtherAnnotation(Annotation): # Non standard entity +class OtherDetection(Detection): # Non standard entity def __init__(self) -> None: super().__init__("other") \ No newline at end of file diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py index b8a8ac2..c232ace 100644 --- a/src/detectors/date/date.py +++ b/src/detectors/date/date.py @@ -2,7 +2,7 @@ from typing import List, Dict, Any, Tuple from .en import detect_dates_en from .pl import detect_dates_pl from .ru import detect_dates_ru -from src.annotations import Annotation, DateAnnotation +from src.detections import Detection, DateDetection from src.detectors.interface import Detector @@ -12,13 +12,13 @@ class DateDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, DateAnnotation]]: + ) -> List[Tuple[int, int, DateDetection]]: return find_dates(text, self._language) def find_dates( text: str, language: str = "pl" -) -> List[Tuple[int, int, DateAnnotation]]: +) -> List[Tuple[int, int, DateDetection]]: """ Finds dates in the text. :param text: the text to be searched diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py index 5c8467e..142c12b 100644 --- a/src/detectors/date/en.py +++ b/src/detectors/date/en.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Tuple -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format @@ -24,7 +24,7 @@ EN_DATES_REGEX = re.compile( r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I ) -def detect_dates_en(text: str) -> List[Tuple[int, int, DateAnnotation]]: +def detect_dates_en(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects English dates in the text. :param text: the text to be searched @@ -36,5 +36,5 @@ def detect_dates_en(text: str) -> List[Tuple[int, int, DateAnnotation]]: dates = [] for match in matches: format = _parse_date_to_format(match.groupdict()) - dates.append((match.start(), match.end(), DateAnnotation(format))) + dates.append((match.start(), match.end(), DateDetection(format))) return dates \ No newline at end of file diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py index a16ac47..e4bbf45 100644 --- a/src/detectors/date/pl.py +++ b/src/detectors/date/pl.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Tuple -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format @@ -27,7 +27,7 @@ PL_DATES_REGEX = re.compile( r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I ) -def detect_dates_pl(text: str) -> List[Tuple[int, int, DateAnnotation]]: +def detect_dates_pl(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects Polish dates in the text. :param text: the text to be searched @@ -40,5 +40,5 @@ def detect_dates_pl(text: str) -> List[Tuple[int, int, DateAnnotation]]: dates = [] for match in matches: format = _parse_date_to_format(match.groupdict()) - dates.append((match.start(), match.end(), DateAnnotation(format))) + dates.append((match.start(), match.end(), DateDetection(format))) return dates \ No newline at end of file diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py index aacdf2f..02f2ed9 100644 --- a/src/detectors/date/ru.py +++ b/src/detectors/date/ru.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Tuple -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format @@ -27,7 +27,7 @@ RU_DATES_REGEX = re.compile( r'(?<!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b))', re.I ) -def detect_dates_ru(text: str) -> List[Tuple[int, int, DateAnnotation]]: +def detect_dates_ru(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects Russian dates in the text. :param text: the text to be searched @@ -39,6 +39,6 @@ def detect_dates_ru(text: str) -> List[Tuple[int, int, DateAnnotation]]: dates = [] for match in matches: format = _parse_date_to_format(match.groupdict()) - dates.append((match.start(), match.end(), DateAnnotation(format))) + dates.append((match.start(), match.end(), DateDetection(format))) return dates \ No newline at end of file diff --git a/src/detectors/date/utils.py b/src/detectors/date/utils.py index 9d7bab2..9dbf8f2 100644 --- a/src/detectors/date/utils.py +++ b/src/detectors/date/utils.py @@ -1,104 +1,104 @@ from typing import List, Tuple -from src.annotations import DateAnnotation, Optional +from src.detections import DateDetection, Optional -def _parse_day_or_month(re_entry) -> List[Tuple[int, int, DateAnnotation]]: +def _parse_day_or_month(re_entry) -> List[Tuple[int, int, DateDetection]]: assert re_entry["day_or_month_year"] is not None result = [] if re_entry["day_month1"] is not None: if len(re_entry["day_month1"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month1"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) if len(re_entry["day_month2"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct1"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) elif "day_month2" in re_entry: if len(re_entry["day_month2"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct1"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) if "year1" in re_entry: if len(re_entry["year1"]) == 2: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year1"])) else: - result.append((DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year1"])) + result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year1"])) return result -def _parse_year_month_or_day(re_entry) -> List[Tuple[int, int, DateAnnotation]]: +def _parse_year_month_or_day(re_entry) -> List[Tuple[int, int, DateDetection]]: assert re_entry["year_month_or_day"] is not None result = [] if "year2" in re_entry: if len(re_entry["year2"]) == 2: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year2"])) else: - result.append((DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year2"])) + result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct3"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct3"])) if "day_month3" in re_entry: if len(re_entry["day_month3"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month3"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month3"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month3"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct4"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month3"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct4"])) if "day_month4" in re_entry: if len(re_entry["day_month4"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month4"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month4"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month4"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month4"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct4"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct4"])) return result -def _parse_month_in_words(re_entry) -> List[Tuple[DateAnnotation.AnnotationPart, str]]: +def _parse_month_in_words(re_entry) -> List[Tuple[DateDetection.AnnotationPart, str]]: assert re_entry["month_in_words"] is not None result = [] if re_entry["day1"] is not None: if len(re_entry["day1"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day1"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day1"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct5"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct5"])) if re_entry["month"] is not None: - result.append((DateAnnotation.AnnotationPart.TEXT_MONTH, re_entry["month"])) + result.append((DateDetection.AnnotationPart.TEXT_MONTH, re_entry["month"])) if re_entry["day1"] is None: - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct7"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct7"])) else: - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct6"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct6"])) if re_entry["day2"] is not None: if len(re_entry["day2"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day2"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct6"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day2"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct6"])) if re_entry["year3"] is not None: if len(re_entry["year3"]) == 2: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year3"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year3"])) else: - result.append((DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year3"])) + result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year3"])) return result -def _parse_date_to_format(re_entry) -> Optional[List[Tuple[DateAnnotation.AnnotationPart, str]]]: +def _parse_date_to_format(re_entry) -> Optional[List[Tuple[DateDetection.AnnotationPart, str]]]: if re_entry["day_or_month_year"] is not None: result = _parse_day_or_month(re_entry) elif re_entry["year_month_or_day"] is not None: diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py index 2e0075d..f14d9fc 100644 --- a/src/detectors/email/email.py +++ b/src/detectors/email/email.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Dict, Any, Tuple -from src.annotations import EmailAnnotation +from src.detections import EmailDetection from src.detectors.interface import Detector @@ -10,7 +10,7 @@ class EmailDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, EmailAnnotation]]: + ) -> List[Tuple[int, int, EmailDetection]]: return detect_emails(text) @@ -23,7 +23,7 @@ EMAIL_REGEX = re.compile( ) -def detect_emails(text: str) -> List[Tuple[int, int, EmailAnnotation]]: +def detect_emails(text: str) -> List[Tuple[int, int, EmailDetection]]: """ Detects emails in the text. :param text: the text to be searched @@ -36,5 +36,5 @@ def detect_emails(text: str) -> List[Tuple[int, int, EmailAnnotation]]: matches = EMAIL_REGEX.finditer(text) emails = [] for match in matches: - emails.append((match.start(), match.end(), EmailAnnotation())) + emails.append((match.start(), match.end(), EmailDetection())) return emails diff --git a/src/detectors/interface.py b/src/detectors/interface.py index b32cd06..325d6d6 100644 --- a/src/detectors/interface.py +++ b/src/detectors/interface.py @@ -1,9 +1,9 @@ from typing import List, Dict, Any, Tuple -from src.annotations import Annotation +from src.detections import Detection class Detector: def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, Annotation]]: + ) -> List[Tuple[int, int, Detection]]: raise NotImplementedError diff --git a/src/detectors/ner/ner.py b/src/detectors/ner/ner.py index e5611c2..6c4ae8a 100644 --- a/src/detectors/ner/ner.py +++ b/src/detectors/ner/ner.py @@ -1,7 +1,7 @@ from typing import List, Dict, Any, Tuple from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5 from src.detectors.interface import Detector -from src.annotations import Annotation +from src.detections import Detection class NerDetector(Detector): @@ -15,7 +15,7 @@ class NerDetector(Detector): def detect_ner( - ccl_annotations: Dict[str, List[Tuple[int, int, Annotation]]], language: str + ccl_annotations: Dict[str, List[Tuple[int, int, Detection]]], language: str ) -> List[Tuple[int, int, str]]: if language == "pl": return detect_ner_pl_liner_n5(ccl_annotations) diff --git a/src/detectors/ner/pl_liner_n5.py b/src/detectors/ner/pl_liner_n5.py index 0ea13a9..d51cfa9 100644 --- a/src/detectors/ner/pl_liner_n5.py +++ b/src/detectors/ner/pl_liner_n5.py @@ -1,10 +1,10 @@ from typing import List, Tuple, Dict from src.utils.utils import subdict -from src.annotations import OtherAnnotation, Annotation +from src.detections import OtherDetection, Detection from src.mappings.ner_pl_n5_mapping import NER_PL_N5_MAPPING def detect_ner_pl_liner_n5( - ccl_annotations: Dict[str, List[Tuple[int, int, Annotation]]] + ccl_annotations: Dict[str, List[Tuple[int, int, Detection]]] ) -> List[Tuple[int, int, str]]: """ Detects ner entities in the text based on liner_n5 NER ontology. @@ -21,7 +21,7 @@ def detect_ner_pl_liner_n5( ) return [ - (start, end, NER_PL_N5_MAPPING.get(entity_type, OtherAnnotation)()) + (start, end, NER_PL_N5_MAPPING.get(entity_type, OtherDetection)()) for entity_type, entity in names.items() for start, end, _ in entity ] diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py index ca88264..861a5e1 100644 --- a/src/detectors/phone/phone.py +++ b/src/detectors/phone/phone.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Dict, Any, Tuple -from src.annotations import PhoneNumberAnnotation +from src.detections import PhoneNumberDetection from src.detectors.interface import Detector @@ -10,7 +10,7 @@ class PhoneNumberDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, PhoneNumberAnnotation]]: + ) -> List[Tuple[int, int, PhoneNumberDetection]]: return detect_phone_numbers(text) @@ -20,7 +20,7 @@ PHONE_NUMBER_REGEX = re.compile( ) -def detect_phone_numbers(text: str) -> List[Tuple[int, int, PhoneNumberAnnotation]]: +def detect_phone_numbers(text: str) -> List[Tuple[int, int, PhoneNumberDetection]]: """ Detects phone numbers in the text. :param text: the text to be searched @@ -33,5 +33,5 @@ def detect_phone_numbers(text: str) -> List[Tuple[int, int, PhoneNumberAnnotatio matches = PHONE_NUMBER_REGEX.finditer(text) phone_numbers = [] for match in matches: - phone_numbers.append((match.start(), match.end(), PhoneNumberAnnotation())) + phone_numbers.append((match.start(), match.end(), PhoneNumberDetection())) return phone_numbers diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py index 63c83db..ac67b2d 100644 --- a/src/detectors/url/url.py +++ b/src/detectors/url/url.py @@ -2,7 +2,7 @@ import regex as re from typing import List, Dict, Any, Tuple from .pl import URL_REGEX_PL from .common import generate_url_regex -from src.annotations import UrlAnnotation +from src.detections import UrlDetection from src.detectors.interface import Detector @@ -12,11 +12,11 @@ class UrlDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, UrlAnnotation]]: + ) -> List[Tuple[int, int, UrlDetection]]: return detect_urls(text, self._language) -def detect_urls(text: str, language: str) -> List[Tuple[int, int, UrlAnnotation]]: +def detect_urls(text: str, language: str) -> List[Tuple[int, int, UrlDetection]]: """ Detects urls in the text. :param text: the text to be searched @@ -34,6 +34,6 @@ def detect_urls(text: str, language: str) -> List[Tuple[int, int, UrlAnnotation] matches = url_regex.finditer(text) urls = [] for match in matches: - urls.append((match.start(), match.end(), UrlAnnotation())) + urls.append((match.start(), match.end(), UrlDetection())) return urls diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py index ca8d483..3a891ab 100644 --- a/src/detectors/user/user.py +++ b/src/detectors/user/user.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Dict, Any, Tuple -from src.annotations import UserAnnotation +from src.detections import UserDetection from src.detectors.interface import Detector @@ -10,14 +10,14 @@ class UserDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, UserAnnotation]]: + ) -> List[Tuple[int, int, UserDetection]]: return detect_users(text) USER_REGEX = re.compile(r"\B(?P<username>\@[\w\-]+)") -def detect_users(text: str) -> List[Tuple[int, int, UserAnnotation]]: +def detect_users(text: str) -> List[Tuple[int, int, UserDetection]]: """ Detects users in the text. :param text: the text to be searched @@ -30,5 +30,5 @@ def detect_users(text: str) -> List[Tuple[int, int, UserAnnotation]]: matches = USER_REGEX.finditer(text) users = [] for match in matches: - users.append((match.start(), match.end(), UserAnnotation())) + users.append((match.start(), match.end(), UserDetection())) return users diff --git a/src/dictionaries/morphosyntactic/interface.py b/src/dictionaries/morphosyntactic/interface.py index fe4a938..3f8a66b 100644 --- a/src/dictionaries/morphosyntactic/interface.py +++ b/src/dictionaries/morphosyntactic/interface.py @@ -1,8 +1,8 @@ -from src.annotations import Annotation +from src.detections import Detection from typing import Optional class MorphosyntacticDictionary: - def get_random_replacement(self, original_entry: Annotation) -> Optional[str]: + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: """ Returns a random replacement for the original entry """ diff --git a/src/dictionaries/morphosyntactic/pl_ner.py b/src/dictionaries/morphosyntactic/pl_ner.py index 3f861a6..d25beae 100644 --- a/src/dictionaries/morphosyntactic/pl_ner.py +++ b/src/dictionaries/morphosyntactic/pl_ner.py @@ -1,6 +1,6 @@ from typing import Dict, List, Optional, Tuple, Type from collections import defaultdict -from src.annotations import Annotation, OtherAnnotation, MorphosyntacticInfoMixin +from src.detections import Detection, OtherDetection, MorphosyntacticInfoMixin from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary import random @@ -9,8 +9,8 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): def __init__( self, dictionary_path: Optional[str] = None, - annotation_mapping: Optional[Dict[str, Type[Annotation]]] = None, - list: Optional[List[Tuple[Annotation, str, str, str]]] = None, + annotation_mapping: Optional[Dict[str, Type[Detection]]] = None, + list: Optional[List[Tuple[Detection, str, str, str]]] = None, always_replace=True, ) -> None: super().__init__() @@ -26,18 +26,18 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): raise ValueError("Either dictionary_path or list must be provided.") def _from_file( - self, path_to_dictionary: str, annotation_mapping: Dict[str, Type[Annotation]] + self, path_to_dictionary: str, annotation_mapping: Dict[str, Type[Detection]] ) -> None: self._dictionary = load_pl_ner_replacements_dictionary( path_to_dictionary, annotation_mapping ) - def _from_list(self, list: List[Tuple[Annotation, str, str, str]]) -> None: + def _from_list(self, list: List[Tuple[Detection, str, str, str]]) -> None: self._dictionary = defaultdict(lambda: defaultdict(dict)) for annotation, word, lemma, morpho_tag in list: self._dictionary[annotation][morpho_tag][lemma] = word - def get_random_replacement(self, original_entry: Annotation) -> Optional[str]: + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: original_entry_type = type(original_entry) result = None @@ -65,7 +65,7 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): def load_pl_ner_replacements_dictionary( - path: str, ner_mapping: Optional[Dict[str, Type[Annotation]]] = None + path: str, ner_mapping: Optional[Dict[str, Type[Detection]]] = None ) -> Dict[str, Dict[str, Dict[str, str]]]: """ Loads a dictionary that maps named entity tags to lemmas to part-of-speech tags to words. @@ -103,7 +103,7 @@ def load_pl_ner_replacements_dictionary( ner_tag, word, lemma, morpho_tag = line.split("\t") if ner_mapping is not None: - ner_tag = ner_mapping.get(ner_tag, OtherAnnotation)() + ner_tag = ner_mapping.get(ner_tag, OtherDetection)() replacement_dictionary[ner_tag][morpho_tag][lemma] = word diff --git a/src/mappings/ner_pl_n5_mapping.py b/src/mappings/ner_pl_n5_mapping.py index a14d9ce..77d5f13 100644 --- a/src/mappings/ner_pl_n5_mapping.py +++ b/src/mappings/ner_pl_n5_mapping.py @@ -1,15 +1,15 @@ -from src.annotations import ( - NameAnnotation, - SurnameAnnotation, - StreetNameAnnotation, - CityAnnotation, - CountryAnnotation, +from src.detections import ( + NameDetection, + SurnameDetection, + StreetNameDetection, + CityDetection, + CountryDetection, ) NER_PL_N5_MAPPING = { - "person_first_nam": NameAnnotation, - "person_last_nam": SurnameAnnotation, - "road_nam": StreetNameAnnotation, - "city_nam": CityAnnotation, - "country_nam": CountryAnnotation, + "person_first_nam": NameDetection, + "person_last_nam": SurnameDetection, + "road_nam": StreetNameDetection, + "city_nam": CityDetection, + "country_nam": CountryDetection, } diff --git a/src/replacers/date_replacer.py b/src/replacers/date_replacer.py index 7f2d681..ecf09b5 100644 --- a/src/replacers/date_replacer.py +++ b/src/replacers/date_replacer.py @@ -1,7 +1,7 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - DateAnnotation, +from src.detections import ( + Detection, + DateDetection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -27,7 +27,7 @@ class DateReplacer(ReplacerInterface): def __init__(self): pass - def replace(self, text: str, detections: List[Tuple[int, int, Annotation]]) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] @@ -36,46 +36,46 @@ class DateReplacer(ReplacerInterface): for item in detections: start, end, detection = item - if isinstance(detection, DateAnnotation): + if isinstance(detection, DateDetection): replacement = [] if detection.format is not None: format = detection.format else: format = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2020"), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2020"), ] if text[start:end] in already_replaced: replacement = already_replaced[text[start:end]] else: for entry in format: - if entry[0] == DateAnnotation.AnnotationPart.TWO_DIGITS_DAY: + if entry[0] == DateDetection.AnnotationPart.TWO_DIGITS_DAY: random_day = random.randint(1, 28) replacement.append(str(random_day).zfill(2)) - elif entry[0] == DateAnnotation.AnnotationPart.ONE_DIGIT_DAY: + elif entry[0] == DateDetection.AnnotationPart.ONE_DIGIT_DAY: random_day = random.randint(1, 28) replacement.append(str(random_day)) - elif entry[0] == DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH: + elif entry[0] == DateDetection.AnnotationPart.TWO_DIGIT_MONTH: random_month = random.randint(1, 12) replacement.append(str(random_month).zfill(2)) - elif entry[0] == DateAnnotation.AnnotationPart.ONE_DIGIT_MONTH: + elif entry[0] == DateDetection.AnnotationPart.ONE_DIGIT_MONTH: random_month = random.randint(1, 12) replacement.append(str(random_month)) - elif entry[0] == DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR: + elif entry[0] == DateDetection.AnnotationPart.FOUR_DIGIT_YEAR: random_year = random.randint(1900, 2020) replacement.append(str(random_year)) - elif entry[0] == DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR: + elif entry[0] == DateDetection.AnnotationPart.TWO_DIGIT_YEAR: random_year = random.randint(0, 99) replacement.append(str(random_year).zfill(2)) - elif entry[0] == DateAnnotation.AnnotationPart.TEXT_MONTH: + elif entry[0] == DateDetection.AnnotationPart.TEXT_MONTH: random_month = random.randint(1, 12) month_name = months_map[random_month] replacement.append(month_name) - elif entry[0] == DateAnnotation.AnnotationPart.OTHER: + elif entry[0] == DateDetection.AnnotationPart.OTHER: replacement.append(entry[1]) replacement = "".join(replacement) diff --git a/src/replacers/delete_replacer.py b/src/replacers/delete_replacer.py index 218873b..9a01908 100644 --- a/src/replacers/delete_replacer.py +++ b/src/replacers/delete_replacer.py @@ -1,5 +1,5 @@ from typing import List, Tuple -from src.annotations import Annotation +from src.detections import Detection from src.string_replacements import replace from src.replacers.interface import ReplacerInterface @@ -9,8 +9,8 @@ class DeleteReplacer(ReplacerInterface): pass def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: result = [ (start, end, "") diff --git a/src/replacers/email_replacer.py b/src/replacers/email_replacer.py index 48bf871..104f53e 100644 --- a/src/replacers/email_replacer.py +++ b/src/replacers/email_replacer.py @@ -1,7 +1,7 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - EmailAnnotation, +from src.detections import ( + Detection, + EmailDetection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -18,7 +18,7 @@ class EmailReplacer(ReplacerInterface): def __init__(self): pass - def replace(self, text: str, detections: List[Tuple[int, int, Annotation]]) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] @@ -27,7 +27,7 @@ class EmailReplacer(ReplacerInterface): for item in detections: start, end, detection = item - if isinstance(detection, EmailAnnotation): + if isinstance(detection, EmailDetection): if text[start:end] not in already_replaced: already_replaced[text[start:end]] = random_email() diff --git a/src/replacers/interface.py b/src/replacers/interface.py index fcaa21e..f4ed59f 100644 --- a/src/replacers/interface.py +++ b/src/replacers/interface.py @@ -1,13 +1,13 @@ from abc import ABC, abstractmethod from typing import List, Tuple -from src.annotations import Annotation +from src.detections import Detection class ReplacerInterface(ABC): @abstractmethod def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: """Replace detected entities in text with anonimized version. Args: diff --git a/src/replacers/ner_replacer.py b/src/replacers/ner_replacer.py index edb10b6..214f0b7 100644 --- a/src/replacers/ner_replacer.py +++ b/src/replacers/ner_replacer.py @@ -1,6 +1,6 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, +from src.detections import ( + Detection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -12,8 +12,8 @@ class NERReplacer(ReplacerInterface): self._dictionary = dictionary def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] diff --git a/src/replacers/tag_replacer.py b/src/replacers/tag_replacer.py index 366934d..1f8d898 100644 --- a/src/replacers/tag_replacer.py +++ b/src/replacers/tag_replacer.py @@ -1,18 +1,18 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - NameAnnotation, - SurnameAnnotation, - StreetNameAnnotation, - CityAnnotation, - CountryAnnotation, - PhoneNumberAnnotation, - UrlAnnotation, - UserAnnotation, - EmailAnnotation, - DateAnnotation, - TINAnnotation, - KRSAnnotation, +from src.detections import ( + Detection, + NameDetection, + SurnameDetection, + StreetNameDetection, + CityDetection, + CountryDetection, + PhoneNumberDetection, + UrlDetection, + UserDetection, + EmailDetection, + DateDetection, + TINDetection, + KRSDetection, ) from src.string_replacements import replace from src.replacers.interface import ReplacerInterface @@ -21,23 +21,23 @@ from src.replacers.interface import ReplacerInterface class TagReplacer(ReplacerInterface): def __init__(self): self.tags_map = { - NameAnnotation: "[OSOBA]", - SurnameAnnotation: "[OSOBA]", - StreetNameAnnotation: "[MIEJSCE]", - CityAnnotation: "[MIEJSCE]", - CountryAnnotation: "[MIEJSCE]", - PhoneNumberAnnotation: "[DIGITS]", - UrlAnnotation: "[WWW]", - UserAnnotation: "@[USER]", - EmailAnnotation: "[MAIL]", - DateAnnotation: "[DATE]", - TINAnnotation: "[DIGITS]", - KRSAnnotation: "[DIGITS]", + NameDetection: "[OSOBA]", + SurnameDetection: "[OSOBA]", + StreetNameDetection: "[MIEJSCE]", + CityDetection: "[MIEJSCE]", + CountryDetection: "[MIEJSCE]", + PhoneNumberDetection: "[DIGITS]", + UrlDetection: "[WWW]", + UserDetection: "@[USER]", + EmailDetection: "[MAIL]", + DateDetection: "[DATE]", + TINDetection: "[DIGITS]", + KRSDetection: "[DIGITS]", } def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: result = [ (start, end, self.tags_map.get(type(entity_type), "[OTHER]")) diff --git a/src/replacers/user_replacer.py b/src/replacers/user_replacer.py index 66aeaf4..87fb931 100644 --- a/src/replacers/user_replacer.py +++ b/src/replacers/user_replacer.py @@ -1,7 +1,7 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - UserAnnotation, +from src.detections import ( + Detection, + UserDetection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -11,7 +11,7 @@ class UserReplacer(ReplacerInterface): def __init__(self): pass - def replace(self, text: str, detections: List[Tuple[int, int, Annotation]]) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] @@ -20,7 +20,7 @@ class UserReplacer(ReplacerInterface): for item in detections: start, end, detection = item - if isinstance(detection, UserAnnotation): + if isinstance(detection, UserDetection): if text[start:end] not in already_replaced: username = "@" + generate_username(1)[0] already_replaced[text[start:end]] = username diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000..23e1435 --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,33 @@ +"""Module for useful functions.""" + +import itertools + + +def consume(iterative, n): + """Consume n elements from iterative object. + + Args: + iterative (iter): Python iterative object. + n (int): Number of elements to consume. + + """ + next(itertools.islice(iterative, n - 1, n), None) + + +def subdict(dictionary, keys, all_must_be_present=True): + """Return a subdictionary of dictionary containing only keys. + + Args: + dictionary (dict): Dictionary to take a subdictionary from. + keys (list): List of keys to take from dictionary. + all_must_be_present (bool): If True, all keys must be present in + dictionary. If False, only keys that are present are returned. + + Returns: + dict: Subdictionary of dictionary containing only keys. + + """ + if all_must_be_present: + return {key: dictionary[key] for key in keys} + else: + return {key: dictionary[key] for key in keys if key in dictionary} diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py index 704ce37..ee62ea8 100644 --- a/tests/detectors/date/test_en.py +++ b/tests/detectors/date/test_en.py @@ -1,4 +1,4 @@ -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_en(): @@ -9,22 +9,22 @@ def test_detect_dates_en(): found_dates = detector.detect(text, dict()) format_date1 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") ] format_date2 = [ - (DateAnnotation.AnnotationPart.TEXT_MONTH, "April"), - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, ", "), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), + (DateDetection.AnnotationPart.TEXT_MONTH, "April"), + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, ", "), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - assert found_dates == [(3, 12, DateAnnotation(format_date1)), (32, 45, DateAnnotation(format_date2))] + assert found_dates == [(3, 12, DateDetection(format_date1)), (32, 45, DateDetection(format_date2))] # Check en-gb # TODO: Following test fails. Fix it. diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py index 077240d..bfe159a 100644 --- a/tests/detectors/date/test_pl.py +++ b/tests/detectors/date/test_pl.py @@ -1,4 +1,4 @@ -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_pl(): @@ -8,19 +8,19 @@ def test_detect_dates_pl(): found_dates = detector.detect(text, dict()) format_date1 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") ] format_date2 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.TEXT_MONTH, "kwietnia"), - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.TEXT_MONTH, "kwietnia"), + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - assert found_dates == [(7, 16, DateAnnotation(format_date1)), (34, 49, DateAnnotation(format_date2))] \ No newline at end of file + assert found_dates == [(7, 16, DateDetection(format_date1)), (34, 49, DateDetection(format_date2))] \ No newline at end of file diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py index 5269b94..72a9f89 100644 --- a/tests/detectors/date/test_ru.py +++ b/tests/detectors/date/test_ru.py @@ -1,4 +1,4 @@ -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.date import DateDetector @@ -9,19 +9,19 @@ def test_detect_dates_pl(): found_dates = detector.detect(text, dict()) format_date1 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") ] format_date2 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "05"), - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.TEXT_MONTH, "апрелÑ"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.TEXT_MONTH, "апрелÑ"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - assert found_dates == [(0, 9, DateAnnotation(format_date1)), (26, 39, DateAnnotation(format_date2))] + assert found_dates == [(0, 9, DateDetection(format_date1)), (26, 39, DateDetection(format_date2))] diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py index 95ecb06..982d983 100644 --- a/tests/detectors/email/test_email.py +++ b/tests/detectors/email/test_email.py @@ -1,4 +1,4 @@ -from src.annotations import EmailAnnotation +from src.detections import EmailDetection from src.detectors.email import EmailDetector def test_detect_emails(): @@ -7,4 +7,4 @@ def test_detect_emails(): text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl" found_emails = detector.detect(text, dict()) - assert found_emails == [(12, 30, EmailAnnotation()), (53, 78, EmailAnnotation())] \ No newline at end of file + assert found_emails == [(12, 30, EmailDetection()), (53, 78, EmailDetection())] \ No newline at end of file diff --git a/tests/detectors/ner/test_pl_liner_n5.py b/tests/detectors/ner/test_pl_liner_n5.py index 544dc00..7af941c 100644 --- a/tests/detectors/ner/test_pl_liner_n5.py +++ b/tests/detectors/ner/test_pl_liner_n5.py @@ -1,4 +1,4 @@ -from src.annotations import NameAnnotation, SurnameAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, CityDetection from src.detectors.ner import NerDetector def test_detect_names_pl_liner_n5(): @@ -14,10 +14,10 @@ def test_detect_names_pl_liner_n5(): result = detector.detect("", ccl_annotations) expected = [ - (10, 16, NameAnnotation()), - (100, 109, NameAnnotation()), - (30, 35, SurnameAnnotation()), - (50, 59, CityAnnotation()), + (10, 16, NameDetection()), + (100, 109, NameDetection()), + (30, 35, SurnameDetection()), + (50, 59, CityDetection()), ] assert set(result) == set(expected) \ No newline at end of file diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py index ad3bc59..5ada397 100644 --- a/tests/detectors/phone/test_phone.py +++ b/tests/detectors/phone/test_phone.py @@ -1,4 +1,4 @@ -from src.annotations import PhoneNumberAnnotation +from src.detections import PhoneNumberDetection from src.detectors.phone import PhoneNumberDetector def test_detect_phone_numbers(): @@ -7,4 +7,4 @@ def test_detect_phone_numbers(): text = "My phone number is +48 123 456 789. My friend's number is 123456789." found_phone_numbers = detector.detect(text, dict()) - assert found_phone_numbers == [(19, 34, PhoneNumberAnnotation()), (58, 67, PhoneNumberAnnotation())] \ No newline at end of file + assert found_phone_numbers == [(19, 34, PhoneNumberDetection()), (58, 67, PhoneNumberDetection())] \ No newline at end of file diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py index 44d14ff..4e5c02f 100644 --- a/tests/detectors/url/test_url.py +++ b/tests/detectors/url/test_url.py @@ -1,5 +1,5 @@ from src.detectors.url import UrlDetector -from src.annotations import UrlAnnotation +from src.detections import UrlDetection def test_detect_urls(): detector = UrlDetector("en") @@ -7,7 +7,7 @@ def test_detect_urls(): text = "This is a test for www.google.com. Make sure to go to https://www.google.com" found_urls = detector.detect(text, dict()) - assert found_urls == [(19, 33, UrlAnnotation()), (54, 76, UrlAnnotation())] + assert found_urls == [(19, 33, UrlDetection()), (54, 76, UrlDetection())] def test_detect_urls_pl(): detector_en = UrlDetector("en") @@ -18,5 +18,5 @@ def test_detect_urls_pl(): found_urls_en = detector_en.detect(text, dict()) # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected as a URL. - assert found_urls_pl == [(6, 28, UrlAnnotation())] - assert found_urls_en == [(0, 4, UrlAnnotation()), (6, 28, UrlAnnotation())] \ No newline at end of file + assert found_urls_pl == [(6, 28, UrlDetection())] + assert found_urls_en == [(0, 4, UrlDetection()), (6, 28, UrlDetection())] \ No newline at end of file diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py index 028b1f4..c1b8bc3 100644 --- a/tests/detectors/user/test_user.py +++ b/tests/detectors/user/test_user.py @@ -1,5 +1,5 @@ from src.detectors.user import UserDetector -from src.annotations import UserAnnotation +from src.detections import UserDetection def test_detect_users(): detector = UserDetector() @@ -7,4 +7,4 @@ def test_detect_users(): text = "My username is @john_smith. My friend's username is @jane_doe." found_users = detector.detect(text, dict()) - assert found_users == [(15, 26, UserAnnotation()), (52, 61, UserAnnotation())] \ No newline at end of file + assert found_users == [(15, 26, UserDetection()), (52, 61, UserDetection())] \ No newline at end of file diff --git a/tests/dictionaries/morphosyntactic/test_pl_ner.py b/tests/dictionaries/morphosyntactic/test_pl_ner.py index 7d9e229..9786f4f 100644 --- a/tests/dictionaries/morphosyntactic/test_pl_ner.py +++ b/tests/dictionaries/morphosyntactic/test_pl_ner.py @@ -1,19 +1,19 @@ from src.dictionaries.morphosyntactic.pl_ner import PlNERMorphosyntacticDictionary -from src.annotations import NameAnnotation, CityAnnotation, SurnameAnnotation +from src.detections import NameDetection, CityDetection, SurnameDetection def test_pl_ner_morphosyntactic_dictionary(): dictionary = PlNERMorphosyntacticDictionary(list=[ - (NameAnnotation, "Andrzejowi", "Andrzej", "subst:sg:dat:m1"), - (NameAnnotation, "Andrzej", "Andrzej", "subst:sg:m1:imperf"), - (NameAnnotation, "Kasia", "Kasia", "subst:sg:f:imperf"), - (CityAnnotation, "WrocÅ‚aw", "WrocÅ‚aw", "subst:sg:m2:imperf"), - (CityAnnotation, "Warszawa", "Warszawa", "subst:sg:f:imperf"), - (CityAnnotation, "Kraków", "Kraków", "subst:sg:m2:imperf") + (NameDetection, "Andrzejowi", "Andrzej", "subst:sg:dat:m1"), + (NameDetection, "Andrzej", "Andrzej", "subst:sg:m1:imperf"), + (NameDetection, "Kasia", "Kasia", "subst:sg:f:imperf"), + (CityDetection, "WrocÅ‚aw", "WrocÅ‚aw", "subst:sg:m2:imperf"), + (CityDetection, "Warszawa", "Warszawa", "subst:sg:f:imperf"), + (CityDetection, "Kraków", "Kraków", "subst:sg:m2:imperf") ]) - example_name_1 = NameAnnotation(morpho_tag="subst:sg:dat:m1") - example_name_2 = NameAnnotation(morpho_tag="subst:sg:m1:imperf") - example_other = SurnameAnnotation(morpho_tag="subst:sg:m1:imperf") + example_name_1 = NameDetection(morpho_tag="subst:sg:dat:m1") + example_name_2 = NameDetection(morpho_tag="subst:sg:m1:imperf") + example_other = SurnameDetection(morpho_tag="subst:sg:m1:imperf") assert dictionary.get_random_replacement(example_name_1) == "Andrzejowi" assert dictionary.get_random_replacement(example_name_2) in ["Andrzej", "Kasia"] diff --git a/tests/pipeline/test_default.py b/tests/pipeline/test_default.py index 97acb46..19e8687 100644 --- a/tests/pipeline/test_default.py +++ b/tests/pipeline/test_default.py @@ -1,5 +1,5 @@ from src.pipeline.default import DefaultPipeline -from src.annotations import NameAnnotation +from src.detections import NameDetection from src.input_parsers.interface import InputParser from src.detectors.interface import Detector from src.suppressors.interface import Suppressor @@ -11,7 +11,7 @@ class MockInputParser(InputParser): class MockDetector(Detector): def detect(self, text, annotations): - return [(0, 3, NameAnnotation())] + return [(0, 3, NameDetection())] class MockSuppressor(Suppressor): def suppress(self, annotations): diff --git a/tests/replacers/test_date_replacer.py b/tests/replacers/test_date_replacer.py index 77ce093..f647f6e 100644 --- a/tests/replacers/test_date_replacer.py +++ b/tests/replacers/test_date_replacer.py @@ -1,14 +1,14 @@ from src.replacers.date_replacer import DateReplacer -from src.annotations import NameAnnotation, SurnameAnnotation, DateAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection def test_date_replacer(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameAnnotation()), - (4, 14, SurnameAnnotation()), - (28, 38, DateAnnotation()), - (42, 51, CityAnnotation()), + (0, 3, NameDetection()), + (4, 14, SurnameDetection()), + (28, 38, DateDetection()), + (42, 51, CityDetection()), ] replacer = DateReplacer() @@ -18,9 +18,9 @@ def test_date_replacer(): expected_text_beggining = "Ala Brzeszczot urodziÅ‚a sie " expected_text_ending = " we WrocÅ‚awiu" exptected_detections_left = [ - (0, 3, NameAnnotation()), - (4, 14, SurnameAnnotation()), - (len(result[0]) - 9, len(result[0]), CityAnnotation()), + (0, 3, NameDetection()), + (4, 14, SurnameDetection()), + (len(result[0]) - 9, len(result[0]), CityDetection()), ] assert result[0].startswith(expected_text_beggining) @@ -30,9 +30,9 @@ def test_date_replacer(): def test_date_replacer_same_date_same_replacement(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu. 05.05.2005 to jej urodziny. 06.05.2005 to nie jej urodziny." detections = [ - (28, 38, DateAnnotation()), - (53, 63, DateAnnotation()), - (81, 91, DateAnnotation()), + (28, 38, DateDetection()), + (53, 63, DateDetection()), + (81, 91, DateDetection()), ] replacer = DateReplacer() diff --git a/tests/replacers/test_email_replacer.py b/tests/replacers/test_email_replacer.py index a354f3e..664e043 100644 --- a/tests/replacers/test_email_replacer.py +++ b/tests/replacers/test_email_replacer.py @@ -1,13 +1,13 @@ from src.replacers.email_replacer import EmailReplacer -from src.annotations import DateAnnotation, CityAnnotation, UserAnnotation, EmailAnnotation +from src.detections import DateDetection, CityDetection, UserDetection, EmailDetection def test_email_replacer(): text = "zz@z.pl urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 7, EmailAnnotation()), - (21, 31, DateAnnotation()), - (35, 44, CityAnnotation()), + (0, 7, EmailDetection()), + (21, 31, DateDetection()), + (35, 44, CityDetection()), ] replacer = EmailReplacer() @@ -15,8 +15,8 @@ def test_email_replacer(): expected_text_ending = " urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ - (len(result[0]) - 23, len(result[0]) - 13, DateAnnotation()), - (len(result[0]) - 9, len(result[0]), CityAnnotation()), + (len(result[0]) - 23, len(result[0]) - 13, DateDetection()), + (len(result[0]) - 9, len(result[0]), CityDetection()), ] assert result[0].endswith(expected_text_ending) @@ -26,9 +26,9 @@ def test_email_replacer(): def test_email_replacer_same_email_same_replacement(): text = "zz@z.pl zz@z.pl aa@a.pl" detections = [ - (0, 7, EmailAnnotation()), - (8, 15, EmailAnnotation()), - (16, 22, EmailAnnotation()), + (0, 7, EmailDetection()), + (8, 15, EmailDetection()), + (16, 22, EmailDetection()), ] diff --git a/tests/replacers/test_ner_replacer.py b/tests/replacers/test_ner_replacer.py index fad3921..64c7de6 100644 --- a/tests/replacers/test_ner_replacer.py +++ b/tests/replacers/test_ner_replacer.py @@ -1,22 +1,22 @@ from src.replacers.ner_replacer import NERReplacer -from src.annotations import NameAnnotation, SurnameAnnotation, DateAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection from src.dictionaries.morphosyntactic.pl_ner import PlNERMorphosyntacticDictionary def test_ner_replacer(): dictionary = PlNERMorphosyntacticDictionary(list=[ - (NameAnnotation, "Andrzej", "Andrzej", "a"), - (NameAnnotation, "Kasi", "Kasia", "b"), - (SurnameAnnotation, "Kowalowi", "Kowal", "a"), - (SurnameAnnotation, "Kowal", "Kowal", "b"), + (NameDetection, "Andrzej", "Andrzej", "a"), + (NameDetection, "Kasi", "Kasia", "b"), + (SurnameDetection, "Kowalowi", "Kowal", "a"), + (SurnameDetection, "Kowal", "Kowal", "b"), ], always_replace=False) text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameAnnotation(morpho_tag="a")), - (4, 14, SurnameAnnotation(morpho_tag="b")), - (28, 38, DateAnnotation()), - (42, 51, CityAnnotation(morpho_tag="c")), + (0, 3, NameDetection(morpho_tag="a")), + (4, 14, SurnameDetection(morpho_tag="b")), + (28, 38, DateDetection()), + (42, 51, CityDetection(morpho_tag="c")), ] replacer = NERReplacer(dictionary) @@ -25,8 +25,8 @@ def test_ner_replacer(): expected_text = "Andrzej Kowal urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ - (27, 37, DateAnnotation()), - (41, 50, CityAnnotation(morpho_tag="c")), + (27, 37, DateDetection()), + (41, 50, CityDetection(morpho_tag="c")), ] assert result == (expected_text, exptected_detections_left) \ No newline at end of file diff --git a/tests/replacers/test_tag_replacer.py b/tests/replacers/test_tag_replacer.py index cd73090..4c5ce48 100644 --- a/tests/replacers/test_tag_replacer.py +++ b/tests/replacers/test_tag_replacer.py @@ -1,14 +1,14 @@ from src.replacers.tag_replacer import TagReplacer -from src.annotations import NameAnnotation, SurnameAnnotation, DateAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection def test_replace_with_tags(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameAnnotation()), - (4, 14, SurnameAnnotation()), - (28, 38, DateAnnotation()), - (42, 51, CityAnnotation()), + (0, 3, NameDetection()), + (4, 14, SurnameDetection()), + (28, 38, DateDetection()), + (42, 51, CityDetection()), ] replacer = TagReplacer() diff --git a/tests/replacers/test_user_replacer.py b/tests/replacers/test_user_replacer.py index 587835a..608f766 100644 --- a/tests/replacers/test_user_replacer.py +++ b/tests/replacers/test_user_replacer.py @@ -1,13 +1,13 @@ from src.replacers.user_replacer import UserReplacer -from src.annotations import DateAnnotation, CityAnnotation, UserAnnotation +from src.detections import DateDetection, CityDetection, UserDetection def test_user_replacer(): text = "@zzzz32 urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 7, UserAnnotation()), - (21, 31, DateAnnotation()), - (35, 44, CityAnnotation()), + (0, 7, UserDetection()), + (21, 31, DateDetection()), + (35, 44, CityDetection()), ] replacer = UserReplacer() @@ -15,8 +15,8 @@ def test_user_replacer(): expected_text_ending = " urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ - (len(result[0]) - 23, len(result[0]) - 13, DateAnnotation()), - (len(result[0]) - 9, len(result[0]), CityAnnotation()), + (len(result[0]) - 23, len(result[0]) - 13, DateDetection()), + (len(result[0]) - 9, len(result[0]), CityDetection()), ] assert result[0].endswith(expected_text_ending) @@ -26,9 +26,9 @@ def test_user_replacer(): def test_user_replacer_same_user_same_replacement(): text = "@zzzz32 @zzzz32 @aaaaa" detections = [ - (0, 7, UserAnnotation()), - (8, 15, UserAnnotation()), - (16, 22, UserAnnotation()), + (0, 7, UserDetection()), + (8, 15, UserDetection()), + (16, 22, UserDetection()), ] -- GitLab