diff --git a/src/annotations/__init__.py b/src/annotations/__init__.py deleted file mode 100644 index cf65f5ac040b2982da20bbfa869283fb1dd75c06..0000000000000000000000000000000000000000 --- a/src/annotations/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from src.annotations.annotation import * -from src.annotations.date import * \ No newline at end of file diff --git a/src/detections/__init__.py b/src/detections/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a26fabccac45f6a9034423e1ce94b45fa0d574de --- /dev/null +++ b/src/detections/__init__.py @@ -0,0 +1,2 @@ +from src.detections.detection import * +from src.detections.date import * \ No newline at end of file diff --git a/src/annotations/date.py b/src/detections/date.py similarity index 91% rename from src/annotations/date.py rename to src/detections/date.py index 502004c44492b295010c6ef7bb724a593b2d37c9..4ae956b7990a21039e6535eab4cf4944913acf71 100644 --- a/src/annotations/date.py +++ b/src/detections/date.py @@ -1,7 +1,7 @@ -from src.annotations.annotation import Annotation +from src.detections.detection import Detection from typing import List, Tuple, Optional -class DateAnnotation(Annotation): +class DateDetection(Detection): class AnnotationPart: TWO_DIGITS_DAY = "DD" ONE_DIGIT_DAY = "D" diff --git a/src/annotations/annotation.py b/src/detections/detection.py similarity index 73% rename from src/annotations/annotation.py rename to src/detections/detection.py index 69e6f500b8d1b57db756926ef890f40eb31f2687..59d712dc9b81abb2b77c98c192c3e612ffb74c9f 100644 --- a/src/annotations/annotation.py +++ b/src/detections/detection.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from typing import Optional @dataclass -class Annotation: +class Detection: def __init__(self, type_name: str) -> None: self._type_name = type_name @@ -18,50 +18,50 @@ class MorphosyntacticInfoMixin: def morpho_tag(self) -> str: return self._morpho_tag -class NameAnnotation(MorphosyntacticInfoMixin, Annotation): +class NameDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="name") -class SurnameAnnotation(MorphosyntacticInfoMixin, Annotation): +class SurnameDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="surname") -class StreetNameAnnotation(MorphosyntacticInfoMixin, Annotation): +class StreetNameDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="street_name") -class CityAnnotation(MorphosyntacticInfoMixin, Annotation): +class CityDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="city") -class CountryAnnotation(MorphosyntacticInfoMixin, Annotation): +class CountryDetection(MorphosyntacticInfoMixin, Detection): def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag, type_name="country") -class PhoneNumberAnnotation(Annotation): +class PhoneNumberDetection(Detection): def __init__(self) -> None: super().__init__("phone_number") -class UrlAnnotation(Annotation): +class UrlDetection(Detection): def __init__(self) -> None: super().__init__("url") -class UserAnnotation(Annotation): +class UserDetection(Detection): def __init__(self) -> None: super().__init__("user") -class EmailAnnotation(Annotation): +class EmailDetection(Detection): def __init__(self) -> None: super().__init__("email") -class TINAnnotation(Annotation): # Tax Identification Number +class TINDetection(Detection): # Tax Identification Number def __init__(self) -> None: super().__init__("tin") -class KRSAnnotation(Annotation): # National Court Register +class KRSDetection(Detection): # National Court Register def __init__(self) -> None: super().__init__("krs") -class OtherAnnotation(Annotation): # Non standard entity +class OtherDetection(Detection): # Non standard entity def __init__(self) -> None: super().__init__("other") \ No newline at end of file diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py index b8a8ac2735ad171f5f2802b53c1121daf692ac0b..c232ace36d438d45224ec31d3aefd503a9a23064 100644 --- a/src/detectors/date/date.py +++ b/src/detectors/date/date.py @@ -2,7 +2,7 @@ from typing import List, Dict, Any, Tuple from .en import detect_dates_en from .pl import detect_dates_pl from .ru import detect_dates_ru -from src.annotations import Annotation, DateAnnotation +from src.detections import Detection, DateDetection from src.detectors.interface import Detector @@ -12,13 +12,13 @@ class DateDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, DateAnnotation]]: + ) -> List[Tuple[int, int, DateDetection]]: return find_dates(text, self._language) def find_dates( text: str, language: str = "pl" -) -> List[Tuple[int, int, DateAnnotation]]: +) -> List[Tuple[int, int, DateDetection]]: """ Finds dates in the text. :param text: the text to be searched diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py index 5c8467eee8df78bcf9b4d158599b0f8a4508fc78..142c12b0a5c8826634ee7ec3e1f733627786cd1e 100644 --- a/src/detectors/date/en.py +++ b/src/detectors/date/en.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Tuple -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format @@ -24,7 +24,7 @@ EN_DATES_REGEX = re.compile( r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I ) -def detect_dates_en(text: str) -> List[Tuple[int, int, DateAnnotation]]: +def detect_dates_en(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects English dates in the text. :param text: the text to be searched @@ -36,5 +36,5 @@ def detect_dates_en(text: str) -> List[Tuple[int, int, DateAnnotation]]: dates = [] for match in matches: format = _parse_date_to_format(match.groupdict()) - dates.append((match.start(), match.end(), DateAnnotation(format))) + dates.append((match.start(), match.end(), DateDetection(format))) return dates \ No newline at end of file diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py index a16ac477fad2f9683bbbda16b6ca6ed4c7482046..e4bbf45efdcbf478bc95815b06e9889fe183cdc9 100644 --- a/src/detectors/date/pl.py +++ b/src/detectors/date/pl.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Tuple -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format @@ -27,7 +27,7 @@ PL_DATES_REGEX = re.compile( r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I ) -def detect_dates_pl(text: str) -> List[Tuple[int, int, DateAnnotation]]: +def detect_dates_pl(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects Polish dates in the text. :param text: the text to be searched @@ -40,5 +40,5 @@ def detect_dates_pl(text: str) -> List[Tuple[int, int, DateAnnotation]]: dates = [] for match in matches: format = _parse_date_to_format(match.groupdict()) - dates.append((match.start(), match.end(), DateAnnotation(format))) + dates.append((match.start(), match.end(), DateDetection(format))) return dates \ No newline at end of file diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py index aacdf2f7757c70f0350e48aa86b5d6346cfc7d36..02f2ed91c2e477e8e6fe79414bba72c1c3113d25 100644 --- a/src/detectors/date/ru.py +++ b/src/detectors/date/ru.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Tuple -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format @@ -27,7 +27,7 @@ RU_DATES_REGEX = re.compile( r'(?<!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b))', re.I ) -def detect_dates_ru(text: str) -> List[Tuple[int, int, DateAnnotation]]: +def detect_dates_ru(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects Russian dates in the text. :param text: the text to be searched @@ -39,6 +39,6 @@ def detect_dates_ru(text: str) -> List[Tuple[int, int, DateAnnotation]]: dates = [] for match in matches: format = _parse_date_to_format(match.groupdict()) - dates.append((match.start(), match.end(), DateAnnotation(format))) + dates.append((match.start(), match.end(), DateDetection(format))) return dates \ No newline at end of file diff --git a/src/detectors/date/utils.py b/src/detectors/date/utils.py index 9d7bab25b405dbf2577c735c39dfb79ce9930b24..9dbf8f2f80ad2aa665b65af3986f1fe08a1c4269 100644 --- a/src/detectors/date/utils.py +++ b/src/detectors/date/utils.py @@ -1,104 +1,104 @@ from typing import List, Tuple -from src.annotations import DateAnnotation, Optional +from src.detections import DateDetection, Optional -def _parse_day_or_month(re_entry) -> List[Tuple[int, int, DateAnnotation]]: +def _parse_day_or_month(re_entry) -> List[Tuple[int, int, DateDetection]]: assert re_entry["day_or_month_year"] is not None result = [] if re_entry["day_month1"] is not None: if len(re_entry["day_month1"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month1"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) if len(re_entry["day_month2"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct1"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) elif "day_month2" in re_entry: if len(re_entry["day_month2"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct1"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) if "year1" in re_entry: if len(re_entry["year1"]) == 2: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year1"])) else: - result.append((DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year1"])) + result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year1"])) return result -def _parse_year_month_or_day(re_entry) -> List[Tuple[int, int, DateAnnotation]]: +def _parse_year_month_or_day(re_entry) -> List[Tuple[int, int, DateDetection]]: assert re_entry["year_month_or_day"] is not None result = [] if "year2" in re_entry: if len(re_entry["year2"]) == 2: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year2"])) else: - result.append((DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year2"])) + result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct3"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct3"])) if "day_month3" in re_entry: if len(re_entry["day_month3"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month3"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month3"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month3"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct4"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month3"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct4"])) if "day_month4" in re_entry: if len(re_entry["day_month4"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month4"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month4"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month4"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month4"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct4"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct4"])) return result -def _parse_month_in_words(re_entry) -> List[Tuple[DateAnnotation.AnnotationPart, str]]: +def _parse_month_in_words(re_entry) -> List[Tuple[DateDetection.AnnotationPart, str]]: assert re_entry["month_in_words"] is not None result = [] if re_entry["day1"] is not None: if len(re_entry["day1"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day1"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day1"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day1"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct5"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct5"])) if re_entry["month"] is not None: - result.append((DateAnnotation.AnnotationPart.TEXT_MONTH, re_entry["month"])) + result.append((DateDetection.AnnotationPart.TEXT_MONTH, re_entry["month"])) if re_entry["day1"] is None: - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct7"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct7"])) else: - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct6"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct6"])) if re_entry["day2"] is not None: if len(re_entry["day2"]) == 1: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day2"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day2"])) else: - result.append((DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, re_entry["day2"])) - result.append((DateAnnotation.AnnotationPart.OTHER, re_entry["punct6"])) + result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day2"])) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct6"])) if re_entry["year3"] is not None: if len(re_entry["year3"]) == 2: - result.append((DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year3"])) + result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year3"])) else: - result.append((DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year3"])) + result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year3"])) return result -def _parse_date_to_format(re_entry) -> Optional[List[Tuple[DateAnnotation.AnnotationPart, str]]]: +def _parse_date_to_format(re_entry) -> Optional[List[Tuple[DateDetection.AnnotationPart, str]]]: if re_entry["day_or_month_year"] is not None: result = _parse_day_or_month(re_entry) elif re_entry["year_month_or_day"] is not None: diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py index 2e0075d01e3d7e998136a0353beb1d02cd64e097..f14d9fc3d18b05de3c6efe8a7c6ef8d1d3b4d16b 100644 --- a/src/detectors/email/email.py +++ b/src/detectors/email/email.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Dict, Any, Tuple -from src.annotations import EmailAnnotation +from src.detections import EmailDetection from src.detectors.interface import Detector @@ -10,7 +10,7 @@ class EmailDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, EmailAnnotation]]: + ) -> List[Tuple[int, int, EmailDetection]]: return detect_emails(text) @@ -23,7 +23,7 @@ EMAIL_REGEX = re.compile( ) -def detect_emails(text: str) -> List[Tuple[int, int, EmailAnnotation]]: +def detect_emails(text: str) -> List[Tuple[int, int, EmailDetection]]: """ Detects emails in the text. :param text: the text to be searched @@ -36,5 +36,5 @@ def detect_emails(text: str) -> List[Tuple[int, int, EmailAnnotation]]: matches = EMAIL_REGEX.finditer(text) emails = [] for match in matches: - emails.append((match.start(), match.end(), EmailAnnotation())) + emails.append((match.start(), match.end(), EmailDetection())) return emails diff --git a/src/detectors/interface.py b/src/detectors/interface.py index b32cd06ed7fd5ecdfc4de8cc953f6991ee3f005c..325d6d6a813409b3794f3c74f66b54a4276ff409 100644 --- a/src/detectors/interface.py +++ b/src/detectors/interface.py @@ -1,9 +1,9 @@ from typing import List, Dict, Any, Tuple -from src.annotations import Annotation +from src.detections import Detection class Detector: def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, Annotation]]: + ) -> List[Tuple[int, int, Detection]]: raise NotImplementedError diff --git a/src/detectors/ner/ner.py b/src/detectors/ner/ner.py index e5611c25d64bcbc9b9753659cd50b8b1e84024d9..6c4ae8ab87dbfc42624a81dadcc0689257c642bd 100644 --- a/src/detectors/ner/ner.py +++ b/src/detectors/ner/ner.py @@ -1,7 +1,7 @@ from typing import List, Dict, Any, Tuple from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5 from src.detectors.interface import Detector -from src.annotations import Annotation +from src.detections import Detection class NerDetector(Detector): @@ -15,7 +15,7 @@ class NerDetector(Detector): def detect_ner( - ccl_annotations: Dict[str, List[Tuple[int, int, Annotation]]], language: str + ccl_annotations: Dict[str, List[Tuple[int, int, Detection]]], language: str ) -> List[Tuple[int, int, str]]: if language == "pl": return detect_ner_pl_liner_n5(ccl_annotations) diff --git a/src/detectors/ner/pl_liner_n5.py b/src/detectors/ner/pl_liner_n5.py index 0ea13a9def6e14a5ad815dd5ef45f24651df9700..d51cfa9b3da82f577da0b7e8dc68b30f48aa0caa 100644 --- a/src/detectors/ner/pl_liner_n5.py +++ b/src/detectors/ner/pl_liner_n5.py @@ -1,10 +1,10 @@ from typing import List, Tuple, Dict from src.utils.utils import subdict -from src.annotations import OtherAnnotation, Annotation +from src.detections import OtherDetection, Detection from src.mappings.ner_pl_n5_mapping import NER_PL_N5_MAPPING def detect_ner_pl_liner_n5( - ccl_annotations: Dict[str, List[Tuple[int, int, Annotation]]] + ccl_annotations: Dict[str, List[Tuple[int, int, Detection]]] ) -> List[Tuple[int, int, str]]: """ Detects ner entities in the text based on liner_n5 NER ontology. @@ -21,7 +21,7 @@ def detect_ner_pl_liner_n5( ) return [ - (start, end, NER_PL_N5_MAPPING.get(entity_type, OtherAnnotation)()) + (start, end, NER_PL_N5_MAPPING.get(entity_type, OtherDetection)()) for entity_type, entity in names.items() for start, end, _ in entity ] diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py index ca88264aa82030e7c6b1f92b31340882954d0284..861a5e1fd35ed42125173d2ba5bc3e284ccf1531 100644 --- a/src/detectors/phone/phone.py +++ b/src/detectors/phone/phone.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Dict, Any, Tuple -from src.annotations import PhoneNumberAnnotation +from src.detections import PhoneNumberDetection from src.detectors.interface import Detector @@ -10,7 +10,7 @@ class PhoneNumberDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, PhoneNumberAnnotation]]: + ) -> List[Tuple[int, int, PhoneNumberDetection]]: return detect_phone_numbers(text) @@ -20,7 +20,7 @@ PHONE_NUMBER_REGEX = re.compile( ) -def detect_phone_numbers(text: str) -> List[Tuple[int, int, PhoneNumberAnnotation]]: +def detect_phone_numbers(text: str) -> List[Tuple[int, int, PhoneNumberDetection]]: """ Detects phone numbers in the text. :param text: the text to be searched @@ -33,5 +33,5 @@ def detect_phone_numbers(text: str) -> List[Tuple[int, int, PhoneNumberAnnotatio matches = PHONE_NUMBER_REGEX.finditer(text) phone_numbers = [] for match in matches: - phone_numbers.append((match.start(), match.end(), PhoneNumberAnnotation())) + phone_numbers.append((match.start(), match.end(), PhoneNumberDetection())) return phone_numbers diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py index 63c83dbe2fd7f70f202014ec3462735abe1bfd47..ac67b2d1744b3787b0814726064510c0259241b2 100644 --- a/src/detectors/url/url.py +++ b/src/detectors/url/url.py @@ -2,7 +2,7 @@ import regex as re from typing import List, Dict, Any, Tuple from .pl import URL_REGEX_PL from .common import generate_url_regex -from src.annotations import UrlAnnotation +from src.detections import UrlDetection from src.detectors.interface import Detector @@ -12,11 +12,11 @@ class UrlDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, UrlAnnotation]]: + ) -> List[Tuple[int, int, UrlDetection]]: return detect_urls(text, self._language) -def detect_urls(text: str, language: str) -> List[Tuple[int, int, UrlAnnotation]]: +def detect_urls(text: str, language: str) -> List[Tuple[int, int, UrlDetection]]: """ Detects urls in the text. :param text: the text to be searched @@ -34,6 +34,6 @@ def detect_urls(text: str, language: str) -> List[Tuple[int, int, UrlAnnotation] matches = url_regex.finditer(text) urls = [] for match in matches: - urls.append((match.start(), match.end(), UrlAnnotation())) + urls.append((match.start(), match.end(), UrlDetection())) return urls diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py index ca8d483bf70c1157d30fdae6753406d412a0b0f7..3a891ab26b37fc8820bfbbe28975f9d4da47647c 100644 --- a/src/detectors/user/user.py +++ b/src/detectors/user/user.py @@ -1,6 +1,6 @@ import regex as re from typing import List, Dict, Any, Tuple -from src.annotations import UserAnnotation +from src.detections import UserDetection from src.detectors.interface import Detector @@ -10,14 +10,14 @@ class UserDetector(Detector): def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, UserAnnotation]]: + ) -> List[Tuple[int, int, UserDetection]]: return detect_users(text) USER_REGEX = re.compile(r"\B(?P<username>\@[\w\-]+)") -def detect_users(text: str) -> List[Tuple[int, int, UserAnnotation]]: +def detect_users(text: str) -> List[Tuple[int, int, UserDetection]]: """ Detects users in the text. :param text: the text to be searched @@ -30,5 +30,5 @@ def detect_users(text: str) -> List[Tuple[int, int, UserAnnotation]]: matches = USER_REGEX.finditer(text) users = [] for match in matches: - users.append((match.start(), match.end(), UserAnnotation())) + users.append((match.start(), match.end(), UserDetection())) return users diff --git a/src/dictionaries/morphosyntactic/interface.py b/src/dictionaries/morphosyntactic/interface.py index fe4a9381b3ac23f63468c41bc9eede89adb2f9ef..3f8a66bf2dd5241af2f026aaf4a4786ce72a872d 100644 --- a/src/dictionaries/morphosyntactic/interface.py +++ b/src/dictionaries/morphosyntactic/interface.py @@ -1,8 +1,8 @@ -from src.annotations import Annotation +from src.detections import Detection from typing import Optional class MorphosyntacticDictionary: - def get_random_replacement(self, original_entry: Annotation) -> Optional[str]: + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: """ Returns a random replacement for the original entry """ diff --git a/src/dictionaries/morphosyntactic/pl_ner.py b/src/dictionaries/morphosyntactic/pl_ner.py index 3f861a6cb871448c009fe8efcd71395de0aec671..d25beaee2b8b2914e48d45a08f5445b390b711fb 100644 --- a/src/dictionaries/morphosyntactic/pl_ner.py +++ b/src/dictionaries/morphosyntactic/pl_ner.py @@ -1,6 +1,6 @@ from typing import Dict, List, Optional, Tuple, Type from collections import defaultdict -from src.annotations import Annotation, OtherAnnotation, MorphosyntacticInfoMixin +from src.detections import Detection, OtherDetection, MorphosyntacticInfoMixin from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary import random @@ -9,8 +9,8 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): def __init__( self, dictionary_path: Optional[str] = None, - annotation_mapping: Optional[Dict[str, Type[Annotation]]] = None, - list: Optional[List[Tuple[Annotation, str, str, str]]] = None, + annotation_mapping: Optional[Dict[str, Type[Detection]]] = None, + list: Optional[List[Tuple[Detection, str, str, str]]] = None, always_replace=True, ) -> None: super().__init__() @@ -26,18 +26,18 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): raise ValueError("Either dictionary_path or list must be provided.") def _from_file( - self, path_to_dictionary: str, annotation_mapping: Dict[str, Type[Annotation]] + self, path_to_dictionary: str, annotation_mapping: Dict[str, Type[Detection]] ) -> None: self._dictionary = load_pl_ner_replacements_dictionary( path_to_dictionary, annotation_mapping ) - def _from_list(self, list: List[Tuple[Annotation, str, str, str]]) -> None: + def _from_list(self, list: List[Tuple[Detection, str, str, str]]) -> None: self._dictionary = defaultdict(lambda: defaultdict(dict)) for annotation, word, lemma, morpho_tag in list: self._dictionary[annotation][morpho_tag][lemma] = word - def get_random_replacement(self, original_entry: Annotation) -> Optional[str]: + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: original_entry_type = type(original_entry) result = None @@ -65,7 +65,7 @@ class PlNERMorphosyntacticDictionary(MorphosyntacticDictionary): def load_pl_ner_replacements_dictionary( - path: str, ner_mapping: Optional[Dict[str, Type[Annotation]]] = None + path: str, ner_mapping: Optional[Dict[str, Type[Detection]]] = None ) -> Dict[str, Dict[str, Dict[str, str]]]: """ Loads a dictionary that maps named entity tags to lemmas to part-of-speech tags to words. @@ -103,7 +103,7 @@ def load_pl_ner_replacements_dictionary( ner_tag, word, lemma, morpho_tag = line.split("\t") if ner_mapping is not None: - ner_tag = ner_mapping.get(ner_tag, OtherAnnotation)() + ner_tag = ner_mapping.get(ner_tag, OtherDetection)() replacement_dictionary[ner_tag][morpho_tag][lemma] = word diff --git a/src/mappings/ner_pl_n5_mapping.py b/src/mappings/ner_pl_n5_mapping.py index a14d9ce4deaabe0322129627e71b5ed6ccdd717f..77d5f13e3f754c93566cc820b22f68efaf29f3ff 100644 --- a/src/mappings/ner_pl_n5_mapping.py +++ b/src/mappings/ner_pl_n5_mapping.py @@ -1,15 +1,15 @@ -from src.annotations import ( - NameAnnotation, - SurnameAnnotation, - StreetNameAnnotation, - CityAnnotation, - CountryAnnotation, +from src.detections import ( + NameDetection, + SurnameDetection, + StreetNameDetection, + CityDetection, + CountryDetection, ) NER_PL_N5_MAPPING = { - "person_first_nam": NameAnnotation, - "person_last_nam": SurnameAnnotation, - "road_nam": StreetNameAnnotation, - "city_nam": CityAnnotation, - "country_nam": CountryAnnotation, + "person_first_nam": NameDetection, + "person_last_nam": SurnameDetection, + "road_nam": StreetNameDetection, + "city_nam": CityDetection, + "country_nam": CountryDetection, } diff --git a/src/replacers/date_replacer.py b/src/replacers/date_replacer.py index 7f2d6818b24f74d331924a01044c738162948044..ecf09b5f4bd88b974c1851e273480a04cf655012 100644 --- a/src/replacers/date_replacer.py +++ b/src/replacers/date_replacer.py @@ -1,7 +1,7 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - DateAnnotation, +from src.detections import ( + Detection, + DateDetection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -27,7 +27,7 @@ class DateReplacer(ReplacerInterface): def __init__(self): pass - def replace(self, text: str, detections: List[Tuple[int, int, Annotation]]) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] @@ -36,46 +36,46 @@ class DateReplacer(ReplacerInterface): for item in detections: start, end, detection = item - if isinstance(detection, DateAnnotation): + if isinstance(detection, DateDetection): replacement = [] if detection.format is not None: format = detection.format else: format = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2020"), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2020"), ] if text[start:end] in already_replaced: replacement = already_replaced[text[start:end]] else: for entry in format: - if entry[0] == DateAnnotation.AnnotationPart.TWO_DIGITS_DAY: + if entry[0] == DateDetection.AnnotationPart.TWO_DIGITS_DAY: random_day = random.randint(1, 28) replacement.append(str(random_day).zfill(2)) - elif entry[0] == DateAnnotation.AnnotationPart.ONE_DIGIT_DAY: + elif entry[0] == DateDetection.AnnotationPart.ONE_DIGIT_DAY: random_day = random.randint(1, 28) replacement.append(str(random_day)) - elif entry[0] == DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH: + elif entry[0] == DateDetection.AnnotationPart.TWO_DIGIT_MONTH: random_month = random.randint(1, 12) replacement.append(str(random_month).zfill(2)) - elif entry[0] == DateAnnotation.AnnotationPart.ONE_DIGIT_MONTH: + elif entry[0] == DateDetection.AnnotationPart.ONE_DIGIT_MONTH: random_month = random.randint(1, 12) replacement.append(str(random_month)) - elif entry[0] == DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR: + elif entry[0] == DateDetection.AnnotationPart.FOUR_DIGIT_YEAR: random_year = random.randint(1900, 2020) replacement.append(str(random_year)) - elif entry[0] == DateAnnotation.AnnotationPart.TWO_DIGIT_YEAR: + elif entry[0] == DateDetection.AnnotationPart.TWO_DIGIT_YEAR: random_year = random.randint(0, 99) replacement.append(str(random_year).zfill(2)) - elif entry[0] == DateAnnotation.AnnotationPart.TEXT_MONTH: + elif entry[0] == DateDetection.AnnotationPart.TEXT_MONTH: random_month = random.randint(1, 12) month_name = months_map[random_month] replacement.append(month_name) - elif entry[0] == DateAnnotation.AnnotationPart.OTHER: + elif entry[0] == DateDetection.AnnotationPart.OTHER: replacement.append(entry[1]) replacement = "".join(replacement) diff --git a/src/replacers/delete_replacer.py b/src/replacers/delete_replacer.py index 218873b54e2fe1193f8c8e5a9ce052932d782047..9a0190848d68d26196a48258bfb03282d1b16f95 100644 --- a/src/replacers/delete_replacer.py +++ b/src/replacers/delete_replacer.py @@ -1,5 +1,5 @@ from typing import List, Tuple -from src.annotations import Annotation +from src.detections import Detection from src.string_replacements import replace from src.replacers.interface import ReplacerInterface @@ -9,8 +9,8 @@ class DeleteReplacer(ReplacerInterface): pass def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: result = [ (start, end, "") diff --git a/src/replacers/email_replacer.py b/src/replacers/email_replacer.py index 48bf8716aa8b0a2afd7334053492618ccda285c6..104f53e63762d522ec92dfb85300a107ebf88f6e 100644 --- a/src/replacers/email_replacer.py +++ b/src/replacers/email_replacer.py @@ -1,7 +1,7 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - EmailAnnotation, +from src.detections import ( + Detection, + EmailDetection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -18,7 +18,7 @@ class EmailReplacer(ReplacerInterface): def __init__(self): pass - def replace(self, text: str, detections: List[Tuple[int, int, Annotation]]) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] @@ -27,7 +27,7 @@ class EmailReplacer(ReplacerInterface): for item in detections: start, end, detection = item - if isinstance(detection, EmailAnnotation): + if isinstance(detection, EmailDetection): if text[start:end] not in already_replaced: already_replaced[text[start:end]] = random_email() diff --git a/src/replacers/interface.py b/src/replacers/interface.py index fcaa21eb3172c1861312a939b38ae5891a8596a2..f4ed59f7d804b73771149256feba032613e1c8a4 100644 --- a/src/replacers/interface.py +++ b/src/replacers/interface.py @@ -1,13 +1,13 @@ from abc import ABC, abstractmethod from typing import List, Tuple -from src.annotations import Annotation +from src.detections import Detection class ReplacerInterface(ABC): @abstractmethod def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: """Replace detected entities in text with anonimized version. Args: diff --git a/src/replacers/ner_replacer.py b/src/replacers/ner_replacer.py index edb10b6c804b49317768e7c559d954ceec79aa62..214f0b78f530d0f526ab3d2e9d503373e15b29a8 100644 --- a/src/replacers/ner_replacer.py +++ b/src/replacers/ner_replacer.py @@ -1,6 +1,6 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, +from src.detections import ( + Detection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -12,8 +12,8 @@ class NERReplacer(ReplacerInterface): self._dictionary = dictionary def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] diff --git a/src/replacers/tag_replacer.py b/src/replacers/tag_replacer.py index 366934d4ea68ae93b9d8c5bfa0dc29af8458bc86..1f8d8987492dd4d728ca985302135902cd3e1371 100644 --- a/src/replacers/tag_replacer.py +++ b/src/replacers/tag_replacer.py @@ -1,18 +1,18 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - NameAnnotation, - SurnameAnnotation, - StreetNameAnnotation, - CityAnnotation, - CountryAnnotation, - PhoneNumberAnnotation, - UrlAnnotation, - UserAnnotation, - EmailAnnotation, - DateAnnotation, - TINAnnotation, - KRSAnnotation, +from src.detections import ( + Detection, + NameDetection, + SurnameDetection, + StreetNameDetection, + CityDetection, + CountryDetection, + PhoneNumberDetection, + UrlDetection, + UserDetection, + EmailDetection, + DateDetection, + TINDetection, + KRSDetection, ) from src.string_replacements import replace from src.replacers.interface import ReplacerInterface @@ -21,23 +21,23 @@ from src.replacers.interface import ReplacerInterface class TagReplacer(ReplacerInterface): def __init__(self): self.tags_map = { - NameAnnotation: "[OSOBA]", - SurnameAnnotation: "[OSOBA]", - StreetNameAnnotation: "[MIEJSCE]", - CityAnnotation: "[MIEJSCE]", - CountryAnnotation: "[MIEJSCE]", - PhoneNumberAnnotation: "[DIGITS]", - UrlAnnotation: "[WWW]", - UserAnnotation: "@[USER]", - EmailAnnotation: "[MAIL]", - DateAnnotation: "[DATE]", - TINAnnotation: "[DIGITS]", - KRSAnnotation: "[DIGITS]", + NameDetection: "[OSOBA]", + SurnameDetection: "[OSOBA]", + StreetNameDetection: "[MIEJSCE]", + CityDetection: "[MIEJSCE]", + CountryDetection: "[MIEJSCE]", + PhoneNumberDetection: "[DIGITS]", + UrlDetection: "[WWW]", + UserDetection: "@[USER]", + EmailDetection: "[MAIL]", + DateDetection: "[DATE]", + TINDetection: "[DIGITS]", + KRSDetection: "[DIGITS]", } def replace( - self, text: str, detections: List[Tuple[int, int, Annotation]] - ) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: result = [ (start, end, self.tags_map.get(type(entity_type), "[OTHER]")) diff --git a/src/replacers/user_replacer.py b/src/replacers/user_replacer.py index 66aeaf4d18317dbc253907b5ad8c98245066abe0..87fb931bac9c7ed8abdbbe1b3c6c7a0470dec6b5 100644 --- a/src/replacers/user_replacer.py +++ b/src/replacers/user_replacer.py @@ -1,7 +1,7 @@ from typing import List, Tuple -from src.annotations import ( - Annotation, - UserAnnotation, +from src.detections import ( + Detection, + UserDetection, ) from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface @@ -11,7 +11,7 @@ class UserReplacer(ReplacerInterface): def __init__(self): pass - def replace(self, text: str, detections: List[Tuple[int, int, Annotation]]) -> Tuple[str, List[Tuple[int, int, Annotation]]]: + def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] @@ -20,7 +20,7 @@ class UserReplacer(ReplacerInterface): for item in detections: start, end, detection = item - if isinstance(detection, UserAnnotation): + if isinstance(detection, UserDetection): if text[start:end] not in already_replaced: username = "@" + generate_username(1)[0] already_replaced[text[start:end]] = username diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..23e1435423c4fe3439a9b0ce4d118c35ace51f1a --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,33 @@ +"""Module for useful functions.""" + +import itertools + + +def consume(iterative, n): + """Consume n elements from iterative object. + + Args: + iterative (iter): Python iterative object. + n (int): Number of elements to consume. + + """ + next(itertools.islice(iterative, n - 1, n), None) + + +def subdict(dictionary, keys, all_must_be_present=True): + """Return a subdictionary of dictionary containing only keys. + + Args: + dictionary (dict): Dictionary to take a subdictionary from. + keys (list): List of keys to take from dictionary. + all_must_be_present (bool): If True, all keys must be present in + dictionary. If False, only keys that are present are returned. + + Returns: + dict: Subdictionary of dictionary containing only keys. + + """ + if all_must_be_present: + return {key: dictionary[key] for key in keys} + else: + return {key: dictionary[key] for key in keys if key in dictionary} diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py index 704ce37775131badd561ec0bd99d637efea5cea2..ee62ea8fef7664928fc4c55351ae0b93c88fc485 100644 --- a/tests/detectors/date/test_en.py +++ b/tests/detectors/date/test_en.py @@ -1,4 +1,4 @@ -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_en(): @@ -9,22 +9,22 @@ def test_detect_dates_en(): found_dates = detector.detect(text, dict()) format_date1 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") ] format_date2 = [ - (DateAnnotation.AnnotationPart.TEXT_MONTH, "April"), - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, ", "), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), + (DateDetection.AnnotationPart.TEXT_MONTH, "April"), + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, ", "), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - assert found_dates == [(3, 12, DateAnnotation(format_date1)), (32, 45, DateAnnotation(format_date2))] + assert found_dates == [(3, 12, DateDetection(format_date1)), (32, 45, DateDetection(format_date2))] # Check en-gb # TODO: Following test fails. Fix it. diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py index 077240deaccd51678589d448e46a9d58bb2cf755..bfe159af91b7cc0f71953d20ca4f3810a00d1b09 100644 --- a/tests/detectors/date/test_pl.py +++ b/tests/detectors/date/test_pl.py @@ -1,4 +1,4 @@ -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_pl(): @@ -8,19 +8,19 @@ def test_detect_dates_pl(): found_dates = detector.detect(text, dict()) format_date1 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") ] format_date2 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.TEXT_MONTH, "kwietnia"), - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.TEXT_MONTH, "kwietnia"), + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - assert found_dates == [(7, 16, DateAnnotation(format_date1)), (34, 49, DateAnnotation(format_date2))] \ No newline at end of file + assert found_dates == [(7, 16, DateDetection(format_date1)), (34, 49, DateDetection(format_date2))] \ No newline at end of file diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py index 5269b94bb9adc5d0fee2e9b51543d05474138963..72a9f89ca76e59c41d8109b7558bb921f2588b4c 100644 --- a/tests/detectors/date/test_ru.py +++ b/tests/detectors/date/test_ru.py @@ -1,4 +1,4 @@ -from src.annotations import DateAnnotation +from src.detections import DateDetection from src.detectors.date.date import DateDetector @@ -9,19 +9,19 @@ def test_detect_dates_pl(): found_dates = detector.detect(text, dict()) format_date1 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateAnnotation.AnnotationPart.OTHER, "."), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), + (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") ] format_date2 = [ - (DateAnnotation.AnnotationPart.TWO_DIGITS_DAY, "05"), - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.TEXT_MONTH, "апрелÑ"), # Only supports two digits for now - (DateAnnotation.AnnotationPart.OTHER, " "), - (DateAnnotation.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.TEXT_MONTH, "апрелÑ"), # Only supports two digits for now + (DateDetection.AnnotationPart.OTHER, " "), + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - assert found_dates == [(0, 9, DateAnnotation(format_date1)), (26, 39, DateAnnotation(format_date2))] + assert found_dates == [(0, 9, DateDetection(format_date1)), (26, 39, DateDetection(format_date2))] diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py index 95ecb061858e0d6890b68234ed2fcb5619133e2c..982d983b20044e53876c4e866d700787aedb343e 100644 --- a/tests/detectors/email/test_email.py +++ b/tests/detectors/email/test_email.py @@ -1,4 +1,4 @@ -from src.annotations import EmailAnnotation +from src.detections import EmailDetection from src.detectors.email import EmailDetector def test_detect_emails(): @@ -7,4 +7,4 @@ def test_detect_emails(): text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl" found_emails = detector.detect(text, dict()) - assert found_emails == [(12, 30, EmailAnnotation()), (53, 78, EmailAnnotation())] \ No newline at end of file + assert found_emails == [(12, 30, EmailDetection()), (53, 78, EmailDetection())] \ No newline at end of file diff --git a/tests/detectors/ner/test_pl_liner_n5.py b/tests/detectors/ner/test_pl_liner_n5.py index 544dc00c56828c135ef1820d692f0753b9103fa9..7af941c2be5fbba2318863eb380aab895e549a72 100644 --- a/tests/detectors/ner/test_pl_liner_n5.py +++ b/tests/detectors/ner/test_pl_liner_n5.py @@ -1,4 +1,4 @@ -from src.annotations import NameAnnotation, SurnameAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, CityDetection from src.detectors.ner import NerDetector def test_detect_names_pl_liner_n5(): @@ -14,10 +14,10 @@ def test_detect_names_pl_liner_n5(): result = detector.detect("", ccl_annotations) expected = [ - (10, 16, NameAnnotation()), - (100, 109, NameAnnotation()), - (30, 35, SurnameAnnotation()), - (50, 59, CityAnnotation()), + (10, 16, NameDetection()), + (100, 109, NameDetection()), + (30, 35, SurnameDetection()), + (50, 59, CityDetection()), ] assert set(result) == set(expected) \ No newline at end of file diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py index ad3bc59623c1b3d3ad5046afad448cd0f2c33ad0..5ada3971eb399d2aceedbd30f1e14edbebb4cbd8 100644 --- a/tests/detectors/phone/test_phone.py +++ b/tests/detectors/phone/test_phone.py @@ -1,4 +1,4 @@ -from src.annotations import PhoneNumberAnnotation +from src.detections import PhoneNumberDetection from src.detectors.phone import PhoneNumberDetector def test_detect_phone_numbers(): @@ -7,4 +7,4 @@ def test_detect_phone_numbers(): text = "My phone number is +48 123 456 789. My friend's number is 123456789." found_phone_numbers = detector.detect(text, dict()) - assert found_phone_numbers == [(19, 34, PhoneNumberAnnotation()), (58, 67, PhoneNumberAnnotation())] \ No newline at end of file + assert found_phone_numbers == [(19, 34, PhoneNumberDetection()), (58, 67, PhoneNumberDetection())] \ No newline at end of file diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py index 44d14ff98a8c6f59caae72259420596ab34efcd0..4e5c02f369cccefc622f072142a47d8487b11bb4 100644 --- a/tests/detectors/url/test_url.py +++ b/tests/detectors/url/test_url.py @@ -1,5 +1,5 @@ from src.detectors.url import UrlDetector -from src.annotations import UrlAnnotation +from src.detections import UrlDetection def test_detect_urls(): detector = UrlDetector("en") @@ -7,7 +7,7 @@ def test_detect_urls(): text = "This is a test for www.google.com. Make sure to go to https://www.google.com" found_urls = detector.detect(text, dict()) - assert found_urls == [(19, 33, UrlAnnotation()), (54, 76, UrlAnnotation())] + assert found_urls == [(19, 33, UrlDetection()), (54, 76, UrlDetection())] def test_detect_urls_pl(): detector_en = UrlDetector("en") @@ -18,5 +18,5 @@ def test_detect_urls_pl(): found_urls_en = detector_en.detect(text, dict()) # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected as a URL. - assert found_urls_pl == [(6, 28, UrlAnnotation())] - assert found_urls_en == [(0, 4, UrlAnnotation()), (6, 28, UrlAnnotation())] \ No newline at end of file + assert found_urls_pl == [(6, 28, UrlDetection())] + assert found_urls_en == [(0, 4, UrlDetection()), (6, 28, UrlDetection())] \ No newline at end of file diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py index 028b1f496664e06acb84747acc3bef2b2ecdffd6..c1b8bc33a316dd912a3dfee9f2bbc286b5cae91d 100644 --- a/tests/detectors/user/test_user.py +++ b/tests/detectors/user/test_user.py @@ -1,5 +1,5 @@ from src.detectors.user import UserDetector -from src.annotations import UserAnnotation +from src.detections import UserDetection def test_detect_users(): detector = UserDetector() @@ -7,4 +7,4 @@ def test_detect_users(): text = "My username is @john_smith. My friend's username is @jane_doe." found_users = detector.detect(text, dict()) - assert found_users == [(15, 26, UserAnnotation()), (52, 61, UserAnnotation())] \ No newline at end of file + assert found_users == [(15, 26, UserDetection()), (52, 61, UserDetection())] \ No newline at end of file diff --git a/tests/dictionaries/morphosyntactic/test_pl_ner.py b/tests/dictionaries/morphosyntactic/test_pl_ner.py index 7d9e229a6e6833e232d97093db1d58699ec8665a..9786f4fadcec5643922487b9a9dd28cfed808788 100644 --- a/tests/dictionaries/morphosyntactic/test_pl_ner.py +++ b/tests/dictionaries/morphosyntactic/test_pl_ner.py @@ -1,19 +1,19 @@ from src.dictionaries.morphosyntactic.pl_ner import PlNERMorphosyntacticDictionary -from src.annotations import NameAnnotation, CityAnnotation, SurnameAnnotation +from src.detections import NameDetection, CityDetection, SurnameDetection def test_pl_ner_morphosyntactic_dictionary(): dictionary = PlNERMorphosyntacticDictionary(list=[ - (NameAnnotation, "Andrzejowi", "Andrzej", "subst:sg:dat:m1"), - (NameAnnotation, "Andrzej", "Andrzej", "subst:sg:m1:imperf"), - (NameAnnotation, "Kasia", "Kasia", "subst:sg:f:imperf"), - (CityAnnotation, "WrocÅ‚aw", "WrocÅ‚aw", "subst:sg:m2:imperf"), - (CityAnnotation, "Warszawa", "Warszawa", "subst:sg:f:imperf"), - (CityAnnotation, "Kraków", "Kraków", "subst:sg:m2:imperf") + (NameDetection, "Andrzejowi", "Andrzej", "subst:sg:dat:m1"), + (NameDetection, "Andrzej", "Andrzej", "subst:sg:m1:imperf"), + (NameDetection, "Kasia", "Kasia", "subst:sg:f:imperf"), + (CityDetection, "WrocÅ‚aw", "WrocÅ‚aw", "subst:sg:m2:imperf"), + (CityDetection, "Warszawa", "Warszawa", "subst:sg:f:imperf"), + (CityDetection, "Kraków", "Kraków", "subst:sg:m2:imperf") ]) - example_name_1 = NameAnnotation(morpho_tag="subst:sg:dat:m1") - example_name_2 = NameAnnotation(morpho_tag="subst:sg:m1:imperf") - example_other = SurnameAnnotation(morpho_tag="subst:sg:m1:imperf") + example_name_1 = NameDetection(morpho_tag="subst:sg:dat:m1") + example_name_2 = NameDetection(morpho_tag="subst:sg:m1:imperf") + example_other = SurnameDetection(morpho_tag="subst:sg:m1:imperf") assert dictionary.get_random_replacement(example_name_1) == "Andrzejowi" assert dictionary.get_random_replacement(example_name_2) in ["Andrzej", "Kasia"] diff --git a/tests/pipeline/test_default.py b/tests/pipeline/test_default.py index 97acb462e2d7bc525c753923d0d5a1f098ee417c..19e8687e729a4ed6aecbed3a3bbd6a98fdc39a54 100644 --- a/tests/pipeline/test_default.py +++ b/tests/pipeline/test_default.py @@ -1,5 +1,5 @@ from src.pipeline.default import DefaultPipeline -from src.annotations import NameAnnotation +from src.detections import NameDetection from src.input_parsers.interface import InputParser from src.detectors.interface import Detector from src.suppressors.interface import Suppressor @@ -11,7 +11,7 @@ class MockInputParser(InputParser): class MockDetector(Detector): def detect(self, text, annotations): - return [(0, 3, NameAnnotation())] + return [(0, 3, NameDetection())] class MockSuppressor(Suppressor): def suppress(self, annotations): diff --git a/tests/replacers/test_date_replacer.py b/tests/replacers/test_date_replacer.py index 77ce093764bfd93c6e370c18e578ef5cdcf15aea..f647f6e6a9a5cde29548a4202d4ec805f8f5c230 100644 --- a/tests/replacers/test_date_replacer.py +++ b/tests/replacers/test_date_replacer.py @@ -1,14 +1,14 @@ from src.replacers.date_replacer import DateReplacer -from src.annotations import NameAnnotation, SurnameAnnotation, DateAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection def test_date_replacer(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameAnnotation()), - (4, 14, SurnameAnnotation()), - (28, 38, DateAnnotation()), - (42, 51, CityAnnotation()), + (0, 3, NameDetection()), + (4, 14, SurnameDetection()), + (28, 38, DateDetection()), + (42, 51, CityDetection()), ] replacer = DateReplacer() @@ -18,9 +18,9 @@ def test_date_replacer(): expected_text_beggining = "Ala Brzeszczot urodziÅ‚a sie " expected_text_ending = " we WrocÅ‚awiu" exptected_detections_left = [ - (0, 3, NameAnnotation()), - (4, 14, SurnameAnnotation()), - (len(result[0]) - 9, len(result[0]), CityAnnotation()), + (0, 3, NameDetection()), + (4, 14, SurnameDetection()), + (len(result[0]) - 9, len(result[0]), CityDetection()), ] assert result[0].startswith(expected_text_beggining) @@ -30,9 +30,9 @@ def test_date_replacer(): def test_date_replacer_same_date_same_replacement(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu. 05.05.2005 to jej urodziny. 06.05.2005 to nie jej urodziny." detections = [ - (28, 38, DateAnnotation()), - (53, 63, DateAnnotation()), - (81, 91, DateAnnotation()), + (28, 38, DateDetection()), + (53, 63, DateDetection()), + (81, 91, DateDetection()), ] replacer = DateReplacer() diff --git a/tests/replacers/test_email_replacer.py b/tests/replacers/test_email_replacer.py index a354f3eb0223a9ccf92d67283ff6cea3a1656520..664e04304cefb059829f372dfc24d7ae48e7052e 100644 --- a/tests/replacers/test_email_replacer.py +++ b/tests/replacers/test_email_replacer.py @@ -1,13 +1,13 @@ from src.replacers.email_replacer import EmailReplacer -from src.annotations import DateAnnotation, CityAnnotation, UserAnnotation, EmailAnnotation +from src.detections import DateDetection, CityDetection, UserDetection, EmailDetection def test_email_replacer(): text = "zz@z.pl urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 7, EmailAnnotation()), - (21, 31, DateAnnotation()), - (35, 44, CityAnnotation()), + (0, 7, EmailDetection()), + (21, 31, DateDetection()), + (35, 44, CityDetection()), ] replacer = EmailReplacer() @@ -15,8 +15,8 @@ def test_email_replacer(): expected_text_ending = " urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ - (len(result[0]) - 23, len(result[0]) - 13, DateAnnotation()), - (len(result[0]) - 9, len(result[0]), CityAnnotation()), + (len(result[0]) - 23, len(result[0]) - 13, DateDetection()), + (len(result[0]) - 9, len(result[0]), CityDetection()), ] assert result[0].endswith(expected_text_ending) @@ -26,9 +26,9 @@ def test_email_replacer(): def test_email_replacer_same_email_same_replacement(): text = "zz@z.pl zz@z.pl aa@a.pl" detections = [ - (0, 7, EmailAnnotation()), - (8, 15, EmailAnnotation()), - (16, 22, EmailAnnotation()), + (0, 7, EmailDetection()), + (8, 15, EmailDetection()), + (16, 22, EmailDetection()), ] diff --git a/tests/replacers/test_ner_replacer.py b/tests/replacers/test_ner_replacer.py index fad3921700720c5a9392a81b5e49b2f44c741665..64c7de6293dff6f48cf3665f223967abeaaa8eec 100644 --- a/tests/replacers/test_ner_replacer.py +++ b/tests/replacers/test_ner_replacer.py @@ -1,22 +1,22 @@ from src.replacers.ner_replacer import NERReplacer -from src.annotations import NameAnnotation, SurnameAnnotation, DateAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection from src.dictionaries.morphosyntactic.pl_ner import PlNERMorphosyntacticDictionary def test_ner_replacer(): dictionary = PlNERMorphosyntacticDictionary(list=[ - (NameAnnotation, "Andrzej", "Andrzej", "a"), - (NameAnnotation, "Kasi", "Kasia", "b"), - (SurnameAnnotation, "Kowalowi", "Kowal", "a"), - (SurnameAnnotation, "Kowal", "Kowal", "b"), + (NameDetection, "Andrzej", "Andrzej", "a"), + (NameDetection, "Kasi", "Kasia", "b"), + (SurnameDetection, "Kowalowi", "Kowal", "a"), + (SurnameDetection, "Kowal", "Kowal", "b"), ], always_replace=False) text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameAnnotation(morpho_tag="a")), - (4, 14, SurnameAnnotation(morpho_tag="b")), - (28, 38, DateAnnotation()), - (42, 51, CityAnnotation(morpho_tag="c")), + (0, 3, NameDetection(morpho_tag="a")), + (4, 14, SurnameDetection(morpho_tag="b")), + (28, 38, DateDetection()), + (42, 51, CityDetection(morpho_tag="c")), ] replacer = NERReplacer(dictionary) @@ -25,8 +25,8 @@ def test_ner_replacer(): expected_text = "Andrzej Kowal urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ - (27, 37, DateAnnotation()), - (41, 50, CityAnnotation(morpho_tag="c")), + (27, 37, DateDetection()), + (41, 50, CityDetection(morpho_tag="c")), ] assert result == (expected_text, exptected_detections_left) \ No newline at end of file diff --git a/tests/replacers/test_tag_replacer.py b/tests/replacers/test_tag_replacer.py index cd73090d67b259c5986f5a45b473496fe008728a..4c5ce485040c8d26baaf0966bc8d72cfb9b9698f 100644 --- a/tests/replacers/test_tag_replacer.py +++ b/tests/replacers/test_tag_replacer.py @@ -1,14 +1,14 @@ from src.replacers.tag_replacer import TagReplacer -from src.annotations import NameAnnotation, SurnameAnnotation, DateAnnotation, CityAnnotation +from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection def test_replace_with_tags(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameAnnotation()), - (4, 14, SurnameAnnotation()), - (28, 38, DateAnnotation()), - (42, 51, CityAnnotation()), + (0, 3, NameDetection()), + (4, 14, SurnameDetection()), + (28, 38, DateDetection()), + (42, 51, CityDetection()), ] replacer = TagReplacer() diff --git a/tests/replacers/test_user_replacer.py b/tests/replacers/test_user_replacer.py index 587835a48b0ff900cb42f4900081e05c2f107903..608f766e44797d596e96a460d599fec726f34158 100644 --- a/tests/replacers/test_user_replacer.py +++ b/tests/replacers/test_user_replacer.py @@ -1,13 +1,13 @@ from src.replacers.user_replacer import UserReplacer -from src.annotations import DateAnnotation, CityAnnotation, UserAnnotation +from src.detections import DateDetection, CityDetection, UserDetection def test_user_replacer(): text = "@zzzz32 urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 7, UserAnnotation()), - (21, 31, DateAnnotation()), - (35, 44, CityAnnotation()), + (0, 7, UserDetection()), + (21, 31, DateDetection()), + (35, 44, CityDetection()), ] replacer = UserReplacer() @@ -15,8 +15,8 @@ def test_user_replacer(): expected_text_ending = " urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ - (len(result[0]) - 23, len(result[0]) - 13, DateAnnotation()), - (len(result[0]) - 9, len(result[0]), CityAnnotation()), + (len(result[0]) - 23, len(result[0]) - 13, DateDetection()), + (len(result[0]) - 9, len(result[0]), CityDetection()), ] assert result[0].endswith(expected_text_ending) @@ -26,9 +26,9 @@ def test_user_replacer(): def test_user_replacer_same_user_same_replacement(): text = "@zzzz32 @zzzz32 @aaaaa" detections = [ - (0, 7, UserAnnotation()), - (8, 15, UserAnnotation()), - (16, 22, UserAnnotation()), + (0, 7, UserDetection()), + (8, 15, UserDetection()), + (16, 22, UserDetection()), ]