diff --git a/main.py b/main.py index 8869ebef5f905ac0292c2c54073b094c1245b4bc..0a3fee6bff7b6dc6ca04ec6da4e7a9f76f0b07c6 100644 --- a/main.py +++ b/main.py @@ -11,9 +11,7 @@ def get_args(): subparsers = parser.add_subparsers(dest="mode") subparsers.required = True - subparsers.add_parser( - "service", - help="Run as a service") + subparsers.add_parser("service", help="Run as a service") return parser.parse_args() diff --git a/scripts/cli.py b/scripts/cli.py index f8986e1885cf56ba4497adbe30cf76abb0e3f68c..8569ee889a12ef61bd874cf54ed0ed8c04f45878 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -15,7 +15,13 @@ def get_args(): parser.add_argument( "-l", "--language", help="Language of the input text", default="pl" ) - parser.add_argument("-m", "--method", help="Anonymization method", default="tag", choices=["delete", "tag", "pseudo"]) + parser.add_argument( + "-m", + "--method", + help="Anonymization method", + default="tag", + choices=["delete", "tag", "pseudo"], + ) parser.add_argument("input_file", help="Path to input file") parser.add_argument("output_file", help="Path to output file") diff --git a/src/annotation_mapping.py b/src/annotation_mapping.py index ef739627b4596b864533fb88f62ae5f1cb95bf50..3a06c9c2c517d1b4b6b2ae7f5e13c5913fe5211e 100644 --- a/src/annotation_mapping.py +++ b/src/annotation_mapping.py @@ -3,13 +3,14 @@ from typing import Dict, List, Tuple, TypeVar T1 = TypeVar("T1") T2 = TypeVar("T2") + def map_annotatios( ref_annotations: List[Tuple[int, int, T1]], all_annotations: Dict[str, List[Tuple[int, int, T2]]], target_columns: List[str], ) -> Dict[Tuple[int, int, T1], Dict[str, Tuple[int, int, T2]]]: """Map annotations from target columns to reference annotations. - + Example: >> ref_annotations = [(0, 3, "Andrzej"), (7, 11, "psa")] >> all_annotations = { @@ -34,7 +35,7 @@ def map_annotatios( Returns: Dict[Tuple[int, int, T1], Dict[str, Tuple[int, int, T2]]]: Mapped annotations. """ - + result = dict() index_map = dict() diff --git a/src/annotations/__init__.py b/src/annotations/__init__.py index d09a85296c51b29fceadd89c8ca736b3e6fddb33..c94f5eccfa58d5e8dde8fde0533b0aa78b19f548 100644 --- a/src/annotations/__init__.py +++ b/src/annotations/__init__.py @@ -1 +1 @@ -from src.annotations.annotations import * \ No newline at end of file +from src.annotations.annotations import * diff --git a/src/annotations/annotations.py b/src/annotations/annotations.py index fab8b102d1712e19de53abd80faba2a2d9dc92f1..c72270ccc24fc3fd8ad81130d9a81a4ff37b12ed 100644 --- a/src/annotations/annotations.py +++ b/src/annotations/annotations.py @@ -1,14 +1,17 @@ from dataclasses import dataclass + @dataclass class Annotation: def __hash__(self) -> int: return (type(self), *(self.__dict__.values())).__hash__() + class MorphosyntacticAnnotation(Annotation): def __init__(self, morphosyntactic_tag) -> None: self.morphosyntactic_tag = morphosyntactic_tag + class NerAnnotation(Annotation): def __init__(self, ner_type: str) -> None: - self.ner_type = ner_type \ No newline at end of file + self.ner_type = ner_type diff --git a/src/detections/__init__.py b/src/detections/__init__.py index 79b800319527d11e763d98f26104b2c6aca3a6c6..554c9a5ead5a6afa9d79063aa9f4aab454480c67 100644 --- a/src/detections/__init__.py +++ b/src/detections/__init__.py @@ -5,4 +5,4 @@ from src.utils.subclasses import get_sublcasses DETECTION_CLASSES_MAP = { detection_class.TYPE_NAME: detection_class for detection_class in get_sublcasses(Detection) -} \ No newline at end of file +} diff --git a/src/detections/date.py b/src/detections/date.py index 89642664c8eb80c99829af691b055c788f11e4c3..b9e9888c5b8e941c3df57c1e2e0eb0d4d27db019 100644 --- a/src/detections/date.py +++ b/src/detections/date.py @@ -1,8 +1,10 @@ from src.detections.detection import Detection from typing import List, Tuple, Optional + class DateDetection(Detection): TYPE_NAME = "date" + class AnnotationPart: TWO_DIGITS_DAY = "DD" ONE_DIGIT_DAY = "D" @@ -12,17 +14,19 @@ class DateDetection(Detection): TWO_DIGIT_YEAR = "YY" TEXT_MONTH = "MMM" OTHER = "OTHER" - - def __init__(self, format: Optional[List[Tuple[AnnotationPart, str]]] = None) -> None: + + def __init__( + self, format: Optional[List[Tuple[AnnotationPart, str]]] = None + ) -> None: """ The annotation representing a date value. :param format: the format of the date, e.g. [(AnnotationPart.TWO_DIGITS_DAY, "01"), (AnnotationPart.OTHER, ".") ...] :type format: Optional[List[Tuple[str, str]]] """ - + super().__init__() - + self.format = format - + def __eq__(self, other) -> bool: - return self.format == other.format and super().__eq__(other) \ No newline at end of file + return self.format == other.format and super().__eq__(other) diff --git a/src/detections/detection.py b/src/detections/detection.py index 6ac980efb31dd20c707f5dfa39e5d4867881d326..149eaf87296928e4d5d7f4c9b5392367170b3886 100644 --- a/src/detections/detection.py +++ b/src/detections/detection.py @@ -1,97 +1,114 @@ from dataclasses import dataclass from typing import Optional + @dataclass class Detection: TYPE_NAME = "detection" + def __init__(self) -> None: pass - + def __hash__(self) -> int: return (type(self), *(self.__dict__.values())).__hash__() - + + class MorphosyntacticInfoMixin: def __init__(self, morpho_tag: str, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self._morpho_tag = morpho_tag - + @property def morpho_tag(self) -> str: return self._morpho_tag - + + class NameDetection(MorphosyntacticInfoMixin, Detection): TYPE_NAME = "name" - + def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag) - + + class SurnameDetection(MorphosyntacticInfoMixin, Detection): TYPE_NAME = "surname" - + def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag) - + + class StreetNameDetection(MorphosyntacticInfoMixin, Detection): TYPE_NAME = "street_name" - + def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag) - + + class CityDetection(MorphosyntacticInfoMixin, Detection): TYPE_NAME = "city" - + def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag) - + + class CountryDetection(MorphosyntacticInfoMixin, Detection): TYPE_NAME = "country" - + def __init__(self, morpho_tag: Optional[str] = None) -> None: super().__init__(morpho_tag=morpho_tag) - + + class UrlDetection(Detection): TYPE_NAME = "url" - + def __init__(self) -> None: super().__init__() - + + class UserDetection(Detection): TYPE_NAME = "user" - + def __init__(self) -> None: super().__init__() - + + class EmailDetection(Detection): TYPE_NAME = "email" - + def __init__(self) -> None: super().__init__() - + + class NumberDetection(Detection): TYPE_NAME = "number" - + def __init__(self) -> None: - super().__init__() + super().__init__() + + class PhoneNumberDetection(NumberDetection): TYPE_NAME = "phone_number" - + def __init__(self) -> None: super().__init__() - -class TINDetection(Detection): # Tax Identification Number + + +class TINDetection(Detection): # Tax Identification Number TYPE_NAME = "tin" - + def __init__(self) -> None: super().__init__() - -class KRSDetection(Detection): # National Court Register + + +class KRSDetection(Detection): # National Court Register TYPE_NAME = "krs" - + def __init__(self) -> None: super().__init__() - -class OtherDetection(Detection): # Non standard entity + + +class OtherDetection(Detection): # Non standard entity TYPE_NAME = "other" - + def __init__(self) -> None: - super().__init__() \ No newline at end of file + super().__init__() diff --git a/src/detectors/date/__init__.py b/src/detectors/date/__init__.py index 45c281947d6faeb1199806be2e5be2b02ecd2bf8..bac0ba7d340262dcc9a6eb3afee0c4d5e37426d1 100644 --- a/src/detectors/date/__init__.py +++ b/src/detectors/date/__init__.py @@ -1 +1 @@ -from src.detectors.date.date import DateDetector \ No newline at end of file +from src.detectors.date.date import DateDetector diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py index c232ace36d438d45224ec31d3aefd503a9a23064..43544d9cc96ccd933ef429799e4f879daf781174 100644 --- a/src/detectors/date/date.py +++ b/src/detectors/date/date.py @@ -16,9 +16,7 @@ class DateDetector(Detector): return find_dates(text, self._language) -def find_dates( - text: str, language: str = "pl" -) -> List[Tuple[int, int, DateDetection]]: +def find_dates(text: str, language: str = "pl") -> List[Tuple[int, int, DateDetection]]: """ Finds dates in the text. :param text: the text to be searched diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py index 142c12b0a5c8826634ee7ec3e1f733627786cd1e..0f668b311e724e45005f36ad6ef74430deaf822b 100644 --- a/src/detectors/date/en.py +++ b/src/detectors/date/en.py @@ -5,25 +5,25 @@ from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format EN_DATES_REGEX = re.compile( - r'\b(?P<day_or_month_year>' - r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' - r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' - r'(?P<year1>\d{4}|\d{2}))\b|' - - r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' - r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' - r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' - - r'(?P<month_in_words>' - r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' - r'\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|' - r'Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)' - r'|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b' - r'(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?' - r'(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?' - r'(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))', re.I + r"\b(?P<day_or_month_year>" + r"(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})" + r"(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})" + r"(?P<year1>\d{4}|\d{2}))\b|" + r"\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})" + r"(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)" + r"(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|" + r"(?P<month_in_words>" + r"(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?" + r"\b(?P<month>Jan(?:|uary)|Feb(?:|ruary)|Mar(?:|ch)|" + r"Apr(?:|il)|May|Jun(?:|e)|Jul(?:|y)|Aug(?:|ust)|Sep(?:|tember)" + r"|Oct(?:|ober)|Nov(?:|ember)|Dec(?:|ember))\b" + r"(?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))?" + r"(?:(?P<punct6>[ \t\-\./,]{1,2})(?P<year3>\d{4}|\d{2}))?" + r"(?<!\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b))", + re.I, ) + def detect_dates_en(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects English dates in the text. @@ -37,4 +37,4 @@ def detect_dates_en(text: str) -> List[Tuple[int, int, DateDetection]]: for match in matches: format = _parse_date_to_format(match.groupdict()) dates.append((match.start(), match.end(), DateDetection(format))) - return dates \ No newline at end of file + return dates diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py index e4bbf45efdcbf478bc95815b06e9889fe183cdc9..4c0537033d6b38215c2fc0a4eab53e023ccc6ced 100644 --- a/src/detectors/date/pl.py +++ b/src/detectors/date/pl.py @@ -5,28 +5,28 @@ from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format PL_DATES_REGEX = re.compile( - r'\b(?P<day_or_month_year>' - r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' - r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' - r'(?P<year1>\d{4}|\d{2}))\b|' - - r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' - r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' - r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' - - r'(?P<month_in_words>' - r'(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)' - r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' - r'\b(?P<month>Sty(?:|cze[nÅ„]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|' - r'Kwi(?:|ecie[nÅ„]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)' - r'|Sie(?:|rpie[nÅ„]|rpnia)|Wrz(?:|esie[nÅ„]|e[Å›s]nia)' - r'|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)' - r'|Gru(?:|dzie[nÅ„]|dnia))\b' - r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' - r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' - r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)', re.I + r"\b(?P<day_or_month_year>" + r"(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})" + r"(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})" + r"(?P<year1>\d{4}|\d{2}))\b|" + r"\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})" + r"(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)" + r"(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|" + r"(?P<month_in_words>" + r"(?!\b(sty|lut|mar|kwi|maj|cze|lip|sie|wrz|paz|lis|gru)\b)" + r"(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?" + r"\b(?P<month>Sty(?:|cze[nÅ„]|cznia)|Lut(?:|y|ego)|Mar(?:|zec|ca)|" + r"Kwi(?:|ecie[nÅ„]|etnia)|Maj(?:|a)|Cze(?:|rwiec|rwca)|Lip(?:|iec|ca)" + r"|Sie(?:|rpie[nÅ„]|rpnia)|Wrz(?:|esie[nÅ„]|e[Å›s]nia)" + r"|Pa[zź](?:|dziernik|dziernika)|Lis(?:|topad|topada)" + r"|Gru(?:|dzie[nÅ„]|dnia))\b" + r"((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))" + r"(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|" + r"(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?)", + re.I, ) + def detect_dates_pl(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects Polish dates in the text. @@ -35,10 +35,10 @@ def detect_dates_pl(text: str) -> List[Tuple[int, int, DateDetection]]: :return: a list of tuples containing (start, end, annotation) :rtype: List[Tuple[int, int, DateAnnotation]] """ - + matches = PL_DATES_REGEX.finditer(text) dates = [] for match in matches: format = _parse_date_to_format(match.groupdict()) dates.append((match.start(), match.end(), DateDetection(format))) - return dates \ No newline at end of file + return dates diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py index 02f2ed91c2e477e8e6fe79414bba72c1c3113d25..bda5b4ba9a52f025787d9845abcb97de6290397a 100644 --- a/src/detectors/date/ru.py +++ b/src/detectors/date/ru.py @@ -5,28 +5,28 @@ from src.detections import DateDetection from src.detectors.date.utils import _parse_date_to_format RU_DATES_REGEX = re.compile( - r'\b(?P<day_or_month_year>' - r'(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})' - r'(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})' - r'(?P<year1>\d{4}|\d{2}))\b|' - - r'\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})' - r'(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)' - r'(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|' - - r'(?P<month_in_words>' - r'(?!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b)' - r'(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?' - r'\b(?P<month>Янв(?:|ар[ьеÑ])|Фев(?:|рал[ьеÑ])|Мар(?:|Ñ‚|те|та)|' - r'Ðпр(?:|ел[ьеÑ])|Ма[йеÑ]|Июн(?:|[ьеÑ])|Июл(?:|[ьеÑ])|' - r'Ðвг(?:|уÑÑ‚|уÑÑ‚[еа])|Сен(?:|Ñ‚Ñбр[ьеÑ])|Окт(?:|Ñбр[ьеÑ])|' - r'ÐоÑ(?:|бр[ьеÑ])|Дек(?:|абр[ьеÑ]))\b' - r'((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))' - r'(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|' - r'(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?' - r'(?<!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b))', re.I + r"\b(?P<day_or_month_year>" + r"(?P<day_month1>[0-3]?\d)(?P<punct1>[ \t\-\./,]{1,2})" + r"(?P<day_month2>[0-3]?\d)(?P<punct2>[ \t\-\./,]{1,2})" + r"(?P<year1>\d{4}|\d{2}))\b|" + r"\b(?P<year_month_or_day>(?P<year2>\d{4}|\d{2})" + r"(?P<punct3>[ \t\-\./,]{1,2})(?P<day_month3>[0-3]?\d)" + r"(?P<punct4>[ \t\-\./,]{1,2})(?P<day_month4>[0-3]?\d))\b|" + r"(?P<month_in_words>" + r"(?!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b)" + r"(?:(?P<day1>[0-3]?\d)(?P<punct5>[ \t\-\./,]{0,2}))?" + r"\b(?P<month>Янв(?:|ар[ьеÑ])|Фев(?:|рал[ьеÑ])|Мар(?:|Ñ‚|те|та)|" + r"Ðпр(?:|ел[ьеÑ])|Ма[йеÑ]|Июн(?:|[ьеÑ])|Июл(?:|[ьеÑ])|" + r"Ðвг(?:|уÑÑ‚|уÑÑ‚[еа])|Сен(?:|Ñ‚Ñбр[ьеÑ])|Окт(?:|Ñбр[ьеÑ])|" + r"ÐоÑ(?:|бр[ьеÑ])|Дек(?:|абр[ьеÑ]))\b" + r"((?:(?P<punct7>[ \t\-\./,]{0,2})(?P<day2>[0-3]?\d))" + r"(?:(?P<punct8>[ \t\-\./,]{1,2})(?P<year4>\d{4}|\d{2}))|" + r"(?:(?P<punct6>[ \t\-\./,]{0,2})(?P<year3>\d{4}|\d{2})))?" + r"(?<!\b(Янв|Фев|Мар|Ðпр|Май|Июн|Июл|Ðвг|Сен|Окт|ÐоÑ|Дек)\b))", + re.I, ) + def detect_dates_ru(text: str) -> List[Tuple[int, int, DateDetection]]: """ Detects Russian dates in the text. @@ -40,5 +40,5 @@ def detect_dates_ru(text: str) -> List[Tuple[int, int, DateDetection]]: for match in matches: format = _parse_date_to_format(match.groupdict()) dates.append((match.start(), match.end(), DateDetection(format))) - - return dates \ No newline at end of file + + return dates diff --git a/src/detectors/date/utils.py b/src/detectors/date/utils.py index 9dbf8f2f80ad2aa665b65af3986f1fe08a1c4269..5e0846e781e143197076edd300560b7adbe575fc 100644 --- a/src/detectors/date/utils.py +++ b/src/detectors/date/utils.py @@ -1,104 +1,165 @@ from typing import List, Tuple from src.detections import DateDetection, Optional + def _parse_day_or_month(re_entry) -> List[Tuple[int, int, DateDetection]]: assert re_entry["day_or_month_year"] is not None result = [] - + if re_entry["day_month1"] is not None: if len(re_entry["day_month1"]) == 1: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month1"])) + result.append( + ( + DateDetection.AnnotationPart.TWO_DIGITS_DAY, + "0" + re_entry["day_month1"], + ) + ) else: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"])) + result.append( + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month1"]) + ) result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) - + if len(re_entry["day_month2"]) == 1: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) + result.append( + ( + DateDetection.AnnotationPart.TWO_DIGIT_MONTH, + "0" + re_entry["day_month2"], + ) + ) else: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) - + result.append( + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"]) + ) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) elif "day_month2" in re_entry: if len(re_entry["day_month2"]) == 1: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month2"])) + result.append( + ( + DateDetection.AnnotationPart.TWO_DIGIT_MONTH, + "0" + re_entry["day_month2"], + ) + ) else: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"])) - + result.append( + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month2"]) + ) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct1"])) - + if "year1" in re_entry: if len(re_entry["year1"]) == 2: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year1"])) - else: - result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year1"])) + result.append( + (DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year1"]) + ) + else: + result.append( + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year1"]) + ) return result + def _parse_year_month_or_day(re_entry) -> List[Tuple[int, int, DateDetection]]: assert re_entry["year_month_or_day"] is not None result = [] - + if "year2" in re_entry: if len(re_entry["year2"]) == 2: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year2"])) - else: - result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year2"])) - + result.append( + (DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year2"]) + ) + else: + result.append( + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year2"]) + ) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct3"])) - + if "day_month3" in re_entry: if len(re_entry["day_month3"]) == 1: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day_month3"])) + result.append( + ( + DateDetection.AnnotationPart.TWO_DIGITS_DAY, + "0" + re_entry["day_month3"], + ) + ) else: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month3"])) + result.append( + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day_month3"]) + ) result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct4"])) - + if "day_month4" in re_entry: if len(re_entry["day_month4"]) == 1: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "0" + re_entry["day_month4"])) + result.append( + ( + DateDetection.AnnotationPart.TWO_DIGIT_MONTH, + "0" + re_entry["day_month4"], + ) + ) else: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month4"])) - + result.append( + (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, re_entry["day_month4"]) + ) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct4"])) - + return result + def _parse_month_in_words(re_entry) -> List[Tuple[DateDetection.AnnotationPart, str]]: assert re_entry["month_in_words"] is not None result = [] - + if re_entry["day1"] is not None: if len(re_entry["day1"]) == 1: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day1"])) + result.append( + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day1"]) + ) else: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day1"])) - + result.append( + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day1"]) + ) + result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct5"])) - + if re_entry["month"] is not None: result.append((DateDetection.AnnotationPart.TEXT_MONTH, re_entry["month"])) - + if re_entry["day1"] is None: result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct7"])) else: result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct6"])) - + if re_entry["day2"] is not None: if len(re_entry["day2"]) == 1: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day2"])) + result.append( + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "0" + re_entry["day2"]) + ) else: - result.append((DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day2"])) + result.append( + (DateDetection.AnnotationPart.TWO_DIGITS_DAY, re_entry["day2"]) + ) result.append((DateDetection.AnnotationPart.OTHER, re_entry["punct6"])) - + if re_entry["year3"] is not None: if len(re_entry["year3"]) == 2: - result.append((DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year3"])) - else: - result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year3"])) - + result.append( + (DateDetection.AnnotationPart.TWO_DIGIT_YEAR, re_entry["year3"]) + ) + else: + result.append( + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year3"]) + ) + return result -def _parse_date_to_format(re_entry) -> Optional[List[Tuple[DateDetection.AnnotationPart, str]]]: + +def _parse_date_to_format( + re_entry, +) -> Optional[List[Tuple[DateDetection.AnnotationPart, str]]]: if re_entry["day_or_month_year"] is not None: result = _parse_day_or_month(re_entry) elif re_entry["year_month_or_day"] is not None: @@ -107,5 +168,5 @@ def _parse_date_to_format(re_entry) -> Optional[List[Tuple[DateDetection.Annotat result = _parse_month_in_words(re_entry) else: result = None - - return result \ No newline at end of file + + return result diff --git a/src/detectors/email/__init__.py b/src/detectors/email/__init__.py index 524f295eb951e1e1949749daeeec17f78e31db52..5342f79add897f960219d4b6dc76dcab830cf8c6 100644 --- a/src/detectors/email/__init__.py +++ b/src/detectors/email/__init__.py @@ -1 +1 @@ -from src.detectors.email.email import EmailDetector \ No newline at end of file +from src.detectors.email.email import EmailDetector diff --git a/src/detectors/ner/__init__.py b/src/detectors/ner/__init__.py index aeb274727351ad300df16b94fb4d76a6aa5779d9..36250fbe2bae16d4adcbd065f0e93f2182208165 100644 --- a/src/detectors/ner/__init__.py +++ b/src/detectors/ner/__init__.py @@ -1 +1 @@ -from src.detectors.ner.ner import NerDetector \ No newline at end of file +from src.detectors.ner.ner import NerDetector diff --git a/src/detectors/ner/ner.py b/src/detectors/ner/ner.py index f8fc1988ce6a0ed0bcb1b80a48b207573afc34f7..38e294b2da1f14a197632fe12dd0634cccccb69f 100644 --- a/src/detectors/ner/ner.py +++ b/src/detectors/ner/ner.py @@ -4,36 +4,39 @@ from src.detections import Detection, MorphosyntacticInfoMixin from src.annotations import Annotation, NerAnnotation, MorphosyntacticAnnotation from src.detections import DETECTION_CLASSES_MAP + class NerDetector(Detector): def __init__(self, detection_mapping: Dict[str, str], language: str = "pl") -> None: self._language = language self._detection_mapping = detection_mapping - + def detect( self, text: str, annotations: List[Tuple[int, int, Annotation]] ) -> List[Tuple[int, int, str]]: - + morpho_tags = dict() ner_detections = [] - + for annotation in annotations: start, end, annotation = annotation if isinstance(annotation, MorphosyntacticAnnotation): morpho_tags[(start, end)] = annotation.morphosyntactic_tag elif isinstance(annotation, NerAnnotation): ner_type = annotation.ner_type - + if ner_type in self._detection_mapping: - detection_class = DETECTION_CLASSES_MAP[self._detection_mapping[ner_type]] + detection_class = DETECTION_CLASSES_MAP[ + self._detection_mapping[ner_type] + ] ner_detections.append((start, end, detection_class)) - - result = [] + + result = [] for start, end, ner_detection in ner_detections: kwargs = dict() if issubclass(ner_detection, MorphosyntacticInfoMixin): if (start, end) in morpho_tags: kwargs["morpho_tag"] = morpho_tags[(start, end)] - - result.append((start, end, ner_detection(**kwargs))) - + + result.append((start, end, ner_detection(**kwargs))) + return result diff --git a/src/detectors/number/__init__.py b/src/detectors/number/__init__.py index 7d72f52460d34c7cc4475e531a8f29d6f60761fc..e72ac4677b6d8dafb613bbfa8bf2eef5a3d768d5 100644 --- a/src/detectors/number/__init__.py +++ b/src/detectors/number/__init__.py @@ -1 +1 @@ -from src.detectors.number.number import NumberDetector \ No newline at end of file +from src.detectors.number.number import NumberDetector diff --git a/src/detectors/number/number.py b/src/detectors/number/number.py index 3fbfa557b108148993032368b02072d8fa0181df..b55745c59afa836e5cfeae9abbf65e3020050002 100644 --- a/src/detectors/number/number.py +++ b/src/detectors/number/number.py @@ -8,6 +8,7 @@ NUMBER_REGEX = re.compile( re.I, ) + class NumberDetector(Detector): def __init__(self) -> None: super().__init__() @@ -17,9 +18,8 @@ class NumberDetector(Detector): ) -> List[Tuple[int, int, NumberDetection]]: NUMBER_REGEX.finditer(text) numbers = [] - + for number in numbers: numbers.append((number.start(), number.end(), NumberDetection())) - - return numbers \ No newline at end of file + return numbers diff --git a/src/detectors/phone/__init__.py b/src/detectors/phone/__init__.py index 4de9ace4cbbdb9517d6288b0766a172609c4da7e..acfec7713ad7d0d28d7d492774dd1461e937449a 100644 --- a/src/detectors/phone/__init__.py +++ b/src/detectors/phone/__init__.py @@ -1 +1 @@ -from src.detectors.phone.phone import PhoneNumberDetector \ No newline at end of file +from src.detectors.phone.phone import PhoneNumberDetector diff --git a/src/detectors/url/common.py b/src/detectors/url/common.py index 9d39241b467a147904f516fdc6f702d5603af558..cba0020f12cd3732c50c766f23360260b6a0cec3 100644 --- a/src/detectors/url/common.py +++ b/src/detectors/url/common.py @@ -1,24 +1,26 @@ import regex as re from typing import List, Tuple + def generate_url_regex(exeptions: List[str]) -> str: return re.compile( - r'\b(?:{})\b(*SKIP)(*FAIL)|'.format('|'.join(exeptions)) + - r'(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?' - r'(?P<auth>\S+(?::\S*)?@)?' - r'(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})' - r'(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})' - r'(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})' - r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])' - r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}' - r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))' - r'|' - r'((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?' - r'[a-z0-9\u00a1-\uffff]\.)+)' - r'(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)' - r'(?P<port>:\d{2,5})?' - r'(?P<path>[/?#]\S*)?)', - re.UNICODE | re.I + r"\b(?:{})\b(*SKIP)(*FAIL)|".format("|".join(exeptions)) + + r"(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?" + r"(?P<auth>\S+(?::\S*)?@)?" + r"(?P<host>(?!(?:10|127)(?:\.\d{1,3}){3})" + r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" + r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" + r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" + r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" + r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" + r"|" + r"((?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?" + r"[a-z0-9\u00a1-\uffff]\.)+)" + r"(?P<tld>[a-z\u00a1-\uffff]{2,}\.??)" + r"(?P<port>:\d{2,5})?" + r"(?P<path>[/?#]\S*)?)", + re.UNICODE | re.I, ) - -URL_REGEX_GENERAL = generate_url_regex([]) \ No newline at end of file + + +URL_REGEX_GENERAL = generate_url_regex([]) diff --git a/src/detectors/url/pl.py b/src/detectors/url/pl.py index 5d1a9edd62b640216f32c84d4aa59195058b2846..c4c790a6206a5787b736440e14a2b3823d05db3d 100644 --- a/src/detectors/url/pl.py +++ b/src/detectors/url/pl.py @@ -2,4 +2,4 @@ from .common import generate_url_regex PL_URL_REGEX_EXEPTIONS = ["m.in"] -URL_REGEX_PL = generate_url_regex(PL_URL_REGEX_EXEPTIONS) \ No newline at end of file +URL_REGEX_PL = generate_url_regex(PL_URL_REGEX_EXEPTIONS) diff --git a/src/detectors/user/__init__.py b/src/detectors/user/__init__.py index 62039d3e1d7777eed6325f882038ae6734f68b69..fa43828ebdbd1c978b2e16db16ac6edda0170745 100644 --- a/src/detectors/user/__init__.py +++ b/src/detectors/user/__init__.py @@ -1 +1 @@ -from src.detectors.user.user import UserDetector \ No newline at end of file +from src.detectors.user.user import UserDetector diff --git a/src/dictionaries/morphosyntactic/__init__.py b/src/dictionaries/morphosyntactic/__init__.py index edce6740e7f1f08a266b935c1c18aa7f2b10d95e..d0142845628e39c70f2797e4bcb78b2cdb325f18 100644 --- a/src/dictionaries/morphosyntactic/__init__.py +++ b/src/dictionaries/morphosyntactic/__init__.py @@ -1 +1 @@ -from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary \ No newline at end of file +from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary diff --git a/src/dictionaries/morphosyntactic/interface.py b/src/dictionaries/morphosyntactic/interface.py index f8d9fa74bbc373dff16bbd8bc8f9dcb3d82fa545..2718189b72baec04831eeded2b4564d681c0046e 100644 --- a/src/dictionaries/morphosyntactic/interface.py +++ b/src/dictionaries/morphosyntactic/interface.py @@ -1,16 +1,16 @@ from src.detections import Detection from typing import Optional, List, Type + class MorphosyntacticDictionary: def get_supported_detection_classes(self) -> List[Type[Detection]]: """ Returns a list of supported detection classes """ raise NotImplementedError() - + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: """ Returns a random replacement for the original entry """ raise NotImplementedError() - \ No newline at end of file diff --git a/src/dictionaries/morphosyntactic/ner_file.py b/src/dictionaries/morphosyntactic/ner_file.py index fef454c285e5127178250a3bba2dddaab378b78e..90880b27066f55c0200d34580b391c0e228777f9 100644 --- a/src/dictionaries/morphosyntactic/ner_file.py +++ b/src/dictionaries/morphosyntactic/ner_file.py @@ -4,6 +4,7 @@ from src.detections import Detection, MorphosyntacticInfoMixin, DETECTION_CLASSE from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary import random + class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): def __init__( self, @@ -13,12 +14,10 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): super().__init__() self._dictionary = None self._always_replace = always_replace - + self._from_file(dictionary_path) - - def _from_file( - self, path_to_dictionary: str - ) -> None: + + def _from_file(self, path_to_dictionary: str) -> None: replacement_dictionary = defaultdict(lambda: defaultdict(dict)) with open(path_to_dictionary, "r", encoding="utf-8") as file: for line in file: @@ -32,10 +31,7 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): """ Returns a list of supported detection classes """ - return [ - DETECTION_CLASSES_MAP[name] - for name in self._dictionary.keys() - ] + return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()] def get_random_replacement(self, original_entry: Detection) -> Optional[str]: original_entry_type = type(original_entry) @@ -51,7 +47,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): and morpho_tag in self._dictionary[original_entry_type_name] ): result = random.choice( - list(self._dictionary[original_entry_type_name][morpho_tag].values()) + list( + self._dictionary[original_entry_type_name][morpho_tag].values() + ) ) if result is None and self._always_replace: @@ -62,4 +60,4 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): list(self._dictionary[random_type][random_tag].values()) ) - return result \ No newline at end of file + return result diff --git a/src/dictionaries/morphosyntactic/ner_file_nkjp.py b/src/dictionaries/morphosyntactic/ner_file_nkjp.py index 05bb0e1a087a194100a038bbc3eee12023df9b24..987128cb39c758774706a191a64769cbca1b7bd7 100644 --- a/src/dictionaries/morphosyntactic/ner_file_nkjp.py +++ b/src/dictionaries/morphosyntactic/ner_file_nkjp.py @@ -6,9 +6,11 @@ from src.dictionaries.morphosyntactic.ner_file import NERFileMorphosyntacticDict class NERFileNKJPMorphosyntacticDictionary(NERFileMorphosyntacticDictionary): - def __init__(self, dictionary_path: Optional[str] = None, always_replace=True) -> None: + def __init__( + self, dictionary_path: Optional[str] = None, always_replace=True + ) -> None: super().__init__(dictionary_path, always_replace) - + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: original_entry_type = type(original_entry) original_entry_type_name = original_entry_type.TYPE_NAME diff --git a/src/input_parsers/ccl.py b/src/input_parsers/ccl.py index 8983f109a290569b8b6e8869c0273b7a46852f03..81763269628b7c55a0a245bdb23446c18caeaee8 100644 --- a/src/input_parsers/ccl.py +++ b/src/input_parsers/ccl.py @@ -1,16 +1,18 @@ from typing import Dict, List, Tuple from lxml import etree from collections import defaultdict -# from src.annotation_types_old import + +# from src.annotation_types_old import from src.input_parsers.interface import InputParser from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation + class CCLInputParser(InputParser): def __init__( self, ) -> None: super().__init__() - + def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse CCL string into text and annotations. @@ -21,55 +23,57 @@ class CCLInputParser(InputParser): Returns: Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and annotations. - """ - ccl_tree = etree.fromstring(content.strip().encode('utf-8')) - + """ + ccl_tree = etree.fromstring(content.strip().encode("utf-8")) + results = [] text = "" - + ner_annotations = [] morphosyntactic_annotations = [] - + # First token is assumed to not have space before it last_was_ns = True - + tokens = ccl_tree.xpath("//ns | //tok") - for token in tokens: - if token.tag == 'tok': + for token in tokens: + if token.tag == "tok": if not last_was_ns: text += " " - word = token.xpath('./orth')[0].text + word = token.xpath("./orth")[0].text start = len(text) end = start + len(word) - for lex in token.xpath('./lex'): - if lex.attrib['disamb'] == "1": - ctag = lex.xpath('./ctag')[0] - morphosyntactic_annotations.append((start, end, MorphosyntacticAnnotation(ctag.text))) - + for lex in token.xpath("./lex"): + if lex.attrib["disamb"] == "1": + ctag = lex.xpath("./ctag")[0] + morphosyntactic_annotations.append( + (start, end, MorphosyntacticAnnotation(ctag.text)) + ) + break - - for ann in token.xpath('./ann'): + + for ann in token.xpath("./ann"): is_present = int(ann.text) == 1 if not is_present: continue - - channel = ann.attrib['chan'] - is_head = "head" in ann.attrib and ann.attrib['head'] == "1" - + + channel = ann.attrib["chan"] + is_head = "head" in ann.attrib and ann.attrib["head"] == "1" + if is_head: ner_annotations.append((start, end, NerAnnotation(channel))) - else: + else: old_start = ner_annotations[-1][0] - + ner_annotations[-1] = (old_start, end, ner_annotations[-1][2]) - + last_was_ns = False text += word - elif token.tag == 'ns': + elif token.tag == "ns": last_was_ns = True - + results = ner_annotations + morphosyntactic_annotations - - return text, results \ No newline at end of file + + return text, results diff --git a/src/input_parsers/interface.py b/src/input_parsers/interface.py index ca8224e4c4ebce02019dd76612002b4b44a52dbb..192fa2f80709997e5a6029a218192f2fc1567a04 100644 --- a/src/input_parsers/interface.py +++ b/src/input_parsers/interface.py @@ -1,5 +1,6 @@ from typing import Dict, List, Tuple, Any + class InputParser: def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Any]]]: """Parse input string into text and annotations. @@ -14,4 +15,3 @@ class InputParser: Tuple[str, Dict[str, List[Tuple[int, int, Any]]]]: Text and annotations. """ pass - diff --git a/src/input_parsers/wiktor_ner.py b/src/input_parsers/wiktor_ner.py index 4c1666865a4cd65050c91a04cd6197cab9f33c90..463f32f0a0815b63edd7ac5218ef7128c3f87bc3 100644 --- a/src/input_parsers/wiktor_ner.py +++ b/src/input_parsers/wiktor_ner.py @@ -2,14 +2,16 @@ from typing import Dict, List, Tuple from lxml import etree import json from collections import defaultdict -# from src.annotation_types_old import + +# from src.annotation_types_old import from src.input_parsers.interface import InputParser from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation + class WiktorNERInputParser(InputParser): def __init__(self) -> None: super().__init__() - + def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse wiktorner file into text and annotations. @@ -22,32 +24,39 @@ class WiktorNERInputParser(InputParser): Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations. """ content_parsed = json.loads(content) - + if "text" in content_parsed: - text = content_parsed['text'] - else: + text = content_parsed["text"] + else: text = "" - + annotations = [] - + # Morphosyntactic annotations if "tokens" in content_parsed: - for token in content_parsed['tokens']: + for token in content_parsed["tokens"]: if "position" in token: - token_start, token_end = token['position'] + token_start, token_end = token["position"] if "lexemes" in token: - for lexeme in token['lexemes']: - if "disamb" in lexeme and lexeme['disamb'] == True: + for lexeme in token["lexemes"]: + if "disamb" in lexeme and lexeme["disamb"] == True: if "mstag" in lexeme: - annotations.append((token_start, token_end, MorphosyntacticAnnotation(lexeme['mstag']))) - + annotations.append( + ( + token_start, + token_end, + MorphosyntacticAnnotation(lexeme["mstag"]), + ) + ) + # NER annotations if "entities" in content_parsed: - for entity in content_parsed['entities']: + for entity in content_parsed["entities"]: if "positions" in entity: - entity_start, entity_end = entity['positions'] + entity_start, entity_end = entity["positions"] if "type" in entity: - annotations.append((entity_start, entity_end, NerAnnotation(entity['type']))) - - - return text, annotations \ No newline at end of file + annotations.append( + (entity_start, entity_end, NerAnnotation(entity["type"])) + ) + + return text, annotations diff --git a/src/pipeline/default.py b/src/pipeline/default.py index 33865f5738f699daa0fcdae62626da86477c4fe0..8850a28ce928c5b44d95d3248605cc7ab3da48c5 100644 --- a/src/pipeline/default.py +++ b/src/pipeline/default.py @@ -20,7 +20,7 @@ class DefaultPipeline(Pipeline): self._replacers = replacers def run(self, input) -> str: - with open(input, 'r') as f: + with open(input, "r") as f: content = f.read() parsed_input = self._input_parser.parse(content) diff --git a/src/pipeline/interface.py b/src/pipeline/interface.py index ebd8fc90674c4abcb722a9a6e9d7c5ade3b3aaba..ed87fd653c7b721f5a960b2da8bdd459ba212bb0 100644 --- a/src/pipeline/interface.py +++ b/src/pipeline/interface.py @@ -1,3 +1,3 @@ class Pipeline: def run(self, input) -> str: - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/src/pipeline/sequential_jsonl.py b/src/pipeline/sequential_jsonl.py index bb35e1d11f5283dbe5385c3aae95e58aa7266c0a..2bc796a706ce52de25cae2f72dc8988c20e14e0b 100644 --- a/src/pipeline/sequential_jsonl.py +++ b/src/pipeline/sequential_jsonl.py @@ -6,6 +6,7 @@ from src.replacers.interface import ReplacerInterface from src.input_parsers.interface import InputParser import json + class SequentialJSONLPipeline(Pipeline): def __init__( self, @@ -21,7 +22,7 @@ class SequentialJSONLPipeline(Pipeline): def run(self, input) -> str: result = [] - with open(input, 'r') as f: + with open(input, "r") as f: for line in f.readlines(): if line.strip() == "": continue @@ -29,7 +30,9 @@ class SequentialJSONLPipeline(Pipeline): detected_entities = [] for detector_name, detector in self._detectors.items(): - detected_entities += detector.detect(parsed_input[0], parsed_input[1]) + detected_entities += detector.detect( + parsed_input[0], parsed_input[1] + ) annotaitons_cleaned = self._suppressor.suppress(detected_entities) @@ -41,5 +44,5 @@ class SequentialJSONLPipeline(Pipeline): ) result.append({"text": replaced_input}) - + return "\n".join([json.dumps(item, ensure_ascii=False) for item in result]) diff --git a/src/replacers/__init__.py b/src/replacers/__init__.py index e652b43a5eb05b90771f255ef4595d079e7507b2..508b92d1344fa9b506cffe2a078ae9a40b953cc0 100644 --- a/src/replacers/__init__.py +++ b/src/replacers/__init__.py @@ -1,2 +1,2 @@ from src.replacers.interface import ReplacerInterface -from src.replacers.tag_replacer import TagReplacer \ No newline at end of file +from src.replacers.tag_replacer import TagReplacer diff --git a/src/replacers/date_replacer.py b/src/replacers/date_replacer.py index ecf09b5f4bd88b974c1851e273480a04cf655012..1ad698996f3a7212fadc94e75629f6c77c8bbf5a 100644 --- a/src/replacers/date_replacer.py +++ b/src/replacers/date_replacer.py @@ -23,19 +23,22 @@ months_map = { 12: "grudnia", } + class DateReplacer(ReplacerInterface): def __init__(self): pass - - def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: + + def replace( + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] - + already_replaced = dict() - + for item in detections: start, end, detection = item - + if isinstance(detection, DateDetection): replacement = [] if detection.format is not None: @@ -43,12 +46,12 @@ class DateReplacer(ReplacerInterface): else: format = [ (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), - (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), - (DateDetection.AnnotationPart.OTHER, "."), + (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2020"), ] - + if text[start:end] in already_replaced: replacement = already_replaced[text[start:end]] else: @@ -77,12 +80,12 @@ class DateReplacer(ReplacerInterface): replacement.append(month_name) elif entry[0] == DateDetection.AnnotationPart.OTHER: replacement.append(entry[1]) - + replacement = "".join(replacement) already_replaced[text[start:end]] = replacement - + replacements.append((start, end, replacement)) else: not_processed.append(item) - - return replace_and_update(text, replacements, not_processed) \ No newline at end of file + + return replace_and_update(text, replacements, not_processed) diff --git a/src/replacers/delete_replacer.py b/src/replacers/delete_replacer.py index 9a0190848d68d26196a48258bfb03282d1b16f95..b51413bc5ebd41d0c167f9b7bb9beea6a8cbd357 100644 --- a/src/replacers/delete_replacer.py +++ b/src/replacers/delete_replacer.py @@ -11,10 +11,7 @@ class DeleteReplacer(ReplacerInterface): def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: - - result = [ - (start, end, "") - for start, end, _ in detections - ] - return replace(text, result), [] \ No newline at end of file + result = [(start, end, "") for start, end, _ in detections] + + return replace(text, result), [] diff --git a/src/replacers/email_replacer.py b/src/replacers/email_replacer.py index 104f53e63762d522ec92dfb85300a107ebf88f6e..cecba4d15af853536f878979e1aa31fd7864a1ff 100644 --- a/src/replacers/email_replacer.py +++ b/src/replacers/email_replacer.py @@ -8,31 +8,36 @@ from src.replacers.interface import ReplacerInterface import random import string + def random_char(char_num): - return ''.join(random.choice(string.ascii_letters) for _ in range(char_num)) + return "".join(random.choice(string.ascii_letters) for _ in range(char_num)) + def random_email(): - return random_char(7)+"@gmail.com" + return random_char(7) + "@gmail.com" + class EmailReplacer(ReplacerInterface): def __init__(self): pass - - def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: + + def replace( + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] - + already_replaced = dict() - + for item in detections: start, end, detection = item - + if isinstance(detection, EmailDetection): if text[start:end] not in already_replaced: already_replaced[text[start:end]] = random_email() - + replacements.append((start, end, already_replaced[text[start:end]])) else: not_processed.append(item) - - return replace_and_update(text, replacements, not_processed) \ No newline at end of file + + return replace_and_update(text, replacements, not_processed) diff --git a/src/replacers/interface.py b/src/replacers/interface.py index f4ed59f7d804b73771149256feba032613e1c8a4..e2621c92aeb7ae8e1833fef621b1db8d22a6e4c0 100644 --- a/src/replacers/interface.py +++ b/src/replacers/interface.py @@ -15,8 +15,8 @@ class ReplacerInterface(ABC): detections (List[Tuple[int, int, str]]): List of detections. Returns: - Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities - replaced with anonimized version and list of detections that were + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with anonimized version and list of detections that were not processed by this replacer. """ pass diff --git a/src/replacers/ner_replacer.py b/src/replacers/ner_replacer.py index 6804451d53d9d20cd920e0261d954f40fa572964..56aee0d6ec2fca2ecadbf2de65806dfc50669fa5 100644 --- a/src/replacers/ner_replacer.py +++ b/src/replacers/ner_replacer.py @@ -23,15 +23,15 @@ class NERReplacer(ReplacerInterface): if type(item[2]) not in self._dictionary.get_supported_detection_classes(): not_processed.append(item) continue - + start, end, detection = item - + key = (text[start:end], type(detection)) - + if key not in already_replaced: replacement = self._dictionary.get_random_replacement(detection) already_replaced[key] = replacement - + if already_replaced[key] is None: not_processed.append(item) else: diff --git a/src/replacers/number_replacer.py b/src/replacers/number_replacer.py index d0c8f9e295c40b63b634aeb379c6ea33f8fd2710..14eaa25546c8aabed205be71c8e57b62da7c9b2d 100644 --- a/src/replacers/number_replacer.py +++ b/src/replacers/number_replacer.py @@ -8,25 +8,29 @@ from src.replacers.interface import ReplacerInterface import random import string + def randomize_digits_in_text(text: str) -> str: result = "" - + for c in text: if c.isdigit(): result += random.choice(string.digits) else: result += c - + return result + class NumberReplacer(ReplacerInterface): def __init__(self): pass - - def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: + + def replace( + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] - + already_replaced = dict() for item in detections: @@ -34,10 +38,12 @@ class NumberReplacer(ReplacerInterface): if isinstance(detection, NumberDetection): if text[start:end] not in already_replaced: - already_replaced[text[start:end]] = randomize_digits_in_text(text[start:end]) - + already_replaced[text[start:end]] = randomize_digits_in_text( + text[start:end] + ) + replacements.append((start, end, already_replaced[text[start:end]])) else: not_processed.append(item) - - return replace_and_update(text, replacements, not_processed) \ No newline at end of file + + return replace_and_update(text, replacements, not_processed) diff --git a/src/replacers/tag_replacer.py b/src/replacers/tag_replacer.py index 1f8d8987492dd4d728ca985302135902cd3e1371..f27745559563418c453e9e279e96138a021a0345 100644 --- a/src/replacers/tag_replacer.py +++ b/src/replacers/tag_replacer.py @@ -38,10 +38,10 @@ class TagReplacer(ReplacerInterface): def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: - + result = [ (start, end, self.tags_map.get(type(entity_type), "[OTHER]")) for start, end, entity_type in detections ] - return replace(text, result), [] \ No newline at end of file + return replace(text, result), [] diff --git a/src/replacers/user_replacer.py b/src/replacers/user_replacer.py index 87fb931bac9c7ed8abdbbe1b3c6c7a0470dec6b5..97206d8a127ddacf697325234e3ab4ed43da450f 100644 --- a/src/replacers/user_replacer.py +++ b/src/replacers/user_replacer.py @@ -7,26 +7,29 @@ from src.string_replacements import replace_and_update from src.replacers.interface import ReplacerInterface from random_username.generate import generate_username + class UserReplacer(ReplacerInterface): def __init__(self): pass - - def replace(self, text: str, detections: List[Tuple[int, int, Detection]]) -> Tuple[str, List[Tuple[int, int, Detection]]]: + + def replace( + self, text: str, detections: List[Tuple[int, int, Detection]] + ) -> Tuple[str, List[Tuple[int, int, Detection]]]: replacements = [] not_processed = [] - + already_replaced = dict() - + for item in detections: start, end, detection = item - + if isinstance(detection, UserDetection): if text[start:end] not in already_replaced: username = "@" + generate_username(1)[0] already_replaced[text[start:end]] = username - + replacements.append((start, end, already_replaced[text[start:end]])) else: not_processed.append(item) - - return replace_and_update(text, replacements, not_processed) \ No newline at end of file + + return replace_and_update(text, replacements, not_processed) diff --git a/src/string_replacements.py b/src/string_replacements.py index 528b5b54d895c2fcaee9176976e915c3820e517a..5b5f584a5e336a59000339f4be5c103d10084653 100644 --- a/src/string_replacements.py +++ b/src/string_replacements.py @@ -4,7 +4,7 @@ from typing import List, Tuple, Any, TypeVar def replace(original_string: str, replacements: List[Tuple[int, int, str]]) -> str: """ Replaces substrings in a string. - + !!! Important: This function assumes that there are no overlapping annotations. Parameters @@ -42,8 +42,8 @@ def replace_and_update( replacements: List[Tuple[int, int, str]], other_annotations: List[Tuple[int, int, _T]], ) -> Tuple[str, List[Tuple[int, int, _T]]]: - """ Replaces substrings in a string and updates other annotations to match new string. - + """Replaces substrings in a string and updates other annotations to match new string. + !!! Important: This function assumes that there are no overlapping annotations. Parameters @@ -60,21 +60,23 @@ def replace_and_update( Tuple[str, List[Tuple[int, int, Any]]] The string with replacements applied and other annotations with new positions. """ - + joined_list = [] for replacement in replacements: joined_list.append((replacement[0], replacement[1], replacement[2], True)) for other_annotation in other_annotations: - joined_list.append((other_annotation[0], other_annotation[1], other_annotation[2], False)) + joined_list.append( + (other_annotation[0], other_annotation[1], other_annotation[2], False) + ) annotations = sorted(joined_list, key=lambda x: x[0]) - + new_other_annotations = [] delta = 0 for annotation in annotations: is_replacement = annotation[3] - + if is_replacement: original_string = ( original_string[: annotation[0] + delta] @@ -83,6 +85,8 @@ def replace_and_update( ) delta += len(annotation[2]) - (annotation[1] - annotation[0]) else: - new_other_annotations.append((annotation[0] + delta, annotation[1] + delta, annotation[2])) + new_other_annotations.append( + (annotation[0] + delta, annotation[1] + delta, annotation[2]) + ) - return original_string, new_other_annotations \ No newline at end of file + return original_string, new_other_annotations diff --git a/src/suppressors/__init__.py b/src/suppressors/__init__.py index e9cc16fb2c596e511388d7cee8698c89f1f44989..4d23afe3fdc385ae84a7a26987be4f8777a0a96a 100644 --- a/src/suppressors/__init__.py +++ b/src/suppressors/__init__.py @@ -1 +1 @@ -from src.suppressors.order_based import suppress_order_based \ No newline at end of file +from src.suppressors.order_based import suppress_order_based diff --git a/src/suppressors/interface.py b/src/suppressors/interface.py index 7fbf543b9847883b1342ad6cb154ac97a9f6530c..565c6ac3a3ec8cd68045497158fa4ac052f46361 100644 --- a/src/suppressors/interface.py +++ b/src/suppressors/interface.py @@ -1,7 +1,10 @@ from typing import List, Tuple, Any + class Suppressor: - def suppress(self, annotations: List[Tuple[int, int, Any]]) -> List[Tuple[int, int, Any]]: + def suppress( + self, annotations: List[Tuple[int, int, Any]] + ) -> List[Tuple[int, int, Any]]: """Suppresses annotations on overlappment. Args: @@ -12,4 +15,4 @@ class Suppressor: annotations removed. """ - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/src/suppressors/order_based.py b/src/suppressors/order_based.py index 2be556940b0517c521354f59cf41b56c8040c989..1a6438f7e7b907711348cfb11aed3d63d5135040 100644 --- a/src/suppressors/order_based.py +++ b/src/suppressors/order_based.py @@ -2,14 +2,20 @@ from typing import List, Tuple, Dict, Any from bitarray import bitarray from src.suppressors.interface import Suppressor + class OrderBasedSuppressor(Suppressor): def __init__(self) -> None: super().__init__() - - def suppress(self, annotations: List[Tuple[int, int, Any]]) -> List[Tuple[int, int, Any]]: + + def suppress( + self, annotations: List[Tuple[int, int, Any]] + ) -> List[Tuple[int, int, Any]]: return suppress_order_based(annotations) -def suppress_order_based(annotations: List[Tuple[int, int, Any]]) -> List[Tuple[int, int, Any]]: + +def suppress_order_based( + annotations: List[Tuple[int, int, Any]] +) -> List[Tuple[int, int, Any]]: """If two annotations overlap, the first one int the list is kept. Args: @@ -22,17 +28,17 @@ def suppress_order_based(annotations: List[Tuple[int, int, Any]]) -> List[Tuple[ """ if len(annotations) == 0: return annotations - + annotations = annotations bitarray_size = max([end for _, end, _ in annotations]) bitarray_ = bitarray(bitarray_size) bitarray_.setall(False) - + result = [] - + for start, end, entity_type in annotations: if not bitarray_[start:end].any(): bitarray_[start:end] = True result.append((start, end, entity_type)) - + return result diff --git a/src/utils/subclasses.py b/src/utils/subclasses.py index 7a7f580a3587e1e5e613a911350a837ed3fe2e89..8f28e81f0fa892ead81ffefd59c93a2374ef85c5 100644 --- a/src/utils/subclasses.py +++ b/src/utils/subclasses.py @@ -3,4 +3,4 @@ def get_sublcasses(cls): for subclass in cls.__subclasses__(): subclasses.append(subclass) subclasses.extend(get_sublcasses(subclass)) - return subclasses \ No newline at end of file + return subclasses diff --git a/src/worker.py b/src/worker.py index d7b27d770aaf6a3a40e216291cb2be4600de5166..aedf29ca64c3b8aa5c1578d77460e83500de32ab 100644 --- a/src/worker.py +++ b/src/worker.py @@ -10,26 +10,27 @@ _log = logging.getLogger(__name__) class Worker(nlp_ws.NLPWorker): """Implements nlp_worker for anonymizer service.""" + def __init__(self) -> None: self._last_config = None self._pipeline = None super().__init__() - + def _prepare_pipeline(self, task_options): - language = task_options.get('language', 'pl') - replace_method = task_options.get('method', 'tag') - + language = task_options.get("language", "pl") + replace_method = task_options.get("method", "tag") + overrides = [ "language=" + language, "replacers=" + replace_method, ] - + config_hash = hash(tuple(overrides)) if self._last_config != config_hash: with initialize(config_path="./config"): cfg = compose(config_name="config", overrides=overrides) self._pipeline = instantiate(cfg["pipeline"]) - + return self._pipeline def process(self, input_file, task_options, output_file): @@ -44,7 +45,7 @@ class Worker(nlp_ws.NLPWorker): language - 'pl' - language of the input text. As of now only Polish is supported. """ pipeline = self._prepare_pipeline(task_options) - - with open(output_file, 'w', encoding='utf-8') as f: + + with open(output_file, "w", encoding="utf-8") as f: result = pipeline.run(input_file) f.write(result) diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py index ee62ea8fef7664928fc4c55351ae0b93c88fc485..07d4bbdb077a6c92ddd1b8e1c1383bfcbaa1720b 100644 --- a/tests/detectors/date/test_en.py +++ b/tests/detectors/date/test_en.py @@ -1,30 +1,40 @@ from src.detections import DateDetection from src.detectors.date.date import DateDetector + def test_detect_dates_en(): detector = DateDetector("en") - + # Check en-us text = "On 1.01.2022, I sold my cat. On April 5, 2021, I bought a dog." found_dates = detector.detect(text, dict()) - + format_date1 = [ - (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + ( + DateDetection.AnnotationPart.TWO_DIGITS_DAY, + "01", + ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), (DateDetection.AnnotationPart.OTHER, "."), - (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022"), ] - + format_date2 = [ (DateDetection.AnnotationPart.TEXT_MONTH, "April"), (DateDetection.AnnotationPart.OTHER, " "), - (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now + ( + DateDetection.AnnotationPart.TWO_DIGITS_DAY, + "05", + ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, ", "), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - assert found_dates == [(3, 12, DateDetection(format_date1)), (32, 45, DateDetection(format_date2))] + assert found_dates == [ + (3, 12, DateDetection(format_date1)), + (32, 45, DateDetection(format_date2)), + ] # Check en-gb # TODO: Following test fails. Fix it. diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py index bfe159af91b7cc0f71953d20ca4f3810a00d1b09..9ddcc586e3effec1217f90b910e56b9730581664 100644 --- a/tests/detectors/date/test_pl.py +++ b/tests/detectors/date/test_pl.py @@ -1,6 +1,7 @@ from src.detections import DateDetection from src.detectors.date.date import DateDetector + def test_detect_dates_pl(): detector = DateDetector("pl") @@ -8,19 +9,28 @@ def test_detect_dates_pl(): found_dates = detector.detect(text, dict()) format_date1 = [ - (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + ( + DateDetection.AnnotationPart.TWO_DIGITS_DAY, + "01", + ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), (DateDetection.AnnotationPart.OTHER, "."), - (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022"), ] - + format_date2 = [ - (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), # Only supports two digits for now + ( + DateDetection.AnnotationPart.TWO_DIGITS_DAY, + "05", + ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, " "), (DateDetection.AnnotationPart.TEXT_MONTH, "kwietnia"), (DateDetection.AnnotationPart.OTHER, " "), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - - assert found_dates == [(7, 16, DateDetection(format_date1)), (34, 49, DateDetection(format_date2))] \ No newline at end of file + + assert found_dates == [ + (7, 16, DateDetection(format_date1)), + (34, 49, DateDetection(format_date2)), + ] diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py index 72a9f89ca76e59c41d8109b7558bb921f2588b4c..1ae3d0bee158df4438c6f52283500417cb86efd1 100644 --- a/tests/detectors/date/test_ru.py +++ b/tests/detectors/date/test_ru.py @@ -4,24 +4,33 @@ from src.detectors.date.date import DateDetector def test_detect_dates_pl(): detector = DateDetector("ru") - + text = "1.01.2022 Ñ Ð¿Ñ€Ð¾Ð´Ð°Ð» кошку. 5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021 Ñ ÐºÑƒÐ¿Ð¸Ð» Ñобаку." found_dates = detector.detect(text, dict()) - + format_date1 = [ - (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), # Only supports two digits for now + ( + DateDetection.AnnotationPart.TWO_DIGITS_DAY, + "01", + ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), (DateDetection.AnnotationPart.OTHER, "."), - (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022") + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022"), ] - + format_date2 = [ (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05"), (DateDetection.AnnotationPart.OTHER, " "), - (DateDetection.AnnotationPart.TEXT_MONTH, "апрелÑ"), # Only supports two digits for now + ( + DateDetection.AnnotationPart.TEXT_MONTH, + "апрелÑ", + ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, " "), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] - - assert found_dates == [(0, 9, DateDetection(format_date1)), (26, 39, DateDetection(format_date2))] + + assert found_dates == [ + (0, 9, DateDetection(format_date1)), + (26, 39, DateDetection(format_date2)), + ] diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py index 982d983b20044e53876c4e866d700787aedb343e..7f61359dbe9af62fd1b3f16d8bc586756b1c0a9f 100644 --- a/tests/detectors/email/test_email.py +++ b/tests/detectors/email/test_email.py @@ -1,10 +1,13 @@ from src.detections import EmailDetection from src.detectors.email import EmailDetector + def test_detect_emails(): detector = EmailDetector() - - text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl" + + text = ( + "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl" + ) found_emails = detector.detect(text, dict()) - - assert found_emails == [(12, 30, EmailDetection()), (53, 78, EmailDetection())] \ No newline at end of file + + assert found_emails == [(12, 30, EmailDetection()), (53, 78, EmailDetection())] diff --git a/tests/detectors/ner/test_ner.py b/tests/detectors/ner/test_ner.py index 28b9bf3e62c03a2a08701b4fa3f8e3b8e1e33ff1..33a4329b9bcd993bc558f79d04d772c0977ea219 100644 --- a/tests/detectors/ner/test_ner.py +++ b/tests/detectors/ner/test_ner.py @@ -2,13 +2,16 @@ from src.annotations import NerAnnotation, MorphosyntacticAnnotation from src.detections import NameDetection, SurnameDetection, CityDetection from src.detectors.ner import NerDetector + def test_ner_detector(): - detector = NerDetector(detection_mapping={ - "person_first_nam": "name", - "person_last_nam": "surname", - "city_nam": "city", - }) - + detector = NerDetector( + detection_mapping={ + "person_first_nam": "name", + "person_last_nam": "surname", + "city_nam": "city", + } + ) + annotations = [ (10, 16, NerAnnotation("person_first_nam")), (100, 109, NerAnnotation("person_first_nam")), @@ -20,14 +23,14 @@ def test_ner_detector(): (30, 35, MorphosyntacticAnnotation("3")), (120, 124, MorphosyntacticAnnotation("some_other_morphosyntactic_annotation")), ] - + result = detector.detect("", annotations) - + expected = [ - (10, 16, NameDetection(morpho_tag="1")), + (10, 16, NameDetection(morpho_tag="1")), (100, 109, NameDetection(morpho_tag="2")), (30, 35, SurnameDetection(morpho_tag="3")), (50, 59, CityDetection(morpho_tag=None)), ] - - assert set(result) == set(expected) \ No newline at end of file + + assert set(result) == set(expected) diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py index 5ada3971eb399d2aceedbd30f1e14edbebb4cbd8..2e89a55a35516c305c338d44f91bd36677104d8b 100644 --- a/tests/detectors/phone/test_phone.py +++ b/tests/detectors/phone/test_phone.py @@ -1,10 +1,14 @@ from src.detections import PhoneNumberDetection from src.detectors.phone import PhoneNumberDetector + def test_detect_phone_numbers(): detector = PhoneNumberDetector() - + text = "My phone number is +48 123 456 789. My friend's number is 123456789." found_phone_numbers = detector.detect(text, dict()) - - assert found_phone_numbers == [(19, 34, PhoneNumberDetection()), (58, 67, PhoneNumberDetection())] \ No newline at end of file + + assert found_phone_numbers == [ + (19, 34, PhoneNumberDetection()), + (58, 67, PhoneNumberDetection()), + ] diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py index 4e5c02f369cccefc622f072142a47d8487b11bb4..c47071e437db9c0f6a8459f4664fac140915237f 100644 --- a/tests/detectors/url/test_url.py +++ b/tests/detectors/url/test_url.py @@ -1,22 +1,26 @@ from src.detectors.url import UrlDetector from src.detections import UrlDetection + def test_detect_urls(): detector = UrlDetector("en") - - text = "This is a test for www.google.com. Make sure to go to https://www.google.com" + + text = ( + "This is a test for www.google.com. Make sure to go to https://www.google.com" + ) found_urls = detector.detect(text, dict()) - + assert found_urls == [(19, 33, UrlDetection()), (54, 76, UrlDetection())] - + + def test_detect_urls_pl(): detector_en = UrlDetector("en") detector_pl = UrlDetector("pl") - - text = "m.in. https://www.google.com" + + text = "m.in. https://www.google.com" found_urls_pl = detector_pl.detect(text, dict()) found_urls_en = detector_en.detect(text, dict()) - + # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected as a URL. assert found_urls_pl == [(6, 28, UrlDetection())] - assert found_urls_en == [(0, 4, UrlDetection()), (6, 28, UrlDetection())] \ No newline at end of file + assert found_urls_en == [(0, 4, UrlDetection()), (6, 28, UrlDetection())] diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py index c1b8bc33a316dd912a3dfee9f2bbc286b5cae91d..56220d6623a927bdaebb5f6d2d733326a0191722 100644 --- a/tests/detectors/user/test_user.py +++ b/tests/detectors/user/test_user.py @@ -1,10 +1,11 @@ from src.detectors.user import UserDetector from src.detections import UserDetection + def test_detect_users(): detector = UserDetector() - + text = "My username is @john_smith. My friend's username is @jane_doe." found_users = detector.detect(text, dict()) - - assert found_users == [(15, 26, UserDetection()), (52, 61, UserDetection())] \ No newline at end of file + + assert found_users == [(15, 26, UserDetection()), (52, 61, UserDetection())] diff --git a/tests/dictionaries/morphosyntactic/test_ner_file.py b/tests/dictionaries/morphosyntactic/test_ner_file.py index c4e20707ed0fa985176f3733fa9a7a81f67db34a..abe02cb18a83e321d6335680f6543c933a84a84c 100644 --- a/tests/dictionaries/morphosyntactic/test_ner_file.py +++ b/tests/dictionaries/morphosyntactic/test_ner_file.py @@ -2,26 +2,36 @@ from src.dictionaries.morphosyntactic.ner_file import NERFileMorphosyntacticDict from src.detections import NameDetection, CityDetection, SurnameDetection from tempfile import NamedTemporaryFile + def test_ner_file_morphosyntactic_dictionary(): with NamedTemporaryFile() as file: - file.writelines([ - "name Andrzejowi Andrzej a\n".encode("utf-8"), - "name Andrzej Andrzej b\n".encode("utf-8"), - "name Kasia Kasia c\n".encode("utf-8"), - "city WrocÅ‚aw WrocÅ‚aw d\n".encode("utf-8"), - "city Warszawa Warszawa c\n".encode("utf-8"), - "city Kraków Kraków d\n".encode("utf-8") - ]) + file.writelines( + [ + "name Andrzejowi Andrzej a\n".encode("utf-8"), + "name Andrzej Andrzej b\n".encode("utf-8"), + "name Kasia Kasia c\n".encode("utf-8"), + "city WrocÅ‚aw WrocÅ‚aw d\n".encode("utf-8"), + "city Warszawa Warszawa c\n".encode("utf-8"), + "city Kraków Kraków d\n".encode("utf-8"), + ] + ) file.flush() - + dictionary = NERFileMorphosyntacticDictionary(file.name) - + example_name_1 = NameDetection(morpho_tag="a") example_name_2 = NameDetection(morpho_tag="b") example_other = SurnameDetection(morpho_tag="c") - + assert dictionary.get_random_replacement(example_name_1) == "Andrzejowi" assert dictionary.get_random_replacement(example_name_2) == "Andrzej" - + # If no good replacement is present, it should choose a random one - assert dictionary.get_random_replacement(example_other) in ["Andrzejowi" ,"Andrzej", "Kasia", "WrocÅ‚aw", "Warszawa", "Kraków"] \ No newline at end of file + assert dictionary.get_random_replacement(example_other) in [ + "Andrzejowi", + "Andrzej", + "Kasia", + "WrocÅ‚aw", + "Warszawa", + "Kraków", + ] diff --git a/tests/dictionaries/morphosyntactic/test_ner_file_nkjp.py b/tests/dictionaries/morphosyntactic/test_ner_file_nkjp.py index 799a30a5b00adcbdeb78e83d2aa24763b23a8e2f..f886b552bb33f498c9e94303bfe6f792ee6c1ab8 100644 --- a/tests/dictionaries/morphosyntactic/test_ner_file_nkjp.py +++ b/tests/dictionaries/morphosyntactic/test_ner_file_nkjp.py @@ -1,27 +1,39 @@ -from src.dictionaries.morphosyntactic.ner_file_nkjp import NERFileNKJPMorphosyntacticDictionary +from src.dictionaries.morphosyntactic.ner_file_nkjp import ( + NERFileNKJPMorphosyntacticDictionary, +) from src.detections import NameDetection, CityDetection, SurnameDetection from tempfile import NamedTemporaryFile + def test_ner_file_nkjp_morphosyntactic_dictionary(): with NamedTemporaryFile() as file: - file.writelines([ - "name Andrzejowi Andrzej sg:dat:m1\n".encode("utf-8"), - "name Andrzej Andrzej sg:m1:imperf\n".encode("utf-8"), - "name Kasia Kasia sg:f:imperf\n".encode("utf-8"), - "city WrocÅ‚aw WrocÅ‚aw sg:m2:imperf\n".encode("utf-8"), - "city Warszawa Warszawa sg:f:imperf\n".encode("utf-8"), - "city Kraków Kraków sg:m2:imperf\n".encode("utf-8") - ]) + file.writelines( + [ + "name Andrzejowi Andrzej sg:dat:m1\n".encode("utf-8"), + "name Andrzej Andrzej sg:m1:imperf\n".encode("utf-8"), + "name Kasia Kasia sg:f:imperf\n".encode("utf-8"), + "city WrocÅ‚aw WrocÅ‚aw sg:m2:imperf\n".encode("utf-8"), + "city Warszawa Warszawa sg:f:imperf\n".encode("utf-8"), + "city Kraków Kraków sg:m2:imperf\n".encode("utf-8"), + ] + ) file.flush() - + dictionary = NERFileNKJPMorphosyntacticDictionary(file.name) - + example_name_1 = NameDetection(morpho_tag="subst:sg:dat:m1") example_name_2 = NameDetection(morpho_tag="subst:sg:m1:imperf") example_other = SurnameDetection(morpho_tag="subst:sg:m1:imperf") - + assert dictionary.get_random_replacement(example_name_1) == "Andrzejowi" assert dictionary.get_random_replacement(example_name_2) in ["Andrzej", "Kasia"] - + # If no good replacement is present, it should choose a random one - assert dictionary.get_random_replacement(example_other) in ["Andrzejowi" ,"Andrzej", "Kasia", "WrocÅ‚aw", "Warszawa", "Kraków"] \ No newline at end of file + assert dictionary.get_random_replacement(example_other) in [ + "Andrzejowi", + "Andrzej", + "Kasia", + "WrocÅ‚aw", + "Warszawa", + "Kraków", + ] diff --git a/tests/input_parsers/test_ccl.py b/tests/input_parsers/test_ccl.py index 53021da885b9e0ee3961994bdb51487da42a40b6..80cc4f2f3e78cd37345a2c85bbe7bf0666f0b323 100644 --- a/tests/input_parsers/test_ccl.py +++ b/tests/input_parsers/test_ccl.py @@ -51,20 +51,21 @@ example_ccl = """<?xml version="1.0" encoding="UTF-8"?> </chunkList> """ + def test_ccl_input_parser(): parser = CCLInputParser() - + text, annotations = parser.parse(example_ccl) assert text == "Marek Kowalski pojechaÅ‚ do WrocÅ‚awia." assert len(annotations) == 8 - + assert (0, 14, NerAnnotation("nam_liv")) in annotations assert (27, 36, NerAnnotation("nam_loc")) in annotations - + assert (0, 5, MorphosyntacticAnnotation("subst:sg:nom:m1")) in annotations assert (6, 14, MorphosyntacticAnnotation("subst:sg:nom:m1")) in annotations assert (15, 23, MorphosyntacticAnnotation("praet:sg:m1:perf")) in annotations assert (24, 26, MorphosyntacticAnnotation("prep:gen")) in annotations assert (27, 36, MorphosyntacticAnnotation("subst:sg:gen:m3")) in annotations - assert (36, 37, MorphosyntacticAnnotation("interp")) in annotations \ No newline at end of file + assert (36, 37, MorphosyntacticAnnotation("interp")) in annotations diff --git a/tests/input_parsers/test_wiktor_ner.py b/tests/input_parsers/test_wiktor_ner.py index 271227c7cc47cd8b7650a523188e471e7506d57b..25e928b4dfc4010dbbb5446da7bd3ae24a5e5f11 100644 --- a/tests/input_parsers/test_wiktor_ner.py +++ b/tests/input_parsers/test_wiktor_ner.py @@ -95,20 +95,21 @@ example_json = """{ ] }""" + def test_wiktor_ner_input_parser(): parser = WiktorNERInputParser() - + text, annotations = parser.parse(example_json) assert text == "Marek Kowalski pojechaÅ‚ do WrocÅ‚awia." assert len(annotations) == 8 - + assert (0, 14, NerAnnotation("nam_liv")) in annotations assert (27, 36, NerAnnotation("nam_loc")) in annotations - + assert (0, 5, MorphosyntacticAnnotation("subst:sg:nom:m1")) in annotations assert (6, 14, MorphosyntacticAnnotation("subst:sg:nom:m1")) in annotations assert (15, 23, MorphosyntacticAnnotation("praet:sg:m1:perf")) in annotations assert (24, 26, MorphosyntacticAnnotation("prep:gen")) in annotations assert (27, 36, MorphosyntacticAnnotation("subst:sg:gen:m3")) in annotations - assert (36, 37, MorphosyntacticAnnotation("interp")) in annotations \ No newline at end of file + assert (36, 37, MorphosyntacticAnnotation("interp")) in annotations diff --git a/tests/pipeline/test_default.py b/tests/pipeline/test_default.py index a524eefca06bc5aa7bc8199372a6f32c8511c34c..9131792659d08863627d692740fbda104513d825 100644 --- a/tests/pipeline/test_default.py +++ b/tests/pipeline/test_default.py @@ -6,30 +6,35 @@ from src.suppressors.interface import Suppressor from src.replacers.interface import ReplacerInterface from tempfile import NamedTemporaryFile + class MockInputParser(InputParser): def parse(self, content): return "ala ma kota", {} - + + class MockDetector(Detector): def detect(self, text, annotations): return [(0, 3, NameDetection())] - + + class MockSuppressor(Suppressor): def suppress(self, annotations): return annotations - + + class MockReplacer(ReplacerInterface): def replace(self, text, annotations): return "zbigniew ma kota", annotations + def test_default_pipeline(): # TODO: Prepare mocks that will better test the pipeline pipeline = DefaultPipeline( MockInputParser(), {"mock_detector": MockDetector()}, MockSuppressor(), - {"mock_replacer": MockReplacer()} + {"mock_replacer": MockReplacer()}, ) - + with NamedTemporaryFile() as f: - assert pipeline.run(f.name) == "zbigniew ma kota" \ No newline at end of file + assert pipeline.run(f.name) == "zbigniew ma kota" diff --git a/tests/pipeline/test_sequential_jsonl.py b/tests/pipeline/test_sequential_jsonl.py index e94b335beb14cce9fa1a63e47f9ad611be7beb43..ddbf45f73024398ddc65faaae94a3455c367d22f 100644 --- a/tests/pipeline/test_sequential_jsonl.py +++ b/tests/pipeline/test_sequential_jsonl.py @@ -6,34 +6,39 @@ from src.suppressors.interface import Suppressor from src.replacers.interface import ReplacerInterface from tempfile import NamedTemporaryFile + class MockInputParser(InputParser): def parse(self, content): return "ala ma kota", {} - + + class MockDetector(Detector): def detect(self, text, annotations): return [(0, 3, NameDetection())] - + + class MockSuppressor(Suppressor): def suppress(self, annotations): return annotations - + + class MockReplacer(ReplacerInterface): def replace(self, text, annotations): return "zbigniew ma kota", annotations + def test_sequential_jsonl_pipeline(): # TODO: Prepare mocks that will better test the pipeline pipeline = SequentialJSONLPipeline( MockInputParser(), {"mock_detector": MockDetector()}, MockSuppressor(), - {"mock_replacer": MockReplacer()} + {"mock_replacer": MockReplacer()}, ) - + with NamedTemporaryFile() as f: f.write(b'{"text": "ala ma kota"}\n{"text": "ala ma kota"}') f.flush() - result = pipeline.run(f.name) - - assert result == '{"text": "zbigniew ma kota"}\n{"text": "zbigniew ma kota"}' \ No newline at end of file + result = pipeline.run(f.name) + + assert result == '{"text": "zbigniew ma kota"}\n{"text": "zbigniew ma kota"}' diff --git a/tests/replacers/test_date_replacer.py b/tests/replacers/test_date_replacer.py index f647f6e6a9a5cde29548a4202d4ec805f8f5c230..08117016b1b1569946a559e2d99cb70606d1fe8e 100644 --- a/tests/replacers/test_date_replacer.py +++ b/tests/replacers/test_date_replacer.py @@ -1,20 +1,20 @@ - from src.replacers.date_replacer import DateReplacer from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection + def test_date_replacer(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameDetection()), + (0, 3, NameDetection()), (4, 14, SurnameDetection()), (28, 38, DateDetection()), (42, 51, CityDetection()), ] - + replacer = DateReplacer() - + result = replacer.replace(text, detections) - + expected_text_beggining = "Ala Brzeszczot urodziÅ‚a sie " expected_text_ending = " we WrocÅ‚awiu" exptected_detections_left = [ @@ -22,11 +22,12 @@ def test_date_replacer(): (4, 14, SurnameDetection()), (len(result[0]) - 9, len(result[0]), CityDetection()), ] - + assert result[0].startswith(expected_text_beggining) assert result[0].endswith(expected_text_ending) assert result[1] == exptected_detections_left - + + def test_date_replacer_same_date_same_replacement(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu. 05.05.2005 to jej urodziny. 06.05.2005 to nie jej urodziny." detections = [ @@ -34,10 +35,10 @@ def test_date_replacer_same_date_same_replacement(): (53, 63, DateDetection()), (81, 91, DateDetection()), ] - + replacer = DateReplacer() - + result = replacer.replace(text, detections) - + assert result[0][29:39] == result[0][54:64] - assert result[1] == [] \ No newline at end of file + assert result[1] == [] diff --git a/tests/replacers/test_email_replacer.py b/tests/replacers/test_email_replacer.py index 664e04304cefb059829f372dfc24d7ae48e7052e..e86b8c9266dee17e6aa4b3f65361e9711fd6f8f3 100644 --- a/tests/replacers/test_email_replacer.py +++ b/tests/replacers/test_email_replacer.py @@ -1,43 +1,43 @@ - from src.replacers.email_replacer import EmailReplacer from src.detections import DateDetection, CityDetection, UserDetection, EmailDetection + def test_email_replacer(): text = "zz@z.pl urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 7, EmailDetection()), + (0, 7, EmailDetection()), (21, 31, DateDetection()), (35, 44, CityDetection()), ] - + replacer = EmailReplacer() result = replacer.replace(text, detections) - + expected_text_ending = " urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ (len(result[0]) - 23, len(result[0]) - 13, DateDetection()), (len(result[0]) - 9, len(result[0]), CityDetection()), ] - + assert result[0].endswith(expected_text_ending) - assert result[0][0:-len(expected_text_ending)] != "zz@z.pl" + assert result[0][0 : -len(expected_text_ending)] != "zz@z.pl" assert result[1] == exptected_detections_left - + + def test_email_replacer_same_email_same_replacement(): text = "zz@z.pl zz@z.pl aa@a.pl" detections = [ - (0, 7, EmailDetection()), + (0, 7, EmailDetection()), (8, 15, EmailDetection()), (16, 22, EmailDetection()), - ] - + replacer = EmailReplacer() result = replacer.replace(text, detections) - + old_emails = text.split() new_emails = result[0].split() - + assert old_emails[0] != new_emails[0] assert new_emails[0] == new_emails[1] - assert result[1] == [] \ No newline at end of file + assert result[1] == [] diff --git a/tests/replacers/test_ner_replacer.py b/tests/replacers/test_ner_replacer.py index b68608fdcd0b7af0541317ef1b9030e70548a05a..8b7fc6561296b78ae80b99d3333930d08b167d09 100644 --- a/tests/replacers/test_ner_replacer.py +++ b/tests/replacers/test_ner_replacer.py @@ -1,37 +1,39 @@ - from src.replacers.ner_replacer import NERReplacer from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection from src.dictionaries.morphosyntactic.ner_file import NERFileMorphosyntacticDictionary from tempfile import NamedTemporaryFile + def test_ner_replacer(): with NamedTemporaryFile() as file: - file.writelines([ - "name Andrzej Andrzej a\n".encode("utf-8"), - "name Kasi Kasia b\n".encode("utf-8"), - "surname Kowalowi Kowal a\n".encode("utf-8"), - "surname Kowal Kowal b\n".encode("utf-8"), - ]) + file.writelines( + [ + "name Andrzej Andrzej a\n".encode("utf-8"), + "name Kasi Kasia b\n".encode("utf-8"), + "surname Kowalowi Kowal a\n".encode("utf-8"), + "surname Kowal Kowal b\n".encode("utf-8"), + ] + ) file.flush() - + dictionary = NERFileMorphosyntacticDictionary(file.name, always_replace=False) - + text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameDetection(morpho_tag="a")), + (0, 3, NameDetection(morpho_tag="a")), (4, 14, SurnameDetection(morpho_tag="b")), (28, 38, DateDetection()), (42, 51, CityDetection(morpho_tag="c")), ] - + replacer = NERReplacer(dictionary) - + result = replacer.replace(text, detections) - + expected_text = "Andrzej Kowal urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ (27, 37, DateDetection()), (41, 50, CityDetection(morpho_tag="c")), ] - - assert result == (expected_text, exptected_detections_left) \ No newline at end of file + + assert result == (expected_text, exptected_detections_left) diff --git a/tests/replacers/test_tag_replacer.py b/tests/replacers/test_tag_replacer.py index 4c5ce485040c8d26baaf0966bc8d72cfb9b9698f..b003e58f82c98c00ca36e16cac90da30ccbed75b 100644 --- a/tests/replacers/test_tag_replacer.py +++ b/tests/replacers/test_tag_replacer.py @@ -1,21 +1,21 @@ - from src.replacers.tag_replacer import TagReplacer from src.detections import NameDetection, SurnameDetection, DateDetection, CityDetection + def test_replace_with_tags(): text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 3, NameDetection()), + (0, 3, NameDetection()), (4, 14, SurnameDetection()), (28, 38, DateDetection()), (42, 51, CityDetection()), ] - + replacer = TagReplacer() - + result = replacer.replace(text, detections) - + expected_text = "[OSOBA] [OSOBA] urodziÅ‚a sie [DATE] we [MIEJSCE]" exptected_detections_left = [] - - assert result == (expected_text, exptected_detections_left) \ No newline at end of file + + assert result == (expected_text, exptected_detections_left) diff --git a/tests/replacers/test_user_replacer.py b/tests/replacers/test_user_replacer.py index 608f766e44797d596e96a460d599fec726f34158..8afaa51e808491a954077e13af3610c93540d6ae 100644 --- a/tests/replacers/test_user_replacer.py +++ b/tests/replacers/test_user_replacer.py @@ -1,43 +1,43 @@ - from src.replacers.user_replacer import UserReplacer from src.detections import DateDetection, CityDetection, UserDetection + def test_user_replacer(): text = "@zzzz32 urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ - (0, 7, UserDetection()), + (0, 7, UserDetection()), (21, 31, DateDetection()), (35, 44, CityDetection()), ] - + replacer = UserReplacer() result = replacer.replace(text, detections) - + expected_text_ending = " urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" exptected_detections_left = [ (len(result[0]) - 23, len(result[0]) - 13, DateDetection()), (len(result[0]) - 9, len(result[0]), CityDetection()), ] - + assert result[0].endswith(expected_text_ending) - assert result[0][0:-len(expected_text_ending)] != "@zzzz32" + assert result[0][0 : -len(expected_text_ending)] != "@zzzz32" assert result[1] == exptected_detections_left - + + def test_user_replacer_same_user_same_replacement(): text = "@zzzz32 @zzzz32 @aaaaa" detections = [ - (0, 7, UserDetection()), + (0, 7, UserDetection()), (8, 15, UserDetection()), (16, 22, UserDetection()), - ] - + replacer = UserReplacer() result = replacer.replace(text, detections) - + old_users = text.split() new_users = result[0].split() - + assert old_users[0] != new_users[0] assert new_users[0] == new_users[1] - assert result[1] == [] \ No newline at end of file + assert result[1] == [] diff --git a/tests/suppressors/test_order_based.py b/tests/suppressors/test_order_based.py index 8cf35b9e92809dd12327c9f55fda9bdd9e16459b..f6d854d5f778a2d3d20141cbdd8606e37c7a7b5e 100644 --- a/tests/suppressors/test_order_based.py +++ b/tests/suppressors/test_order_based.py @@ -1,5 +1,6 @@ from src.suppressors.order_based import suppress_order_based + def test_supress_order_based(): annotations = [ (10, 16, "Marian"), @@ -13,4 +14,4 @@ def test_supress_order_based(): (30, 35, "Nowak"), (50, 59, "WrocÅ‚awiu"), ] - assert set(result) == set(expected) \ No newline at end of file + assert set(result) == set(expected) diff --git a/tests/test_annotation_mapping.py b/tests/test_annotation_mapping.py index 42b4bb2a143862cd8850601a280f66fc26b2df15..a43133df461e227012f1163a67a12fa9fd06eccc 100644 --- a/tests/test_annotation_mapping.py +++ b/tests/test_annotation_mapping.py @@ -1,5 +1,6 @@ from src.annotation_mapping import map_annotatios + def test_map_annotations(): ref_annotations = [(0, 3, "Andrzej"), (7, 11, "psa")] all_annotations = { diff --git a/tests/test_string_replacements.py b/tests/test_string_replacements.py index 6bea546a251d97e9739adfa0121388eb699b9778..384d0d6641b16e25fc4c9b5479207d5b15993792 100644 --- a/tests/test_string_replacements.py +++ b/tests/test_string_replacements.py @@ -1,34 +1,38 @@ from src.string_replacements import replace, replace_and_update + def test_replace(): text = "Ala ma kota" replacements = [(0, 3, "Andrzej"), (7, 11, "psa")] - + expected = "Andrzej ma psa" - + result = replace(text, replacements) - + assert result == expected - + + def test_replace_out_of_order(): text = "Ala ma kota" replacements = [(7, 11, "psa"), (0, 3, "Andrzej")] - + expected = "Andrzej ma psa" result = replace(text, replacements) - + assert result == expected - - + + def test_replace_and_update(): text = "Ala ma kota kropka" replacements = [(0, 3, "Andrzej"), (7, 11, "psa")] other_annotations = [(4, 6, "ma"), (12, 18, "kropka")] - + expected_text = "Andrzej ma psa kropka" expected_other_annotations = [(8, 10, "ma"), (15, 21, "kropka")] - - result_text, result_other_annotations = replace_and_update(text, replacements, other_annotations) - + + result_text, result_other_annotations = replace_and_update( + text, replacements, other_annotations + ) + assert result_text == expected_text - assert result_other_annotations == expected_other_annotations \ No newline at end of file + assert result_other_annotations == expected_other_annotations diff --git a/utility/NELex2_to_wiki.py b/utility/NELex2_to_wiki.py index 4dbf13db3b0bc8ac3d691f8348ae3c81e0de5871..704e900e549fb7b7e5cb35e266e1b030d8de239a 100644 --- a/utility/NELex2_to_wiki.py +++ b/utility/NELex2_to_wiki.py @@ -4,30 +4,30 @@ Requires morfeusz2 to be installed. """ import morfeusz2 + morf = morfeusz2.Morfeusz(expand_tags=True) _file_to_liner_dispatch = { - 'nam_liv_person': 'person_first_nam', - 'nam_liv_person_last': 'person_last_nam', - 'nam_fac_road': 'road_nam', - 'nam_loc_gpe_city': 'city_nam', - 'nam_org_group_team': 'country_nam' + "nam_liv_person": "person_first_nam", + "nam_liv_person_last": "person_last_nam", + "nam_fac_road": "road_nam", + "nam_loc_gpe_city": "city_nam", + "nam_org_group_team": "country_nam", } -_allowed_genders = ['f', 'm1', 'm2', 'm3', 'n'] +_allowed_genders = ["f", "m1", "m2", "m3", "n"] def _create_wiki(): - with open('wiki.txt', 'wt+', encoding='utf-8') as f: + with open("wiki.txt", "wt+", encoding="utf-8") as f: _add_gender(f) _last_names(f) def _add_gender( - output, - file_name='nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt' + output, file_name="nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt" ): - with open(file_name, 'r', encoding='utf-8') as f: + with open(file_name, "r", encoding="utf-8") as f: _form_dict = dict() for line in f: l_list = line.split() @@ -35,8 +35,8 @@ def _add_gender( if cat in _file_to_liner_dispatch: cat_name = cat length = int((len(l_list) - 2) / 2) - gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)]) - flx_name = ' '.join(l_list[1:(1 + length)]) + gen_name = " ".join(l_list[(1 + length) : (1 + 2 * length)]) + flx_name = " ".join(l_list[1 : (1 + length)]) flex = l_list[-1] if cat_name not in _form_dict: _form_dict[cat_name] = dict() @@ -46,56 +46,68 @@ def _add_gender( _form_dict[cat_name][length][gen_name] = dict() if flex not in _form_dict[cat_name][length][gen_name]: _form_dict[cat_name][length][gen_name][flex] = flx_name - name = gen_name.split(' ')[0] + name = gen_name.split(" ")[0] generate = morf.generate(name) - flex_split = generate[0][2].split(':') + flex_split = generate[0][2].split(":") if len(flex_split) > 3: gender = flex_split[3] - new_flex = flex + ':' + gender - output.write(cat + '\t' + flx_name + '\t' + - gen_name + '\t' + new_flex + '\n') + new_flex = flex + ":" + gender + output.write( + cat + + "\t" + + flx_name + + "\t" + + gen_name + + "\t" + + new_flex + + "\n" + ) def _last_names(output): dict_list = list() - with open('nelexicon2/extra/wikipedia-liner2.txt', - 'rt', - encoding='utf-8' - ) as f: + with open("nelexicon2/extra/wikipedia-liner2.txt", "rt", encoding="utf-8") as f: for line in f: line = line.strip() - line_l = line.split('\t') - if line_l[0] == 'nam_liv_person_last': + line_l = line.split("\t") + if line_l[0] == "nam_liv_person_last": line_l = line_l[1] - line_l.split(' ') + line_l.split(" ") line_len = len(line_l) if type(line_l) == list() and line_len > 1: dictionary = dict() for word in line_l: gen = morf.generate(word) for w in gen: - tag_list = w[2].split(':') + tag_list = w[2].split(":") if len(tag_list) > 3: - tag = tag_list[1] + ':' + tag_list[2] + tag = tag_list[1] + ":" + tag_list[2] if tag not in dictionary: dictionary[tag] = w[0] else: - dictionary[tag] += ' ' + w[0] + dictionary[tag] += " " + w[0] for key in dictionary: - if len(dictionary[key].split(' ')) == line_len: + if len(dictionary[key].split(" ")) == line_len: d = dictionary[key] dict_list.append(d) else: word = line_l[0] if type(line_l) == list() else line_l generate = morf.generate(word) for g in generate: - if len(g) > 4 and 'nazwisko' in g[3]: + if len(g) > 4 and "nazwisko" in g[3]: dict_list.append(g) for word in dict_list: d = word - line = 'nam_liv_person_last' + '\t' + d[0].split(':')[0] +\ - '\t' + d[1].split(':')[0] + '\t' + ':'.join(d[2].split(':')[1:]) - output.write(line + '\n') + line = ( + "nam_liv_person_last" + + "\t" + + d[0].split(":")[0] + + "\t" + + d[1].split(":")[0] + + "\t" + + ":".join(d[2].split(":")[1:]) + ) + output.write(line + "\n") _create_wiki()