"""Module for replacing dates with anonimized version.""" import random from typing import List, Tuple from src.detections import DateDetection, Detection from src.replacers.interface import ReplacerInterface from src.string_replacements import replace_and_update # TODO: Add support for other languages months_map = { 1: "stycznia", 2: "lutego", 3: "marca", 4: "kwietnia", 5: "maja", 6: "czerwca", 7: "lipca", 8: "sierpnia", 9: "września", 10: "października", 11: "listopada", 12: "grudnia", } class DateReplacer(ReplacerInterface): """Class for replacing dates with anonimized version.""" def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: """Replace detected dates in text with anonimized version. Eg.: I was born on 01.01.2020 -> I was born on 22.11.2069 Args: text (str): Text to be processed. detections (List[Tuple[int, int, str]]): List of detections. Returns: Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities replaced with anonimized version and list of detections that were not processed by this replacer. """ replacements = [] not_processed = [] already_replaced = dict() for item in detections: start, end, detection = item if isinstance(detection, DateDetection): replacement = [] if detection.format is not None: format = detection.format else: format = [ (DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01"), (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2020"), ] if text[start:end] in already_replaced: replacement = already_replaced[text[start:end]] else: for entry in format: if entry[0] == DateDetection.AnnotationPart.TWO_DIGITS_DAY: random_day = random.randint(1, 28) replacement.append(str(random_day).zfill(2)) elif entry[0] == DateDetection.AnnotationPart.ONE_DIGIT_DAY: random_day = random.randint(1, 28) replacement.append(str(random_day)) elif entry[0] == DateDetection.AnnotationPart.TWO_DIGIT_MONTH: random_month = random.randint(1, 12) replacement.append(str(random_month).zfill(2)) elif entry[0] == DateDetection.AnnotationPart.ONE_DIGIT_MONTH: random_month = random.randint(1, 12) replacement.append(str(random_month)) elif entry[0] == DateDetection.AnnotationPart.FOUR_DIGIT_YEAR: random_year = random.randint(1900, 2020) replacement.append(str(random_year)) elif entry[0] == DateDetection.AnnotationPart.TWO_DIGIT_YEAR: random_year = random.randint(0, 99) replacement.append(str(random_year).zfill(2)) elif entry[0] == DateDetection.AnnotationPart.TEXT_MONTH: random_month = random.randint(1, 12) month_name = months_map[random_month] replacement.append(month_name) elif entry[0] == DateDetection.AnnotationPart.OTHER: replacement.append(entry[1]) else: raise ValueError(f"Unknown format entry: {entry}") replacement = "".join(replacement) already_replaced[text[start:end]] = replacement replacements.append((start, end, replacement)) else: not_processed.append(item) return replace_and_update(text, replacements, not_processed)