from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_pl(): detector = DateDetector("pl") text = "W dniu 1.01.2022 sprzedałem kota. 5 kwietnia 2021 roku kupiłem psa." found_dates = detector.detect(text, dict()) format_date1 = [ ( DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01", ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022"), ] format_date2 = [ ( DateDetection.AnnotationPart.TWO_DIGITS_DAY, "05", ), # Only supports two digits for now (DateDetection.AnnotationPart.OTHER, " "), (DateDetection.AnnotationPart.TEXT_MONTH, "kwietnia"), (DateDetection.AnnotationPart.OTHER, " "), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2021"), ] assert found_dates == [ (7, 16, DateDetection(format_date1)), (34, 49, DateDetection(format_date2)), ] def test_date_with_different_punctuations(): # There is discussion about this wheter we should even detect such cases # as a dates... However, for now we do and if we find cases where that is # problematic, this definitly could be changed. detector = DateDetector("pl") text = "1.01,2022" found_dates = detector.detect(text, dict()) format_date = [ ( DateDetection.AnnotationPart.TWO_DIGITS_DAY, "01", ), (DateDetection.AnnotationPart.OTHER, "."), (DateDetection.AnnotationPart.TWO_DIGIT_MONTH, "01"), (DateDetection.AnnotationPart.OTHER, ","), (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2022"), ] assert found_dates == [ (7, 16, DateDetection(format_date)), ]