diff --git a/config/detectors/ner_kpwr_ext.yaml b/config/detectors/ner_kpwr_ext.yaml index 3ef710fd7ea4209bcc1ad26e80a521097706c46b..447120451124eb8c06ac49fa57c691cecbf45c0c 100644 --- a/config/detectors/ner_kpwr_ext.yaml +++ b/config/detectors/ner_kpwr_ext.yaml @@ -1,82 +1,7 @@ +defaults: + - /ner_mapping@ner.detection_mapping: kpwr_n82 + - _self_ + ner: _target_: src.detectors.ner.NerDetector - language: ${language} - detection_mapping: - "nam_liv_person_first": "name" - "nam_liv_person_last": "surname" - "nam_fac_road": "street_name" - "nam_loc_gpe_city": "city" - "nam_loc_gpe_country": "country" - "nam_loc_gpe_subdivision": "location" - "nam_loc_gpe_admin1": "location" - "nam_loc_gpe_district": "location" - "nam_loc_gpe_admin3": "location" - "nam_loc_gpe_admin2": "location" - "nam_loc_gpe_conurbation": "location" - "nam_loc_country_region": "location" - "nam_loc": "location" - "nam_fac_bridge": "location" - "nam_fac_goe": "location" - "nam_loc_land_mountain": "location" - "nam_loc_land_island": "location" - "nam_loc_land": "location" - "nam_loc_land_peak": "location" - "nam_loc_land_continent": "location" - "nam_loc_land_region": "location" - "nam_loc_historical_region": "location" - "nam_fac_park": "location" - "nam_fac_square": "street" - "nam_fac_goe_stop": "street" - "nam_adj_country": "country" - "nam_num_phone": "phone_number" - "nam_num_house": "number" - "nam_liv_person": "name" - "nam_adj_person": "surname" - "nam_adj_city": "city" - "nam_pro_title_document": "title" - "nam_pro_title_book": "title" - "nam_pro_title_article": "title" - "nam_pro_title": "title" - "nam_pro_title_song": "title" - "nam_pro_title_treaty": "title" - "nam_pro_title_album": "title" - "nam_pro_title_tv": "title" - "nam_loc_hydronym": "hydronym" - "nam_loc_hydronym_sea": "hydronym" - "nam_loc_hydronym_lake": "hydronym" - "nam_loc_hydronym_river": "hydronym" - "nam_loc_hydronym_ocean": "hydronym" - "nam_oth_www": "url" - "nam_pro_award": "proper_name" - "nam_pro_media_radio": "proper_name" - "nam_pro_media_tv": "proper_name" - "nam_pro_media_web": "proper_name" - "nam_pro_software_game": "proper_name" - "nam_pro_model_car": "proper_name" - "nam_pro_media": "proper_name" - "nam_pro_vehicle": "proper_name" - "nam_pro_brand": "proper_name" - "nam_pro_media_periodic": "proper_name" - "nam_pro_software": "proper_name" - "nam_pro": "proper_name" - "nam_eve_human_cultural": "proper_name" - "nam_eve_human_holiday": "proper_name" - "nam_eve_human": "proper_name" - "nam_eve": "proper_name" - "nam_eve_human_sport": "proper_name" - "nam_fac_system": "proper_name" - "nam_oth_tech": "proper_name" - "nam_oth_currency": "proper_name" - "nam_oth_position": "proper_name" - "nam_oth": "serial_number" - "nam_oth_data_format": "serial_number" - "nam_oth_license": "serial_number" - "nam_org_group_team": "organization_name" - "nam_org_company": "organization_name" - "nam_org_group": "organization_name" - "nam_org_political_party": "organization_name" - "nam_org_group_band": "organization_name" - "nam_org_nation": "organization_name" - "nam_org_organization_sub": "organization_name" - "nam_org_institution": "organization_name" - "nam_org_organization": "organization_name" + language: ${language} \ No newline at end of file diff --git a/config/detectors/ner_n5.yaml b/config/detectors/ner_n5.yaml index 27894c494cd3b3bdc96cda03ab029820c54377a7..d95883039560f390f5dd6a8abf1a7197d1d60264 100644 --- a/config/detectors/ner_n5.yaml +++ b/config/detectors/ner_n5.yaml @@ -1,9 +1,7 @@ +defaults: + - /ner_mapping@ner.detection_mapping: n5 + - _self_ + ner: _target_: src.detectors.ner.NerDetector - language: ${language} - detection_mapping: - "person_first_nam": "name" - "person_last_nam": "surname" - "road_nam": "street_name" - "city_nam": "city" - "country_nam": "country" \ No newline at end of file + language: ${language} \ No newline at end of file diff --git a/config/dictionary/pl_ner_morphosyntactic_dictionary.yaml b/config/dictionary/pl_ner_morphosyntactic_dictionary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..39c98cbcd139b6833f1052157e0aba1d1555730e --- /dev/null +++ b/config/dictionary/pl_ner_morphosyntactic_dictionary.yaml @@ -0,0 +1,3 @@ +_target_: src.dictionaries.morphosyntactic.ner_file.NERFileMorphosyntacticDictionary +dictionary_path: ${paths.dictionaries_path}/pl_ext_dict.txt + \ No newline at end of file diff --git a/config/ner_mapping/kpwr_n82.yaml b/config/ner_mapping/kpwr_n82.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d4e8481bfc1ec7af27cd8ceb6698c027ea34513 --- /dev/null +++ b/config/ner_mapping/kpwr_n82.yaml @@ -0,0 +1,78 @@ +"nam_liv_person_first": "name" +"nam_liv_person_last": "surname" +"nam_fac_road": "street_name" +"nam_loc_gpe_city": "city" +"nam_loc_gpe_country": "country" +"nam_loc_gpe_subdivision": "location" +"nam_loc_gpe_admin1": "location" +"nam_loc_gpe_district": "location" +"nam_loc_gpe_admin3": "location" +"nam_loc_gpe_admin2": "location" +"nam_loc_gpe_conurbation": "location" +"nam_loc_country_region": "location" +"nam_loc": "location" +"nam_fac_bridge": "location" +"nam_fac_goe": "location" +"nam_loc_land_mountain": "location" +"nam_loc_land_island": "location" +"nam_loc_land": "location" +"nam_loc_land_peak": "location" +"nam_loc_land_continent": "location" +"nam_loc_land_region": "location" +"nam_loc_historical_region": "location" +"nam_fac_park": "location" +"nam_fac_square": "street" +"nam_fac_goe_stop": "street" +"nam_adj_country": "country" +"nam_num_phone": "phone_number" +"nam_num_house": "number" +"nam_liv_person": "name" +"nam_adj_person": "surname" +"nam_adj_city": "city" +"nam_pro_title_document": "title" +"nam_pro_title_book": "title" +"nam_pro_title_article": "title" +"nam_pro_title": "title" +"nam_pro_title_song": "title" +"nam_pro_title_treaty": "title" +"nam_pro_title_album": "title" +"nam_pro_title_tv": "title" +"nam_loc_hydronym": "hydronym" +"nam_loc_hydronym_sea": "hydronym" +"nam_loc_hydronym_lake": "hydronym" +"nam_loc_hydronym_river": "hydronym" +"nam_loc_hydronym_ocean": "hydronym" +"nam_oth_www": "url" +"nam_pro_award": "proper_name" +"nam_pro_media_radio": "proper_name" +"nam_pro_media_tv": "proper_name" +"nam_pro_media_web": "proper_name" +"nam_pro_software_game": "proper_name" +"nam_pro_model_car": "proper_name" +"nam_pro_media": "proper_name" +"nam_pro_vehicle": "proper_name" +"nam_pro_brand": "proper_name" +"nam_pro_media_periodic": "proper_name" +"nam_pro_software": "proper_name" +"nam_pro": "proper_name" +"nam_eve_human_cultural": "proper_name" +"nam_eve_human_holiday": "proper_name" +"nam_eve_human": "proper_name" +"nam_eve": "proper_name" +"nam_eve_human_sport": "proper_name" +"nam_fac_system": "proper_name" +"nam_oth_tech": "proper_name" +"nam_oth_currency": "proper_name" +"nam_oth_position": "proper_name" +"nam_oth": "serial_number" +"nam_oth_data_format": "serial_number" +"nam_oth_license": "serial_number" +"nam_org_group_team": "organization_name" +"nam_org_company": "organization_name" +"nam_org_group": "organization_name" +"nam_org_political_party": "organization_name" +"nam_org_group_band": "organization_name" +"nam_org_nation": "organization_name" +"nam_org_organization_sub": "organization_name" +"nam_org_institution": "organization_name" +"nam_org_organization": "organization_name" \ No newline at end of file diff --git a/config/ner_mapping/n5.yaml b/config/ner_mapping/n5.yaml new file mode 100644 index 0000000000000000000000000000000000000000..738494c200d744a6a06b5acd017a6dfa75c53b33 --- /dev/null +++ b/config/ner_mapping/n5.yaml @@ -0,0 +1,5 @@ +"person_first_nam": "name" +"person_last_nam": "surname" +"road_nam": "street_name" +"city_nam": "city" +"country_nam": "country" \ No newline at end of file diff --git a/config/replacers/ner.yaml b/config/replacers/ner.yaml index 1bfe6ffc55e54cc1f838395ee2834b31f49baa39..fbb55b34fd10a97fd459c21b2a69343be614af41 100644 --- a/config/replacers/ner.yaml +++ b/config/replacers/ner.yaml @@ -1,6 +1,5 @@ +defaults: + - /dictionary@ner.dictionary: pl_ner_morphosyntactic_dictionary + ner: _target_: src.replacers.ner_replacer.NERReplacer - dictionary: - _target_: src.dictionaries.morphosyntactic.ner_file.NERFileMorphosyntacticDictionary - dictionary_path: ${paths.dictionaries_path}/pl_ext_dict.txt - \ No newline at end of file diff --git a/print_config.py b/print_config.py new file mode 100644 index 0000000000000000000000000000000000000000..64222f0596a45ea45aef034594c4273c7c9d2114 --- /dev/null +++ b/print_config.py @@ -0,0 +1,13 @@ +import hydra +from omegaconf import OmegaConf +import json + +@hydra.main(config_path="config", config_name="config") +def main(cfg): + cfg_resolved = OmegaConf.to_container(cfg, resolve=True) + cfg_resolved_json = json.dumps(cfg_resolved, indent=4) + + print(cfg_resolved_json) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py index 3c7434ae84a84ccf11a606b35836eb4c7d822276..f315d58ee4bb04057de4eba00ce908c39c7651e9 100644 --- a/src/detectors/date/pl.py +++ b/src/detectors/date/pl.py @@ -1,6 +1,7 @@ import regex as re from typing import List, Tuple from src.detections import DateDetection +from src.suppressors.order_based import OrderBasedSuppressor from src.detectors.date.utils import parse_date_to_format @@ -48,6 +49,7 @@ def detect_dates_pl(text: str) -> List[Tuple[int, int, DateDetection]]: for match in PL_YEAR_REGEX.finditer(text): format = parse_date_to_format(match.groupdict()) - dates.append((match.start(), match.end(), DateDetection(format))) - return dates + dates.append((match.start("year"), match.end("year"), DateDetection(format))) + + return OrderBasedSuppressor().suppress(dates) diff --git a/src/detectors/date/utils.py b/src/detectors/date/utils.py index eb78c11f4aaecda7f15a7dda5d49ecf4ede87e43..64a9eb5f60805f31eaa4c521969ea720e071f818 100644 --- a/src/detectors/date/utils.py +++ b/src/detectors/date/utils.py @@ -166,8 +166,6 @@ def _parse_year_only(re_entry) -> List[Tuple[DateDetection.AnnotationPart, str]] else: result.append((DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, re_entry["year"])) - result.append((DateDetection.AnnotationPart.OTHER, re_entry["addon"])) - return result def parse_date_to_format( diff --git a/tests/integration/ccl_configuration/test_ccl_configuration.py b/tests/integration/ccl_configuration/test_ccl_configuration.py index 12f8a09d8a21aa5c7016150fd470563547642971..357bb32f3fe73db6b059c1a23076d24690c1e79c 100644 --- a/tests/integration/ccl_configuration/test_ccl_configuration.py +++ b/tests/integration/ccl_configuration/test_ccl_configuration.py @@ -14,4 +14,4 @@ def test_ccl_configuration(): "./tests/integration/ccl_configuration/marek_kowalski_pojechal_do_wroclawia.ccl" ) - assert result == "[OSOBA] [OSOBA] (numer telefonu [DIGITS]) miesza we [MIEJSCE]" + assert result == "[OSOBA] [OSOBA] (numer telefonu [CYFRY]) miesza we [MIEJSCE]" diff --git a/tests/unit/detectors/date/test_pl.py b/tests/unit/detectors/date/test_pl.py index 7ed1373801447d0d4549420889db3dc1f424a59d..be0392fe04392aade3aebd4db1ce969bf1e20640 100644 --- a/tests/unit/detectors/date/test_pl.py +++ b/tests/unit/detectors/date/test_pl.py @@ -79,4 +79,18 @@ def test_28_czerwca_1847(): assert found_dates == [ (0, 15, DateDetection(format_date)), + ] + +def test_year_only(): + detector = DateDetector("pl") + + text = "W 2020 roku kupiłem psa." + found_dates = detector.detect(text, dict()) + + format_date = [ + (DateDetection.AnnotationPart.FOUR_DIGIT_YEAR, "2020"), + ] + + assert found_dates == [ + (2, 6, DateDetection(format_date)), ] \ No newline at end of file diff --git a/tests/unit/replacers/test_tag_replacer.py b/tests/unit/replacers/test_tag_replacer.py index b003e58f82c98c00ca36e16cac90da30ccbed75b..a13439479861c9a3a903b897f31a8df84a7943f3 100644 --- a/tests/unit/replacers/test_tag_replacer.py +++ b/tests/unit/replacers/test_tag_replacer.py @@ -15,7 +15,7 @@ def test_replace_with_tags(): result = replacer.replace(text, detections) - expected_text = "[OSOBA] [OSOBA] urodziła sie [DATE] we [MIEJSCE]" + expected_text = "[OSOBA] [OSOBA] urodziła sie [DATA] we [MIEJSCE]" exptected_detections_left = [] assert result == (expected_text, exptected_detections_left)