From 8bc5de430e139b8c453bf0450c60bc632f9a678e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl> Date: Mon, 19 Dec 2022 12:18:27 +0100 Subject: [PATCH] [WIP] more refactoring & unit testing --- .../marek_kowalski_pojechal_do_wroclawia.ccl | 30 ++++---- requirements.txt | 3 +- src/anonymizers/english_anonymizer.py | 2 +- src/anonymizers/polish_anonymizer.py | 2 +- src/anonymizers/russian_anonymizer.py | 2 +- src/ccl_parser.py | 70 +++++++++++++++++++ src/detectors/date/date.py | 2 +- src/detectors/date/en.py | 5 +- src/detectors/date/pl.py | 5 +- src/detectors/date/ru.py | 5 +- src/detectors/email/email.py | 5 +- src/detectors/ner/__init__.py | 1 + src/detectors/ner/ner.py | 8 +++ src/detectors/ner/pl_liner_n5.py | 33 +++++++++ src/detectors/phone/phone.py | 5 +- src/detectors/url/url.py | 5 +- src/detectors/user/user.py | 5 +- src/dictionaries/__init__.py | 0 src/dictionaries/pl_ner_replacements.py | 46 ++++++++++++ src/entity_types.py | 14 ++++ src/string_replacements.py | 27 +++++++ src/suppressors/__init__.py | 1 + src/suppressors/order_based.py | 27 +++++++ src/tag_anonimization.py | 40 +++++++++++ src/utils.py | 14 ---- src/utils/__init__.py | 1 + src/utils/ner_pl_n5_mapping.py | 9 +++ src/utils/utils.py | 33 +++++++++ tests/detectors/date/test_en.py | 13 ++-- tests/detectors/date/test_pl.py | 6 +- tests/detectors/date/test_ru.py | 6 +- tests/detectors/email/test_email.py | 3 +- tests/detectors/ner/__init__.py | 0 tests/detectors/ner/test_pl_liner_n5.py | 21 ++++++ tests/detectors/phone/test_phone.py | 3 +- tests/detectors/url/test_url.py | 7 +- tests/detectors/user/test_user.py | 3 +- tests/dictionaries/__init__.py | 0 .../dictionaries/test_pl_ner_replacements.py | 38 ++++++++++ tests/suppressors/test_order_based.py | 16 +++++ tests/test_ccl_parser.py | 60 ++++++++++++++++ tests/test_string_replacements.py | 20 ++++++ tests/test_tag_anonimization.py | 17 +++++ 43 files changed, 552 insertions(+), 61 deletions(-) create mode 100644 src/ccl_parser.py create mode 100644 src/detectors/ner/__init__.py create mode 100644 src/detectors/ner/ner.py create mode 100644 src/detectors/ner/pl_liner_n5.py create mode 100644 src/dictionaries/__init__.py create mode 100644 src/dictionaries/pl_ner_replacements.py create mode 100644 src/entity_types.py create mode 100644 src/string_replacements.py create mode 100644 src/suppressors/__init__.py create mode 100644 src/suppressors/order_based.py create mode 100644 src/tag_anonimization.py delete mode 100644 src/utils.py create mode 100644 src/utils/__init__.py create mode 100644 src/utils/ner_pl_n5_mapping.py create mode 100644 src/utils/utils.py create mode 100644 tests/detectors/ner/__init__.py create mode 100644 tests/detectors/ner/test_pl_liner_n5.py create mode 100644 tests/dictionaries/__init__.py create mode 100644 tests/dictionaries/test_pl_ner_replacements.py create mode 100644 tests/suppressors/test_order_based.py create mode 100644 tests/test_ccl_parser.py create mode 100644 tests/test_string_replacements.py create mode 100644 tests/test_tag_anonimization.py diff --git a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl index b19c400..f1459ba 100644 --- a/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl +++ b/example_inputs/marek_kowalski_pojechal_do_wroclawia.ccl @@ -7,39 +7,45 @@ <orth>Marek</orth> <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex> <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex> - <ann chan="nam_liv" head="1">1</ann> - <ann chan="nam_loc">0</ann> + <ann chan="person_first_nam" head="1">1</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> </tok> <tok> <orth>Kowalski</orth> <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex> - <ann chan="nam_liv">1</ann> - <ann chan="nam_loc">0</ann> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam" head="1">1</ann> + <ann chan="city_nam">0</ann> </tok> <tok> <orth>pojechaÅ‚</orth> <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex> - <ann chan="nam_liv">0</ann> - <ann chan="nam_loc">0</ann> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> </tok> <tok> <orth>do</orth> <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex> - <ann chan="nam_liv">0</ann> - <ann chan="nam_loc">0</ann> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> </tok> <tok> <orth>WrocÅ‚awia</orth> <lex disamb="1"><base>WrocÅ‚aw</base><ctag>subst:sg:gen:m3</ctag></lex> - <ann chan="nam_liv">0</ann> - <ann chan="nam_loc" head="1">1</ann> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam" head="1">1</ann> </tok> <ns/> <tok> <orth>.</orth> <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> - <ann chan="nam_liv">0</ann> - <ann chan="nam_loc">0</ann> + <ann chan="person_first_nam">0</ann> + <ann chan="person_last_nam">0</ann> + <ann chan="city_nam">0</ann> </tok> </sentence> </chunk> diff --git a/requirements.txt b/requirements.txt index f7260eb..3923df9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ --index-url https://pypi.clarin-pl.eu/simple/ nlp-ws regex==2020.10.28 -Babel==2.8.0 \ No newline at end of file +Babel==2.8.0 +bitarray==2.6.1 \ No newline at end of file diff --git a/src/anonymizers/english_anonymizer.py b/src/anonymizers/english_anonymizer.py index 61f29b1..0961c24 100644 --- a/src/anonymizers/english_anonymizer.py +++ b/src/anonymizers/english_anonymizer.py @@ -5,7 +5,7 @@ import random import regex -from src.utils import consume +from src.utils.utils import consume from src.ccl_handler import CCLHandler from src.base_anonymizer import BaseAnonymizer from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, diff --git a/src/anonymizers/polish_anonymizer.py b/src/anonymizers/polish_anonymizer.py index 60f9c50..f725254 100644 --- a/src/anonymizers/polish_anonymizer.py +++ b/src/anonymizers/polish_anonymizer.py @@ -4,7 +4,7 @@ import regex import random -from src.utils import consume +from src.utils.utils import consume from src.base_anonymizer import BaseAnonymizer from src.ccl_handler import CCLHandler from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, diff --git a/src/anonymizers/russian_anonymizer.py b/src/anonymizers/russian_anonymizer.py index 32c0a91..d9e6c07 100644 --- a/src/anonymizers/russian_anonymizer.py +++ b/src/anonymizers/russian_anonymizer.py @@ -5,7 +5,7 @@ import random import regex -from src.utils import consume +from src.utils.utils import consume from src.ccl_handler import CCLHandler from src.base_anonymizer import BaseAnonymizer from src.generators import (generate_pseudo_email, generate_pseudo_phone_number, diff --git a/src/ccl_parser.py b/src/ccl_parser.py new file mode 100644 index 0000000..41e6971 --- /dev/null +++ b/src/ccl_parser.py @@ -0,0 +1,70 @@ +from typing import Dict, Any, List, Tuple +from lxml import etree +from collections import defaultdict + +def parse_ccl(ccl: str) -> Tuple[str, Dict[str, List[Tuple[int, int, str]]]]: + """ + Parses CCL XML format and returns original text and annotations. + + Annotations are returned as a dictionary with keys being annotation channels + and values being a list of tuples (start, end, word) where: + * start is an index of the first character in the word + * end is an index of the last character in the word + * word is a word or a group of words (in case of multiword tokens) + + :param ccl: CCL XML + :return: (text, annotations) + """ + ccl_tree = etree.fromstring(ccl.strip().encode('utf-8')) + + results = defaultdict(list) + text = "" + + # First token is assumed to not have space before it + last_was_ns = True + + tokens = ccl_tree.xpath("//ns | //tok") + for token in tokens: + if token.tag == 'tok': + if not last_was_ns: + text += " " + + word = token.xpath('./orth')[0].text + start = len(text) + end = start + len(word) + + for lex in token.xpath('./lex'): + if lex.attrib['disamb'] == "1": + ctag = lex.xpath('./ctag')[0] + results["ctag"].append((start, end, ctag.text)) + + break + + for ann in token.xpath('./ann'): + is_present = int(ann.text) == 1 + if not is_present: + continue + + channel = ann.attrib['chan'] + is_head = "head" in ann.attrib and ann.attrib['head'] == "1" + + if is_head: + results[channel].append((start, end, word)) + else: + if last_was_ns: + new_word = results[channel][-1][2] + word + else: + new_word = results[channel][-1][2] + " " + word + + old_start = results[channel][-1][0] + + results[channel][-1] = (old_start, end, new_word) + + last_was_ns = False + text += word + elif token.tag == 'ns': + last_was_ns = True + + return text, results + + \ No newline at end of file diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py index 85e34b4..2f1f132 100644 --- a/src/detectors/date/date.py +++ b/src/detectors/date/date.py @@ -10,7 +10,7 @@ def find_dates(text: str, language: str = "en") -> List[Tuple[int, int, str]]: :type text: str :param language: the language of the text :type language: str - :return: a list of tuples containing (start, end, detected_date) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py index 594e663..a716bc1 100644 --- a/src/detectors/date/en.py +++ b/src/detectors/date/en.py @@ -1,5 +1,6 @@ import regex as re from typing import List, Tuple +from src.entity_types import EntityTypes EN_DATES_REGEX = re.compile( r'\b(?P<day_or_month_year>' @@ -26,11 +27,11 @@ def detect_dates_en(text: str) -> List[Tuple[int, int, str]]: Detects English dates in the text. :param text: the text to be searched :type text: str - :return: a list of tuples containing (start, end, detected_date) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ matches = EN_DATES_REGEX.finditer(text) dates = [] for match in matches: - dates.append((match.start(), match.end(), match.group())) + dates.append((match.start(), match.end(), EntityTypes.DATE)) return dates \ No newline at end of file diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py index 7001b9f..02abfdd 100644 --- a/src/detectors/date/pl.py +++ b/src/detectors/date/pl.py @@ -1,5 +1,6 @@ import regex as re from typing import List, Tuple +from src.entity_types import EntityTypes PL_DATES_REGEX = re.compile( r'\b(?P<day_or_month_year>' @@ -29,11 +30,11 @@ def detect_dates_pl(text: str) -> List[Tuple[int, int, str]]: Detects Polish dates in the text. :param text: the text to be searched :type text: str - :return: a list of tuples containing (start, end, detected_date) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ matches = PL_DATES_REGEX.finditer(text) dates = [] for match in matches: - dates.append((match.start(), match.end(), match.group())) + dates.append((match.start(), match.end(), EntityTypes.DATE)) return dates \ No newline at end of file diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py index 91017c8..4100717 100644 --- a/src/detectors/date/ru.py +++ b/src/detectors/date/ru.py @@ -1,5 +1,6 @@ import regex as re from typing import List, Tuple +from src.entity_types import EntityTypes RU_DATES_REGEX = re.compile( r'\b(?P<day_or_month_year>' @@ -29,11 +30,11 @@ def detect_dates_ru(text: str) -> List[Tuple[int, int, str]]: Detects Russian dates in the text. :param text: the text to be searched :type text: str - :return: a list of tuples containing (start, end, detected_date) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ matches = RU_DATES_REGEX.finditer(text) dates = [] for match in matches: - dates.append((match.start(), match.end(), match.group())) + dates.append((match.start(), match.end(), EntityTypes.DATE)) return dates \ No newline at end of file diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py index a0637ec..82e1756 100644 --- a/src/detectors/email/email.py +++ b/src/detectors/email/email.py @@ -1,5 +1,6 @@ import regex as re from typing import List, Tuple +from src.entity_types import EntityTypes EMAIL_REGEX = re.compile( r'(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+' @@ -16,11 +17,11 @@ def detect_emails(text: str, language: str) -> List[Tuple[int, int, str]]: :type text: str :param language: the language of the text :type language: str - :return: a list of tuples containing (start, end, detected_email) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ matches = EMAIL_REGEX.finditer(text) emails = [] for match in matches: - emails.append((match.start(), match.end(), match.group())) + emails.append((match.start(), match.end(), EntityTypes.EMAIL)) return emails \ No newline at end of file diff --git a/src/detectors/ner/__init__.py b/src/detectors/ner/__init__.py new file mode 100644 index 0000000..9f8aefd --- /dev/null +++ b/src/detectors/ner/__init__.py @@ -0,0 +1 @@ +from src.detectors.ner.ner import detect_ner \ No newline at end of file diff --git a/src/detectors/ner/ner.py b/src/detectors/ner/ner.py new file mode 100644 index 0000000..18c5622 --- /dev/null +++ b/src/detectors/ner/ner.py @@ -0,0 +1,8 @@ +from typing import List, Tuple +from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5 + +def detect_ner(ccl_annotations, language) -> List[Tuple[int, int, str]]: + if language == 'pl': + return detect_ner_pl_liner_n5(ccl_annotations) + else: + raise NotImplementedError \ No newline at end of file diff --git a/src/detectors/ner/pl_liner_n5.py b/src/detectors/ner/pl_liner_n5.py new file mode 100644 index 0000000..c494d13 --- /dev/null +++ b/src/detectors/ner/pl_liner_n5.py @@ -0,0 +1,33 @@ +from typing import List, Tuple, Dict +from src.utils.utils import subdict +from src.entity_types import EntityTypes +from src.utils.ner_pl_n5_mapping import NER_PL_N5_MAPPING + +def detect_ner_pl_liner_n5( + ccl_annotations: Dict[str, List[Tuple[int, int, str]]] +) -> List[Tuple[int, int, str]]: + """ + Detects ner entities in the text based on liner_n5 NER ontology. + + :param ner_annotations: a dictionary of NER annotations + :type ner_annotations: Dict[str, List[Tuple[int, int, str]]] + :return: a list of tuples containing (start, end, entity_type) + :rtype: List[Tuple[int, int, str]] + """ + names = subdict( + ccl_annotations, + [ + "nam_liv_person", + "nam_liv_person_last", + "nam_fac_road", + "nam_loc_gpe_city", + "nam_org_group_team", + ], + all_must_be_present=False, + ) + + return [ + (start, end, NER_PL_N5_MAPPING.get(entity_type, EntityTypes.OTHER)) + for entity_type, entity in names.items() + for start, end, _ in entity + ] diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py index 49abeb5..8ab3d65 100644 --- a/src/detectors/phone/phone.py +++ b/src/detectors/phone/phone.py @@ -1,5 +1,6 @@ import regex as re from typing import List, Tuple +from src.entity_types import EntityTypes PHONE_NUMBER_REGEX = re.compile( r'(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?' @@ -14,11 +15,11 @@ def detect_phone_numbers(text: str, language: str) -> List[Tuple[int, int, str]] :type text: str :param language: the language of the text :type language: str - :return: a list of tuples containing (start, end, detected_date) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ matches = PHONE_NUMBER_REGEX.finditer(text) phone_numbers = [] for match in matches: - phone_numbers.append((match.start(), match.end(), match.group())) + phone_numbers.append((match.start(), match.end(), EntityTypes.PHONE_NUMBER)) return phone_numbers \ No newline at end of file diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py index 2ca1fec..70b8ba8 100644 --- a/src/detectors/url/url.py +++ b/src/detectors/url/url.py @@ -2,6 +2,7 @@ import regex as re from typing import List, Tuple from .pl import URL_REGEX_PL from .common import generate_url_regex +from src.entity_types import EntityTypes def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]: """ @@ -10,7 +11,7 @@ def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]: :type text: str :param language: the language of the text :type language: str - :return: a list of tuples containing (start, end, detected_url) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ if language == "pl": @@ -21,6 +22,6 @@ def detect_urls(text: str, language: str) -> List[Tuple[int, int, str]]: matches = url_regex.finditer(text) urls = [] for match in matches: - urls.append((match.start(), match.end(), match.group())) + urls.append((match.start(), match.end(), EntityTypes.URL)) return urls \ No newline at end of file diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py index 4d8f035..d588a25 100644 --- a/src/detectors/user/user.py +++ b/src/detectors/user/user.py @@ -1,5 +1,6 @@ import regex as re from typing import List, Tuple +from src.entity_types import EntityTypes USER_REGEX = re.compile(r'\B(?P<username>\@[\w\-]+)') @@ -10,11 +11,11 @@ def detect_users(text: str, language: str) -> List[Tuple[int, int, str]]: :type text: str :param language: the language of the text :type language: str - :return: a list of tuples containing (start, end, detected_user) + :return: a list of tuples containing (start, end, entity_type) :rtype: List[Tuple[int, int, str]] """ matches = USER_REGEX.finditer(text) users = [] for match in matches: - users.append((match.start(), match.end(), match.group())) + users.append((match.start(), match.end(), EntityTypes.USER)) return users \ No newline at end of file diff --git a/src/dictionaries/__init__.py b/src/dictionaries/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/dictionaries/pl_ner_replacements.py b/src/dictionaries/pl_ner_replacements.py new file mode 100644 index 0000000..77e7e87 --- /dev/null +++ b/src/dictionaries/pl_ner_replacements.py @@ -0,0 +1,46 @@ +from typing import Dict, List, Optional +from collections import defaultdict +from src.entity_types import EntityTypes + +def load_pl_ner_replacements_dictionary(path: str, ner_mapping: Optional[Dict[str, str]] = None) -> Dict[str, Dict[str, Dict[str, str]]]: + """ + Loads a dictionary that maps named entity tags to lemmas to part-of-speech tags to words. + + The dictionary is a nested defaultdict, so if a key is not found, an empty defaultdict is returned. + + The dictionary is stored in a tab-separated file, where each line has the following format: + + <ner_tag> <word> <lemma> <pos_tag> + + Example: + + OSOBA Andrzejowi Andrzej subst:sg:dat:m1 + OSOBA Andrzej Andrzej subst:sg:m1:imperf + OSOBA Kasia Kasia subst:sg:f:imperf + MIEJSCE WrocÅ‚aw WrocÅ‚aw subst:sg:m2:imperf + MIEJSCE Warszawa Warszawa subst:sg:f:imperf + MIEJSCE Kraków Kraków subst:sg:m2:imperf + + Parameters + ---------- + path : str + Path to the dictionary file. + + Returns + ------- + Dict[str, Dict[str, Dict[str, str]]] + Nested defaultdict that maps named entity tags to lemmas to part-of-speech tags to words. + """ + + replacement_dictionary = defaultdict(lambda: defaultdict(dict)) + with open(path, "r", encoding="utf-8") as file: + for line in file: + line = line.strip() + ner_tag, word, lemma, pos_tag = line.split("\t") + + if ner_mapping is not None: + ner_tag = ner_mapping.get(ner_tag, EntityTypes.OTHER) + + replacement_dictionary[ner_tag][lemma][pos_tag] = word + + return replacement_dictionary \ No newline at end of file diff --git a/src/entity_types.py b/src/entity_types.py new file mode 100644 index 0000000..ed0496b --- /dev/null +++ b/src/entity_types.py @@ -0,0 +1,14 @@ +class EntityTypes: + NAME = "name" + SURNAME = "surname" + STREET_NAME = "street_name" + CITY = "city" + COUNTRY = "country" + PHONE_NUMBER = "phone_number" + URL = "url" + USER = "user" + EMAIL = "email" + DATE = "date" + TIN = "tin" # Tax Identification Number + KRS = "krs" # National Court Register + OTHER = "other" \ No newline at end of file diff --git a/src/string_replacements.py b/src/string_replacements.py new file mode 100644 index 0000000..33c426d --- /dev/null +++ b/src/string_replacements.py @@ -0,0 +1,27 @@ +from typing import List, Tuple + +def replace(original_string: str, replacements: List[Tuple[int, int, str]]): + """ + Replaces substrings in a string. + + Parameters + ---------- + original_string : str + The original string. + replacements : List[Tuple[int, int, str]] + A list of tuples containing (start, end, replacement). + + Returns + ------- + str + The string with replacements applied. + """ + + replacements = sorted(replacements, key=lambda x: x[0]) + + delta = 0 + for replacement in replacements: + original_string = original_string[:replacement[0] + delta] + replacement[2] + original_string[replacement[1] + delta:] + delta += len(replacement[2]) - (replacement[1] - replacement[0]) + + return original_string \ No newline at end of file diff --git a/src/suppressors/__init__.py b/src/suppressors/__init__.py new file mode 100644 index 0000000..e9cc16f --- /dev/null +++ b/src/suppressors/__init__.py @@ -0,0 +1 @@ +from src.suppressors.order_based import suppress_order_based \ No newline at end of file diff --git a/src/suppressors/order_based.py b/src/suppressors/order_based.py new file mode 100644 index 0000000..8488465 --- /dev/null +++ b/src/suppressors/order_based.py @@ -0,0 +1,27 @@ +from typing import List, Tuple, Dict +from bitarray import bitarray + +def suppress_order_based(annotations: List[Tuple[int, int, str]]) -> List[Tuple[int, int, str]]: + """If two annotations overlap, the first one int the list is kept. + + Args: + annotations (List[Tuple[int, int, str]]): List of annotations. + + Returns: + List[Tuple[int, int, str]]: List of annotations with overlapping + annotations removed. + + """ + annotations = annotations + bitarray_size = max([end for _, end, _ in annotations]) + bitarray_ = bitarray(bitarray_size) + bitarray_.setall(False) + + result = [] + + for start, end, entity_type in annotations: + if not bitarray_[start:end].any(): + bitarray_[start:end] = True + result.append((start, end, entity_type)) + + return result diff --git a/src/tag_anonimization.py b/src/tag_anonimization.py new file mode 100644 index 0000000..89e1a10 --- /dev/null +++ b/src/tag_anonimization.py @@ -0,0 +1,40 @@ +from typing import List, Tuple +from collections import defaultdict +from src.entity_types import EntityTypes +from src.string_replacements import replace + +def replace_with_tags(text: str, detections: List[Tuple[int, int, str]]) -> str: + """Replace entities with tags. + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + str: Text with entities replaced with tags. + + """ + + tags_map = { + EntityTypes.NAME: "[OSOBA]", + EntityTypes.SURNAME: "[OSOBA]", + EntityTypes.STREET_NAME: "[MIEJSCE]", + EntityTypes.CITY: "[MIEJSCE]", + EntityTypes.COUNTRY: "[MIEJSCE]", + EntityTypes.PHONE_NUMBER: "[DIGITS]", + EntityTypes.URL: "[WWW]", + EntityTypes.USER: "@[USER]", + EntityTypes.EMAIL: "[MAIL]", + EntityTypes.DATE: "[DATE]", + EntityTypes.TIN: "[DIGITS]", + EntityTypes.KRS: "[DIGITS]", + } + + result = [ + (start, end, tags_map.get(entity_type, "[OTHER]")) + for start, end, entity_type in detections + ] + + return replace(text, result) + + \ No newline at end of file diff --git a/src/utils.py b/src/utils.py deleted file mode 100644 index 81cc67f..0000000 --- a/src/utils.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Module for useful functions.""" - -import itertools - - -def consume(iterative, n): - """Consume n elements from iterative object. - - Args: - iterative (iter): Python iterative object. - n (int): Number of elements to consume. - - """ - next(itertools.islice(iterative, n - 1, n), None) diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..a8b0bd1 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1 @@ +from src.utils.utils import * \ No newline at end of file diff --git a/src/utils/ner_pl_n5_mapping.py b/src/utils/ner_pl_n5_mapping.py new file mode 100644 index 0000000..0b857b5 --- /dev/null +++ b/src/utils/ner_pl_n5_mapping.py @@ -0,0 +1,9 @@ +from src.entity_types import EntityTypes + +NER_PL_N5_MAPPING = { + "nam_liv_person": EntityTypes.NAME, + "nam_liv_person_last": EntityTypes.SURNAME, + "nam_fac_road": EntityTypes.STREET_NAME, + "nam_loc_gpe_city": EntityTypes.CITY, + "nam_org_group_team": EntityTypes.COUNTRY, +} \ No newline at end of file diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000..c0035e6 --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,33 @@ +"""Module for useful functions.""" + +import itertools + + +def consume(iterative, n): + """Consume n elements from iterative object. + + Args: + iterative (iter): Python iterative object. + n (int): Number of elements to consume. + + """ + next(itertools.islice(iterative, n - 1, n), None) + + +def subdict(dictionary, keys, all_must_be_present=True): + """Return a subdictionary of dictionary containing only keys. + + Args: + dictionary (dict): Dictionary to take a subdictionary from. + keys (list): List of keys to take from dictionary. + all_must_be_present (bool): If True, all keys must be present in + dictionary. If False, only keys that are present are returned. + + Returns: + dict: Subdictionary of dictionary containing only keys. + + """ + if all_must_be_present: + return {key: dictionary[key] for key in keys} + else: + return {key: dictionary[key] for key in keys if key in dictionary} \ No newline at end of file diff --git a/tests/detectors/date/test_en.py b/tests/detectors/date/test_en.py index 429ee2a..8104a83 100644 --- a/tests/detectors/date/test_en.py +++ b/tests/detectors/date/test_en.py @@ -1,16 +1,17 @@ from src.detectors.date.en import detect_dates_en +from src.entity_types import EntityTypes + def test_detect_dates_en(): # Check en-us text = "On 1.01.2022, I sold my cat. On April 5, 2021, I bought a dog." found_dates = detect_dates_en(text) - - assert found_dates == [(3,12,"1.01.2022"), (32,45, "April 5, 2021")] - + + assert found_dates == [(3, 12, EntityTypes.DATE), (32, 45, EntityTypes.DATE)] + # Check en-gb # TODO: Following test fails. Fix it. # text = "On 1.01.2022 I sold the cat. On 5th April 2021 I bought a dog." # found_dates = detect_dates_en(text) - - # assert found_dates == [(3,12,"1.01.2022"), (32,46, "5th April 2021")] - \ No newline at end of file + + # assert found_dates == [(3,12, EntityTypes.DATE), (32,46, EntityTypes.DATE)] diff --git a/tests/detectors/date/test_pl.py b/tests/detectors/date/test_pl.py index a441c36..2942163 100644 --- a/tests/detectors/date/test_pl.py +++ b/tests/detectors/date/test_pl.py @@ -1,7 +1,9 @@ from src.detectors.date.pl import detect_dates_pl +from src.entity_types import EntityTypes + def test_detect_dates_pl(): text = "W dniu 1.01.2022 sprzedaÅ‚em kota. 5 kwietnia 2021 roku kupiÅ‚em psa." found_dates = detect_dates_pl(text) - - assert found_dates == [(7,16,"1.01.2022"), (34,49, "5 kwietnia 2021")] \ No newline at end of file + + assert found_dates == [(7, 16, EntityTypes.DATE), (34, 49, EntityTypes.DATE)] diff --git a/tests/detectors/date/test_ru.py b/tests/detectors/date/test_ru.py index 44e9805..5b90d29 100644 --- a/tests/detectors/date/test_ru.py +++ b/tests/detectors/date/test_ru.py @@ -1,7 +1,9 @@ from src.detectors.date.ru import detect_dates_ru +from src.entity_types import EntityTypes + def test_detect_dates_pl(): text = "1.01.2022 Ñ Ð¿Ñ€Ð¾Ð´Ð°Ð» кошку. 5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021 Ñ ÐºÑƒÐ¿Ð¸Ð» Ñобаку." found_dates = detect_dates_ru(text) - - assert found_dates == [(0,9,"1.01.2022"), (26,39, "5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021")] \ No newline at end of file + + assert found_dates == [(0, 9, EntityTypes.DATE), (26, 39, EntityTypes.DATE)] diff --git a/tests/detectors/email/test_email.py b/tests/detectors/email/test_email.py index 05b3e63..6be224f 100644 --- a/tests/detectors/email/test_email.py +++ b/tests/detectors/email/test_email.py @@ -1,7 +1,8 @@ from src.detectors.email import detect_emails +from src.entity_types import EntityTypes def test_detect_emails(): text = "My email is arkadiusz@borek.pw. My friend's email is arkadiusz.dump@pwr.edu.pl" found_emails = detect_emails(text, "en") - assert found_emails == [(12, 30, "arkadiusz@borek.pw"), (53, 78, "arkadiusz.dump@pwr.edu.pl")] \ No newline at end of file + assert found_emails == [(12, 30, EntityTypes.EMAIL), (53, 78, EntityTypes.EMAIL)] \ No newline at end of file diff --git a/tests/detectors/ner/__init__.py b/tests/detectors/ner/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detectors/ner/test_pl_liner_n5.py b/tests/detectors/ner/test_pl_liner_n5.py new file mode 100644 index 0000000..ab14e41 --- /dev/null +++ b/tests/detectors/ner/test_pl_liner_n5.py @@ -0,0 +1,21 @@ +from src.detectors.ner.pl_liner_n5 import detect_ner_pl_liner_n5 +from src.entity_types import EntityTypes + +def test_detect_names_pl_liner_n5(): + ccl_annotations = { + 'nam_liv_person': [(10, 16, 'Marian'), (100, 109, 'Magdalena')], + 'nam_liv_person_last': [(30, 35, 'Nowak')], + 'nam_loc_gpe_city': [(50, 59, 'WrocÅ‚awiu')], + 'some_other_annotation': [(120, 124, 'zowd')], + } + + result = detect_ner_pl_liner_n5(ccl_annotations) + + expected = [ + (10, 16, EntityTypes.NAME), + (100, 109, EntityTypes.NAME), + (30, 35, EntityTypes.SURNAME), + (50, 59, EntityTypes.CITY), + ] + + assert set(result) == set(expected) \ No newline at end of file diff --git a/tests/detectors/phone/test_phone.py b/tests/detectors/phone/test_phone.py index b2efe23..733f263 100644 --- a/tests/detectors/phone/test_phone.py +++ b/tests/detectors/phone/test_phone.py @@ -1,7 +1,8 @@ from src.detectors.phone.phone import detect_phone_numbers +from src.entity_types import EntityTypes def test_detect_phone_numbers(): text = "My phone number is +48 123 456 789. My friend's number is 123456789." found_phone_numbers = detect_phone_numbers(text, "en") - assert found_phone_numbers == [(19, 34, '+48 123 456 789'), (58, 67, '123456789')] \ No newline at end of file + assert found_phone_numbers == [(19, 34, EntityTypes.PHONE_NUMBER), (58, 67, EntityTypes.PHONE_NUMBER)] \ No newline at end of file diff --git a/tests/detectors/url/test_url.py b/tests/detectors/url/test_url.py index ad22f69..3d50e4d 100644 --- a/tests/detectors/url/test_url.py +++ b/tests/detectors/url/test_url.py @@ -1,10 +1,11 @@ from src.detectors.url import detect_urls +from src.entity_types import EntityTypes def test_detect_urls(): text = "This is a test for www.google.com. Make sure to go to https://www.google.com" found_urls = detect_urls(text, "en") - assert found_urls == [(19, 33, 'www.google.com'), (54, 76, 'https://www.google.com')] + assert found_urls == [(19, 33, EntityTypes.URL), (54, 76, EntityTypes.URL)] def test_detect_urls_pl(): text = "m.in. https://www.google.com" @@ -12,5 +13,5 @@ def test_detect_urls_pl(): found_urls_en = detect_urls(text, "en") # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected as a URL. - assert found_urls_pl == [(6, 28, 'https://www.google.com')] - assert found_urls_en == [(0, 4, "m.in"), (6, 28, 'https://www.google.com')] \ No newline at end of file + assert found_urls_pl == [(6, 28, EntityTypes.URL)] + assert found_urls_en == [(0, 4, EntityTypes.URL), (6, 28, EntityTypes.URL)] \ No newline at end of file diff --git a/tests/detectors/user/test_user.py b/tests/detectors/user/test_user.py index b198f71..0ae3c9e 100644 --- a/tests/detectors/user/test_user.py +++ b/tests/detectors/user/test_user.py @@ -1,7 +1,8 @@ from src.detectors.user.user import detect_users +from src.entity_types import EntityTypes def test_detect_users(): text = "My username is @john_smith. My friend's username is @jane_doe." found_users = detect_users(text, "en") - assert found_users == [(15, 26, '@john_smith'), (52, 61, '@jane_doe')] \ No newline at end of file + assert found_users == [(15, 26, EntityTypes.USER), (52, 61, EntityTypes.USER)] \ No newline at end of file diff --git a/tests/dictionaries/__init__.py b/tests/dictionaries/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/dictionaries/test_pl_ner_replacements.py b/tests/dictionaries/test_pl_ner_replacements.py new file mode 100644 index 0000000..a694d2e --- /dev/null +++ b/tests/dictionaries/test_pl_ner_replacements.py @@ -0,0 +1,38 @@ +from src.dictionaries.pl_ner_replacements import load_pl_ner_replacements_dictionary +from tempfile import NamedTemporaryFile + +def test_load_pl_ner_replacements_dictionary(): + with NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) as file: + file.write("OSOBA\tAndrzejowi\tAndrzej\tsubst:sg:dat:m1\n") + file.write("OSOBA\tAndrzej\tAndrzej\tsubst:sg:m1:imperf\n") + file.write("OSOBA\tKasia\tKasia\tsubst:sg:f:imperf\n") + file.write("MIEJSCE\tWrocÅ‚aw\tWrocÅ‚aw\tsubst:sg:m2:imperf\n") + file.write("MIEJSCE\tWarszawa\tWarszawa\tsubst:sg:f:imperf\n") + file.write("MIEJSCE\tKraków\tKraków\tsubst:sg:m2:imperf\n") + + path = file.name + + dictionary = load_pl_ner_replacements_dictionary(path) + + assert dictionary == { + "OSOBA": { + "Andrzej": { + "subst:sg:dat:m1": "Andrzejowi", + "subst:sg:m1:imperf": "Andrzej" + }, + "Kasia": { + "subst:sg:f:imperf": "Kasia" + } + }, + "MIEJSCE": { + "WrocÅ‚aw": { + "subst:sg:m2:imperf": "WrocÅ‚aw" + }, + "Warszawa": { + "subst:sg:f:imperf": "Warszawa" + }, + "Kraków": { + "subst:sg:m2:imperf": "Kraków" + } + } + } \ No newline at end of file diff --git a/tests/suppressors/test_order_based.py b/tests/suppressors/test_order_based.py new file mode 100644 index 0000000..8cf35b9 --- /dev/null +++ b/tests/suppressors/test_order_based.py @@ -0,0 +1,16 @@ +from src.suppressors.order_based import suppress_order_based + +def test_supress_order_based(): + annotations = [ + (10, 16, "Marian"), + (10, 18, "Marianna"), + (30, 35, "Nowak"), + (50, 59, "WrocÅ‚awiu"), + ] + result = suppress_order_based(annotations) + expected = [ + (10, 16, "Marian"), + (30, 35, "Nowak"), + (50, 59, "WrocÅ‚awiu"), + ] + assert set(result) == set(expected) \ No newline at end of file diff --git a/tests/test_ccl_parser.py b/tests/test_ccl_parser.py new file mode 100644 index 0000000..e140edc --- /dev/null +++ b/tests/test_ccl_parser.py @@ -0,0 +1,60 @@ +from src.ccl_parser import parse_ccl + +example_ccl = """<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE chunkList SYSTEM "ccl.dtd"> +<chunkList> + <chunk type="p" id="ch1"> + <sentence id="s1"> + <tok> + <orth>Marek</orth> + <lex disamb="1"><base>Marek</base><ctag>subst:sg:nom:m1</ctag></lex> + <lex disamb="1"><base>marek</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="nam_liv" head="1">1</ann> + <ann chan="nam_loc">0</ann> + </tok> + <tok> + <orth>Kowalski</orth> + <lex disamb="1"><base>Kowalski</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="nam_liv">1</ann> + <ann chan="nam_loc">0</ann> + </tok> + <tok> + <orth>pojechaÅ‚</orth> + <lex disamb="1"><base>pojechać</base><ctag>praet:sg:m1:perf</ctag></lex> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> + </tok> + <tok> + <orth>do</orth> + <lex disamb="1"><base>do</base><ctag>prep:gen</ctag></lex> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> + </tok> + <tok> + <orth>WrocÅ‚awia</orth> + <lex disamb="1"><base>WrocÅ‚aw</base><ctag>subst:sg:gen:m3</ctag></lex> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc" head="1">1</ann> + </tok> + <ns/> + <tok> + <orth>.</orth> + <lex disamb="1"><base>.</base><ctag>interp</ctag></lex> + <ann chan="nam_liv">0</ann> + <ann chan="nam_loc">0</ann> + </tok> + </sentence> + </chunk> +</chunkList> +""" + +def test_parse_ccl(): + text, annotations = parse_ccl(example_ccl) + + assert text == "Marek Kowalski pojechaÅ‚ do WrocÅ‚awia." + + assert set(annotations.keys()) == set(["nam_liv", "nam_loc", "ctag"]) + + assert annotations["nam_liv"] == [(0, 14, "Marek Kowalski")] + assert annotations["nam_loc"] == [(27, 36, "WrocÅ‚awia")] + assert annotations["ctag"] == [(0, 5, "subst:sg:nom:m1"), (6, 14, "subst:sg:nom:m1"), (15, 23, "praet:sg:m1:perf"), (24, 26, "prep:gen"), (27, 36, "subst:sg:gen:m3"), (36, 37, "interp")] \ No newline at end of file diff --git a/tests/test_string_replacements.py b/tests/test_string_replacements.py new file mode 100644 index 0000000..f44644d --- /dev/null +++ b/tests/test_string_replacements.py @@ -0,0 +1,20 @@ +from src.string_replacements import replace + +def test_replace(): + text = "Ala ma kota" + replacements = [(0, 3, "Andrzej"), (7, 11, "psa")] + + expected = "Andrzej ma psa" + + result = replace(text, replacements) + + assert result == expected + +def test_replace_out_of_order(): + text = "Ala ma kota" + replacements = [(7, 11, "psa"), (0, 3, "Andrzej")] + + expected = "Andrzej ma psa" + result = replace(text, replacements) + + assert result == expected \ No newline at end of file diff --git a/tests/test_tag_anonimization.py b/tests/test_tag_anonimization.py new file mode 100644 index 0000000..3bfd374 --- /dev/null +++ b/tests/test_tag_anonimization.py @@ -0,0 +1,17 @@ + +from src.tag_anonimization import replace_with_tags +from src.entity_types import EntityTypes + +def test_replace_with_tags(): + text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" + detections = [ + (0, 3, EntityTypes.NAME), + (4, 14, EntityTypes.SURNAME), + (28, 38, EntityTypes.DATE), + (42, 51, EntityTypes.CITY), + ] + + result = replace_with_tags(text, detections) + expected = "[OSOBA] [OSOBA] urodziÅ‚a sie [DATE] we [MIEJSCE]" + + assert result == expected \ No newline at end of file -- GitLab