From fbe7e76e463dca650f64445c4deaac4ad7e913b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl> Date: Mon, 19 Jun 2023 07:59:26 +0200 Subject: [PATCH] Green tox --- .gitignore | 3 +- .gitlab-ci.yml | 16 +- cli.py | 9 + config.yaml.template | 4 + main.py | 27 ++- print_config.py | 3 + scripts/winer_input.py | 12 +- src/annotations/__init__.py | 12 +- src/annotations/annotations.py | 22 ++- src/detections/__init__.py | 53 +++++- src/detections/date.py | 29 ++- src/detections/detection.py | 173 ++++++++++++------ src/detectors/date/__init__.py | 4 + src/detectors/date/date.py | 40 +++- src/detectors/date/en.py | 18 +- src/detectors/date/pl.py | 17 +- src/detectors/date/ru.py | 17 +- src/detectors/date/utils.py | 13 ++ src/detectors/email/__init__.py | 4 + src/detectors/email/email.py | 49 +++-- src/detectors/interface.py | 7 +- src/detectors/ner/__init__.py | 4 + src/detectors/ner/ner.py | 28 ++- src/detectors/number/__init__.py | 4 + src/detectors/number/number.py | 16 +- src/detectors/phone/__init__.py | 4 + src/detectors/phone/phone.py | 50 ++--- src/detectors/url/__init__.py | 4 + src/detectors/url/common.py | 17 +- src/detectors/url/pl.py | 2 + src/detectors/url/url.py | 59 +++--- src/detectors/user/__init__.py | 4 + src/detectors/user/user.py | 39 ++-- src/dictionaries/morphosyntactic/__init__.py | 7 +- src/dictionaries/morphosyntactic/interface.py | 31 +++- src/dictionaries/morphosyntactic/ner_file.py | 50 ++++- .../morphosyntactic/ner_file_nkjp.py | 69 ++++++- src/input_parsers/ccl.py | 48 ++++- src/input_parsers/interface.py | 17 +- src/input_parsers/wiktor_ner.py | 64 ++++++- src/pipeline/default.py | 29 ++- src/pipeline/interface.py | 14 +- src/pipeline/sequential_jsonl.py | 32 +++- src/replacers/__init__.py | 2 + src/replacers/date_replacer.py | 19 +- src/replacers/delete_replacer.py | 18 +- src/replacers/email_replacer.py | 27 ++- src/replacers/interface.py | 5 + src/replacers/ner_replacer.py | 25 +++ src/replacers/number_replacer.py | 23 ++- src/replacers/tag_replacer.py | 51 +++++- src/replacers/user_replacer.py | 19 +- src/string_replacements.py | 60 +++--- src/suppressors/__init__.py | 2 + src/suppressors/interface.py | 9 +- src/suppressors/order_based.py | 49 +++-- src/utils/subclasses.py | 6 - src/utils/utils.py | 18 ++ src/worker.py | 39 +++- .../test_ccl_configuration.py | 3 + .../test_wiktorner_jsonl_configuration.py | 15 +- ...iktorner_jsonl_txt_output_configuration.py | 29 ++- tests/unit/detectors/date/test_en.py | 2 + tests/unit/detectors/date/test_pl.py | 14 +- tests/unit/detectors/date/test_ru.py | 3 + tests/unit/detectors/email/test_email.py | 3 + tests/unit/detectors/ner/test_ner.py | 2 + tests/unit/detectors/phone/test_phone.py | 3 + tests/unit/detectors/url/test_url.py | 7 +- tests/unit/detectors/user/test_user.py | 3 + .../morphosyntactic/test_ner_file.py | 8 +- .../morphosyntactic/test_ner_file_nkjp.py | 10 +- tests/unit/input_parsers/test_ccl.py | 3 + tests/unit/input_parsers/test_wiktor_ner.py | 2 + tests/unit/pipeline/test_default.py | 19 +- tests/unit/pipeline/test_sequential_jsonl.py | 19 +- tests/unit/replacers/test_date_replacer.py | 12 +- tests/unit/replacers/test_email_replacer.py | 7 +- tests/unit/replacers/test_ner_replacer.py | 9 +- tests/unit/replacers/test_tag_replacer.py | 6 +- tests/unit/replacers/test_user_replacer.py | 4 + tests/unit/suppressors/test_order_based.py | 3 + tests/unit/test_string_replacements.py | 5 + tox.ini | 8 +- 84 files changed, 1306 insertions(+), 389 deletions(-) create mode 100644 config.yaml.template delete mode 100644 src/utils/subclasses.py diff --git a/.gitignore b/.gitignore index 7198980..82f3f84 100644 --- a/.gitignore +++ b/.gitignore @@ -141,4 +141,5 @@ cython_debug/ *.ipynb /test.txt -/outputs \ No newline at end of file +/outputs +/config.yaml \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4eb96d5..5d7c443 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,14 +7,14 @@ stages: - build before_script: - pip install tox==2.9.1 -# pep8: -# stage: check_style -# script: -# - tox -v -e pep8 -# docstyle: -# stage: check_style -# script: -# - tox -v -e docstyle +pep8: + stage: check_style + script: + - tox -v -e pep8 +docstyle: + stage: check_style + script: + - tox -v -e docstyle build_image: stage: build image: 'docker:18.09.7' diff --git a/cli.py b/cli.py index 1bc12e0..3398a4e 100644 --- a/cli.py +++ b/cli.py @@ -1,3 +1,12 @@ +"""Script for running the anonymizer from the command line. + +Example usage: +```bash +python3 cli.py input.ccl output.txt --replace-method tag --language pl \ + --configuration ccl +``` +""" + import argparse from src.worker import Worker diff --git a/config.yaml.template b/config.yaml.template new file mode 100644 index 0000000..7979f78 --- /dev/null +++ b/config.yaml.template @@ -0,0 +1,4 @@ +# Authorization for NLP WS client + +USERNAME: username +PASSWORD: password \ No newline at end of file diff --git a/main.py b/main.py index 42deb74..7b308f6 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,7 @@ import logging import nlp_ws +from typing import Dict from src.worker import Worker @@ -9,30 +10,48 @@ _log = logging.getLogger(__name__) class AnonymizerWorker(nlp_ws.NLPWorker): - """Class implementing TextFixerWorker worker.""" + """NLP WS worker for anonymizer.""" @classmethod def static_init(cls, config): - """Initialize process.""" + """Initialize the class with configuration. + + Args: + config (dict): Configuration dictionary. (It's not strictly dict, + but it supports dict-like operations.) + + """ cls._configuration = config.get("tool").get("configuration", "ccl") cls._default_language = config.get("tool").get("default_language", "pl") cls._default_replacer = config.get("tool").get("default_replacer", "tag") _log.info( - "AnonymizerWorker initialized with configuration: %s, default language: %s, default replacer: %s", + "AnonymizerWorker initialized with configuration:" + "%s, default language: %s, default replacer: %s", cls._configuration, cls._default_language, cls._default_replacer, ) def __init__(self): + """Initialize the worker instance.""" self._worker = Worker( configuration=self._configuration, default_language=self._default_language, default_replacer=self._default_replacer, ) - def process(self, input_file, task_options, output_file): + def process( + self, input_file: str, task_options: Dict[str, str], output_file: str + ) -> None: + """Process the input and save the result to the output path. + + Args: + input_file (str): Path to the input. + task_options (Dict[str, str]): Runtime configuration of the task. + output_file (str): Path to the output. + + """ self._worker.process(input_file, task_options, output_file) diff --git a/print_config.py b/print_config.py index a02740f..e5a83e1 100644 --- a/print_config.py +++ b/print_config.py @@ -1,3 +1,5 @@ +"""Utility script for printing Hydra config.""" + import json import hydra @@ -6,6 +8,7 @@ from omegaconf import OmegaConf @hydra.main(config_path="config", config_name="config") def main(cfg): + """Main function.""" cfg_resolved = OmegaConf.to_container(cfg, resolve=True) cfg_resolved_json = json.dumps(cfg_resolved, indent=4) diff --git a/scripts/winer_input.py b/scripts/winer_input.py index ec0726e..8d63b1e 100644 --- a/scripts/winer_input.py +++ b/scripts/winer_input.py @@ -1,6 +1,16 @@ +"""Get input for annotation from Winer. + +This utility script is used to get input for anonizmizer using WiNER as input. +You need to have config.yaml file in run directory with auth data for lpmn_client_biz. + +Example config.yaml: +USERNAME: username +PASSWORD: password +""" + import json -from lpmn_client_biz import Connection, IOType, Task, delete, download, upload +from lpmn_client_biz import Connection, IOType, Task, download, upload lpmn = [ "morphodita", diff --git a/src/annotations/__init__.py b/src/annotations/__init__.py index c94f5ec..ed1dc62 100644 --- a/src/annotations/__init__.py +++ b/src/annotations/__init__.py @@ -1 +1,11 @@ -from src.annotations.annotations import * +from src.annotations.annotations import ( + Annotation, + MorphosyntacticAnnotation, + NerAnnotation, +) + +__all__ = [ + "Annotation", + "MorphosyntacticAnnotation", + "NerAnnotation", +] diff --git a/src/annotations/annotations.py b/src/annotations/annotations.py index 008fa89..dbe0c3f 100644 --- a/src/annotations/annotations.py +++ b/src/annotations/annotations.py @@ -1,17 +1,37 @@ +"""Module for annotations. + +Annotations are used to mark parts of the text that are relevant for the task. +Annotations are usually extracted from previous parts of the pipeline and used +by the detectors to spot entities in the text. +""" + from dataclasses import dataclass @dataclass class Annotation: + """Interface for annotations.""" + def __hash__(self) -> int: + """Returns the hash of the annotation.""" return (type(self), *(self.__dict__.values())).__hash__() class MorphosyntacticAnnotation(Annotation): - def __init__(self, morphosyntactic_tag) -> None: + """Annotation for morphosyntactic tags.""" + + def __init__(self, morphosyntactic_tag: str) -> None: + """Initializes the annotation. + + Args: + morphosyntactic_tag (str): Morphosyntactic tag. + + """ self.morphosyntactic_tag = morphosyntactic_tag @dataclass class NerAnnotation(Annotation): + """Annotation for named entities.""" + ner_type: str diff --git a/src/detections/__init__.py b/src/detections/__init__.py index 9126ffb..23a3978 100644 --- a/src/detections/__init__.py +++ b/src/detections/__init__.py @@ -1,8 +1,57 @@ -from src.detections.date import * -from src.detections.detection import * +from src.detections.date import DateDetection +from src.detections.detection import ( + Detection, + DetectionType, + CityDetection, + CountryDetection, + EmailDetection, + HydronymDetection, + KRSDetection, + LocationDetection, + MorphosyntacticInfoMixin, + NameDetection, + NumberDetection, + OrganizationNameDetection, + PhoneNumberDetection, + ProperNameDetection, + SerialNumberDetection, + StreetNameDetection, + OtherDetection, + SurnameDetection, + TINDetection, + TitleDetection, + UrlDetection, + UserDetection, +) from src.utils.subclasses import get_sublcasses DETECTION_CLASSES_MAP = { detection_class.TYPE_NAME: detection_class for detection_class in get_sublcasses(Detection) } + +__all__ = [ + "Detection", + "DetectionType", + "CityDetection", + "CountryDetection", + "EmailDetection", + "HydronymDetection", + "KRSDetection", + "LocationDetection", + "MorphosyntacticInfoMixin", + "NameDetection", + "NumberDetection", + "OrganizationNameDetection", + "PhoneNumberDetection", + "ProperNameDetection", + "SerialNumberDetection", + "StreetNameDetection", + "OtherDetection", + "SurnameDetection", + "TINDetection", + "TitleDetection", + "UrlDetection", + "UserDetection", + "DateDetection", +] diff --git a/src/detections/date.py b/src/detections/date.py index f5d01fd..8754dc5 100644 --- a/src/detections/date.py +++ b/src/detections/date.py @@ -1,12 +1,21 @@ +"""Module for date detection in text.""" + from typing import List, Optional, Tuple from src.detections.detection import Detection class DateDetection(Detection): + """Date detection in text. + + Eg.: 12.05.2023 + """ + TYPE_NAME = "date" class AnnotationPart: + """Annotation part of date detection.""" + TWO_DIGITS_DAY = "DD" ONE_DIGIT_DAY = "D" TWO_DIGIT_MONTH = "MM" @@ -19,15 +28,25 @@ class DateDetection(Detection): def __init__( self, format: Optional[List[Tuple[AnnotationPart, str]]] = None ) -> None: - """ - The annotation representing a date value. - :param format: the format of the date, e.g. [(AnnotationPart.TWO_DIGITS_DAY, "01"), (AnnotationPart.OTHER, ".") ...] - :type format: Optional[List[Tuple[str, str]]] - """ + """Date detection initialization. + Args: + format (Optional[List[Tuple[AnnotationPart, str]]], optional): Format of + detected date. Defaults to None. + + """ super().__init__() self.format = format def __eq__(self, other) -> bool: + """Compare two date detections. + + Args: + other (DateDetection): date detection to compare with + + Returns: + bool: true if both detections are equal, false otherwise + + """ return self.format == other.format and super().__eq__(other) diff --git a/src/detections/detection.py b/src/detections/detection.py index 9070497..f527b21 100644 --- a/src/detections/detection.py +++ b/src/detections/detection.py @@ -1,153 +1,210 @@ +"""Module containg definions of simple detection types.""" + from dataclasses import dataclass -from typing import Optional @dataclass class Detection: + """Interface for detections. + + Should be used as base class for all detections. + It should not be used directly. For other detections see: OtherDetection class + """ + TYPE_NAME = "detection" def __hash__(self) -> int: + """Hash function for detection.""" return (type(self), *(self.__dict__.values())).__hash__() class MorphosyntacticInfoMixin: + """Mixin for detections with morphosyntactic information. + + Eg. detection of "Rolexes" can be thought of as "Rolex" + with morphosyntactic tag "NNS" (noun, plural). + """ + def __init__(self, morpho_tag: str, *args, **kwargs) -> None: + """Initialization of MorphosyntacticInfoMixin. + + Args: + morpho_tag (str): Morphosyntactic tag of detection, eg. "NNS", + "sg:nom:m1" etc. + + """ super().__init__(*args, **kwargs) self._morpho_tag = morpho_tag @property def morpho_tag(self) -> str: + """Morphosyntactic tag of detection.""" return self._morpho_tag class NameDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "name" + """Class representing name detection. - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + Eg.: "John", "Mark" + """ + + TYPE_NAME = "name" class SurnameDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "surname" + """Class representing surname detection. + + Eg.: "Smith", "Johnson" + """ - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + TYPE_NAME = "surname" class LocationDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "location" + """Class representing location detection. - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + Eg.: "Park Narodowy BiaÅ‚owieski", "Tatry" + """ + + TYPE_NAME = "location" class OrganizationNameDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "organization_name" + """Class for organization name detection. + + Eg.: "Apple", "Microsoft" + """ - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + TYPE_NAME = "organization_name" class ProperNameDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "proper_name" + """Class representing proper name detection. - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + Eg.: "Rolex" + """ + + TYPE_NAME = "proper_name" class TitleDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "title" + """Class representing title detection. + + Eg.: "Fast and Furious", "The Lord of the Rings" + """ - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + TYPE_NAME = "title" class HydronymDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "hydronym" + """Class representing hydronym detection. - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + Eg.: "WisÅ‚a", "Odra" + """ + + TYPE_NAME = "hydronym" class StreetNameDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "street_name" + """Class representing street name detection. + + Eg.: "MarszaÅ‚kowska", "KoÅ›ciuszki" + """ - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + TYPE_NAME = "street_name" class CityDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "city" + """Class representing city detection. - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + Eg.: "Warsaw", "Berlin" + """ + + TYPE_NAME = "city" class CountryDetection(MorphosyntacticInfoMixin, Detection): - TYPE_NAME = "country" + """Class representing country detection. + + Eg.: "Poland", "Germany" + """ - def __init__(self, morpho_tag: Optional[str] = None) -> None: - super().__init__(morpho_tag=morpho_tag) + TYPE_NAME = "country" class UrlDetection(Detection): - TYPE_NAME = "url" + """Class for url detection. - def __init__(self) -> None: - super().__init__() + Eg.: "https://www.google.com", "www.google.com" + """ + + TYPE_NAME = "url" class UserDetection(Detection): - TYPE_NAME = "user" + """Class for user detection. + + Eg.: "@bob25", "@angelica111" + """ - def __init__(self) -> None: - super().__init__() + TYPE_NAME = "user" class EmailDetection(Detection): - TYPE_NAME = "email" + """Class representing email detection. - def __init__(self) -> None: - super().__init__() + Eg.: bob@gmail.com + """ + + TYPE_NAME = "email" class NumberDetection(Detection): - TYPE_NAME = "number" + """Class for number detection. - def __init__(self) -> None: - super().__init__() + Eg.: "123", "123.456" + """ + + TYPE_NAME = "number" class PhoneNumberDetection(NumberDetection): - TYPE_NAME = "phone_number" + """Class for phone number detection. - def __init__(self) -> None: - super().__init__() + Eg.: "123-456-789", "+49 123 456 789" + """ + + TYPE_NAME = "phone_number" class TINDetection(Detection): # Tax Identification Number - TYPE_NAME = "tin" + """Class for NIP (Tax Identification Number) detection. + + Eg.: "123-456-32-18", "1234563218" + """ - def __init__(self) -> None: - super().__init__() + TYPE_NAME = "tin" class KRSDetection(Detection): # National Court Register - TYPE_NAME = "krs" + """Class for KRS (National Court Register) number detection. - def __init__(self) -> None: - super().__init__() + Eg.: "0000000123" + """ + + TYPE_NAME = "krs" class SerialNumberDetection(Detection): - TYPE_NAME = "serial_number" + """Serial number detection. - def __init__(self) -> None: - super().__init__() + Eg.: "AB1234567890" + """ + + TYPE_NAME = "serial_number" class OtherDetection(Detection): # Non standard entity - TYPE_NAME = "other" + """Detection of an entity that does not fit into other categories.""" - def __init__(self) -> None: - super().__init__() + TYPE_NAME = "other" diff --git a/src/detectors/date/__init__.py b/src/detectors/date/__init__.py index bac0ba7..c880ba6 100644 --- a/src/detectors/date/__init__.py +++ b/src/detectors/date/__init__.py @@ -1 +1,5 @@ from src.detectors.date.date import DateDetector + +__all__ = [ + "DateDetector", +] diff --git a/src/detectors/date/date.py b/src/detectors/date/date.py index 6ef4a20..d434ce2 100644 --- a/src/detectors/date/date.py +++ b/src/detectors/date/date.py @@ -1,3 +1,5 @@ +"""Module responsible for detecting dates in the text.""" + from typing import Any, Dict, List, Tuple from src.detections import DateDetection @@ -9,26 +11,46 @@ from .ru import detect_dates_ru class DateDetector(Detector): + """Detects dates in the text.""" + def __init__(self, language: str = "pl") -> None: + """Initializes the detector. + + Args: + language (str, optional): Language of analyzed texts. Defaults to "pl". + + """ self._language = language def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] ) -> List[Tuple[int, int, DateDetection]]: + """Finds dates in the text. + + Args: + text (str): The text to be searched + annotations (Dict[str, List[Tuple[int, int, Any]]]): Annotations of the text + + Returns: + List[Tuple[int, int, DateDetection]]: a list of tuples containing + (start, end, annotation) + + """ return find_dates(text, self._language) def find_dates(text: str, language: str = "pl") -> List[Tuple[int, int, DateDetection]]: - """ - Finds dates in the text. - :param text: the text to be searched - :type text: str - :param language: the language of the text - :type language: str - :return: a list of tuples containing (start, end, annotation) - :rtype: List[Tuple[int, int, Annotation]] - """ + """Finds dates in the text. + Args: + text (str): the text to be searched + language (str, optional): The language of the text. Defaults to "pl". + + Returns: + List[Tuple[int, int, DateDetection]]: a list of tuples containing + (start, end, annotation) + + """ language_processors = { "en": detect_dates_en, "pl": detect_dates_pl, diff --git a/src/detectors/date/en.py b/src/detectors/date/en.py index 22423da..12996a3 100644 --- a/src/detectors/date/en.py +++ b/src/detectors/date/en.py @@ -1,3 +1,5 @@ +"""Date detector for English dates.""" + from typing import List, Tuple import regex as re @@ -26,16 +28,20 @@ EN_DATES_REGEX = re.compile( def detect_dates_en(text: str) -> List[Tuple[int, int, DateDetection]]: - """ - Detects English dates in the text. - :param text: the text to be searched - :type text: str - :return: a list of tuples containing (start, end, annotation) - :rtype: List[Tuple[int, int, DateAnnotation]] + """Detects English dates in the text. + + Args: + text (str): The text to be searched + + Returns: + List[Tuple[int, int, DateDetection]]: a list of tuples containing + (start, end, annotation) + """ matches = EN_DATES_REGEX.finditer(text) dates = [] for match in matches: date_format = parse_date_to_format(match.groupdict()) dates.append((match.start(), match.end(), DateDetection(date_format))) + return dates diff --git a/src/detectors/date/pl.py b/src/detectors/date/pl.py index 3000a75..6b6c6b1 100644 --- a/src/detectors/date/pl.py +++ b/src/detectors/date/pl.py @@ -1,3 +1,5 @@ +"""Date detector for Polish dates.""" + from typing import List, Tuple import regex as re @@ -32,12 +34,15 @@ PL_YEAR_REGEX = re.compile(r"(?<year>\d+)\s*(?<addon>roku?)") def detect_dates_pl(text: str) -> List[Tuple[int, int, DateDetection]]: - """ - Detects Polish dates in the text. - :param text: the text to be searched - :type text: str - :return: a list of tuples containing (start, end, annotation) - :rtype: List[Tuple[int, int, DateAnnotation]] + """Detects polish dates in the text. + + Args: + text (str): The text to be searched + + Returns: + List[Tuple[int, int, DateDetection]]: a list of tuples containing + (start, end, annotation) + """ matches = PL_DATES_REGEX.finditer(text) diff --git a/src/detectors/date/ru.py b/src/detectors/date/ru.py index 247d040..af8369b 100644 --- a/src/detectors/date/ru.py +++ b/src/detectors/date/ru.py @@ -1,3 +1,5 @@ +"""Date detector for Russian dates.""" + from typing import List, Tuple import regex as re @@ -29,12 +31,15 @@ RU_DATES_REGEX = re.compile( def detect_dates_ru(text: str) -> List[Tuple[int, int, DateDetection]]: - """ - Detects Russian dates in the text. - :param text: the text to be searched - :type text: str - :return: a list of tuples containing (start, end, annotation) - :rtype: List[Tuple[int, int, DateAnnotation]] + """Detects Russian dates in the text. + + Args: + text (str): The text to be searched + + Returns: + List[Tuple[int, int, DateDetection]]: a list of tuples containing + (start, end, annotation) + """ matches = RU_DATES_REGEX.finditer(text) dates = [] diff --git a/src/detectors/date/utils.py b/src/detectors/date/utils.py index 356c185..05fc9fd 100644 --- a/src/detectors/date/utils.py +++ b/src/detectors/date/utils.py @@ -1,3 +1,5 @@ +"""Utility scripts for parsing date detections.""" + from typing import List, Tuple from src.detections import DateDetection, Optional @@ -173,6 +175,17 @@ def _parse_year_only(re_entry) -> List[Tuple[DateDetection.AnnotationPart, str]] def parse_date_to_format( re_entry, ) -> Optional[List[Tuple[DateDetection.AnnotationPart, str]]]: + """Parse date to format. + + TODO: This should be definietly refactored :( + + Args: + re_entry (_type_): regex entry + + Returns: + Optional[List[Tuple[DateDetection.AnnotationPart, str]]]: pared dataentry + + """ if re_entry.get("day_or_month_year", None) is not None: result = _parse_day_or_month(re_entry) elif re_entry.get("year_month_or_day", None) is not None: diff --git a/src/detectors/email/__init__.py b/src/detectors/email/__init__.py index 5342f79..05fcf54 100644 --- a/src/detectors/email/__init__.py +++ b/src/detectors/email/__init__.py @@ -1 +1,5 @@ from src.detectors.email.email import EmailDetector + +__all__ = [ + "EmailDetector", +] diff --git a/src/detectors/email/email.py b/src/detectors/email/email.py index e63b3a4..4092c28 100644 --- a/src/detectors/email/email.py +++ b/src/detectors/email/email.py @@ -1,3 +1,5 @@ +"""Module for the email detector.""" + from typing import Any, Dict, List, Tuple import regex as re @@ -6,16 +8,6 @@ from src.detections import EmailDetection from src.detectors.interface import Detector -class EmailDetector(Detector): - def __init__(self) -> None: - super().__init__() - - def detect( - self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, EmailDetection]]: - return detect_emails(text) - - EMAIL_REGEX = re.compile( r"(?P<local_part>[a-z0-9!#$%&\'*+/=?^_`{|}~-]+" r"(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*@)" @@ -25,18 +17,25 @@ EMAIL_REGEX = re.compile( ) -def detect_emails(text: str) -> List[Tuple[int, int, EmailDetection]]: - """ - Detects emails in the text. - :param text: the text to be searched - :type text: str - :param language: the language of the text - :type language: str - :return: a list of tuples containing (start, end, entity_type) - :rtype: List[Tuple[int, int, EmailAnnotation]] - """ - matches = EMAIL_REGEX.finditer(text) - emails = [] - for match in matches: - emails.append((match.start(), match.end(), EmailDetection())) - return emails +class EmailDetector(Detector): + """Detector for emails.""" + + def detect( + self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] + ) -> List[Tuple[int, int, EmailDetection]]: + """Detects emails in the text. + + Args: + text (str): the text to be searched + + Returns: + List[Tuple[int, int, EmailDetection]]: a list of tuples containing + (start, end, entity_type) + + """ + matches = EMAIL_REGEX.finditer(text) + emails = [] + for match in matches: + emails.append((match.start(), match.end(), EmailDetection())) + + return emails diff --git a/src/detectors/interface.py b/src/detectors/interface.py index ca68255..71cb852 100644 --- a/src/detectors/interface.py +++ b/src/detectors/interface.py @@ -1,3 +1,5 @@ +"""Interface for detectors.""" + from abc import ABC, abstractmethod from typing import Any, Dict, List, Tuple @@ -5,11 +7,13 @@ from src.detections import Detection class Detector(ABC): + """Interface for detectors.""" + @abstractmethod def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] ) -> List[Tuple[int, int, Detection]]: - """Detects entities in text + """Detects entities in text. Args: text (str): Text to be processed. @@ -17,4 +21,5 @@ class Detector(ABC): Returns: List[Tuple[int, int, Detection]]: List of detections. + """ diff --git a/src/detectors/ner/__init__.py b/src/detectors/ner/__init__.py index 36250fb..cfdfa9d 100644 --- a/src/detectors/ner/__init__.py +++ b/src/detectors/ner/__init__.py @@ -1 +1,5 @@ from src.detectors.ner.ner import NerDetector + +__all__ = [ + "NerDetector", +] diff --git a/src/detectors/ner/ner.py b/src/detectors/ner/ner.py index bff8891..6b4e900 100644 --- a/src/detectors/ner/ner.py +++ b/src/detectors/ner/ner.py @@ -1,20 +1,42 @@ +"""Module for named entity recognition detector.""" + from typing import Dict, List, Tuple -from src.annotations import (Annotation, MorphosyntacticAnnotation, - NerAnnotation) -from src.detections import (DETECTION_CLASSES_MAP, MorphosyntacticInfoMixin) +from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation +from src.detections import DETECTION_CLASSES_MAP, MorphosyntacticInfoMixin from src.detectors.interface import Detector class NerDetector(Detector): + """Detector for named entities.""" + def __init__(self, detection_mapping: Dict[str, str], language: str = "pl") -> None: + """Initialize detector. + + Args: + detection_mapping (Dict[str, str]): mapping from NER types to detection + classes. + language (str, optional): Language of detection. Defaults to "pl". + + """ self._language = language self._detection_mapping = detection_mapping def detect( self, text: str, annotations: List[Tuple[int, int, Annotation]] ) -> List[Tuple[int, int, str]]: + """Detect named entities in the text. + + NOTE: This is not NER detector. It extracts named entities from the annotations. + + Args: + text (str): Text to be processed + annotations (List[Tuple[int, int, Annotation]]): Annotations of the text + + Returns: + List[Tuple[int, int, str]]: List of detected named entities + """ morpho_tags = dict() ner_detections = [] diff --git a/src/detectors/number/__init__.py b/src/detectors/number/__init__.py index e72ac46..cc3eb10 100644 --- a/src/detectors/number/__init__.py +++ b/src/detectors/number/__init__.py @@ -1 +1,5 @@ from src.detectors.number.number import NumberDetector + +__all__ = [ + "NumberDetector", +] diff --git a/src/detectors/number/number.py b/src/detectors/number/number.py index b3fed57..e297241 100644 --- a/src/detectors/number/number.py +++ b/src/detectors/number/number.py @@ -1,3 +1,5 @@ +"""Module for the number detector.""" + from typing import Any, Dict, List, Tuple import regex as re @@ -12,13 +14,23 @@ NUMBER_REGEX = re.compile( class NumberDetector(Detector): - def __init__(self) -> None: - super().__init__() + """Detector for numbers.""" def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] ) -> List[Tuple[int, int, NumberDetection]]: + """Detects numbers in the text. + + Args: + text (str): the text to be searched + annotations (Dict[str, List[Tuple[int, int, Any]]]): Annotations of + the text + + Returns: + List[Tuple[int, int, NumberDetection]]: a list of tuples containing + (start, end, entity_type) + """ numbers = [] for number in NUMBER_REGEX.finditer(text): diff --git a/src/detectors/phone/__init__.py b/src/detectors/phone/__init__.py index acfec77..bc96d6c 100644 --- a/src/detectors/phone/__init__.py +++ b/src/detectors/phone/__init__.py @@ -1 +1,5 @@ from src.detectors.phone.phone import PhoneNumberDetector + +__all__ = [ + "PhoneNumberDetector", +] diff --git a/src/detectors/phone/phone.py b/src/detectors/phone/phone.py index 9ba9520..71dc67d 100644 --- a/src/detectors/phone/phone.py +++ b/src/detectors/phone/phone.py @@ -1,3 +1,5 @@ +"""Module for detecting the phone number.""" + from typing import Any, Dict, List, Tuple import regex as re @@ -6,34 +8,32 @@ from src.detections import PhoneNumberDetection from src.detectors.interface import Detector -class PhoneNumberDetector(Detector): - def __init__(self) -> None: - super().__init__() - - def detect( - self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] - ) -> List[Tuple[int, int, PhoneNumberDetection]]: - return detect_phone_numbers(text) - - PHONE_NUMBER_REGEX = re.compile( r"(?P<country_code>(00[1-9]\d?)|(\(?([+\d]{2,3})\)?)[- ]??)?" r"(?P<number>(\d[- ]??){9,10})" ) -def detect_phone_numbers(text: str) -> List[Tuple[int, int, PhoneNumberDetection]]: - """ - Detects phone numbers in the text. - :param text: the text to be searched - :type text: str - :param language: the language of the text - :type language: str - :return: a list of tuples containing (start, end, entity_type) - :rtype: List[Tuple[int, int, PhoneNumberAnnotation]] - """ - matches = PHONE_NUMBER_REGEX.finditer(text) - phone_numbers = [] - for match in matches: - phone_numbers.append((match.start(), match.end(), PhoneNumberDetection())) - return phone_numbers +class PhoneNumberDetector(Detector): + """Detector for phone numbers.""" + + def detect( + self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] + ) -> List[Tuple[int, int, PhoneNumberDetection]]: + """Detects phone numbers in the text. + + Args: + text (str): the text to be searched + annotations (Dict[str, List[Tuple[int, int, Any]]]): Annotations of + the text + + Returns: + List[Tuple[int, int, PhoneNumberDetection]]: a list of tuples + containing (start, end, entity_type) + + """ + matches = PHONE_NUMBER_REGEX.finditer(text) + phone_numbers = [] + for match in matches: + phone_numbers.append((match.start(), match.end(), PhoneNumberDetection())) + return phone_numbers diff --git a/src/detectors/url/__init__.py b/src/detectors/url/__init__.py index 5fa84d2..c553e40 100644 --- a/src/detectors/url/__init__.py +++ b/src/detectors/url/__init__.py @@ -1 +1,5 @@ from src.detectors.url.url import UrlDetector + +__all__ = [ + "UrlDetector", +] diff --git a/src/detectors/url/common.py b/src/detectors/url/common.py index d3e814d..085c701 100644 --- a/src/detectors/url/common.py +++ b/src/detectors/url/common.py @@ -1,9 +1,22 @@ -from typing import List +"""Helper module for detecting URLs.""" + +from typing import List, AnyStr import regex as re +from regex import compile, Pattern + + +def generate_url_regex(exeptions: List[str]) -> Pattern[AnyStr @ compile]: + """Returns a regex for detecting urls. + + Args: + exeptions (List[str]): List of strings that should not be treated as + urls. + Returns: + str: a regex for detecting urls -def generate_url_regex(exeptions: List[str]) -> str: + """ return re.compile( r"\b(?:{})\b(*SKIP)(*FAIL)|".format("|".join(exeptions)) + r"(?:(?P<protocol>(?:(?:https?|ftp):)?\/\/)?" diff --git a/src/detectors/url/pl.py b/src/detectors/url/pl.py index c4c790a..19c17cc 100644 --- a/src/detectors/url/pl.py +++ b/src/detectors/url/pl.py @@ -1,3 +1,5 @@ +"""Helper module for detecting Polish URLs.""" + from .common import generate_url_regex PL_URL_REGEX_EXEPTIONS = ["m.in"] diff --git a/src/detectors/url/url.py b/src/detectors/url/url.py index ac6996c..5fa76dd 100644 --- a/src/detectors/url/url.py +++ b/src/detectors/url/url.py @@ -1,5 +1,6 @@ -from typing import Any, Dict, List, Tuple +"""Detector for urls.""" +from typing import Any, Dict, List, Tuple from src.detections import UrlDetection from src.detectors.interface import Detector @@ -9,33 +10,41 @@ from .pl import URL_REGEX_PL class UrlDetector(Detector): + """Detector for urls.""" + def __init__(self, language: str = "pl") -> None: + """Initializes the detector. + + Args: + language (str, optional): Language of analyzed texts. + Defaults to "pl". + + """ self._language = language def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] ) -> List[Tuple[int, int, UrlDetection]]: - return detect_urls(text, self._language) - - -def detect_urls(text: str, language: str) -> List[Tuple[int, int, UrlDetection]]: - """ - Detects urls in the text. - :param text: the text to be searched - :type text: str - :param language: the language of the text - :type language: str - :return: a list of tuples containing (start, end, entity_type) - :rtype: List[Tuple[int, int, UrlAnnotation]] - """ - if language == "pl": - url_regex = URL_REGEX_PL - else: - url_regex = generate_url_regex(language) - - matches = url_regex.finditer(text) - urls = [] - for match in matches: - urls.append((match.start(), match.end(), UrlDetection())) - - return urls + """Detects urls in the text. + + Args: + text (str): the text to be searched + annotations (Dict[str, List[Tuple[int, int, Any]]]): Annotations of + the text + + Returns: + List[Tuple[int, int, UrlDetection]]: a list of tuples containing + (start, end, entity_type) + + """ + if self._language == "pl": + url_regex = URL_REGEX_PL + else: + url_regex = generate_url_regex(self._language) + + matches = url_regex.finditer(text) + urls = [] + for match in matches: + urls.append((match.start(), match.end(), UrlDetection())) + + return urls diff --git a/src/detectors/user/__init__.py b/src/detectors/user/__init__.py index fa43828..2ceb5c4 100644 --- a/src/detectors/user/__init__.py +++ b/src/detectors/user/__init__.py @@ -1 +1,5 @@ from src.detectors.user.user import UserDetector + +__all__ = [ + "UserDetector", +] diff --git a/src/detectors/user/user.py b/src/detectors/user/user.py index c197352..448fda0 100644 --- a/src/detectors/user/user.py +++ b/src/detectors/user/user.py @@ -1,3 +1,5 @@ +"""Module for detecting user mentions in a text.""" + from typing import Any, Dict, List, Tuple import regex as re @@ -6,31 +8,30 @@ from src.detections import UserDetection from src.detectors.interface import Detector +USER_REGEX = re.compile(r"\B(?P<username>\@[\w\-]+)") + + class UserDetector(Detector): - def __init__(self) -> None: - super().__init__() + """Detector for user mentions.""" def detect( self, text: str, annotations: Dict[str, List[Tuple[int, int, Any]]] ) -> List[Tuple[int, int, UserDetection]]: - return detect_users(text) + """Detects users in the text. + Args: + text (str): the text to be searched + annotations (Dict[str, List[Tuple[int, int, Any]]]): Annotations of + the text -USER_REGEX = re.compile(r"\B(?P<username>\@[\w\-]+)") + Returns: + List[Tuple[int, int, UserDetection]]: a list of tuples containing + (start, end, entity_type) + """ + matches = USER_REGEX.finditer(text) + users = [] + for match in matches: + users.append((match.start(), match.end(), UserDetection())) -def detect_users(text: str) -> List[Tuple[int, int, UserDetection]]: - """ - Detects users in the text. - :param text: the text to be searched - :type text: str - :param language: the language of the text - :type language: str - :return: a list of tuples containing (start, end, entity_type) - :rtype: List[Tuple[int, int, UserAnnotation]] - """ - matches = USER_REGEX.finditer(text) - users = [] - for match in matches: - users.append((match.start(), match.end(), UserDetection())) - return users + return users diff --git a/src/dictionaries/morphosyntactic/__init__.py b/src/dictionaries/morphosyntactic/__init__.py index 22d3339..418b5c7 100644 --- a/src/dictionaries/morphosyntactic/__init__.py +++ b/src/dictionaries/morphosyntactic/__init__.py @@ -1,2 +1,5 @@ -from src.dictionaries.morphosyntactic.interface import \ - MorphosyntacticDictionary +from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary + +__all__ = [ + "MorphosyntacticDictionary", +] diff --git a/src/dictionaries/morphosyntactic/interface.py b/src/dictionaries/morphosyntactic/interface.py index 8e22432..d89c09e 100644 --- a/src/dictionaries/morphosyntactic/interface.py +++ b/src/dictionaries/morphosyntactic/interface.py @@ -1,3 +1,5 @@ +"""Morpohosyntactic Dictionary Interface definition.""" + from abc import ABC, abstractmethod from typing import List, Optional, Type @@ -5,13 +7,34 @@ from src.detections import Detection class MorphosyntacticDictionary(ABC): + """Interface for dictionaries with morphosyntactic tags. + + Dictionary allows to replace a detection with anonimized one + while (trying to) keep the same morphosyntactic form + + Ie.: Janusza -> Marka, Januszowi -> Markowi etc. + """ + @abstractmethod def get_supported_detection_classes(self) -> List[Type[Detection]]: + """Returns a list of supported detection classess. + + Returns: + List[Type[Detection]]: List of detection classes that are supported + """ - Returns a list of supported detection classess - """ + pass def get_random_replacement(self, original_entry: Detection) -> Optional[str]: + """Returns a random replacement of original entry. + + Args: + original_entry (Detection): Detection that should be replaced. Class + should have MorphosyntacticInfoMixin + + + Returns: + Optional[str]: Text that should replace the original entry + """ - Returns a random replacement for the original entry - """ + pass diff --git a/src/dictionaries/morphosyntactic/ner_file.py b/src/dictionaries/morphosyntactic/ner_file.py index e143403..dff4fcb 100644 --- a/src/dictionaries/morphosyntactic/ner_file.py +++ b/src/dictionaries/morphosyntactic/ner_file.py @@ -1,19 +1,43 @@ +"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.""" + import random from collections import defaultdict from typing import List, Optional, Type -from src.detections import (DETECTION_CLASSES_MAP, Detection, - MorphosyntacticInfoMixin) -from src.dictionaries.morphosyntactic.interface import \ - MorphosyntacticDictionary +from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin +from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): + """Morphosyntactic dictionary that uses a tsv file with NER tags as a source. + + Example of a tsv file: + name Aaronom Aaron subst:pl:dat:m1 + name Aaronami Aaron subst:pl:inst:m1 + name Aaronach Aaron subst:pl:loc:m1 + country Apolonie Apolonia subst:pl:voc:f + country Apolonii Apolonia subst:sg:dat:f + country Apolonii Apolonia subst:pl:gen:f + country Apolonii Apolonia subst:sg:loc:f + city Araba Arab subst:sg:gen:m2 + city Arabie Arab subst:sg:voc:m2 + city Arabem Arab subst:sg:inst:m2 + """ + def __init__( self, dictionary_path: Optional[str] = None, always_replace=True, ) -> None: + """Initializes NERFileMorphosyntacticDictionary. + + Args: + dictionary_path (Optional[str], optional): Path to dictionary tsv file. + Defaults to None. + always_replace (bool, optional): Wheter to replace detection even if no + word with matching morpho tag is found. Defaults to True. + + """ super().__init__() self._dictionary = None self._always_replace = always_replace @@ -31,12 +55,26 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): self._dictionary = replacement_dictionary def get_supported_detection_classes(self) -> List[Type[Detection]]: - """ - Returns a list of supported detection classes + """Returns a list of supported detection classess. + + Returns: + List[Type[Detection]]: List of detection classes that are supported + """ return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()] def get_random_replacement(self, original_entry: Detection) -> Optional[str]: + """Returns a random replacement of original entry. + + Args: + original_entry (Detection): Detection that should be replaced. Class + should have MorphosyntacticInfoMixin + + + Returns: + Optional[str]: Text that should replace the original entry + + """ original_entry_type = type(original_entry) original_entry_type_name = original_entry_type.TYPE_NAME diff --git a/src/dictionaries/morphosyntactic/ner_file_nkjp.py b/src/dictionaries/morphosyntactic/ner_file_nkjp.py index 80eb16a..e322624 100644 --- a/src/dictionaries/morphosyntactic/ner_file_nkjp.py +++ b/src/dictionaries/morphosyntactic/ner_file_nkjp.py @@ -1,22 +1,83 @@ +"""Module implementing NERFileNKJPMorphosyntacticDictionary class. + +Module responsible for Morphosyntactic dict that uses a tsv file with NER tags +and NKJP-formatted morpho tags. + +NKJP-formatted morpho tags differ from the original morpho tags that they don't have +part of speech information. For example: +Nomral morpheus tag: subst:sg:gen:m2 +NKJP morpho tag: sg:gen:m2 +""" + import random -from typing import Optional +from typing import List, Optional, Type -from src.detections import Detection, MorphosyntacticInfoMixin -from src.dictionaries.morphosyntactic.ner_file import \ - NERFileMorphosyntacticDictionary +from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin +from src.dictionaries.morphosyntactic.ner_file import NERFileMorphosyntacticDictionary class NERFileNKJPMorphosyntacticDictionary(NERFileMorphosyntacticDictionary): + """Morphosyntactic dictionary that uses a tsv file with NER tags as a source. + + NKJP-formatted morpho tags differ from the original morpho tags that they don't have + part of speech information. For example: + Nomral morpheus tag: subst:sg:gen:m2 + NKJP morpho tag: sg:gen:m2 + + Example of a tsv file: + name Aaronom Aaron pl:dat:m1 + name Aaronami Aaron pl:inst:m1 + name Aaronach Aaron pl:loc:m1 + country Apolonie Apolonia pl:voc:f + country Apolonii Apolonia sg:dat:f + country Apolonii Apolonia pl:gen:f + country Apolonii Apolonia sg:loc:f + city Araba Arab sg:gen:m2 + city Arabie Arab sg:voc:m2 + city Arabem Arab sg:inst:m2 + """ + def __init__( self, dictionary_path: Optional[str] = None, always_replace=True, remove_first_morpho_subtag=True, ) -> None: + """Initializes NERFileNKJPMorphosyntacticDictionary. + + Args: + dictionary_path (Optional[str], optional): Path to dictionary tsv file. + Defaults to None. + always_replace (bool, optional): Wheter to replace detection even if no + word with matching morpho tag is found. Defaults to True. + remove_first_morpho_subtag (bool, optional): Wheter to remove first morpho. + Defaults to True. + + """ super().__init__(dictionary_path, always_replace) self._remove_first_morpho_subtag = remove_first_morpho_subtag + def get_supported_detection_classes(self) -> List[Type[Detection]]: + """Returns a list of supported detection classess. + + Returns: + List[Type[Detection]]: List of detection classes that are supported + + """ + return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()] + def get_random_replacement(self, original_entry: Detection) -> Optional[str]: + """Returns a random replacement of original entry. + + Args: + original_entry (Detection): Detection that should be replaced. Class + should have MorphosyntacticInfoMixin + + + Returns: + Optional[str]: Text that should replace the original entry + + """ original_entry_type = type(original_entry) original_entry_type_name = original_entry_type.TYPE_NAME diff --git a/src/input_parsers/ccl.py b/src/input_parsers/ccl.py index 44b5946..f6a172f 100644 --- a/src/input_parsers/ccl.py +++ b/src/input_parsers/ccl.py @@ -1,28 +1,60 @@ +"""Module for parsing CCL files.""" + from typing import List, Tuple from lxml import etree -from src.annotations import (Annotation, MorphosyntacticAnnotation, - NerAnnotation) +from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation from src.input_parsers.interface import InputParser class CCLInputParser(InputParser): - def __init__( - self, - ) -> None: - super().__init__() + """Parser for CCL files. + + Example CCL file: + <?xml version="1.0" encoding="UTF-8"?> + <!DOCTYPE chunkList SYSTEM "ccl.dtd"> + <chunkList> + <chunk type="p" id="ch1"> + <sentence id="s1"> + <tok> + <orth>Tom</orth> + <lex disamb="1"><base>Tom</base><ctag>subst:sg:nom:m1</ctag></lex> + <ann chan="person_first_nam" head="1">1</ann> + </tok> + <tok> + <orth>is</orth> + <lex disamb="1"><base>be</base><ctag>fin:sg:ter:pres</ctag></lex> + <ann chan="person_first_nam">0</ann> + </tok> + <tok> + <orth>nice</orth> + <lex disamb="1"><base>nice</base><ctag>adj:sg:nom:f:pos</ctag></lex> + <ann chan="person_first_nam">0</ann> + </tok> + <tok> + <orth>!</orth> + <lex disamb="1"><base>!</base><ctag>interp</ctag></lex> + <ann chan="person_first_nam">0</ann> + </tok> + </sentence> + </chunk> + </chunkList> + """ def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse CCL string into text and annotations. - Annotations are returned as a dictionary with channel name as a key and list of tuples. + Annotations are returned as a dictionary with channel name as a key and list of + tuples. Args: content (str): Content of ccl file. Returns: - Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and annotations. + Tuple[str, Dict[str, List[Tuple[int, int, Annotation]]]]: Text and + annotations. + """ ccl_tree = etree.fromstring(content.strip().encode("utf-8")) diff --git a/src/input_parsers/interface.py b/src/input_parsers/interface.py index ce48fb9..cd9dadf 100644 --- a/src/input_parsers/interface.py +++ b/src/input_parsers/interface.py @@ -1,18 +1,31 @@ +"""Module for input parser interface.""" + from abc import ABC, abstractmethod from typing import Any, List, Tuple class InputParser(ABC): + """Input parser interface. + + Input parser is used to parse input standarized set of into text and annotations. + """ + @abstractmethod def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Any]]]: """Parse input string into text and annotations. - Annotations are returned as a dictionary with channel name as a key and list of tuples. - Eg.: "She has a cat" -> ("She has a cat", {"entities": [(0, 3, "She"), (8, 11, "cat")]}) + Annotations are returned as a dictionary with channel name as a key and list of + tuples. Eg.: + Input + "She has a cat" + Output: + ("She has a cat", {"entities": [(0, 3, "She"), (8, 11, "cat")]}) Args: content (str): Input in raw form. Returns: Tuple[str, Dict[str, List[Tuple[int, int, Any]]]]: Text and annotations. + """ + pass diff --git a/src/input_parsers/wiktor_ner.py b/src/input_parsers/wiktor_ner.py index cb7a75d..673c422 100644 --- a/src/input_parsers/wiktor_ner.py +++ b/src/input_parsers/wiktor_ner.py @@ -1,25 +1,77 @@ +"""Module for parsing WiktorNER files.""" + import json from typing import List, Tuple -from src.annotations import (Annotation, MorphosyntacticAnnotation, - NerAnnotation) +from src.annotations import Annotation, MorphosyntacticAnnotation, NerAnnotation from src.input_parsers.interface import InputParser class WiktorNERInputParser(InputParser): - def __init__(self) -> None: - super().__init__() + """Parser for WiktorNER files. + + Example WiktorNER file: + { + "filename": "greeting-5b1401", + "text": "Hello Tom!", + "tokens": [ + { + "index": 1, + "position": [0,5], + "orth": "Hello", + "lex": [ + { + "lemma": "hello", + "mstag": "interj" + } + ] + }, + { + "index": 2, + "position": [6,9], + "orth": "Tom", + "lex": [ + { + "lemma": "Tom", + "mstag": "noun" + } + ] + }, + { + "index": 3, + "position": [9,10], + "orth": "!", + "lex": [ + { + "lemma": "!", + "mstag": "interp" + } + ] + } + ], + "entities": [ + { + "text": "Tom", + "type": "nam_prs_human", + "tokens": [2], + "positions": [6,9] + } + ] + } + """ def parse(self, content: str) -> Tuple[str, List[Tuple[int, int, Annotation]]]: """Parse wiktorner file into text and annotations. - Annotations are returned as a dictionary with channel name as a key and list of tuples. + Annotations are returned as a dictionary with channel name as a key and list of + tuples. Args: co z (str): Path to file containing CCL. Returns: Tuple[str, List[Tuple[int, int, Annotation]]]: Text and annotations. + """ content_parsed = json.loads(content) @@ -37,7 +89,7 @@ class WiktorNERInputParser(InputParser): token_start, token_end = token["position"] if "lexemes" in token: for lexeme in token["lexemes"]: - if "disamb" in lexeme and lexeme["disamb"] == True: + if "disamb" in lexeme and lexeme["disamb"] is True: if "mstag" in lexeme: annotations.append( ( diff --git a/src/pipeline/default.py b/src/pipeline/default.py index 0185154..3abb924 100644 --- a/src/pipeline/default.py +++ b/src/pipeline/default.py @@ -1,3 +1,5 @@ +"""Module for default pipeline implementation.""" + from typing import Dict from src.detectors.interface import Detector @@ -8,6 +10,8 @@ from src.suppressors.interface import Suppressor class DefaultPipeline(Pipeline): + """Pipeline that runs the whole anonymization process on the single input.""" + def __init__( self, input_parser: InputParser, @@ -15,25 +19,44 @@ class DefaultPipeline(Pipeline): suppressor: Suppressor, replacers: Dict[str, ReplacerInterface], ): + """Initialize pipeline. + + Args: + input_parser (InputParser): Object that parses input into text and + annotations. + detectors (Dict[str, Detector]): List of detectors. + suppressor (Suppressor): List of suppressors. + replacers (Dict[str, ReplacerInterface]): List of replacers. + + """ self._input_parser = input_parser self._detectors = detectors self._suppressor = suppressor self._replacers = replacers - def run(self, input_path) -> str: + def run(self, input_path: str) -> str: + """Run the whole anonymization pipeline. + + Args: + input_path (str): Path to the input supported by input parser. + + Returns: + str: Anonymized text. + + """ with open(input_path, "r") as f: content = f.read() parsed_input = self._input_parser.parse(content) detected_entities = [] - for detector_name, detector in self._detectors.items(): + for _detector_name, detector in self._detectors.items(): detected_entities += detector.detect(parsed_input[0], parsed_input[1]) annotaitons_cleaned = self._suppressor.suppress(detected_entities) replaced_input = parsed_input[0] annotations_left = annotaitons_cleaned - for replacer_name, replacer in self._replacers.items(): + for _replacer_name, replacer in self._replacers.items(): replaced_input, annotations_left = replacer.replace( replaced_input, annotations_left ) diff --git a/src/pipeline/interface.py b/src/pipeline/interface.py index 4e92e24..a001755 100644 --- a/src/pipeline/interface.py +++ b/src/pipeline/interface.py @@ -1,14 +1,22 @@ +"""Pipeline interface definition.""" "" + from abc import ABC, abstractmethod class Pipeline(ABC): + """Pipeline interface definition. + + Pipeline is a class that runs the whole anonymization process on the input file. + """ + @abstractmethod - def run(self, input_path) -> str: - """Run the whole anonymization pipeline on the input file and output the result. + def run(self, input_path: str) -> str: + """Run the whole anonymization pipeline. Args: - input_path (_type_): Path to the input file. + input_path (str): Path to the input. Returns: str: Anonymized text. + """ diff --git a/src/pipeline/sequential_jsonl.py b/src/pipeline/sequential_jsonl.py index bfb464a..dc76e97 100644 --- a/src/pipeline/sequential_jsonl.py +++ b/src/pipeline/sequential_jsonl.py @@ -1,3 +1,4 @@ +"""Sequential pipeline that runs anonymization process on jsonl-splitted input.""" import json from typing import Dict @@ -9,6 +10,12 @@ from src.suppressors.interface import Suppressor class SequentialJSONLPipeline(Pipeline): + """Pipeline that runs the whole anonymization process on jsonl-splitted input. + + This pipeline supports cases where the input is splitted into multiple parts and + each part is processed separately and then concatenated into single text output. + """ + def __init__( self, input_parser: InputParser, @@ -17,13 +24,36 @@ class SequentialJSONLPipeline(Pipeline): replacers: Dict[str, ReplacerInterface], concat_to_txt: bool = False, ): + """Initialize pipeline. + + Args: + input_parser (InputParser): Object that parses input into text and + annotations. + detectors (Dict[str, Detector]): List of detectors. + suppressor (Suppressor): List of suppressors. + replacers (Dict[str, ReplacerInterface]): List of replacers. + concat_to_txt (bool, optional): If true, concatenates output to single txt + file. If false - returns output in jsonl format, splitted in the same + way as the input. Defaults to False. + + """ + # TODO: Maybe input parser should be set by default to JSONL parser? self._input_parser = input_parser self._detectors = detectors self._suppressor = suppressor self._replacers = replacers self._concat_to_txt = concat_to_txt - def run(self, input_path) -> str: + def run(self, input_path: str) -> str: + """Run the whole anonymization pipeline. + + Args: + input_path (str): Path to the input supported by input parser. + + Returns: + str: Anonymized text. + + """ result = [] with open(input_path, "r") as f: for line in f.readlines(): diff --git a/src/replacers/__init__.py b/src/replacers/__init__.py index 508b92d..2245172 100644 --- a/src/replacers/__init__.py +++ b/src/replacers/__init__.py @@ -1,2 +1,4 @@ from src.replacers.interface import ReplacerInterface from src.replacers.tag_replacer import TagReplacer + +__all__ = ["ReplacerInterface", "TagReplacer"] diff --git a/src/replacers/date_replacer.py b/src/replacers/date_replacer.py index 011868f..22bb7f3 100644 --- a/src/replacers/date_replacer.py +++ b/src/replacers/date_replacer.py @@ -1,3 +1,5 @@ +"""Module for replacing dates with anonimized version.""" + import random from typing import List, Tuple @@ -23,12 +25,25 @@ months_map = { class DateReplacer(ReplacerInterface): - def __init__(self): - pass + """Class for replacing dates with anonimized version.""" def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: + """Replace detected dates in text with anonimized version. + + Eg.: I was born on 01.01.2020 -> I was born on 22.11.2069 + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with anonimized version and list of detections that were + not processed by this replacer. + + """ replacements = [] not_processed = [] diff --git a/src/replacers/delete_replacer.py b/src/replacers/delete_replacer.py index d8cd3e8..338afb2 100644 --- a/src/replacers/delete_replacer.py +++ b/src/replacers/delete_replacer.py @@ -1,3 +1,5 @@ +"""Module for DeleteReplacer class.""" + from typing import List, Tuple from src.detections import Detection @@ -6,13 +8,25 @@ from src.string_replacements import replace class DeleteReplacer(ReplacerInterface): - def __init__(self): - pass + """Replacer for deleting detected entities.""" def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: + """Delete detected entities from text. + + Eg.: His name is Bob -> His name is + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with anonimized version and list of detections that were + not processed by this replacer. + """ result = [(start, end, "") for start, end, _ in detections] return replace(text, result), [] diff --git a/src/replacers/email_replacer.py b/src/replacers/email_replacer.py index f4f5f89..8b495d3 100644 --- a/src/replacers/email_replacer.py +++ b/src/replacers/email_replacer.py @@ -1,3 +1,5 @@ +"""Email replacer module.""" + import random import string from typing import List, Tuple @@ -7,21 +9,34 @@ from src.replacers.interface import ReplacerInterface from src.string_replacements import replace_and_update -def random_char(char_num): +def _random_char(char_num): return "".join(random.choice(string.ascii_letters) for _ in range(char_num)) -def random_email(): - return random_char(7) + "@gmail.com" +def _random_email(): + return _random_char(7) + "@email.com" class EmailReplacer(ReplacerInterface): - def __init__(self): - pass + """Replaces any detected emails with random email.""" def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: + """Replace detected emails in text with anonimized version. + + eg.: "My email is bob@shop.com -> My email is 532fasfs@email.com" + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with anonimized version and list of detections that were + not processed by this replacer. + + """ replacements = [] not_processed = [] @@ -32,7 +47,7 @@ class EmailReplacer(ReplacerInterface): if isinstance(detection, EmailDetection): if text[start:end] not in already_replaced: - already_replaced[text[start:end]] = random_email() + already_replaced[text[start:end]] = _random_email() replacements.append((start, end, already_replaced[text[start:end]])) else: diff --git a/src/replacers/interface.py b/src/replacers/interface.py index dbf4396..e303f23 100644 --- a/src/replacers/interface.py +++ b/src/replacers/interface.py @@ -1,3 +1,5 @@ +"""Interface for replacers.""" + from abc import ABC, abstractmethod from typing import List, Tuple @@ -5,6 +7,8 @@ from src.detections import Detection class ReplacerInterface(ABC): + """Interface for replacers.""" + @abstractmethod def replace( self, text: str, detections: List[Tuple[int, int, Detection]] @@ -19,4 +23,5 @@ class ReplacerInterface(ABC): Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities replaced with anonimized version and list of detections that were not processed by this replacer. + """ diff --git a/src/replacers/ner_replacer.py b/src/replacers/ner_replacer.py index 98cf055..219074b 100644 --- a/src/replacers/ner_replacer.py +++ b/src/replacers/ner_replacer.py @@ -1,3 +1,5 @@ +"""Module for replacing NER entities with anonimized version.""" + from typing import List, Tuple from src.detections import Detection @@ -7,12 +9,35 @@ from src.string_replacements import replace_and_update class NERReplacer(ReplacerInterface): + """Replaces any detected NER entities with their morphosyntactic equivalents.""" + def __init__(self, dictionary: MorphosyntacticDictionary): + """Initializes NERReplacer. + + Args: + dictionary (MorphosyntacticDictionary): Dictionary used for + replacing words with their morphosyntactic equivalents. + + """ self._dictionary = dictionary def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: + """Replace detected NER entities in text with anonimized version. + + Eg.: "John Smith is a nice guy" -> "Mark Johnson is a nice guy" + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with anonimized version and list of detections that were + not processed by this replacer. + + """ replacements = [] not_processed = [] diff --git a/src/replacers/number_replacer.py b/src/replacers/number_replacer.py index 6ae331a..8730167 100644 --- a/src/replacers/number_replacer.py +++ b/src/replacers/number_replacer.py @@ -1,3 +1,5 @@ +"""Module responsible for replacing numbers with random digits.""" + import random import string from typing import List, Tuple @@ -7,7 +9,7 @@ from src.replacers.interface import ReplacerInterface from src.string_replacements import replace_and_update -def randomize_digits_in_text(text: str) -> str: +def _randomize_digits_in_text(text: str) -> str: result = "" for c in text: @@ -20,12 +22,25 @@ def randomize_digits_in_text(text: str) -> str: class NumberReplacer(ReplacerInterface): - def __init__(self): - pass + """Replaces any detected numbers with random digits.""" def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: + """Replace detected numbers in text with anonimized version. + + Eg.: "123456789" -> "692154236" + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with anonimized version and list of detections that were + not processed by this replacer. + + """ replacements = [] not_processed = [] @@ -36,7 +51,7 @@ class NumberReplacer(ReplacerInterface): if isinstance(detection, NumberDetection): if text[start:end] not in already_replaced: - already_replaced[text[start:end]] = randomize_digits_in_text( + already_replaced[text[start:end]] = _randomize_digits_in_text( text[start:end] ) diff --git a/src/replacers/tag_replacer.py b/src/replacers/tag_replacer.py index 836cd14..954ce6a 100644 --- a/src/replacers/tag_replacer.py +++ b/src/replacers/tag_replacer.py @@ -1,19 +1,43 @@ +"""Tag Replacer module.""" + from typing import List, Tuple -from src.detections import (CityDetection, CountryDetection, DateDetection, - Detection, EmailDetection, HydronymDetection, - KRSDetection, LocationDetection, NameDetection, - NumberDetection, OrganizationNameDetection, - PhoneNumberDetection, ProperNameDetection, - SerialNumberDetection, StreetNameDetection, - SurnameDetection, TINDetection, TitleDetection, - UrlDetection, UserDetection) +from src.detections import ( + CityDetection, + CountryDetection, + DateDetection, + Detection, + EmailDetection, + HydronymDetection, + KRSDetection, + LocationDetection, + NameDetection, + NumberDetection, + OrganizationNameDetection, + PhoneNumberDetection, + ProperNameDetection, + SerialNumberDetection, + StreetNameDetection, + SurnameDetection, + TINDetection, + TitleDetection, + UrlDetection, + UserDetection, +) from src.replacers.interface import ReplacerInterface from src.string_replacements import replace class TagReplacer(ReplacerInterface): + """Replaces any detected entities with tags. + + Example: Anna has a cat -> [PERSON] has a cat. + + """ + def __init__(self): + """Inits tag replacer.""" + # TODO: Move this to a config file. self.tags_map = { NameDetection: "[OSOBA]", SurnameDetection: "[OSOBA]", @@ -39,7 +63,18 @@ class TagReplacer(ReplacerInterface): def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: + """Replace detected entities in text with a tag like [CITY]. + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with tags and list of detections that were + not processed by this replacer. + """ result = [ (start, end, self.tags_map.get(type(entity_type), "[OTHER]")) for start, end, entity_type in detections diff --git a/src/replacers/user_replacer.py b/src/replacers/user_replacer.py index 518e569..b9d8a7d 100644 --- a/src/replacers/user_replacer.py +++ b/src/replacers/user_replacer.py @@ -1,3 +1,5 @@ +"""Replacer for user mentions.""" "" + from typing import List, Tuple from random_username.generate import generate_username @@ -8,12 +10,25 @@ from src.string_replacements import replace_and_update class UserReplacer(ReplacerInterface): - def __init__(self): - pass + """Replaces any detected user mentions with random username.""" def replace( self, text: str, detections: List[Tuple[int, int, Detection]] ) -> Tuple[str, List[Tuple[int, int, Detection]]]: + """Replace detected user mentions in text with anonimized version. + + Eg.: "@bob213" -> "@beautifullion56" + + Args: + text (str): Text to be processed. + detections (List[Tuple[int, int, str]]): List of detections. + + Returns: + Tuple[str, List[Tuple[int, int, str]]]: Text with supported entities + replaced with anonimized version and list of detections that were + not processed by this replacer. + + """ replacements = [] not_processed = [] diff --git a/src/string_replacements.py b/src/string_replacements.py index 06182bd..438ffc9 100644 --- a/src/string_replacements.py +++ b/src/string_replacements.py @@ -1,25 +1,23 @@ +"""Utility functions for replacing substrings in a string.""" + from typing import List, Tuple, TypeVar def replace(original_string: str, replacements: List[Tuple[int, int, str]]) -> str: - """ - Replaces substrings in a string. + """Replaces substrings in a string. - !!! Important: This function assumes that there are no overlapping annotations. + !!! Important: This function assumes that there are no overlapping + annotations. - Parameters - ---------- - original_string : str - The original string. - replacements : List[Tuple[int, int, str]] - A list of tuples containing (start, end, replacement). + Args: + original_string (str): The original string. + replacements (List[Tuple[int, int, str]]): A list of tuples containing + (start, end, replacement). - Returns - ------- - str - The string with replacements applied. - """ + Returns: + str: The string with replacements applied. + """ replacements = sorted(replacements, key=lambda x: x[0]) delta = 0 @@ -42,25 +40,23 @@ def replace_and_update( replacements: List[Tuple[int, int, str]], other_annotations: List[Tuple[int, int, _T]], ) -> Tuple[str, List[Tuple[int, int, _T]]]: - """Replaces substrings in a string and updates other annotations to match new string. - - !!! Important: This function assumes that there are no overlapping annotations. - - Parameters - ---------- - original_string : str - The original string. - replacements : List[Tuple[int, int, str]] - A list of tuples containing (start, end, replacement). - other_annotations : List[Tuple[int, int, Any]] - A list of other annotations. - - Returns - ------- - Tuple[str, List[Tuple[int, int, Any]]] - The string with replacements applied and other annotations with new positions. - """ + """Replaces parts of a string and updates annotations to match new string. + !!! Important: This function assumes that there are no overlapping + annotations. + + Args: + original_string (str): The original string. + replacements (List[Tuple[int, int, str]]): A list of tuples containing + (start, end, replacement). + other_annotations (List[Tuple[int, int, _T]]): A list of other + annotations. + + Returns: + Tuple[str, List[Tuple[int, int, _T]]]: The string with replacements + applied and other annotations with new positions. + + """ joined_list = [] for replacement in replacements: joined_list.append((replacement[0], replacement[1], replacement[2], True)) diff --git a/src/suppressors/__init__.py b/src/suppressors/__init__.py index 4d23afe..7a5f31c 100644 --- a/src/suppressors/__init__.py +++ b/src/suppressors/__init__.py @@ -1 +1,3 @@ from src.suppressors.order_based import suppress_order_based + +__all__ = [suppress_order_based] diff --git a/src/suppressors/interface.py b/src/suppressors/interface.py index 960449c..0c5e75e 100644 --- a/src/suppressors/interface.py +++ b/src/suppressors/interface.py @@ -1,13 +1,20 @@ +"""Module for the Suppressor interface.""" + from abc import ABC, abstractmethod from typing import Any, List, Tuple class Suppressor(ABC): + """Suppressor interface. + + Suppressors are used to remove overlapping annotations. + """ + @abstractmethod def suppress( self, annotations: List[Tuple[int, int, Any]] ) -> List[Tuple[int, int, Any]]: - """Suppresses annotations on overlappment. + """Suppresses annotations on overlap. Args: annotations (List[Tuple[int, int, Any]]): List of annotations. diff --git a/src/suppressors/order_based.py b/src/suppressors/order_based.py index c466c69..1d4f5c9 100644 --- a/src/suppressors/order_based.py +++ b/src/suppressors/order_based.py @@ -1,3 +1,5 @@ +"""Module for the order-based suppressor.""" + from typing import Any, List, Tuple from bitarray import bitarray @@ -6,41 +8,34 @@ from src.suppressors.interface import Suppressor class OrderBasedSuppressor(Suppressor): - def __init__(self) -> None: - super().__init__() + """Suppressor that removes overlapping annotations based on their order.""" def suppress( self, annotations: List[Tuple[int, int, Any]] ) -> List[Tuple[int, int, Any]]: - return suppress_order_based(annotations) - - -def suppress_order_based( - annotations: List[Tuple[int, int, Any]] -) -> List[Tuple[int, int, Any]]: - """If two annotations overlap, the first one int the list is kept. + """If two annotations overlap, the first one int the list is kept. - Args: - annotations (List[Tuple[int, int, Any]]): List of annotations. + Args: + annotations (List[Tuple[int, int, Any]]): List of annotations. - Returns: - List[Tuple[int, int, Any]]: List of annotations with overlapping - annotations removed. + Returns: + List[Tuple[int, int, Any]]: List of annotations with overlapping + annotations removed. - """ - if len(annotations) == 0: - return annotations + """ + if len(annotations) == 0: + return annotations - annotations = annotations - bitarray_size = max([end for _, end, _ in annotations]) - bitarray_ = bitarray(bitarray_size) - bitarray_.setall(False) + annotations = annotations + bitarray_size = max([end for _, end, _ in annotations]) + bitarray_ = bitarray(bitarray_size) + bitarray_.setall(False) - result = [] + result = [] - for start, end, entity_type in annotations: - if not bitarray_[start:end].any(): - bitarray_[start:end] = True - result.append((start, end, entity_type)) + for start, end, entity_type in annotations: + if not bitarray_[start:end].any(): + bitarray_[start:end] = True + result.append((start, end, entity_type)) - return result + return result diff --git a/src/utils/subclasses.py b/src/utils/subclasses.py deleted file mode 100644 index 8f28e81..0000000 --- a/src/utils/subclasses.py +++ /dev/null @@ -1,6 +0,0 @@ -def get_sublcasses(cls): - subclasses = [] - for subclass in cls.__subclasses__(): - subclasses.append(subclass) - subclasses.extend(get_sublcasses(subclass)) - return subclasses diff --git a/src/utils/utils.py b/src/utils/utils.py index 23e1435..7577214 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -31,3 +31,21 @@ def subdict(dictionary, keys, all_must_be_present=True): return {key: dictionary[key] for key in keys} else: return {key: dictionary[key] for key in keys if key in dictionary} + + +def get_sublcasses(cls): + """Returns all classes that inherits from the provided class. + + Returns all classes that inherits from cls, both directly and + indirectly + + Returns: + List[Type[object]]: List of classes that inherits from the + provided class + + """ + subclasses = [] + for subclass in cls.__subclasses__(): + subclasses.append(subclass) + subclasses.extend(get_sublcasses(subclass)) + return subclasses diff --git a/src/worker.py b/src/worker.py index 7c84fd0..5610cf0 100644 --- a/src/worker.py +++ b/src/worker.py @@ -4,9 +4,29 @@ from hydra.utils import instantiate class Worker: + """Worker class compatible with the nlp_worker interface. + + This class is responsible for loading the pipeline and running it on the + given text. + + It's supposed to be used in the nlp_worker but it can be used as a standalone + for easier debugging. + """ + def __init__( self, configuration="ccl", default_language="pl", default_replacer="tag" ) -> None: + """Initializes the worker. + + Args: + configuration (str, optional): Hydra configuration of the pipeline. + Defaults to "ccl". + default_language (str, optional): Default language of the text. + Defaults to "pl". + default_replacer (str, optional): Default method of replacing + tokens. Defaults to "tag". + + """ self._last_config = None self._pipeline = None @@ -37,16 +57,23 @@ class Worker: return self._pipeline - def process(self, input_file, task_options, output_file): + def process(self, input_file, task_options, output_file) -> None: """Anonymizes input text. It is assumed input_file is encoded in UTF-8. - Options: - method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens, - 'tag' replaces selected tokens with arbitrary tags, 'pseudo' - replaces selected tokens with a random token that - language - 'pl' - language of the input text. As of now only Polish is supported. + Args: + input_file (str): path to the input file + task_options (Dict[str, str]): task options. Can contain following + keys: + method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected + tokens, 'tag' replaces selected tokens with arbitrary + tags, 'pseudo' replaces selected tokens with a random + token that has the same POS tag. + language - 'pl' - language of the input text. As of now only + Polish is supported. + output_file (str): path to the output file + """ pipeline = self._prepare_pipeline(task_options) diff --git a/tests/integration/ccl_configuration/test_ccl_configuration.py b/tests/integration/ccl_configuration/test_ccl_configuration.py index 463b89d..3a887f3 100644 --- a/tests/integration/ccl_configuration/test_ccl_configuration.py +++ b/tests/integration/ccl_configuration/test_ccl_configuration.py @@ -1,8 +1,11 @@ +"""Test the pipeline with the ccl configuration.""" + from hydra import compose, initialize from hydra.utils import instantiate def test_ccl_configuration(): + """Test the pipeline with the ccl configuration.""" with initialize(config_path="../../../config", version_base="1.1"): config = compose( config_name="config", diff --git a/tests/integration/wiktorner_jsonl_configuration/test_wiktorner_jsonl_configuration.py b/tests/integration/wiktorner_jsonl_configuration/test_wiktorner_jsonl_configuration.py index 48133bf..09b9f9a 100644 --- a/tests/integration/wiktorner_jsonl_configuration/test_wiktorner_jsonl_configuration.py +++ b/tests/integration/wiktorner_jsonl_configuration/test_wiktorner_jsonl_configuration.py @@ -1,8 +1,11 @@ +"""Test the pipeline with the wiktorner_jsonl configuration.""" + from hydra import compose, initialize from hydra.utils import instantiate def test_wiktorner_jsonl_configuration(): + """Check if the pipeline works with the wiktorner_jsonl configuration.""" with initialize(config_path="../../../config", version_base="1.1"): config = compose( config_name="config", @@ -16,5 +19,15 @@ def test_wiktorner_jsonl_configuration(): assert ( result - == '{"text": "ROZDZIAÅ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszÄ™ uprzedzić z góry czytelników, aby siÄ™ daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach [MIEJSCE] i [MIEJSCE], bo go tam nie znajdÄ…. Nie dlatego, jakoby [MIEJSCE] nie istniaÅ‚a w rzeczywistoÅ›ci i byÅ‚a tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkaÅ„cy owego sÅ‚awnego grodu, urosÅ‚szy z czasem w ambicjÄ™, uważali tÄ™ nazwÄ™ jako ubliżajÄ…cÄ… ich powadze i podali do c. k. namiestnictwa pokornÄ… proÅ›bÄ™ o pozwolenie zamienienia jej na innÄ…. Podobne zamiany nazwisk praktykujÄ… siÄ™ dość czÄ™sto w [MIEJSCE], szczególnie u pojedynczych osób, które nie czujÄ…c siÄ™ na siÅ‚ach uszlachetnienia sobÄ…, swymi czynami wÅ‚asnego nazwiska, chcÄ… nazwiskiem uszlachetnić siebie, i tak np."}' + == '{"text": "ROZDZIAÅ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszÄ™' + " uprzedzić z góry czytelników, aby siÄ™ daremnie nie trudzili nad szukaniem" + " wyżej wyrażonego miasteczka na mapach [MIEJSCE] i [MIEJSCE], bo go tam nie" + " znajdÄ…. Nie dlatego, jakoby [MIEJSCE] nie istniaÅ‚a w rzeczywistoÅ›ci i byÅ‚a" + " tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkaÅ„cy owego" + " sÅ‚awnego grodu, urosÅ‚szy z czasem w ambicjÄ™, uważali tÄ™ nazwÄ™ jako" + " ubliżajÄ…cÄ… ich powadze i podali do c. k. namiestnictwa pokornÄ… proÅ›bÄ™ o" + " pozwolenie zamienienia jej na innÄ…. Podobne zamiany nazwisk praktykujÄ… siÄ™" + " dość czÄ™sto w [MIEJSCE], szczególnie u pojedynczych osób, które nie czujÄ…c" + " siÄ™ na siÅ‚ach uszlachetnienia sobÄ…, swymi czynami wÅ‚asnego nazwiska, chcÄ…" + ' nazwiskiem uszlachetnić siebie, i tak np."}' ) diff --git a/tests/integration/wiktorner_jsonl_txt_output_configuration/test_wiktorner_jsonl_txt_output_configuration.py b/tests/integration/wiktorner_jsonl_txt_output_configuration/test_wiktorner_jsonl_txt_output_configuration.py index d8aded5..9eb4dae 100644 --- a/tests/integration/wiktorner_jsonl_txt_output_configuration/test_wiktorner_jsonl_txt_output_configuration.py +++ b/tests/integration/wiktorner_jsonl_txt_output_configuration/test_wiktorner_jsonl_txt_output_configuration.py @@ -1,8 +1,11 @@ +"""Test the pipeline with the wiktorner_jsonl_txt_output configuration.""" + from hydra import compose, initialize from hydra.utils import instantiate def test_wiktorner_jsonl_txt_output_configuration(): + """Test the pipeline with the wiktorner_jsonl_txt_output configuration.""" with initialize(config_path="../../../config", version_base="1.1"): config = compose( config_name="config", @@ -14,10 +17,32 @@ def test_wiktorner_jsonl_txt_output_configuration(): pipeline = instantiate(config.pipeline) result = pipeline.run( - "./tests/integration/wiktorner_jsonl_txt_output_configuration/wiktorner_jsonl.jsonl" + "./tests/integration/" + "wiktorner_jsonl_txt_output_configuration/wiktorner_jsonl.jsonl" ) assert ( result - == "ROZDZIAÅ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszÄ™ uprzedzić z góry czytelników, aby siÄ™ daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach [MIEJSCE] i [MIEJSCE], bo go tam nie znajdÄ…. Nie dlatego, jakoby [MIEJSCE] nie istniaÅ‚a w rzeczywistoÅ›ci i byÅ‚a tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkaÅ„cy owego sÅ‚awnego grodu, urosÅ‚szy z czasem w ambicjÄ™, uważali tÄ™ nazwÄ™ jako ubliżajÄ…cÄ… ich powadze i podali do c. k. namiestnictwa pokornÄ… proÅ›bÄ™ o pozwolenie zamienienia jej na innÄ…. Podobne zamiany nazwisk praktykujÄ… siÄ™ dość czÄ™sto w [MIEJSCE], szczególnie u pojedynczych osób, które nie czujÄ…c siÄ™ na siÅ‚ach uszlachetnienia sobÄ…, swymi czynami wÅ‚asnego nazwiska, chcÄ… nazwiskiem uszlachetnić siebie, i tak np. ROZDZIAÅ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszÄ™ uprzedzić z góry czytelników, aby siÄ™ daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach [MIEJSCE] i [MIEJSCE], bo go tam nie znajdÄ…. Nie dlatego, jakoby [MIEJSCE] nie istniaÅ‚a w rzeczywistoÅ›ci i byÅ‚a tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkaÅ„cy owego sÅ‚awnego grodu, urosÅ‚szy z czasem w ambicjÄ™, uważali tÄ™ nazwÄ™ jako ubliżajÄ…cÄ… ich powadze i podali do c. k. namiestnictwa pokornÄ… proÅ›bÄ™ o pozwolenie zamienienia jej na innÄ…. Podobne zamiany nazwisk praktykujÄ… siÄ™ dość czÄ™sto w [MIEJSCE], szczególnie u pojedynczych osób, które nie czujÄ…c siÄ™ na siÅ‚ach uszlachetnienia sobÄ…, swymi czynami wÅ‚asnego nazwiska, chcÄ… nazwiskiem uszlachetnić siebie, i tak np." + == "ROZDZIAÅ I. CO NIECO O SAMEJ PIPIDÓWCE Przede wszystkim muszÄ™ uprzedzić z" + " góry czytelników, aby siÄ™ daremnie nie trudzili nad szukaniem wyżej" + " wyrażonego miasteczka na mapach [MIEJSCE] i [MIEJSCE], bo go tam nie" + " znajdÄ…. Nie dlatego, jakoby [MIEJSCE] nie istniaÅ‚a w rzeczywistoÅ›ci i byÅ‚a" + " tylko wytworem fantazji autora, ale po prostu dlatego, że mieszkaÅ„cy owego" + " sÅ‚awnego grodu, urosÅ‚szy z czasem w ambicjÄ™, uważali tÄ™ nazwÄ™ jako" + " ubliżajÄ…cÄ… ich powadze i podali do c. k. namiestnictwa pokornÄ… proÅ›bÄ™ o" + " pozwolenie zamienienia jej na innÄ…. Podobne zamiany nazwisk praktykujÄ… siÄ™" + " dość czÄ™sto w [MIEJSCE], szczególnie u pojedynczych osób, które nie czujÄ…c" + " siÄ™ na siÅ‚ach uszlachetnienia sobÄ…, swymi czynami wÅ‚asnego nazwiska, chcÄ…" + " nazwiskiem uszlachetnić siebie, i tak np. ROZDZIAÅ I. CO NIECO O SAMEJ" + " PIPIDÓWCE Przede wszystkim muszÄ™ uprzedzić z góry czytelników, aby siÄ™" + " daremnie nie trudzili nad szukaniem wyżej wyrażonego miasteczka na mapach" + " [MIEJSCE] i [MIEJSCE], bo go tam nie znajdÄ…. Nie dlatego, jakoby [MIEJSCE]" + " nie istniaÅ‚a w rzeczywistoÅ›ci i byÅ‚a tylko wytworem fantazji autora, ale" + " po prostu dlatego, że mieszkaÅ„cy owego sÅ‚awnego grodu, urosÅ‚szy z czasem w" + " ambicjÄ™, uważali tÄ™ nazwÄ™ jako ubliżajÄ…cÄ… ich powadze i podali do c. k." + " namiestnictwa pokornÄ… proÅ›bÄ™ o pozwolenie zamienienia jej na innÄ…. Podobne" + " zamiany nazwisk praktykujÄ… siÄ™ dość czÄ™sto w [MIEJSCE], szczególnie u" + " pojedynczych osób, które nie czujÄ…c siÄ™ na siÅ‚ach uszlachetnienia sobÄ…," + " swymi czynami wÅ‚asnego nazwiska, chcÄ… nazwiskiem uszlachetnić siebie, i" + " tak np." ) diff --git a/tests/unit/detectors/date/test_en.py b/tests/unit/detectors/date/test_en.py index 07d4bbd..de41e30 100644 --- a/tests/unit/detectors/date/test_en.py +++ b/tests/unit/detectors/date/test_en.py @@ -1,8 +1,10 @@ +"""Unit tests for DateDetector for english language.""" from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_en(): + """Test DateDetector for english language.""" detector = DateDetector("en") # Check en-us diff --git a/tests/unit/detectors/date/test_pl.py b/tests/unit/detectors/date/test_pl.py index 6281c80..04b3883 100644 --- a/tests/unit/detectors/date/test_pl.py +++ b/tests/unit/detectors/date/test_pl.py @@ -1,8 +1,11 @@ +"""Unit tests for DateDetector for polish language.""" + from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_pl(): + """Test DateDetector for polish language.""" detector = DateDetector("pl") text = "W dniu 1.01.2022 sprzedaÅ‚em kota. 5 kwietnia 2021 roku kupiÅ‚em psa." @@ -37,10 +40,13 @@ def test_detect_dates_pl(): def test_date_with_different_punctuations(): - # There is discussion about this wheter we should even detect such cases - # as a dates... However, for now we do and if we find cases where that is - # problematic, this definitly could be changed. + """Test for date with different punctuations. + Example: 1.01,2022 + There is discussion about this wheter we should even detect such cases + as a dates... However, for now we do and if we find cases where that is + problematic, this definitly could be changed. + """ detector = DateDetector("pl") text = "1.01,2022" @@ -63,6 +69,7 @@ def test_date_with_different_punctuations(): def test_28_czerwca_1847(): + """Test for 28 czerwca 1847 date (user reported it to fail).""" detector = DateDetector("pl") text = "28 czerwca 1847 wraz z ojcem skazany na karÄ™ Å›mierci." @@ -85,6 +92,7 @@ def test_28_czerwca_1847(): def test_year_only(): + """Test for year-only dates.""" detector = DateDetector("pl") text = "W 2020 roku kupiÅ‚em psa." diff --git a/tests/unit/detectors/date/test_ru.py b/tests/unit/detectors/date/test_ru.py index 1ae3d0b..afca842 100644 --- a/tests/unit/detectors/date/test_ru.py +++ b/tests/unit/detectors/date/test_ru.py @@ -1,8 +1,11 @@ +"""Unit tests for DateDetector for polish language.""" + from src.detections import DateDetection from src.detectors.date.date import DateDetector def test_detect_dates_pl(): + """Test DateDetector for polish language.""" detector = DateDetector("ru") text = "1.01.2022 Ñ Ð¿Ñ€Ð¾Ð´Ð°Ð» кошку. 5 Ð°Ð¿Ñ€ÐµÐ»Ñ 2021 Ñ ÐºÑƒÐ¿Ð¸Ð» Ñобаку." diff --git a/tests/unit/detectors/email/test_email.py b/tests/unit/detectors/email/test_email.py index 7f61359..61e4518 100644 --- a/tests/unit/detectors/email/test_email.py +++ b/tests/unit/detectors/email/test_email.py @@ -1,8 +1,11 @@ +"""Unit tests for email detector.""" + from src.detections import EmailDetection from src.detectors.email import EmailDetector def test_detect_emails(): + """Test that the email detector detects emails in the text.""" detector = EmailDetector() text = ( diff --git a/tests/unit/detectors/ner/test_ner.py b/tests/unit/detectors/ner/test_ner.py index f59e722..21dcf9b 100644 --- a/tests/unit/detectors/ner/test_ner.py +++ b/tests/unit/detectors/ner/test_ner.py @@ -1,9 +1,11 @@ +"""Unit tests for ner detector.""" from src.annotations import MorphosyntacticAnnotation, NerAnnotation from src.detections import CityDetection, NameDetection, SurnameDetection from src.detectors.ner import NerDetector def test_ner_detector(): + """Test ner detector.""" detector = NerDetector( detection_mapping={ "person_first_nam": "name", diff --git a/tests/unit/detectors/phone/test_phone.py b/tests/unit/detectors/phone/test_phone.py index 2e89a55..5903c28 100644 --- a/tests/unit/detectors/phone/test_phone.py +++ b/tests/unit/detectors/phone/test_phone.py @@ -1,8 +1,11 @@ +"""Unit tests for PhoneNumberDetector.""" + from src.detections import PhoneNumberDetection from src.detectors.phone import PhoneNumberDetector def test_detect_phone_numbers(): + """Test PhoneNumberDetector.""" detector = PhoneNumberDetector() text = "My phone number is +48 123 456 789. My friend's number is 123456789." diff --git a/tests/unit/detectors/url/test_url.py b/tests/unit/detectors/url/test_url.py index 110d503..1e6664c 100644 --- a/tests/unit/detectors/url/test_url.py +++ b/tests/unit/detectors/url/test_url.py @@ -1,8 +1,11 @@ +"""Unit tests for the URL detector.""" + from src.detections import UrlDetection from src.detectors.url import UrlDetector def test_detect_urls(): + """Test that the URL detector detects URLs in the text.""" detector = UrlDetector("en") text = ( @@ -14,6 +17,7 @@ def test_detect_urls(): def test_detect_urls_pl(): + """Test if the URL detector won't detect Polish url-like words as URLs.""" detector_en = UrlDetector("en") detector_pl = UrlDetector("pl") @@ -21,6 +25,7 @@ def test_detect_urls_pl(): found_urls_pl = detector_pl.detect(text, dict()) found_urls_en = detector_en.detect(text, dict()) - # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected as a URL. + # m.in is a valid shortcut for miÄ™dzy innymi in Polish. It should not be detected + # as a URL. assert found_urls_pl == [(6, 28, UrlDetection())] assert found_urls_en == [(0, 4, UrlDetection()), (6, 28, UrlDetection())] diff --git a/tests/unit/detectors/user/test_user.py b/tests/unit/detectors/user/test_user.py index f56218b..af2186d 100644 --- a/tests/unit/detectors/user/test_user.py +++ b/tests/unit/detectors/user/test_user.py @@ -1,8 +1,11 @@ +"""Unit tests for UserDetector.""" + from src.detections import UserDetection from src.detectors.user import UserDetector def test_detect_users(): + """Test UserDetector.""" detector = UserDetector() text = "My username is @john_smith. My friend's username is @jane_doe." diff --git a/tests/unit/dictionaries/morphosyntactic/test_ner_file.py b/tests/unit/dictionaries/morphosyntactic/test_ner_file.py index f912869..00c5d59 100644 --- a/tests/unit/dictionaries/morphosyntactic/test_ner_file.py +++ b/tests/unit/dictionaries/morphosyntactic/test_ner_file.py @@ -1,11 +1,13 @@ +"""Tests for NERFileMorphosyntacticDictionary.""" + from tempfile import NamedTemporaryFile -from src.detections import CityDetection, NameDetection, SurnameDetection -from src.dictionaries.morphosyntactic.ner_file import \ - NERFileMorphosyntacticDictionary +from src.detections import NameDetection, SurnameDetection +from src.dictionaries.morphosyntactic.ner_file import NERFileMorphosyntacticDictionary def test_ner_file_morphosyntactic_dictionary(): + """Test NERFileMorphosyntacticDictionary.""" with NamedTemporaryFile() as file: file.writelines( [ diff --git a/tests/unit/dictionaries/morphosyntactic/test_ner_file_nkjp.py b/tests/unit/dictionaries/morphosyntactic/test_ner_file_nkjp.py index c928e5b..7b68489 100644 --- a/tests/unit/dictionaries/morphosyntactic/test_ner_file_nkjp.py +++ b/tests/unit/dictionaries/morphosyntactic/test_ner_file_nkjp.py @@ -1,11 +1,15 @@ +"""Tests for NERFileNKJPMorphosyntacticDictionary.""" + from tempfile import NamedTemporaryFile -from src.detections import CityDetection, NameDetection, SurnameDetection -from src.dictionaries.morphosyntactic.ner_file_nkjp import \ - NERFileNKJPMorphosyntacticDictionary +from src.detections import NameDetection, SurnameDetection +from src.dictionaries.morphosyntactic.ner_file_nkjp import ( + NERFileNKJPMorphosyntacticDictionary, +) def test_ner_file_nkjp_morphosyntactic_dictionary(): + """Test NERFileNKJPMorphosyntacticDictionary.""" with NamedTemporaryFile() as file: file.writelines( [ diff --git a/tests/unit/input_parsers/test_ccl.py b/tests/unit/input_parsers/test_ccl.py index 9a41c77..e5809fa 100644 --- a/tests/unit/input_parsers/test_ccl.py +++ b/tests/unit/input_parsers/test_ccl.py @@ -1,3 +1,5 @@ +"""Tests for CCL input parser.""" + from src.annotations import MorphosyntacticAnnotation, NerAnnotation from src.input_parsers.ccl import CCLInputParser @@ -51,6 +53,7 @@ example_ccl = """<?xml version="1.0" encoding="UTF-8"?> def test_ccl_input_parser(): + """Test CCLInputParser.""" parser = CCLInputParser() text, annotations = parser.parse(example_ccl) diff --git a/tests/unit/input_parsers/test_wiktor_ner.py b/tests/unit/input_parsers/test_wiktor_ner.py index 8b8fa73..27c95f8 100644 --- a/tests/unit/input_parsers/test_wiktor_ner.py +++ b/tests/unit/input_parsers/test_wiktor_ner.py @@ -1,3 +1,4 @@ +"""Tests for WiktorNERInputParser.""" from src.annotations import MorphosyntacticAnnotation, NerAnnotation from src.input_parsers.wiktor_ner import WiktorNERInputParser @@ -96,6 +97,7 @@ example_json = """{ def test_wiktor_ner_input_parser(): + """Test WiktorNERInputParser.""" parser = WiktorNERInputParser() text, annotations = parser.parse(example_json) diff --git a/tests/unit/pipeline/test_default.py b/tests/unit/pipeline/test_default.py index c83189a..f0e026d 100644 --- a/tests/unit/pipeline/test_default.py +++ b/tests/unit/pipeline/test_default.py @@ -1,3 +1,5 @@ +"""Tests for default pipeline.""" + from tempfile import NamedTemporaryFile from src.detections import NameDetection @@ -8,33 +10,34 @@ from src.replacers.interface import ReplacerInterface from src.suppressors.interface import Suppressor -class MockInputParser(InputParser): +class _MockInputParser(InputParser): def parse(self, content): return "ala ma kota", {} -class MockDetector(Detector): +class _MockDetector(Detector): def detect(self, text, annotations): return [(0, 3, NameDetection())] -class MockSuppressor(Suppressor): +class _MockSuppressor(Suppressor): def suppress(self, annotations): return annotations -class MockReplacer(ReplacerInterface): +class _MockReplacer(ReplacerInterface): def replace(self, text, annotations): return "zbigniew ma kota", annotations def test_default_pipeline(): + """Test default pipeline.""" # TODO: Prepare mocks that will better test the pipeline pipeline = DefaultPipeline( - MockInputParser(), - {"mock_detector": MockDetector()}, - MockSuppressor(), - {"mock_replacer": MockReplacer()}, + _MockInputParser(), + {"mock_detector": _MockDetector()}, + _MockSuppressor(), + {"mock_replacer": _MockReplacer()}, ) with NamedTemporaryFile() as f: diff --git a/tests/unit/pipeline/test_sequential_jsonl.py b/tests/unit/pipeline/test_sequential_jsonl.py index aaaa63d..b09fa6f 100644 --- a/tests/unit/pipeline/test_sequential_jsonl.py +++ b/tests/unit/pipeline/test_sequential_jsonl.py @@ -1,3 +1,5 @@ +"""Tests for sequential jsonl pipeline.""" + from tempfile import NamedTemporaryFile from src.detections import NameDetection @@ -8,33 +10,34 @@ from src.replacers.interface import ReplacerInterface from src.suppressors.interface import Suppressor -class MockInputParser(InputParser): +class _MockInputParser(InputParser): def parse(self, content): return "ala ma kota", {} -class MockDetector(Detector): +class _MockDetector(Detector): def detect(self, text, annotations): return [(0, 3, NameDetection())] -class MockSuppressor(Suppressor): +class _MockSuppressor(Suppressor): def suppress(self, annotations): return annotations -class MockReplacer(ReplacerInterface): +class _MockReplacer(ReplacerInterface): def replace(self, text, annotations): return "zbigniew ma kota", annotations def test_sequential_jsonl_pipeline(): + """Test sequential jsonl pipeline.""" # TODO: Prepare mocks that will better test the pipeline pipeline = SequentialJSONLPipeline( - MockInputParser(), - {"mock_detector": MockDetector()}, - MockSuppressor(), - {"mock_replacer": MockReplacer()}, + _MockInputParser(), + {"mock_detector": _MockDetector()}, + _MockSuppressor(), + {"mock_replacer": _MockReplacer()}, ) with NamedTemporaryFile() as f: diff --git a/tests/unit/replacers/test_date_replacer.py b/tests/unit/replacers/test_date_replacer.py index 2995438..e4cca6a 100644 --- a/tests/unit/replacers/test_date_replacer.py +++ b/tests/unit/replacers/test_date_replacer.py @@ -1,9 +1,11 @@ -from src.detections import (CityDetection, DateDetection, NameDetection, - SurnameDetection) +"""Tests for date replacer.""" + +from src.detections import CityDetection, DateDetection, NameDetection, SurnameDetection from src.replacers.date_replacer import DateReplacer def test_date_replacer(): + """Test date replacer.""" text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ (0, 3, NameDetection()), @@ -30,7 +32,11 @@ def test_date_replacer(): def test_date_replacer_same_date_same_replacement(): - text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu. 05.05.2005 to jej urodziny. 06.05.2005 to nie jej urodziny." + """Test to make sure that the same dates are replaced in the same way.""" + text = ( + "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu. 05.05.2005 to jej" + " urodziny. 06.05.2005 to nie jej urodziny." + ) detections = [ (28, 38, DateDetection()), (53, 63, DateDetection()), diff --git a/tests/unit/replacers/test_email_replacer.py b/tests/unit/replacers/test_email_replacer.py index fdb25cc..1bda3cc 100644 --- a/tests/unit/replacers/test_email_replacer.py +++ b/tests/unit/replacers/test_email_replacer.py @@ -1,9 +1,11 @@ -from src.detections import (CityDetection, DateDetection, EmailDetection, - UserDetection) +"""Tests for email replacer.""" + +from src.detections import CityDetection, DateDetection, EmailDetection from src.replacers.email_replacer import EmailReplacer def test_email_replacer(): + """Test email replacer.""" text = "zz@z.pl urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ (0, 7, EmailDetection()), @@ -26,6 +28,7 @@ def test_email_replacer(): def test_email_replacer_same_email_same_replacement(): + """Test to make sure that the same emails are replaced in the same way.""" text = "zz@z.pl zz@z.pl aa@a.pl" detections = [ (0, 7, EmailDetection()), diff --git a/tests/unit/replacers/test_ner_replacer.py b/tests/unit/replacers/test_ner_replacer.py index 6b262ef..249b00d 100644 --- a/tests/unit/replacers/test_ner_replacer.py +++ b/tests/unit/replacers/test_ner_replacer.py @@ -1,13 +1,14 @@ +"""Tests for NER replacer.""" + from tempfile import NamedTemporaryFile -from src.detections import (CityDetection, DateDetection, NameDetection, - SurnameDetection) -from src.dictionaries.morphosyntactic.ner_file import \ - NERFileMorphosyntacticDictionary +from src.detections import CityDetection, DateDetection, NameDetection, SurnameDetection +from src.dictionaries.morphosyntactic.ner_file import NERFileMorphosyntacticDictionary from src.replacers.ner_replacer import NERReplacer def test_ner_replacer(): + """Test NER replacer.""" with NamedTemporaryFile() as file: file.writelines( [ diff --git a/tests/unit/replacers/test_tag_replacer.py b/tests/unit/replacers/test_tag_replacer.py index 35af41f..f8dd7a2 100644 --- a/tests/unit/replacers/test_tag_replacer.py +++ b/tests/unit/replacers/test_tag_replacer.py @@ -1,9 +1,11 @@ -from src.detections import (CityDetection, DateDetection, NameDetection, - SurnameDetection) +"""Tests for tag replacer.""" + +from src.detections import CityDetection, DateDetection, NameDetection, SurnameDetection from src.replacers.tag_replacer import TagReplacer def test_replace_with_tags(): + """Test replace with tags.""" text = "Ala Brzeszczot urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ (0, 3, NameDetection()), diff --git a/tests/unit/replacers/test_user_replacer.py b/tests/unit/replacers/test_user_replacer.py index 3a49f2e..ded256a 100644 --- a/tests/unit/replacers/test_user_replacer.py +++ b/tests/unit/replacers/test_user_replacer.py @@ -1,8 +1,11 @@ +"""Tests for user replacer.""" + from src.detections import CityDetection, DateDetection, UserDetection from src.replacers.user_replacer import UserReplacer def test_user_replacer(): + """Test user replacer.""" text = "@zzzz32 urodziÅ‚a sie 05.05.2005 we WrocÅ‚awiu" detections = [ (0, 7, UserDetection()), @@ -25,6 +28,7 @@ def test_user_replacer(): def test_user_replacer_same_user_same_replacement(): + """Test to make sure that the same users are replaced in the same way.""" text = "@zzzz32 @zzzz32 @aaaaa" detections = [ (0, 7, UserDetection()), diff --git a/tests/unit/suppressors/test_order_based.py b/tests/unit/suppressors/test_order_based.py index f6d854d..8374dc7 100644 --- a/tests/unit/suppressors/test_order_based.py +++ b/tests/unit/suppressors/test_order_based.py @@ -1,7 +1,10 @@ +"""Tests for order_based suppressor.""" + from src.suppressors.order_based import suppress_order_based def test_supress_order_based(): + """Test test_supress_order_based.""" annotations = [ (10, 16, "Marian"), (10, 18, "Marianna"), diff --git a/tests/unit/test_string_replacements.py b/tests/unit/test_string_replacements.py index 384d0d6..c7bd560 100644 --- a/tests/unit/test_string_replacements.py +++ b/tests/unit/test_string_replacements.py @@ -1,7 +1,10 @@ +"""Unit tests for string_replacements module.""" + from src.string_replacements import replace, replace_and_update def test_replace(): + """Test replace function.""" text = "Ala ma kota" replacements = [(0, 3, "Andrzej"), (7, 11, "psa")] @@ -13,6 +16,7 @@ def test_replace(): def test_replace_out_of_order(): + """Test replace function with replacements out of order.""" text = "Ala ma kota" replacements = [(7, 11, "psa"), (0, 3, "Andrzej")] @@ -23,6 +27,7 @@ def test_replace_out_of_order(): def test_replace_and_update(): + """Test if the replace_and_update function works correctly.""" text = "Ala ma kota kropka" replacements = [(0, 3, "Andrzej"), (7, 11, "psa")] other_annotations = [(4, 6, "ma"), (12, 18, "kropka")] diff --git a/tox.ini b/tox.ini index 8d8d0fb..00b9e8d 100644 --- a/tox.ini +++ b/tox.ini @@ -5,24 +5,24 @@ skipsdist = True [testenv:pep8] deps = flake8 -basepython = python3.6 +basepython = python3.8 commands = flake8 {posargs} [testenv:docstyle] deps = pydocstyle -basepython = python3.6 +basepython = python3.8 commands = pydocstyle --verbose {posargs} [flake8] # W504 skipped because it is overeager and unnecessary -ignore = W504 +ignore = W504,E203,W503 show-source = True exclude = .git,.venv,.tox,dist,doc,*egg,build,venv import-order-style = pep8 -max-line-length = 80 +max-line-length = 88 [pydocstyle] -- GitLab