diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..f5e96dbfaec8bd23554e839a582259cf17837f26 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +venv \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0120219cc1564bc36a81649c5652846619d3236f..4eb96d573557a729b32c2d8bf3961aa1525427b2 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,23 +3,23 @@ cache: paths: - .tox stages: - - check_style + # - check_style - build before_script: - pip install tox==2.9.1 -pep8: - stage: check_style - script: - - tox -v -e pep8 -docstyle: - stage: check_style - script: - - tox -v -e docstyle +# pep8: +# stage: check_style +# script: +# - tox -v -e pep8 +# docstyle: +# stage: check_style +# script: +# - tox -v -e docstyle build_image: stage: build image: 'docker:18.09.7' only: - - master + - develop services: - 'docker:18.09.7-dind' variables: @@ -31,9 +31,9 @@ build_image: - echo $DOCKER_PASSWORD > pass.txt - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin - rm pass.txt - - docker push $DOCKERHUB_NAME + # - docker push $DOCKERHUB_NAME - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG - - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:latest + # - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:develop - docker push $CI_REGISTRY_IMAGE diff --git a/Dockerfile b/Dockerfile index 62a552baa4b4632be3d5ec5bcdd05c25aa2c2f44..f2c49188535bbd04539f76038816f7b6eb30a434 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,16 @@ -FROM clarinpl/python:3.6 +FROM python:3.8.9 WORKDIR /home/worker -COPY ./src ./src -COPY ./main.py . -COPY ./requirements.txt . -COPY ./dictionaries . +COPY requirements.txt requirements.txt +RUN python3.8 -m pip install -r requirements.txt + +COPY ./src ./src +COPY ./config ./config +COPY ./dictionaries ./dictionaries +COPY ./cli.py ./cli.py +COPY ./main.py ./main.py +COPY ./config.ini ./config.ini -RUN python3.6 -m pip install -r requirements.txt -CMD ["python3.6", "main.py", "service"] +CMD ["python3.8", "main.py"] diff --git a/README.md b/README.md index 919e63215bfc0f879d81fd281b7220aa848152cd..6edd4c6905773cc6eb60228b9272bddddbcaa461 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,20 @@ Anonymizer works in 3 modes, when sensitive data is detected, it can perform ope - tag - sensitive data is replaced by the category tag it belongs to - pseudo (pseudonymization) - sensitive data is replaced by another object in the same category -### Examples: +## How it works? +Anonymizer is a pipeline of modules. The overall pipeline is as follows: + +1. Text is loaded from a file by using input_parser module. The role of this module is to read the data from the file and output text and it's annotations into standardized format. +2. A series of detector modules are run agains the text and annotations from the previous step. Each detector module is responsible for detecting a specific type of sensitive data. The output of the detector is a list of parsed detections. At the and detections from all detectors are merged into one list. +3. Multiple detector modules can detect sensitive data in the same or overlapping spans (eg. 523-612-298 will be detected as a phone number, but also as multiple numbers). The role of a suppresor is to select which annotations should be kept and which should be removed. The simplest suppresor is the order based, that - on overlap - selects the detections that was first in the list (so the detection that was created by detector module that was higher on the list of detectors). +4. A series of replacer modules are run against the text and detections from the previous step. Each replacer module is responsible for replacing a specific type of sensitive data. The output of the replacer is a list of parsed replacements (the entires that were handled by a specific replacer) and list of unhandled detections (the detections that were not handled by a specific replacer). All of not handled detections are passed to the next replacer module. It's usually a good idea to put the most general replacer at the end of the list of replacers (ie the one that will be able to put some generic replacement for every possible detection). + +All of those steps are managed by pipeline module. + +## Configuration +The project uses hydra for configuration. You can find the configuration files in `config`. The project is structured in such a way, that different configurations of the software are placed in `config/configuration`. For example, there you can find `ccl.yaml` configuration, which configures anonimizer so that it works on single CCL files with n5 ner. + +## Examples: - Delete - Spotkałem się dzisiaj z Janem Kowalskim. - Spotkałem się dzisiaj z . diff --git a/cli.py b/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..2b2dd65b3dbe84225514d9a988f88ff8b80ee1d1 --- /dev/null +++ b/cli.py @@ -0,0 +1,37 @@ +from src.worker import Worker +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="anonymizer") + parser.add_argument("input_path", type=str, help="Path to the input file") + parser.add_argument("output_path", type=str, help="Path to the output file") + parser.add_argument( + "--replace-method", + type=str, + default="tag", + choices=["delete", "tag", "pseudo"], + help="Method of replacing tokens", + ) + parser.add_argument( + "--language", + type=str, + default="pl", + choices=["pl"], + help="Language of the input text", + ) + parser.add_argument( + "--configuration", + type=str, + default="ccl", + choices=["ccl", "wiktorner_jsonl"], + help="Configuration of the anonymizer", + ) + args = parser.parse_args() + + worker = Worker(configuration=args.configuration) + worker.process( + args.input_path, + {"method": args.replace_method, "language": args.language}, + args.output_path, + ) + print("Done") diff --git a/config.ini b/config.ini index 3cd144a7baf256a4fc0c40488b8bd9813b9289ce..ff3ffacb0ef8b05ac0f5c4284450a7b451f39b7f 100644 --- a/config.ini +++ b/config.ini @@ -9,6 +9,9 @@ queue_prefix = nlp_ [tool] workers_number = 1 +configuration = "wiktorner_jsonl" +default_language = "pl" +default_replacer = "tag" [logging] port = 9998 diff --git a/main.py b/main.py index 0a3fee6bff7b6dc6ca04ec6da4e7a9f76f0b07c6..d92cf402548a5c161c0909260059c3cfa8b276f6 100644 --- a/main.py +++ b/main.py @@ -1,32 +1,39 @@ """Implementation of anonymizer service.""" -import argparse import nlp_ws from src.worker_old import Worker +import logging +import nlp_ws +_log = logging.getLogger(__name__) -def get_args(): - """Gets command line arguments.""" - parser = argparse.ArgumentParser(description="anonymizer") - - subparsers = parser.add_subparsers(dest="mode") - subparsers.required = True - - subparsers.add_parser("service", help="Run as a service") - return parser.parse_args() +class AnonymizerWorker(nlp_ws.NLPWorker): + """Class implementing TextFixerWorker worker.""" + @classmethod + def static_init(cls, config): + """Initialize process.""" + cls._configuration = config.get("tool").get("configuration", "ccl") + cls._default_language = config.get("tool").get("default_language", "pl") + cls._default_replacer = config.get("tool").get("default_replacer", "tag") -def main(): - """Runs the program.""" - args = get_args() + _log.info( + "AnonymizerWorker initialized with configuration: %s, default language: %s, default replacer: %s", + cls._configuration, + cls._default_language, + cls._default_replacer, + ) - generators = { - "service": lambda: nlp_ws.NLPService.main(Worker), - } + def __init__(self): + self._worker = Worker( + configuration=self._configuration, + default_language=self._default_language, + default_replacer=self._default_replacer, + ) - gen_fn = generators.get(args.mode, lambda: None) - gen_fn() + def process(self, input_file, task_options, output_file): + self._worker.process(input_file, task_options, output_file) if __name__ == "__main__": - main() + nlp_ws.NLPService.main(AnonymizerWorker, pause_at_exit=False) diff --git a/requirements.txt b/requirements.txt index abceaaab74a0e30a67493b54be3855d7cea76a65..fd294b06655c0e74737c266b35c0cdc6f12602b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ Babel==2.8.0 bitarray==2.6.1 random-username==1.0.2 randominfo==2.0.2 -hydra-core==1.3.1 \ No newline at end of file +hydra-core==1.3.1 +lxml==4.9.2 \ No newline at end of file diff --git a/src/worker.py b/src/worker.py index aedf29ca64c3b8aa5c1578d77460e83500de32ab..ecbb0e843c3d02a6c05b8f8145e4bed1cbcacdb9 100644 --- a/src/worker.py +++ b/src/worker.py @@ -1,33 +1,37 @@ """Implementation of nlp_worker.""" -import logging - -import nlp_ws from hydra import initialize, compose from hydra.utils import instantiate -_log = logging.getLogger(__name__) - - -class Worker(nlp_ws.NLPWorker): - """Implements nlp_worker for anonymizer service.""" - def __init__(self) -> None: +class Worker: + def __init__( + self, configuration="ccl", default_language="pl", default_replacer="tag" + ) -> None: self._last_config = None self._pipeline = None + + self._configuration = configuration + self._default_language = default_language + self._default_replacer = default_replacer + super().__init__() def _prepare_pipeline(self, task_options): - language = task_options.get("language", "pl") - replace_method = task_options.get("method", "tag") + language = task_options.get("language", self._default_language) + replace_method = task_options.get("method", self._default_replacer) overrides = [ "language=" + language, "replacers=" + replace_method, + "configuration=" + self._configuration, ] + assert language in ["pl"] + assert replace_method in ["delete", "tag", "pseudo"] + config_hash = hash(tuple(overrides)) if self._last_config != config_hash: - with initialize(config_path="./config"): + with initialize(config_path="../config", version_base="1.1"): cfg = compose(config_name="config", overrides=overrides) self._pipeline = instantiate(cfg["pipeline"])