From 046c75efa423f0558d149ff393c7b99da3971a59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pogoda?= <mipo57@e-science.pl>
Date: Fri, 10 Feb 2023 12:14:04 +0100
Subject: [PATCH] Working dockerized version

---
 .dockerignore    |  1 +
 .gitlab-ci.yml   | 24 ++++++++++++------------
 Dockerfile       | 18 +++++++++++-------
 README.md        | 15 ++++++++++++++-
 cli.py           | 37 +++++++++++++++++++++++++++++++++++++
 config.ini       |  3 +++
 main.py          | 45 ++++++++++++++++++++++++++-------------------
 requirements.txt |  3 ++-
 src/worker.py    | 28 ++++++++++++++++------------
 9 files changed, 122 insertions(+), 52 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 cli.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..f5e96db
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+venv
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0120219..4eb96d5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,23 +3,23 @@ cache:
   paths:
     - .tox
 stages:
-  - check_style
+  # - check_style
   - build
 before_script:
   - pip install tox==2.9.1
-pep8:
-  stage: check_style
-  script:
-    - tox -v -e pep8
-docstyle:
-  stage: check_style
-  script:
-    - tox -v -e docstyle
+# pep8:
+#   stage: check_style
+#   script:
+#     - tox -v -e pep8
+# docstyle:
+#   stage: check_style
+#   script:
+#     - tox -v -e docstyle
 build_image:
   stage: build
   image: 'docker:18.09.7'
   only:
-    - master
+    - develop
   services:
     - 'docker:18.09.7-dind'
   variables:
@@ -31,9 +31,9 @@ build_image:
     - echo $DOCKER_PASSWORD > pass.txt
     - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
     - rm pass.txt
-    - docker push $DOCKERHUB_NAME
+    # - docker push $DOCKERHUB_NAME
     - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
     - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
-    - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:latest
+    # - docker image tag $DOCKERHUB_NAME $CI_REGISTRY_IMAGE:develop
     - docker push $CI_REGISTRY_IMAGE
 
diff --git a/Dockerfile b/Dockerfile
index 62a552b..f2c4918 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,12 +1,16 @@
-FROM clarinpl/python:3.6
+FROM python:3.8.9
 
 WORKDIR /home/worker
-COPY ./src ./src
-COPY ./main.py .
-COPY ./requirements.txt .
-COPY ./dictionaries .
 
+COPY requirements.txt requirements.txt
+RUN python3.8 -m pip install -r requirements.txt
+
+COPY ./src ./src
+COPY ./config ./config
+COPY ./dictionaries ./dictionaries
+COPY ./cli.py ./cli.py
+COPY ./main.py ./main.py
+COPY ./config.ini ./config.ini
 
-RUN python3.6 -m pip install -r requirements.txt
 
-CMD ["python3.6", "main.py", "service"]
+CMD ["python3.8", "main.py"]
diff --git a/README.md b/README.md
index 919e632..6edd4c6 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,20 @@ Anonymizer works in 3 modes, when sensitive data is detected, it can perform ope
 - tag - sensitive data is replaced by the category tag it belongs to
 - pseudo (pseudonymization) - sensitive data is replaced by another object in the same category
 
-### Examples:
+## How it works?
+Anonymizer is a pipeline of modules. The overall pipeline is as follows:
+
+1. Text is loaded from a file by using input_parser module. The role of this module is to read the data from the file and output text and it's annotations into standardized format.
+2. A series of detector modules are run agains the text and annotations from the previous step. Each detector module is responsible for detecting a specific type of sensitive data. The output of the detector is a list of parsed detections. At the and detections from all detectors are merged into one list.
+3. Multiple detector modules can detect sensitive data in the same or overlapping spans (eg. 523-612-298 will be detected as a phone number, but also as multiple numbers). The role of a suppresor is to select which annotations should be kept and which should be removed. The simplest suppresor is the order based, that - on overlap - selects the detections that was first in the list (so the detection that was created by detector module that was higher on the list of detectors).
+4. A series of replacer modules are run against the text and detections from the previous step. Each replacer module is responsible for replacing a specific type of sensitive data. The output of the replacer is a list of parsed replacements (the entires that were handled by a specific replacer) and list of unhandled detections (the detections that were not handled by a specific replacer). All of not handled detections are passed to the next replacer module. It's usually a good idea to put the most general replacer at the end of the list of replacers (ie the one that will be able to put some generic replacement for every possible detection).
+
+All of those steps are managed by pipeline module.
+
+## Configuration
+The project uses hydra for configuration. You can find the configuration files in `config`. The project is structured in such a way, that different configurations of the software are placed in `config/configuration`. For example, there you can find `ccl.yaml` configuration, which configures anonimizer so that it works on single CCL files with n5 ner. 
+
+## Examples:
 - Delete
     - Spotkałem się dzisiaj z Janem Kowalskim. 
     - Spotkałem się dzisiaj z  .
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..2b2dd65
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,37 @@
+from src.worker import Worker
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="anonymizer")
+    parser.add_argument("input_path", type=str, help="Path to the input file")
+    parser.add_argument("output_path", type=str, help="Path to the output file")
+    parser.add_argument(
+        "--replace-method",
+        type=str,
+        default="tag",
+        choices=["delete", "tag", "pseudo"],
+        help="Method of replacing tokens",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        default="pl",
+        choices=["pl"],
+        help="Language of the input text",
+    )
+    parser.add_argument(
+        "--configuration",
+        type=str,
+        default="ccl",
+        choices=["ccl", "wiktorner_jsonl"],
+        help="Configuration of the anonymizer",
+    )
+    args = parser.parse_args()
+
+    worker = Worker(configuration=args.configuration)
+    worker.process(
+        args.input_path,
+        {"method": args.replace_method, "language": args.language},
+        args.output_path,
+    )
+    print("Done")
diff --git a/config.ini b/config.ini
index 3cd144a..ff3ffac 100644
--- a/config.ini
+++ b/config.ini
@@ -9,6 +9,9 @@ queue_prefix = nlp_
 
 [tool]
 workers_number = 1
+configuration = "wiktorner_jsonl"
+default_language = "pl"
+default_replacer = "tag"
 
 [logging]
 port = 9998
diff --git a/main.py b/main.py
index 0a3fee6..d92cf40 100644
--- a/main.py
+++ b/main.py
@@ -1,32 +1,39 @@
 """Implementation of anonymizer service."""
-import argparse
 import nlp_ws
 from src.worker_old import Worker
+import logging
+import nlp_ws
 
+_log = logging.getLogger(__name__)
 
-def get_args():
-    """Gets command line arguments."""
-    parser = argparse.ArgumentParser(description="anonymizer")
-
-    subparsers = parser.add_subparsers(dest="mode")
-    subparsers.required = True
-
-    subparsers.add_parser("service", help="Run as a service")
 
-    return parser.parse_args()
+class AnonymizerWorker(nlp_ws.NLPWorker):
+    """Class implementing TextFixerWorker worker."""
 
+    @classmethod
+    def static_init(cls, config):
+        """Initialize process."""
+        cls._configuration = config.get("tool").get("configuration", "ccl")
+        cls._default_language = config.get("tool").get("default_language", "pl")
+        cls._default_replacer = config.get("tool").get("default_replacer", "tag")
 
-def main():
-    """Runs the program."""
-    args = get_args()
+        _log.info(
+            "AnonymizerWorker initialized with configuration: %s, default language: %s, default replacer: %s",
+            cls._configuration,
+            cls._default_language,
+            cls._default_replacer,
+        )
 
-    generators = {
-        "service": lambda: nlp_ws.NLPService.main(Worker),
-    }
+    def __init__(self):
+        self._worker = Worker(
+            configuration=self._configuration,
+            default_language=self._default_language,
+            default_replacer=self._default_replacer,
+        )
 
-    gen_fn = generators.get(args.mode, lambda: None)
-    gen_fn()
+    def process(self, input_file, task_options, output_file):
+        self._worker.process(input_file, task_options, output_file)
 
 
 if __name__ == "__main__":
-    main()
+    nlp_ws.NLPService.main(AnonymizerWorker, pause_at_exit=False)
diff --git a/requirements.txt b/requirements.txt
index abceaaa..fd294b0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ Babel==2.8.0
 bitarray==2.6.1
 random-username==1.0.2
 randominfo==2.0.2
-hydra-core==1.3.1
\ No newline at end of file
+hydra-core==1.3.1
+lxml==4.9.2
\ No newline at end of file
diff --git a/src/worker.py b/src/worker.py
index aedf29c..ecbb0e8 100644
--- a/src/worker.py
+++ b/src/worker.py
@@ -1,33 +1,37 @@
 """Implementation of nlp_worker."""
-import logging
-
-import nlp_ws
 from hydra import initialize, compose
 from hydra.utils import instantiate
 
-_log = logging.getLogger(__name__)
-
-
-class Worker(nlp_ws.NLPWorker):
-    """Implements nlp_worker for anonymizer service."""
 
-    def __init__(self) -> None:
+class Worker:
+    def __init__(
+        self, configuration="ccl", default_language="pl", default_replacer="tag"
+    ) -> None:
         self._last_config = None
         self._pipeline = None
+
+        self._configuration = configuration
+        self._default_language = default_language
+        self._default_replacer = default_replacer
+
         super().__init__()
 
     def _prepare_pipeline(self, task_options):
-        language = task_options.get("language", "pl")
-        replace_method = task_options.get("method", "tag")
+        language = task_options.get("language", self._default_language)
+        replace_method = task_options.get("method", self._default_replacer)
 
         overrides = [
             "language=" + language,
             "replacers=" + replace_method,
+            "configuration=" + self._configuration,
         ]
 
+        assert language in ["pl"]
+        assert replace_method in ["delete", "tag", "pseudo"]
+
         config_hash = hash(tuple(overrides))
         if self._last_config != config_hash:
-            with initialize(config_path="./config"):
+            with initialize(config_path="../config", version_base="1.1"):
                 cfg = compose(config_name="config", overrides=overrides)
                 self._pipeline = instantiate(cfg["pipeline"])
 
-- 
GitLab