First version of annonymizer. Needs an update for the wiki with replacments.

eb1e9ee7 · Bartlomiej Koptyra · f78b4599 · eb1e9ee7 · eb1e9ee7 · eb1e9ee7
Commit eb1e9ee7 authored 4 years ago by Bartlomiej Koptyra
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: 'clarinpl/python:3.6'
+cache:
+  paths:
+    - .tox
+stages:
+  - check_style
+  - build
+before_script:
+  - pip install tox==2.9.1
+pep8:
+  stage: check_style
+  script:
+    - tox -v -e pep8
+docstyle:
+  stage: check_style
+  script:
+    - tox -v -e docstyle
+build_image:
+  stage: build
+  image: 'docker:18.09.7'
+  only:
+    - master
+  services:
+    - 'docker:18.09.7-dind'
+  before_script:
+    - ''
+  script:
+    - docker build -t clarinpl/anonymizer .
+    - echo $DOCKER_PASSWORD > pass.txt
+    - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
+    - rm pass.txt
+    - docker push clarinpl/anonymizer
--- a/Dockerfile
+++ b/Dockerfile
-FROM clarinpl/python:3.6
-
-WORKDIR /home/worker
-COPY ./src ./src
-COPY ./main.py .
-COPY ./requirements.txt .
-
-RUN python3.6 -m pip install -r requirements.txt
-
-CMD ["python3.6", "main.py", "service"]
\ No newline at end of file
+FROM clarinpl/python:3.6
+
+WORKDIR /home/worker
+COPY ./src ./src
+COPY ./main.py .
+COPY ./requirements.txt .
+
+RUN python3.6 -m pip install -r requirements.txt
+
+CMD ["python3.6", "main.py", "service"]
--- a/README.md
+++ b/README.md
-# Anonymizer
-
-Input tagger should be morphoDita and liner2 should use model 5nam.
-tekst->any2txt->morphodita->liner2->anonimizer
+# Anonymizer
+
+Input tagger should be morphoDita and liner2 should use model 5nam.
+tekst->any2txt->morphodita->liner2->anonimizer
--- a/config.ini
+++ b/config.ini
-[service]
-tool = anonymizer
-
-root = /samba/requests/
-rabbit_host = rabbitmq
-rabbit_user = test
-rabbit_password = test
-queue_prefix = nlp_
-
-[tool]
-workers_number = 1
-
-[logging]
-port = 9998
-local_log_level = INFO
-
-[logging_levels]
-__main__ = INFO
-
+[service]
+tool = anonymizer
+
+root = /samba/requests/
+rabbit_host = rabbitmq
+rabbit_user = test
+rabbit_password = test
+queue_prefix = nlp_
+
+[tool]
+workers_number = 1
+
+[logging]
+port = 9998
+local_log_level = INFO
+
+[logging_levels]
+__main__ = INFO
+
--- a/docker-compose.yml
+++ b/docker-compose.yml
-version: '3'
-services:
-  tokenizer:
-    container_name: clarin_anonymizer
-    build: ./
-    working_dir: /home/worker
-    entrypoint:
-      - python3.6
-      - main.py
-      - service
-    environment:
-      - PYTHONUNBUFFERED=0
-    volumes:
-      - '/samba:/samba'
-      - './config.ini:/home/worker/config.ini'
-      - './src:/home/worker/src'
-      - './main.py:/home/worker/main.py'
+version: '3'
+services:
+  tokenizer:
+    container_name: clarin_anonymizer
+    build: ./
+    working_dir: /home/worker
+    entrypoint:
+      - python3.6
+      - main.py
+      - service
+    environment:
+      - PYTHONUNBUFFERED=0
+    volumes:
+      - '/samba:/samba'
+      - './config.ini:/home/worker/config.ini'
+      - './src:/home/worker/src'
+      - './main.py:/home/worker/main.py'
+      - './wiktionary-forms-with-bases-and-tags.txt:/home/worker/wiktionary-forms-with-bases-and-tags.txt'
--- a/main.py
+++ b/main.py
-"""Implementation of tokenizer service."""
-import argparse
-import nlp_ws
-from src.worker import Worker
-
-
-def get_args():
-    """Gets command line arguments."""
-    parser = argparse.ArgumentParser(description="tokenizer")
-
-    subparsers = parser.add_subparsers(dest="mode")
-    subparsers.required = True
-
-    subparsers.add_parser(
-        "service",
-        help="Run as a service")
-
-    return parser.parse_args()
-
-
-def main():
-    """Runs the program."""
-    args = get_args()
-
-    generators = {
-        "service": lambda: nlp_ws.NLPService.main(Worker),
-    }
-
-    gen_fn = generators.get(args.mode, lambda: None)
-    gen_fn()
-
-
-if __name__ == "__main__":
-    main()
+"""Implementation of tokenizer service."""
+import argparse
+import nlp_ws
+from src.worker import Worker
+
+
+def get_args():
+    """Gets command line arguments."""
+    parser = argparse.ArgumentParser(description="tokenizer")
+
+    subparsers = parser.add_subparsers(dest="mode")
+    subparsers.required = True
+
+    subparsers.add_parser(
+        "service",
+        help="Run as a service")
+
+    return parser.parse_args()
+
+
+def main():
+    """Runs the program."""
+    args = get_args()
+
+    generators = {
+        "service": lambda: nlp_ws.NLPService.main(Worker),
+    }
+
+    gen_fn = generators.get(args.mode, lambda: None)
+    gen_fn()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/anonymizer.py
+++ b/src/anonymizer.py
--- a/src/ccl_handler.py
+++ b/src/ccl_handler.py
-"""Implementation of ccl reading functionality."""
-from xml.etree.ElementTree import iterparse
-
-class Ccl_handler:
-    """Implements reading ccl for anonymizer service."""
-
-    def __init__(self, ccl_file_name):
-        self._file_name = ccl_file_name
-
-    def process(self, output_file, unmarshallers):
-        with open(output_file, 'wt') as out:
-            with open(self._file_name, 'r') as f:
-                for event, elem in iterparse(f):
-                    unmarshal = unmarshallers.get(elem.tag, None)
-                    if unmarshal:
-                        out.write(unmarshal(elem))
-                        elem.clear()
-
-
+"""Implementation of ccl reading functionality."""
+from xml.etree.ElementTree import iterparse
+
+
+class Ccl_handler:
+    """Implements reading ccl for anonymizer service."""
+
+    def __init__(self, ccl_file_name):
+        """Initialize ccl_handler with a filename."""
+        self._file_name = ccl_file_name
+
+    def process(self, output_file, unmarshallers):
+        """Process xml tags using unmarshallers and save in output_file."""
+        with open(output_file, 'wt', encoding='utf-8') as out:
+            with open(self._file_name, 'r', encoding='utf-8') as f:
+                for event, elem in iterparse(f):
+                    unmarshal = unmarshallers.get(elem.tag, None)
+                    if unmarshal:
+                        out.write(unmarshal(elem))
+                        elem.clear()
--- a/src/worker.py
+++ b/src/worker.py
-"""Implementation of nlp_worker."""
-import logging
-
-import nlp_ws
-
-
-from src.anonymizer import Anonymizer
-
-_log = logging.getLogger(__name__)
-
-
-class Worker(nlp_ws.NLPWorker):
-    """Implements nlp_worker for anonymizer service."""
-
-    @classmethod
-    def static_init(cls, config):
-        """One time static initialisation."""
-        print("siema")
-
-    def process(self, input_file, task_options, output_file):
-        """Anonymizes input text.
-
-        It is assumed input_file is encoded in UTF-8.
-
-        Options:
-        method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens,
-                'tag' replaces selected tokens with arbitrary tags, 'pseudo'
-                replaces selected tokens with a random token that
-        """
-        anon = Anonymizer(task_options)
-        with open(input_file, 'rt', encoding='utf-8') as input_file:
-            with open(output_file, 'wt', encoding='utf-8') as output_file:
-                print("elo")
-
+"""Implementation of nlp_worker."""
+import logging
+
+import nlp_ws
+
+
+from src.anonymizer import Anonymizer
+from src.ccl_handler import Ccl_handler
+
+_log = logging.getLogger(__name__)
+
+
+class Worker(nlp_ws.NLPWorker):
+    """Implements nlp_worker for anonymizer service."""
+
+    def process(self, input_file, task_options, output_file):
+        """Anonymizes input text.
+
+        It is assumed input_file is encoded in UTF-8.
+
+        Options:
+        method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens,
+                'tag' replaces selected tokens with arbitrary tags, 'pseudo'
+                replaces selected tokens with a random token that
+        """
+        anon = Anonymizer(task_options)
+        ccl_handler = Ccl_handler(input_file)
+        ccl_handler.process(output_file, anon.unmarshallers)
--- a/tox.ini
+++ b/tox.ini
-[tox]
-envlist = pep8,docstyle
-skipsdist = True
-
-[testenv:pep8]
-deps =
-    flake8
-basepython = python3
-commands =
-    flake8 {posargs}
-
-[testenv:docstyle]
-deps =
-    pydocstyle
-basepython = python3
-commands =
-    pydocstyle --verbose {posargs}
-
-[flake8]
-# W504 skipped because it is overeager and unnecessary
-ignore = W504
-show-source = True
-exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
-import-order-style = pep8
-max-line-length = 80
-
-
-[pydocstyle]
-# D104 Missing docstring in public package
-# D203 1 blank line required before class docstring
-# D213 Multi-line docstring summary should start at the second line
-# D214 Section is over-indented
-# D215 Section underline is over-indented
-# D401 First line should be in imperative mood; try rephrasing
-# D405 Section name should be properly capitalized
-# D406 Section name should end with a newline
-# D407 Missing dashed underline after section
-# D408 Section underline should be in the line following the section’s name
-# D409 Section underline should match the length of its name
-# D410 Missing blank line after section
-# D411 Missing blank line before section
-ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
-match-dir = ^(?!\.tox|venv).*
+[tox]
+envlist = pep8,docstyle
+skipsdist = True
+
+[testenv:pep8]
+deps =
+    flake8
+basepython = python3
+commands =
+    flake8 {posargs}
+
+[testenv:docstyle]
+deps =
+    pydocstyle
+basepython = python3
+commands =
+    pydocstyle --verbose {posargs}
+
+[flake8]
+# W504 skipped because it is overeager and unnecessary
+ignore = W504
+show-source = True
+exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
+import-order-style = pep8
+max-line-length = 80
+
+
+[pydocstyle]
+# D104 Missing docstring in public package
+# D203 1 blank line required before class docstring
+# D213 Multi-line docstring summary should start at the second line
+# D214 Section is over-indented
+# D215 Section underline is over-indented
+# D401 First line should be in imperative mood; try rephrasing
+# D405 Section name should be properly capitalized
+# D406 Section name should end with a newline
+# D407 Missing dashed underline after section
+# D408 Section underline should be in the line following the section’s name
+# D409 Section underline should match the length of its name
+# D410 Missing blank line after section
+# D411 Missing blank line before section
+ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
+match-dir = ^(?!\.tox|venv).*
 match = ^(?!setup).*\.py
\ No newline at end of file
--- a/wiktionary-forms-with-bases-and-tags.txt
+++ b/wiktionary-forms-with-bases-and-tags.txt