diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c6ec813966514d0d24c181f8fd8cbcce559455ba --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM clarinpl/python:3.6 + +WORKDIR /home/worker +COPY ./src ./src +COPY ./main.py . +COPY ./requirements.txt . + +RUN python3.6 -m pip install -r requirements.txt + +CMD ["python3.6", "main.py", "service"] \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..8adf851c849abf08557fab0663e69e2af7778688 --- /dev/null +++ b/config.ini @@ -0,0 +1,19 @@ +[service] +tool = anonymizer + +root = /samba/requests/ +rabbit_host = rabbitmq +rabbit_user = test +rabbit_password = test +queue_prefix = nlp_ + +[tool] +workers_number = 1 + +[logging] +port = 9998 +local_log_level = INFO + +[logging_levels] +__main__ = INFO + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..7ab0db258b63cfd53cb3fd5dbdd57af035b77dc9 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +version: '3' +services: + tokenizer: + container_name: clarin_anonymizer + build: ./ + working_dir: /home/worker + entrypoint: + - python3.6 + - main.py + - service + environment: + - PYTHONUNBUFFERED=0 + volumes: + - '/samba:/samba' + - './config.ini:/home/worker/config.ini' + - './src:/home/worker/src' + - './main.py:/home/worker/main.py' diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..42eeb5e0425a181592b65b503e090d4a00df77b7 --- /dev/null +++ b/main.py @@ -0,0 +1,34 @@ +"""Implementation of tokenizer service.""" +import argparse +import nlp_ws +from src.worker import Worker + + +def get_args(): + """Gets command line arguments.""" + parser = argparse.ArgumentParser(description="tokenizer") + + subparsers = parser.add_subparsers(dest="mode") + subparsers.required = True + + subparsers.add_parser( + "service", + help="Run as a service") + + return parser.parse_args() + + +def main(): + """Runs the program.""" + args = get_args() + + generators = { + "service": lambda: nlp_ws.NLPService.main(Worker), + } + + gen_fn = generators.get(args.mode, lambda: None) + gen_fn() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..dae0fc562943ab4df118a6eb757e6c333761c3fb --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +nlp-ws \ No newline at end of file diff --git a/src/anonymizer.py b/src/anonymizer.py new file mode 100644 index 0000000000000000000000000000000000000000..577638f3ab756cc059ea96733bb41d2bbab9f0f8 --- /dev/null +++ b/src/anonymizer.py @@ -0,0 +1,14 @@ +"""Implementation of anonymizer functionality.""" +import re + + +class Anonymizer: + """Class used to edit sentences based on options.""" + + def __init__(self, task_options): + self.method = task_options.get('method', 'delete') + + def process(self): + if ctag == 'ign': + # sprawddz czy to nick a potem email + # sprawdz czy to nazwa własna jak mBank? nie wiem diff --git a/src/worker.py b/src/worker.py new file mode 100644 index 0000000000000000000000000000000000000000..a2a225d3c7f54ae222210af27e7b0d654090a600 --- /dev/null +++ b/src/worker.py @@ -0,0 +1,34 @@ +"""Implementation of nlp_worker.""" +import logging + +import nlp_ws + + +from src.anonymizer import Anonymizer + +_log = logging.getLogger(__name__) + + +class Worker(nlp_ws.NLPWorker): + """Implements nlp_worker for anonymizer service.""" + + @classmethod + def static_init(cls, config): + """One time static initialisation.""" + print("siema") + + def process(self, input_file, task_options, output_file): + """Anonymizes input text. + + It is assumed input_file is encoded in UTF-8. + + Options: + method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens, + 'tag' replaces selected tokens with arbitrary tags, 'pseudo' + replaces selected tokens with a random token that + """ + anon = Anonymizer(task_options) + with open(input_file, 'rt', encoding='utf-8') as input_file: + with open(output_file, 'wt', encoding='utf-8') as output_file: + print("elo") + diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..819e612125efd269e8c0ef8d996d668a30c8dad7 --- /dev/null +++ b/tox.ini @@ -0,0 +1,44 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python3 +commands = + pydocstyle --verbose {posargs} + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 +match-dir = ^(?!\.tox|venv).* +match = ^(?!setup).*\.py \ No newline at end of file