Skip to content
Snippets Groups Projects
Commit eb1e9ee7 authored by Bartlomiej Koptyra's avatar Bartlomiej Koptyra
Browse files

First version of annonymizer. Needs an update for the wiki with replacments.

parent f78b4599
Branches
2 merge requests!2Develop,!1Develop
Pipeline #1566 passed with stage
in 32 seconds
image: 'clarinpl/python:3.6'
cache:
paths:
- .tox
stages:
- check_style
- build
before_script:
- pip install tox==2.9.1
pep8:
stage: check_style
script:
- tox -v -e pep8
docstyle:
stage: check_style
script:
- tox -v -e docstyle
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/anonymizer .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/anonymizer
FROM clarinpl/python:3.6
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
CMD ["python3.6", "main.py", "service"]
\ No newline at end of file
FROM clarinpl/python:3.6
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
CMD ["python3.6", "main.py", "service"]
# Anonymizer
Input tagger should be morphoDita and liner2 should use model 5nam.
tekst->any2txt->morphodita->liner2->anonimizer
# Anonymizer
Input tagger should be morphoDita and liner2 should use model 5nam.
tekst->any2txt->morphodita->liner2->anonimizer
[service]
tool = anonymizer
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
queue_prefix = nlp_
[tool]
workers_number = 1
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
[service]
tool = anonymizer
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
queue_prefix = nlp_
[tool]
workers_number = 1
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
version: '3'
services:
tokenizer:
container_name: clarin_anonymizer
build: ./
working_dir: /home/worker
entrypoint:
- python3.6
- main.py
- service
environment:
- PYTHONUNBUFFERED=0
volumes:
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
version: '3'
services:
tokenizer:
container_name: clarin_anonymizer
build: ./
working_dir: /home/worker
entrypoint:
- python3.6
- main.py
- service
environment:
- PYTHONUNBUFFERED=0
volumes:
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
- './wiktionary-forms-with-bases-and-tags.txt:/home/worker/wiktionary-forms-with-bases-and-tags.txt'
"""Implementation of tokenizer service."""
import argparse
import nlp_ws
from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="tokenizer")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(Worker),
}
gen_fn = generators.get(args.mode, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
"""Implementation of tokenizer service."""
import argparse
import nlp_ws
from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="tokenizer")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(Worker),
}
gen_fn = generators.get(args.mode, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
This diff is collapsed.
"""Implementation of ccl reading functionality."""
from xml.etree.ElementTree import iterparse
class Ccl_handler:
"""Implements reading ccl for anonymizer service."""
def __init__(self, ccl_file_name):
self._file_name = ccl_file_name
def process(self, output_file, unmarshallers):
with open(output_file, 'wt') as out:
with open(self._file_name, 'r') as f:
for event, elem in iterparse(f):
unmarshal = unmarshallers.get(elem.tag, None)
if unmarshal:
out.write(unmarshal(elem))
elem.clear()
"""Implementation of ccl reading functionality."""
from xml.etree.ElementTree import iterparse
class Ccl_handler:
"""Implements reading ccl for anonymizer service."""
def __init__(self, ccl_file_name):
"""Initialize ccl_handler with a filename."""
self._file_name = ccl_file_name
def process(self, output_file, unmarshallers):
"""Process xml tags using unmarshallers and save in output_file."""
with open(output_file, 'wt', encoding='utf-8') as out:
with open(self._file_name, 'r', encoding='utf-8') as f:
for event, elem in iterparse(f):
unmarshal = unmarshallers.get(elem.tag, None)
if unmarshal:
out.write(unmarshal(elem))
elem.clear()
"""Implementation of nlp_worker."""
import logging
import nlp_ws
from src.anonymizer import Anonymizer
_log = logging.getLogger(__name__)
class Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for anonymizer service."""
@classmethod
def static_init(cls, config):
"""One time static initialisation."""
print("siema")
def process(self, input_file, task_options, output_file):
"""Anonymizes input text.
It is assumed input_file is encoded in UTF-8.
Options:
method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens,
'tag' replaces selected tokens with arbitrary tags, 'pseudo'
replaces selected tokens with a random token that
"""
anon = Anonymizer(task_options)
with open(input_file, 'rt', encoding='utf-8') as input_file:
with open(output_file, 'wt', encoding='utf-8') as output_file:
print("elo")
"""Implementation of nlp_worker."""
import logging
import nlp_ws
from src.anonymizer import Anonymizer
from src.ccl_handler import Ccl_handler
_log = logging.getLogger(__name__)
class Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for anonymizer service."""
def process(self, input_file, task_options, output_file):
"""Anonymizes input text.
It is assumed input_file is encoded in UTF-8.
Options:
method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens,
'tag' replaces selected tokens with arbitrary tags, 'pseudo'
replaces selected tokens with a random token that
"""
anon = Anonymizer(task_options)
ccl_handler = Ccl_handler(input_file)
ccl_handler.process(output_file, anon.unmarshallers)
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
match = ^(?!setup).*\.py
\ No newline at end of file
This diff is collapsed.
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment