Commit b60208e1 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski

Merge branch 'develop' into 'master'

Develop

See merge request !1
parents dd9d97f6 af901edf
Pipeline #1732 passed with stages
in 1 minute and 27 seconds
image: 'clarinpl/python:3.6'
cache:
paths:
- .tox
stages:
- check_style
- build
before_script:
- pip install tox==2.9.1
pep8:
stage: check_style
script:
- tox -v -e pep8
docstyle:
stage: check_style
script:
- tox -v -e docstyle
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/anonymizer .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/anonymizer
FROM clarinpl/python:3.6
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
CMD ["python3.6", "main.py", "service"]
# anonymizer
# Anonymizer
Liner2 should use model 5nam.
tekst->any2txt->morphodita->liner2->anonymizer
[service]
tool = anonymizer
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
queue_prefix = nlp_
[tool]
workers_number = 1
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
version: '3'
services:
tokenizer:
container_name: clarin_anonymizer
build: ./
working_dir: /home/worker
entrypoint:
- python3.6
- main.py
- service
environment:
- PYTHONUNBUFFERED=0
volumes:
- '/samba:/samba'
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
- './wiki.txt:/home/worker/wiki.txt'
"""Implementation of anonymizer service."""
import argparse
import nlp_ws
from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="anonymizer")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(Worker),
}
gen_fn = generators.get(args.mode, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
nlp-ws
\ No newline at end of file
This diff is collapsed.
"""Implementation of ccl reading functionality."""
from xml.etree.ElementTree import iterparse
class Ccl_handler:
"""Implements reading ccl for anonymizer service."""
def __init__(self, ccl_file_name):
"""Initialize ccl_handler with a filename."""
self._file_name = ccl_file_name
def process(self, output_file, unmarshallers):
"""Process xml tags using unmarshallers and save in output_file."""
with open(output_file, 'w', encoding='utf-8') as out:
with open(self._file_name, 'r', encoding='utf-8') as f:
for event, elem in iterparse(f):
unmarshal = unmarshallers.get(elem.tag, None)
if unmarshal:
out.write(unmarshal(elem))
elem.clear()
"""Implementation of nlp_worker."""
import logging
import nlp_ws
from src.anonymizer import Anonymizer
from src.ccl_handler import Ccl_handler
_log = logging.getLogger(__name__)
class Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for anonymizer service."""
def process(self, input_file, task_options, output_file):
"""Anonymizes input text.
It is assumed input_file is encoded in UTF-8.
Options:
method - 'delete'/'tag'/'pseudo' - 'delete' deletes selected tokens,
'tag' replaces selected tokens with arbitrary tags, 'pseudo'
replaces selected tokens with a random token that
"""
anon = Anonymizer(task_options)
ccl_handler = Ccl_handler(input_file)
ccl_handler.process(output_file, anon.unmarshallers)
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3
commands =
flake8 {posargs}
[testenv:docstyle]
deps =
pydocstyle
basepython = python3
commands =
pydocstyle --verbose {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
match = ^(?!setup).*\.py
\ No newline at end of file
"""Convert NELexicon into wiki used by anonymizer.
Requires morfeusz2 to be installed.
"""
import morfeusz2
morf = morfeusz2.Morfeusz(expand_tags=True)
_file_to_liner_dispatch = {
'nam_liv_person': 'person_first_nam',
'nam_liv_person_last': 'person_last_nam',
'nam_fac_road': 'road_nam',
'nam_loc_gpe_city': 'city_nam',
'nam_org_group_team': 'country_nam'
}
_allowed_genders = ['f', 'm1', 'm2', 'm3', 'n']
def _create_wiki():
with open('wiki.txt', 'wt+', encoding='utf-8') as f:
_add_gender(f)
_last_names(f)
def _add_gender(
output,
file_name='nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt'
):
with open(file_name, 'r', encoding='utf-8') as f:
_form_dict = dict()
for line in f:
l_list = line.split()
cat = l_list[0]
if cat in _file_to_liner_dispatch:
cat_name = cat
length = int((len(l_list) - 2) / 2)
gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)])
flx_name = ' '.join(l_list[1:(1 + length)])
flex = l_list[-1]
if cat_name not in _form_dict:
_form_dict[cat_name] = dict()
if length not in _form_dict[cat_name]:
_form_dict[cat_name][length] = dict()
if gen_name not in _form_dict[cat_name][length]:
_form_dict[cat_name][length][gen_name] = dict()
if flex not in _form_dict[cat_name][length][gen_name]:
_form_dict[cat_name][length][gen_name][flex] = flx_name
name = gen_name.split(' ')[0]
generate = morf.generate(name)
flex_split = generate[0][2].split(':')
if len(flex_split) > 3:
gender = flex_split[3]
new_flex = flex + ':' + gender
output.write(cat + '\t' + flx_name + '\t' +
gen_name + '\t' + new_flex + '\n')
def _last_names(output):
dict_list = list()
with open('nelexicon2/extra/wikipedia-liner2.txt',
'rt',
encoding='utf-8'
) as f:
for line in f:
line = line.strip()
line_l = line.split('\t')
if line_l[0] == 'nam_liv_person_last':
line_l = line_l[1]
line_l.split(' ')
line_len = len(line_l)
if type(line_l) == list() and line_len > 1:
dictionary = dict()
for word in line_l:
gen = morf.generate(word)
for w in gen:
tag_list = w[2].split(':')
if len(tag_list) > 3:
tag = tag_list[1] + ':' + tag_list[2]
if tag not in dictionary:
dictionary[tag] = w[0]
else:
dictionary[tag] += ' ' + w[0]
for key in dictionary:
if len(dictionary[key].split(' ')) == line_len:
d = dictionary[key]
dict_list.append(d)
else:
word = line_l[0] if type(line_l) == list() else line_l
generate = morf.generate(word)
for g in generate:
if len(g) > 4 and 'nazwisko' in g[3]:
dict_list.append(g)
for word in dict_list:
d = word
line = 'nam_liv_person_last' + '\t' + d[0].split(':')[0] +\
'\t' + d[1].split(':')[0] + '\t' + ':'.join(d[2].split(':')[1:])
output.write(line + '\n')
_create_wiki()
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment