Commit 4a8455a8 authored by Bartlomiej Koptyra's avatar Bartlomiej Koptyra

New wiki for annonymizer.

parent cc34c978
Pipeline #1651 passed with stage
in 30 seconds
......@@ -15,4 +15,4 @@ services:
- './config.ini:/home/worker/config.ini'
- './src:/home/worker/src'
- './main.py:/home/worker/main.py'
- './wiktionary-forms-with-bases-and-tags.txt:/home/worker/wiktionary-forms-with-bases-and-tags.txt'
- './wiki.txt:/home/worker/wiki.txt'
"""Implementation of tokenizer service."""
"""Implementation of anonymizer service."""
import argparse
import nlp_ws
from src.worker import Worker
......@@ -6,7 +6,7 @@ from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="tokenizer")
parser = argparse.ArgumentParser(description="anonymizer")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
......
......@@ -9,10 +9,10 @@ class Anonymizer:
_file_to_liner_dispatch = {
'nam_liv_person': 'person_first_nam',
'xDDDDDDDD': 'person_last_nam',
'nam_liv_person_last': 'person_last_nam',
'nam_fac_road': 'road_nam',
'nam_loc_gpe_city': 'city_nam',
'xDDDDDDDd': 'country_nam'
'nam_org_group_team': 'country_nam'
}
_liner_to_tag_dispatch = {
......@@ -26,7 +26,7 @@ class Anonymizer:
def __init__(self, task_options):
"""Initialize anonymizer with task_options."""
self.unmarshallers = {
'chunk': lambda *args: '\n\n',
'chunk': lambda *args: '\n',
'sentence': lambda *args: self._process_sent_tree(*args),
}
self._method = task_options.get('method', 'delete')
......@@ -38,7 +38,7 @@ class Anonymizer:
self._pseudo_ann_list = list()
self._load_file()
def _load_file(self, file_name='wiktionary-forms-with-bases-and-tags.txt'):
def _load_file(self, file_name='wiki.txt'):
with open(file_name, 'r', encoding='utf-8') as f:
for line in f.readlines():
l_list = line.split()
......@@ -114,7 +114,8 @@ class Anonymizer:
current_id = id
length = 1
for id, text, tag, ann in it:
if current_ann == ann:
if current_ann == ann and (ann != 'person_first_nam' and
ann != 'person_last_nam'):
if id == current_id + 2:
length += 1
current_tag = tag
......@@ -150,7 +151,7 @@ class Anonymizer:
length -= 1
if length == 0:
return ''
new_tag = ':'.join(tag.split(':')[1:3])
new_tag = ':'.join(tag.split(':')[1:4])
for i in range(0, 10):
random_entry = random.choice(self._form_dict[ann][length])
if new_tag in random_entry[1]:
......@@ -286,12 +287,15 @@ class Anonymizer:
@staticmethod
def _generate_pseudo_phone_number(number):
new_number = []
length = len(number)
it = iter(number)
if number[0] == '+':
for j in range(0, 3):
how_many = length - 9
for j in range(0, how_many):
new_number.append(next(it))
elif number[0] == '0' and number[1] == '0' and number[4] == ' ':
for j in range(0, 4):
elif number[0] == '0' and number[1] == '0' \
and number[length - 10] == ' ':
for j in range(0, length - 10):
new_number.append(next(it))
elif number[0] == '(' and number[1] == '0' and number[4] == ')':
for j in range(0, 2):
......@@ -373,9 +377,10 @@ class Anonymizer:
def _anonoymize_phone_number(self, sentence):
"""Handles removal/change of links."""
phone_number_regex = r'(((\+\d{2}|00\d{2}) ?)?(\d{9}))|((\+\d{2} ' \
r'|00\d{2} )?(\d{3} \d{3} \d{3}))|(\(0\d{2}\) ' \
r'\d{2} \d{2} \d{3})|(\(\d{2}\) \d{2} \d{3} \d{2})'
phone_number_regex = r'(((\+[1-9]\d{0,2}|00[1-9]\d{0,2}) ?)?(\d{9}))' \
r'|((\+[1-9]\d{0,2} |00[1-9]\d{0,2} )?' \
r'(\d{3} \d{3} \d{3}))|(\(0\d{2}\) \d{2} \d{2} ' \
r'\d{3})|(\(\d{2}\) \d{2} \d{3} \d{2})'
if self._method == 'delete':
sentence = re.sub(phone_number_regex, '', sentence)
elif self._method == 'tag':
......
......@@ -11,7 +11,7 @@ class Ccl_handler:
def process(self, output_file, unmarshallers):
"""Process xml tags using unmarshallers and save in output_file."""
with open(output_file, 'wt', encoding='utf-8') as out:
with open(output_file, 'w', encoding='utf-8') as out:
with open(self._file_name, 'r', encoding='utf-8') as f:
for event, elem in iterparse(f):
unmarshal = unmarshallers.get(elem.tag, None)
......
"""Convert NELexicon into wiki used by anonymizer.
Requires morfeusz2 to be installed.
"""
import morfeusz2
morf = morfeusz2.Morfeusz(expand_tags=True)
_file_to_liner_dispatch = {
'nam_liv_person': 'person_first_nam',
'nam_liv_person_last': 'person_last_nam',
'nam_fac_road': 'road_nam',
'nam_loc_gpe_city': 'city_nam',
'nam_org_group_team': 'country_nam'
}
_allowed_genders = ['f', 'm1', 'm2', 'm3', 'n']
def _create_wiki():
with open('wiki.txt', 'wt+', encoding='utf-8') as f:
_add_gender(f)
_last_names(f)
def _add_gender(
output,
file_name='nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt'
):
with open(file_name, 'r', encoding='utf-8') as f:
_form_dict = dict()
for line in f:
l_list = line.split()
cat = l_list[0]
if cat in _file_to_liner_dispatch:
cat_name = cat
length = int((len(l_list) - 2) / 2)
gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)])
flx_name = ' '.join(l_list[1:(1 + length)])
flex = l_list[-1]
if cat_name not in _form_dict:
_form_dict[cat_name] = dict()
if length not in _form_dict[cat_name]:
_form_dict[cat_name][length] = dict()
if gen_name not in _form_dict[cat_name][length]:
_form_dict[cat_name][length][gen_name] = dict()
if flex not in _form_dict[cat_name][length][gen_name]:
_form_dict[cat_name][length][gen_name][flex] = flx_name
name = gen_name.split(' ')[0]
generate = morf.generate(name)
flex_split = generate[0][2].split(':')
if len(flex_split) > 3:
gender = flex_split[3]
new_flex = flex + ':' + gender
output.write(cat + '\t' + flx_name + '\t' +
gen_name + '\t' + new_flex + '\n')
def _last_names(output):
dict_list = list()
with open('nelexicon2/extra/wikipedia-liner2.txt',
'rt',
encoding='utf-8'
) as f:
for line in f:
line = line.strip()
line_l = line.split('\t')
if line_l[0] == 'nam_liv_person_last':
line_l = line_l[1]
line_l.split(' ')
line_len = len(line_l)
if type(line_l) == list() and line_len > 1:
dictionary = dict()
for word in line_l:
gen = morf.generate(word)
for w in gen:
tag_list = w[2].split(':')
if len(tag_list) > 3:
tag = tag_list[1] + ':' + tag_list[2]
if tag not in dictionary:
dictionary[tag] = w[0]
else:
dictionary[tag] += ' ' + w[0]
for key in dictionary:
if len(dictionary[key].split(' ')) == line_len:
d = dictionary[key]
dict_list.append(d)
else:
word = line_l[0] if type(line_l) == list() else line_l
generate = morf.generate(word)
for g in generate:
if len(g) > 4 and 'nazwisko' in g[3]:
dict_list.append(g)
for word in dict_list:
d = word
line = 'nam_liv_person_last' + '\t' + d[0].split(':')[0] +\
'\t' + d[1].split(':')[0] + '\t' + ':'.join(d[2].split(':')[1:])
output.write(line + '\n')
_create_wiki()
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment