New wiki for annonymizer.

4a8455a8 · Bartlomiej Koptyra · cc34c978 · 4a8455a8 · 4a8455a8 · 4a8455a8
Commit 4a8455a8 authored 4 years ago by Bartlomiej Koptyra
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -15,4 +15,4 @@ services:
      - './config.ini:/home/worker/config.ini'
      - './src:/home/worker/src'
      - './main.py:/home/worker/main.py'
-      - './wiktionary-forms-with-bases-and-tags.txt:/home/worker/wiktionary-forms-with-bases-and-tags.txt'
+      - './wiki.txt:/home/worker/wiki.txt'
--- a/main.py
+++ b/main.py
-"""Implementation of tokenizer service."""
+"""Implementation of anonymizer service."""
 import argparse
 import nlp_ws
 from src.worker import Worker
@@ -6,7 +6,7 @@ from src.worker import Worker

 def get_args():
    """Gets command line arguments."""
-    parser = argparse.ArgumentParser(description="tokenizer")
+    parser = argparse.ArgumentParser(description="anonymizer")

    subparsers = parser.add_subparsers(dest="mode")
    subparsers.required = True

--- a/src/anonymizer.py
+++ b/src/anonymizer.py
@@ -9,10 +9,10 @@ class Anonymizer:

    _file_to_liner_dispatch = {
        'nam_liv_person': 'person_first_nam',
-        'xDDDDDDDD': 'person_last_nam',
+        'nam_liv_person_last': 'person_last_nam',
        'nam_fac_road': 'road_nam',
        'nam_loc_gpe_city': 'city_nam',
-        'xDDDDDDDd': 'country_nam'
+        'nam_org_group_team': 'country_nam'
    }

    _liner_to_tag_dispatch = {
@@ -26,7 +26,7 @@ class Anonymizer:
    def __init__(self, task_options):
        """Initialize anonymizer with task_options."""
        self.unmarshallers = {
-            'chunk': lambda *args: '\n\n',
+            'chunk': lambda *args: '\n',
            'sentence': lambda *args: self._process_sent_tree(*args),
        }
        self._method = task_options.get('method', 'delete')
@@ -38,7 +38,7 @@ class Anonymizer:
        self._pseudo_ann_list = list()
        self._load_file()

-    def _load_file(self, file_name='wiktionary-forms-with-bases-and-tags.txt'):
+    def _load_file(self, file_name='wiki.txt'):
        with open(file_name, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                l_list = line.split()
@@ -114,7 +114,8 @@ class Anonymizer:
            current_id = id
            length = 1
            for id, text, tag, ann in it:
-                if current_ann == ann:
+                if current_ann == ann and (ann != 'person_first_nam' and
+                                           ann != 'person_last_nam'):
                    if id == current_id + 2:
                        length += 1
                        current_tag = tag
@@ -150,7 +151,7 @@ class Anonymizer:
            length -= 1
        if length == 0:
            return ''
-        new_tag = ':'.join(tag.split(':')[1:3])
+        new_tag = ':'.join(tag.split(':')[1:4])
        for i in range(0, 10):
            random_entry = random.choice(self._form_dict[ann][length])
            if new_tag in random_entry[1]:
@@ -286,12 +287,15 @@ class Anonymizer:
    @staticmethod
    def _generate_pseudo_phone_number(number):
        new_number = []
+        length = len(number)
        it = iter(number)
        if number[0] == '+':
-            for j in range(0, 3):
+            how_many = length - 9
+            for j in range(0, how_many):
                new_number.append(next(it))
-        elif number[0] == '0' and number[1] == '0' and number[4] == ' ':
-            for j in range(0, 4):
+        elif number[0] == '0' and number[1] == '0' \
+                and number[length - 10] == ' ':
+            for j in range(0, length - 10):
                new_number.append(next(it))
        elif number[0] == '(' and number[1] == '0' and number[4] == ')':
            for j in range(0, 2):
@@ -373,9 +377,10 @@ class Anonymizer:

    def _anonoymize_phone_number(self, sentence):
        """Handles removal/change of links."""
-        phone_number_regex = r'(((\+\d{2}|00\d{2}) ?)?(\d{9}))|((\+\d{2} ' \
-                             r'|00\d{2} )?(\d{3} \d{3} \d{3}))|(\(0\d{2}\) ' \
-                             r'\d{2} \d{2} \d{3})|(\(\d{2}\) \d{2} \d{3} \d{2})'
+        phone_number_regex = r'(((\+[1-9]\d{0,2}|00[1-9]\d{0,2}) ?)?(\d{9}))' \
+                             r'|((\+[1-9]\d{0,2} |00[1-9]\d{0,2} )?' \
+                             r'(\d{3} \d{3} \d{3}))|(\(0\d{2}\) \d{2} \d{2} ' \
+                             r'\d{3})|(\(\d{2}\) \d{2} \d{3} \d{2})'
        if self._method == 'delete':
            sentence = re.sub(phone_number_regex, '', sentence)
        elif self._method == 'tag':

--- a/src/ccl_handler.py
+++ b/src/ccl_handler.py
@@ -11,7 +11,7 @@ class Ccl_handler:

    def process(self, output_file, unmarshallers):
        """Process xml tags using unmarshallers and save in output_file."""
-        with open(output_file, 'wt', encoding='utf-8') as out:
+        with open(output_file, 'w', encoding='utf-8') as out:
            with open(self._file_name, 'r', encoding='utf-8') as f:
                for event, elem in iterparse(f):
                    unmarshal = unmarshallers.get(elem.tag, None)

--- a/utility/NELex2_to_wiki.py
+++ b/utility/NELex2_to_wiki.py
+"""Convert NELexicon into wiki used by anonymizer.
+
+Requires morfeusz2 to be installed.
+"""
+
+import morfeusz2
+morf = morfeusz2.Morfeusz(expand_tags=True)
+
+_file_to_liner_dispatch = {
+    'nam_liv_person': 'person_first_nam',
+    'nam_liv_person_last': 'person_last_nam',
+    'nam_fac_road': 'road_nam',
+    'nam_loc_gpe_city': 'city_nam',
+    'nam_org_group_team': 'country_nam'
+}
+
+_allowed_genders = ['f', 'm1', 'm2', 'm3', 'n']
+
+
+def _create_wiki():
+    with open('wiki.txt', 'wt+', encoding='utf-8') as f:
+        _add_gender(f)
+        _last_names(f)
+
+
+def _add_gender(
+        output,
+        file_name='nelexicon2/extra/wiktionary-forms-with-bases-and-tags.txt'
+):
+    with open(file_name, 'r', encoding='utf-8') as f:
+        _form_dict = dict()
+        for line in f:
+            l_list = line.split()
+            cat = l_list[0]
+            if cat in _file_to_liner_dispatch:
+                cat_name = cat
+                length = int((len(l_list) - 2) / 2)
+                gen_name = ' '.join(l_list[(1 + length):(1 + 2 * length)])
+                flx_name = ' '.join(l_list[1:(1 + length)])
+                flex = l_list[-1]
+                if cat_name not in _form_dict:
+                    _form_dict[cat_name] = dict()
+                if length not in _form_dict[cat_name]:
+                    _form_dict[cat_name][length] = dict()
+                if gen_name not in _form_dict[cat_name][length]:
+                    _form_dict[cat_name][length][gen_name] = dict()
+                if flex not in _form_dict[cat_name][length][gen_name]:
+                    _form_dict[cat_name][length][gen_name][flex] = flx_name
+                    name = gen_name.split(' ')[0]
+                    generate = morf.generate(name)
+                    flex_split = generate[0][2].split(':')
+                    if len(flex_split) > 3:
+                        gender = flex_split[3]
+                        new_flex = flex + ':' + gender
+                        output.write(cat + '\t' + flx_name + '\t' +
+                                     gen_name + '\t' + new_flex + '\n')
+
+
+def _last_names(output):
+    dict_list = list()
+    with open('nelexicon2/extra/wikipedia-liner2.txt',
+              'rt',
+              encoding='utf-8'
+              ) as f:
+        for line in f:
+            line = line.strip()
+            line_l = line.split('\t')
+            if line_l[0] == 'nam_liv_person_last':
+                line_l = line_l[1]
+                line_l.split(' ')
+                line_len = len(line_l)
+                if type(line_l) == list() and line_len > 1:
+                    dictionary = dict()
+                    for word in line_l:
+                        gen = morf.generate(word)
+                        for w in gen:
+                            tag_list = w[2].split(':')
+                            if len(tag_list) > 3:
+                                tag = tag_list[1] + ':' + tag_list[2]
+                                if tag not in dictionary:
+                                    dictionary[tag] = w[0]
+                                else:
+                                    dictionary[tag] += ' ' + w[0]
+                    for key in dictionary:
+                        if len(dictionary[key].split(' ')) == line_len:
+                            d = dictionary[key]
+                            dict_list.append(d)
+                else:
+                    word = line_l[0] if type(line_l) == list() else line_l
+                    generate = morf.generate(word)
+                    for g in generate:
+                        if len(g) > 4 and 'nazwisko' in g[3]:
+                            dict_list.append(g)
+    for word in dict_list:
+        d = word
+        line = 'nam_liv_person_last' + '\t' + d[0].split(':')[0] +\
+               '\t' + d[1].split(':')[0] + '\t' + ':'.join(d[2].split(':')[1:])
+        output.write(line + '\n')
+
+
+_create_wiki()
--- a/wiki.txt
+++ b/wiki.txt
--- a/wiktionary-forms-with-bases-and-tags.txt
+++ b/wiktionary-forms-with-bases-and-tags.txt