Merge branch 'fix1' into 'master'

Fix anonymizer errors See merge request !13

Merge branch 'fix1' into 'master'
Fix anonymizer errors See merge request !13
f879f239 · Paweł Walkowiak · c96cc8e3 · ba2b1885 · f879f239 · f879f239
Commit f879f239 authored 1 year ago by Paweł Walkowiak
--- a/src/dictionaries/morphosyntactic/ner_file.py
+++ b/src/dictionaries/morphosyntactic/ner_file.py
 """Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""
 import random
+import string
 from collections import defaultdict
 from typing import List, Optional, Type, Dict
 from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
 from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
+import logging
+_log = logging.getLogger(__name__)
 class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
    """Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
                ner_tag, word, lemma, morpho_tag = line.split("\t")
                replacement_dictionary[ner_tag][morpho_tag][lemma] = word
+        replacement_dictionary = {k: dict(v) for k, v
+                                  in replacement_dictionary.items()
+                                  if v}  # freeze dict
        return replacement_dictionary
    def get_supported_detection_classes(self) -> List[Type[Detection]]:
@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
            if original_entry_type_name in self._dictionary:
                entry_type = original_entry_type_name
+                try:
-                if morpho_tag in self._dictionary[entry_type]:
+                    if entry_type in self._dictionary \
-                    lemma = random.choice(
+                            and morpho_tag in self._dictionary[entry_type] \
-                        list(self._dictionary[entry_type][morpho_tag].keys())
+                            and len(list(
-                    )
+                                    self._dictionary[
+                                        entry_type][morpho_tag].keys())
-                    word = self._dictionary[entry_type][morpho_tag][lemma]
+                                    ) > 0:
-                else:
+                        lemma = random.choice(
-                    morpho_tag = random.choice(
+                            list(self._dictionary[entry_type][morpho_tag].keys())
-                        list(self._dictionary[entry_type].keys())
+                        )
-                    )
-                    lemma = random.choice(
+                        word = self._dictionary[entry_type][morpho_tag][lemma]
-                        list(self._dictionary[entry_type][morpho_tag].keys())
+                    elif morpho_tag == "ign":  # unknown form
-                    )
+                        letters = string.ascii_lowercase
-                    word = lemma
+                        size = random.randint(3, 5)
+                        lemma = "".join(random.sample(
+                            list(letters), size)).upper()
+                        word = lemma
+                    else:
+                        morpho_tag = random.choice(
+                            list(self._dictionary[entry_type].keys())
+                        )
+                        lemma = random.choice(
+                            list(
+                                self._dictionary[entry_type][morpho_tag].keys()
+                            )
+                        )
+                        word = lemma
+                except IndexError as exp:
+                    _log.error(f"IndexError entry_type "
+                               f"{entry_type} morpho_tag {morpho_tag}")
+                    _log.error(exp)
+                    _log.error(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")
        if word is None and self._always_replace:
            entry_type = random.choice(list(self._dictionary.keys()))
@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
                original_entries
            )
-        possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys())
+        possible_lemmas = set(
+            self._dictionary[detection_type][required_tags[0]].keys()
+        ) \
+            if detection_type in self._dictionary \
+            and required_tags[0] in self._dictionary[detection_type] \
+            else set()
        for tag in required_tags[1:]:
-            possible_lemmas.intersection_update(
+            keys = self._dictionary[detection_type][tag].keys() \
-                self._dictionary[detection_type][tag].keys()
+                if detection_type in self._dictionary \
-            )
+                and tag in self._dictionary[detection_type] \
+                else set()
+            if keys:
+                possible_lemmas.intersection_update(keys)
        if len(possible_lemmas) == 0:
            return [self.get_random_replacement(original_entries[0])] * len(
@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
        for entry in original_entries:
            if isinstance(entry, MorphosyntacticInfoMixin):
                morpho_tag = entry.morpho_tag
-                word = self._dictionary[detection_type][morpho_tag][lemma]
+                if detection_type in self._dictionary \
+                        and morpho_tag in self._dictionary[detection_type] \
+                        and lemma in \
+                        self._dictionary[detection_type][morpho_tag]:
+                    word = self._dictionary[detection_type][morpho_tag][lemma]
+                else:
+                    word = lemma
            else:
                word = lemma

--- a/src/replacers/date_replacer.py
+++ b/src/replacers/date_replacer.py
@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface):
                            month_name = months_map[random_month]
                            replacement.append(month_name)
                        elif entry[0] == DateDetection.AnnotationPart.OTHER:
-                            replacement.append(entry[1])
+                            if entry[1] is not None:
+                                replacement.append(entry[1])
+                        else:
+                            raise ValueError(f"Unknown format entry: {entry}")
                    replacement = "".join(replacement)
                    already_replaced[text[start:end]] = replacement

--- a/src/replacers/ner_replacer.py
+++ b/src/replacers/ner_replacer.py
@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface):
                )
                morpho_detections[key].append(detection)
            else:
-                key = (text[start:end], detection.TYPE_NAME)
+                key = (text[start:end], detection_entry.TYPE_NAME)
                non_morpho_detections[key].append(detection)
        # Replace morphosyntactic detections