Skip to content
Snippets Groups Projects
Commit f879f239 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Merge branch 'fix1' into 'master'

Fix anonymizer errors

See merge request !13
parents c96cc8e3 ba2b1885
1 merge request!13Fix anonymizer errors
Pipeline #15494 passed with stages
in 1 minute and 37 seconds
"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.""" """Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""
import random import random
import string
from collections import defaultdict from collections import defaultdict
from typing import List, Optional, Type, Dict from typing import List, Optional, Type, Dict
from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
import logging
_log = logging.getLogger(__name__)
class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
"""Morphosyntactic dictionary that uses a tsv file with NER tags as a source. """Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
...@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
ner_tag, word, lemma, morpho_tag = line.split("\t") ner_tag, word, lemma, morpho_tag = line.split("\t")
replacement_dictionary[ner_tag][morpho_tag][lemma] = word replacement_dictionary[ner_tag][morpho_tag][lemma] = word
replacement_dictionary = {k: dict(v) for k, v
in replacement_dictionary.items()
if v} # freeze dict
return replacement_dictionary return replacement_dictionary
def get_supported_detection_classes(self) -> List[Type[Detection]]: def get_supported_detection_classes(self) -> List[Type[Detection]]:
...@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
if original_entry_type_name in self._dictionary: if original_entry_type_name in self._dictionary:
entry_type = original_entry_type_name entry_type = original_entry_type_name
try:
if morpho_tag in self._dictionary[entry_type]: if entry_type in self._dictionary \
lemma = random.choice( and morpho_tag in self._dictionary[entry_type] \
list(self._dictionary[entry_type][morpho_tag].keys()) and len(list(
) self._dictionary[
entry_type][morpho_tag].keys())
word = self._dictionary[entry_type][morpho_tag][lemma] ) > 0:
else: lemma = random.choice(
morpho_tag = random.choice( list(self._dictionary[entry_type][morpho_tag].keys())
list(self._dictionary[entry_type].keys()) )
)
lemma = random.choice( word = self._dictionary[entry_type][morpho_tag][lemma]
list(self._dictionary[entry_type][morpho_tag].keys()) elif morpho_tag == "ign": # unknown form
) letters = string.ascii_lowercase
word = lemma size = random.randint(3, 5)
lemma = "".join(random.sample(
list(letters), size)).upper()
word = lemma
else:
morpho_tag = random.choice(
list(self._dictionary[entry_type].keys())
)
lemma = random.choice(
list(
self._dictionary[entry_type][morpho_tag].keys()
)
)
word = lemma
except IndexError as exp:
_log.error(f"IndexError entry_type "
f"{entry_type} morpho_tag {morpho_tag}")
_log.error(exp)
_log.error(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")
if word is None and self._always_replace: if word is None and self._always_replace:
entry_type = random.choice(list(self._dictionary.keys())) entry_type = random.choice(list(self._dictionary.keys()))
...@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
original_entries original_entries
) )
possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys()) possible_lemmas = set(
self._dictionary[detection_type][required_tags[0]].keys()
) \
if detection_type in self._dictionary \
and required_tags[0] in self._dictionary[detection_type] \
else set()
for tag in required_tags[1:]: for tag in required_tags[1:]:
possible_lemmas.intersection_update( keys = self._dictionary[detection_type][tag].keys() \
self._dictionary[detection_type][tag].keys() if detection_type in self._dictionary \
) and tag in self._dictionary[detection_type] \
else set()
if keys:
possible_lemmas.intersection_update(keys)
if len(possible_lemmas) == 0: if len(possible_lemmas) == 0:
return [self.get_random_replacement(original_entries[0])] * len( return [self.get_random_replacement(original_entries[0])] * len(
...@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
for entry in original_entries: for entry in original_entries:
if isinstance(entry, MorphosyntacticInfoMixin): if isinstance(entry, MorphosyntacticInfoMixin):
morpho_tag = entry.morpho_tag morpho_tag = entry.morpho_tag
word = self._dictionary[detection_type][morpho_tag][lemma] if detection_type in self._dictionary \
and morpho_tag in self._dictionary[detection_type] \
and lemma in \
self._dictionary[detection_type][morpho_tag]:
word = self._dictionary[detection_type][morpho_tag][lemma]
else:
word = lemma
else: else:
word = lemma word = lemma
......
...@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface): ...@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface):
month_name = months_map[random_month] month_name = months_map[random_month]
replacement.append(month_name) replacement.append(month_name)
elif entry[0] == DateDetection.AnnotationPart.OTHER: elif entry[0] == DateDetection.AnnotationPart.OTHER:
replacement.append(entry[1]) if entry[1] is not None:
replacement.append(entry[1])
else:
raise ValueError(f"Unknown format entry: {entry}")
replacement = "".join(replacement) replacement = "".join(replacement)
already_replaced[text[start:end]] = replacement already_replaced[text[start:end]] = replacement
......
...@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface): ...@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface):
) )
morpho_detections[key].append(detection) morpho_detections[key].append(detection)
else: else:
key = (text[start:end], detection.TYPE_NAME) key = (text[start:end], detection_entry.TYPE_NAME)
non_morpho_detections[key].append(detection) non_morpho_detections[key].append(detection)
# Replace morphosyntactic detections # Replace morphosyntactic detections
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment