Skip to content
Snippets Groups Projects
Commit f879f239 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Merge branch 'fix1' into 'master'

Fix anonymizer errors

See merge request !13
parents c96cc8e3 ba2b1885
Branches
1 merge request!13Fix anonymizer errors
Pipeline #15494 passed with stages
in 1 minute and 37 seconds
"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""
import random
import string
from collections import defaultdict
from typing import List, Optional, Type, Dict
from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
import logging
_log = logging.getLogger(__name__)
class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
"""Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
......@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
ner_tag, word, lemma, morpho_tag = line.split("\t")
replacement_dictionary[ner_tag][morpho_tag][lemma] = word
replacement_dictionary = {k: dict(v) for k, v
in replacement_dictionary.items()
if v} # freeze dict
return replacement_dictionary
def get_supported_detection_classes(self) -> List[Type[Detection]]:
......@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
if original_entry_type_name in self._dictionary:
entry_type = original_entry_type_name
if morpho_tag in self._dictionary[entry_type]:
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = self._dictionary[entry_type][morpho_tag][lemma]
else:
morpho_tag = random.choice(
list(self._dictionary[entry_type].keys())
)
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = lemma
try:
if entry_type in self._dictionary \
and morpho_tag in self._dictionary[entry_type] \
and len(list(
self._dictionary[
entry_type][morpho_tag].keys())
) > 0:
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = self._dictionary[entry_type][morpho_tag][lemma]
elif morpho_tag == "ign": # unknown form
letters = string.ascii_lowercase
size = random.randint(3, 5)
lemma = "".join(random.sample(
list(letters), size)).upper()
word = lemma
else:
morpho_tag = random.choice(
list(self._dictionary[entry_type].keys())
)
lemma = random.choice(
list(
self._dictionary[entry_type][morpho_tag].keys()
)
)
word = lemma
except IndexError as exp:
_log.error(f"IndexError entry_type "
f"{entry_type} morpho_tag {morpho_tag}")
_log.error(exp)
_log.error(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")
if word is None and self._always_replace:
entry_type = random.choice(list(self._dictionary.keys()))
......@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
original_entries
)
possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys())
possible_lemmas = set(
self._dictionary[detection_type][required_tags[0]].keys()
) \
if detection_type in self._dictionary \
and required_tags[0] in self._dictionary[detection_type] \
else set()
for tag in required_tags[1:]:
possible_lemmas.intersection_update(
self._dictionary[detection_type][tag].keys()
)
keys = self._dictionary[detection_type][tag].keys() \
if detection_type in self._dictionary \
and tag in self._dictionary[detection_type] \
else set()
if keys:
possible_lemmas.intersection_update(keys)
if len(possible_lemmas) == 0:
return [self.get_random_replacement(original_entries[0])] * len(
......@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
for entry in original_entries:
if isinstance(entry, MorphosyntacticInfoMixin):
morpho_tag = entry.morpho_tag
word = self._dictionary[detection_type][morpho_tag][lemma]
if detection_type in self._dictionary \
and morpho_tag in self._dictionary[detection_type] \
and lemma in \
self._dictionary[detection_type][morpho_tag]:
word = self._dictionary[detection_type][morpho_tag][lemma]
else:
word = lemma
else:
word = lemma
......
......@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface):
month_name = months_map[random_month]
replacement.append(month_name)
elif entry[0] == DateDetection.AnnotationPart.OTHER:
replacement.append(entry[1])
if entry[1] is not None:
replacement.append(entry[1])
else:
raise ValueError(f"Unknown format entry: {entry}")
replacement = "".join(replacement)
already_replaced[text[start:end]] = replacement
......
......@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface):
)
morpho_detections[key].append(detection)
else:
key = (text[start:end], detection.TYPE_NAME)
key = (text[start:end], detection_entry.TYPE_NAME)
non_morpho_detections[key].append(detection)
# Replace morphosyntactic detections
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment