Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nlpworkers/anonymizer
1 result
Show changes
Commits on Source (2)
"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""
import random
import string
from collections import defaultdict
from typing import List, Optional, Type, Dict
from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
import logging
_log = logging.getLogger(__name__)
class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
"""Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
......@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
ner_tag, word, lemma, morpho_tag = line.split("\t")
replacement_dictionary[ner_tag][morpho_tag][lemma] = word
replacement_dictionary = {k: dict(v) for k, v
in replacement_dictionary.items()
if v} # freeze dict
return replacement_dictionary
def get_supported_detection_classes(self) -> List[Type[Detection]]:
......@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
if original_entry_type_name in self._dictionary:
entry_type = original_entry_type_name
if morpho_tag in self._dictionary[entry_type]:
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = self._dictionary[entry_type][morpho_tag][lemma]
else:
morpho_tag = random.choice(
list(self._dictionary[entry_type].keys())
)
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = lemma
try:
if entry_type in self._dictionary \
and morpho_tag in self._dictionary[entry_type] \
and len(list(
self._dictionary[
entry_type][morpho_tag].keys())
) > 0:
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = self._dictionary[entry_type][morpho_tag][lemma]
elif morpho_tag == "ign": # unknown form
letters = string.ascii_lowercase
size = random.randint(3, 5)
lemma = "".join(random.sample(
list(letters), size)).upper()
word = lemma
else:
morpho_tag = random.choice(
list(self._dictionary[entry_type].keys())
)
lemma = random.choice(
list(
self._dictionary[entry_type][morpho_tag].keys()
)
)
word = lemma
except IndexError as exp:
_log.error(f"IndexError entry_type "
f"{entry_type} morpho_tag {morpho_tag}")
_log.error(exp)
_log.error(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")
if word is None and self._always_replace:
entry_type = random.choice(list(self._dictionary.keys()))
......@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
original_entries
)
possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys())
possible_lemmas = set(
self._dictionary[detection_type][required_tags[0]].keys()
) \
if detection_type in self._dictionary \
and required_tags[0] in self._dictionary[detection_type] \
else set()
for tag in required_tags[1:]:
possible_lemmas.intersection_update(
self._dictionary[detection_type][tag].keys()
)
keys = self._dictionary[detection_type][tag].keys() \
if detection_type in self._dictionary \
and tag in self._dictionary[detection_type] \
else set()
if keys:
possible_lemmas.intersection_update(keys)
if len(possible_lemmas) == 0:
return [self.get_random_replacement(original_entries[0])] * len(
......@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
for entry in original_entries:
if isinstance(entry, MorphosyntacticInfoMixin):
morpho_tag = entry.morpho_tag
word = self._dictionary[detection_type][morpho_tag][lemma]
if detection_type in self._dictionary \
and morpho_tag in self._dictionary[detection_type] \
and lemma in \
self._dictionary[detection_type][morpho_tag]:
word = self._dictionary[detection_type][morpho_tag][lemma]
else:
word = lemma
else:
word = lemma
......
......@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface):
month_name = months_map[random_month]
replacement.append(month_name)
elif entry[0] == DateDetection.AnnotationPart.OTHER:
replacement.append(entry[1])
if entry[1] is not None:
replacement.append(entry[1])
else:
raise ValueError(f"Unknown format entry: {entry}")
replacement = "".join(replacement)
already_replaced[text[start:end]] = replacement
......
......@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface):
)
morpho_detections[key].append(detection)
else:
key = (text[start:end], detection.TYPE_NAME)
key = (text[start:end], detection_entry.TYPE_NAME)
non_morpho_detections[key].append(detection)
# Replace morphosyntactic detections
......