Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nlpworkers/anonymizer
1 result
Show changes
Commits on Source (2)
"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.""" """Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""
import random import random
import string
from collections import defaultdict from collections import defaultdict
from typing import List, Optional, Type, Dict from typing import List, Optional, Type, Dict
from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
import logging
_log = logging.getLogger(__name__)
class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
"""Morphosyntactic dictionary that uses a tsv file with NER tags as a source. """Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
...@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
ner_tag, word, lemma, morpho_tag = line.split("\t") ner_tag, word, lemma, morpho_tag = line.split("\t")
replacement_dictionary[ner_tag][morpho_tag][lemma] = word replacement_dictionary[ner_tag][morpho_tag][lemma] = word
replacement_dictionary = {k: dict(v) for k, v
in replacement_dictionary.items()
if v} # freeze dict
return replacement_dictionary return replacement_dictionary
def get_supported_detection_classes(self) -> List[Type[Detection]]: def get_supported_detection_classes(self) -> List[Type[Detection]]:
...@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
if original_entry_type_name in self._dictionary: if original_entry_type_name in self._dictionary:
entry_type = original_entry_type_name entry_type = original_entry_type_name
try:
if morpho_tag in self._dictionary[entry_type]: if entry_type in self._dictionary \
lemma = random.choice( and morpho_tag in self._dictionary[entry_type] \
list(self._dictionary[entry_type][morpho_tag].keys()) and len(list(
) self._dictionary[
entry_type][morpho_tag].keys())
word = self._dictionary[entry_type][morpho_tag][lemma] ) > 0:
else: lemma = random.choice(
morpho_tag = random.choice( list(self._dictionary[entry_type][morpho_tag].keys())
list(self._dictionary[entry_type].keys()) )
)
lemma = random.choice( word = self._dictionary[entry_type][morpho_tag][lemma]
list(self._dictionary[entry_type][morpho_tag].keys()) elif morpho_tag == "ign": # unknown form
) letters = string.ascii_lowercase
word = lemma size = random.randint(3, 5)
lemma = "".join(random.sample(
list(letters), size)).upper()
word = lemma
else:
morpho_tag = random.choice(
list(self._dictionary[entry_type].keys())
)
lemma = random.choice(
list(
self._dictionary[entry_type][morpho_tag].keys()
)
)
word = lemma
except IndexError as exp:
_log.error(f"IndexError entry_type "
f"{entry_type} morpho_tag {morpho_tag}")
_log.error(exp)
_log.error(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")
if word is None and self._always_replace: if word is None and self._always_replace:
entry_type = random.choice(list(self._dictionary.keys())) entry_type = random.choice(list(self._dictionary.keys()))
...@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
original_entries original_entries
) )
possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys()) possible_lemmas = set(
self._dictionary[detection_type][required_tags[0]].keys()
) \
if detection_type in self._dictionary \
and required_tags[0] in self._dictionary[detection_type] \
else set()
for tag in required_tags[1:]: for tag in required_tags[1:]:
possible_lemmas.intersection_update( keys = self._dictionary[detection_type][tag].keys() \
self._dictionary[detection_type][tag].keys() if detection_type in self._dictionary \
) and tag in self._dictionary[detection_type] \
else set()
if keys:
possible_lemmas.intersection_update(keys)
if len(possible_lemmas) == 0: if len(possible_lemmas) == 0:
return [self.get_random_replacement(original_entries[0])] * len( return [self.get_random_replacement(original_entries[0])] * len(
...@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ...@@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
for entry in original_entries: for entry in original_entries:
if isinstance(entry, MorphosyntacticInfoMixin): if isinstance(entry, MorphosyntacticInfoMixin):
morpho_tag = entry.morpho_tag morpho_tag = entry.morpho_tag
word = self._dictionary[detection_type][morpho_tag][lemma] if detection_type in self._dictionary \
and morpho_tag in self._dictionary[detection_type] \
and lemma in \
self._dictionary[detection_type][morpho_tag]:
word = self._dictionary[detection_type][morpho_tag][lemma]
else:
word = lemma
else: else:
word = lemma word = lemma
......
...@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface): ...@@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface):
month_name = months_map[random_month] month_name = months_map[random_month]
replacement.append(month_name) replacement.append(month_name)
elif entry[0] == DateDetection.AnnotationPart.OTHER: elif entry[0] == DateDetection.AnnotationPart.OTHER:
replacement.append(entry[1]) if entry[1] is not None:
replacement.append(entry[1])
else:
raise ValueError(f"Unknown format entry: {entry}")
replacement = "".join(replacement) replacement = "".join(replacement)
already_replaced[text[start:end]] = replacement already_replaced[text[start:end]] = replacement
......
...@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface): ...@@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface):
) )
morpho_detections[key].append(detection) morpho_detections[key].append(detection)
else: else:
key = (text[start:end], detection.TYPE_NAME) key = (text[start:end], detection_entry.TYPE_NAME)
non_morpho_detections[key].append(detection) non_morpho_detections[key].append(detection)
# Replace morphosyntactic detections # Replace morphosyntactic detections
......