from typing import Dict, List, Optional, Type from collections import defaultdict from src.detections import Detection, MorphosyntacticInfoMixin, DETECTION_CLASSES_MAP from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary import random class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): def __init__( self, dictionary_path: Optional[str] = None, always_replace=True, ) -> None: super().__init__() self._dictionary = None self._always_replace = always_replace self._from_file(dictionary_path) def _from_file(self, path_to_dictionary: str) -> None: replacement_dictionary = defaultdict(lambda: defaultdict(dict)) with open(path_to_dictionary, "r", encoding="utf-8") as file: for line in file: line = line.strip() ner_tag, word, lemma, morpho_tag = line.split("\t") replacement_dictionary[ner_tag][morpho_tag][lemma] = word self._dictionary = replacement_dictionary def get_supported_detection_classes(self) -> List[Type[Detection]]: """ Returns a list of supported detection classes """ return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()] def get_random_replacement(self, original_entry: Detection) -> Optional[str]: original_entry_type = type(original_entry) original_entry_type_name = original_entry_type.TYPE_NAME result = None if issubclass(original_entry_type, MorphosyntacticInfoMixin): morpho_tag = original_entry.morpho_tag if ( original_entry_type_name in self._dictionary ): if morpho_tag in self._dictionary[original_entry_type_name]: result = random.choice( list( self._dictionary[original_entry_type_name][morpho_tag].values() ) ) else: morpho_tag = result = random.choice(list(self._dictionary[original_entry_type_name].keys())) result = random.choice( list(self._dictionary[original_entry_type_name][morpho_tag].keys()) ) if result is None and self._always_replace: random_type = random.choice(list(self._dictionary.keys())) random_tag = random.choice(list(self._dictionary[random_type].keys())) result = random.choice( list(self._dictionary[random_type][random_tag].values()) ) return result