"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.""" import random from collections import defaultdict from typing import List, Optional, Type from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): """Morphosyntactic dictionary that uses a tsv file with NER tags as a source. Example of a tsv file: name Aaronom Aaron subst:pl:dat:m1 name Aaronami Aaron subst:pl:inst:m1 name Aaronach Aaron subst:pl:loc:m1 country Apolonie Apolonia subst:pl:voc:f country Apolonii Apolonia subst:sg:dat:f country Apolonii Apolonia subst:pl:gen:f country Apolonii Apolonia subst:sg:loc:f city Araba Arab subst:sg:gen:m2 city Arabie Arab subst:sg:voc:m2 city Arabem Arab subst:sg:inst:m2 """ def __init__( self, dictionary_path: Optional[str] = None, always_replace=True, ) -> None: """Initializes NERFileMorphosyntacticDictionary. Args: dictionary_path (Optional[str], optional): Path to dictionary tsv file. Defaults to None. always_replace (bool, optional): Wheter to replace detection even if no word with matching morpho tag is found. Defaults to True. """ super().__init__() self._dictionary = None self._always_replace = always_replace self._from_file(dictionary_path) def _from_file(self, path_to_dictionary: str) -> None: replacement_dictionary = defaultdict(lambda: defaultdict(dict)) with open(path_to_dictionary, "r", encoding="utf-8") as file: for line in file: line = line.strip() ner_tag, word, lemma, morpho_tag = line.split("\t") replacement_dictionary[ner_tag][morpho_tag][lemma] = word self._dictionary = replacement_dictionary def get_supported_detection_classes(self) -> List[Type[Detection]]: """Returns a list of supported detection classess. Returns: List[Type[Detection]]: List of detection classes that are supported """ return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()] def get_random_replacement(self, original_entry: Detection) -> Optional[str]: """Returns a random replacement of original entry. Args: original_entry (Detection): Detection that should be replaced. Class should have MorphosyntacticInfoMixin Returns: Optional[str]: Text that should replace the original entry """ original_entry_type = type(original_entry) original_entry_type_name = original_entry_type.TYPE_NAME result = None if issubclass(original_entry_type, MorphosyntacticInfoMixin): morpho_tag = original_entry.morpho_tag if original_entry_type_name in self._dictionary: if morpho_tag in self._dictionary[original_entry_type_name]: result = random.choice( list( self._dictionary[original_entry_type_name][ morpho_tag ].values() ) ) else: morpho_tag = result = random.choice( list(self._dictionary[original_entry_type_name].keys()) ) result = random.choice( list( self._dictionary[original_entry_type_name][ morpho_tag ].keys() ) ) if result is None and self._always_replace: random_type = random.choice(list(self._dictionary.keys())) random_tag = random.choice(list(self._dictionary[random_type].keys())) result = random.choice( list(self._dictionary[random_type][random_tag].values()) ) return result