"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.""" import random from collections import defaultdict from typing import List, Optional, Type, Dict from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): """Morphosyntactic dictionary that uses a tsv file with NER tags as a source. Example of a tsv file: name Aaronom Aaron subst:pl:dat:m1 name Aaronami Aaron subst:pl:inst:m1 name Aaronach Aaron subst:pl:loc:m1 country Apolonie Apolonia subst:pl:voc:f country Apolonii Apolonia subst:sg:dat:f country Apolonii Apolonia subst:pl:gen:f country Apolonii Apolonia subst:sg:loc:f city Araba Arab subst:sg:gen:m2 city Arabie Arab subst:sg:voc:m2 city Arabem Arab subst:sg:inst:m2 """ def __init__( self, dictionary_path: Optional[str] = None, always_replace=True, ) -> None: """Initializes NERFileMorphosyntacticDictionary. Args: dictionary_path (Optional[str], optional): Path to dictionary tsv file. Defaults to None. always_replace (bool, optional): Wheter to replace detection even if no word with matching morpho tag is found. Defaults to True. """ super().__init__() self._dictionary = None self._always_replace = always_replace self._dictionary = self._from_file(dictionary_path) def _from_file(self, path_to_dictionary: str) -> None: replacement_dictionary = defaultdict(lambda: defaultdict(dict)) with open(path_to_dictionary, "r", encoding="utf-8") as file: for line in file: line = line.strip() ner_tag, word, lemma, morpho_tag = line.split("\t") replacement_dictionary[ner_tag][morpho_tag][lemma] = word return replacement_dictionary def get_supported_detection_classes(self) -> List[Type[Detection]]: """Returns a list of supported detection classess. Returns: List[Type[Detection]]: List of detection classes that are supported """ return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()] def get_random_replacement( self, original_entry: Detection ) -> Optional[Dict[str, str]]: """Returns a random replacement of original entry. Args: original_entry (Detection): Detection that should be replaced. Class should have MorphosyntacticInfoMixin Returns: Optional[Dict[str, str]]: Dictionary with replacement and morphosyntactic tag. The dictionary should have following keys: word[str] -> replacement word lemma[str] -> lemma of the replacement word tag[str] -> morphosyntactic tag of the replacement word type[str] -> type of the replacement word """ original_entry_type = type(original_entry) original_entry_type_name = original_entry.TYPE_NAME word = None if issubclass(original_entry_type, MorphosyntacticInfoMixin): morpho_tag = original_entry.morpho_tag if original_entry_type_name in self._dictionary: entry_type = original_entry_type_name if morpho_tag in self._dictionary[entry_type]: lemma = random.choice( list(self._dictionary[entry_type][morpho_tag].keys()) ) word = self._dictionary[entry_type][morpho_tag][lemma] else: morpho_tag = random.choice( list(self._dictionary[entry_type].keys()) ) lemma = random.choice( list(self._dictionary[entry_type][morpho_tag].keys()) ) word = lemma if word is None and self._always_replace: entry_type = random.choice(list(self._dictionary.keys())) morpho_tag = random.choice(list(self._dictionary[entry_type].keys())) lemma = random.choice(list(self._dictionary[entry_type][morpho_tag].keys())) word = self._dictionary[entry_type][morpho_tag][lemma] return { "word": word, "lemma": lemma, "tag": morpho_tag, "type": entry_type, } def get_random_replacements( self, original_entries: List[Detection] ) -> List[Optional[Dict[str, str]]]: """Returns a list of random replacements of original entries. !!!Important!!! It's assumed that all of the original entries have the same lemma. Args: original_entries (List[Detection]): List of detections that should be replaced. All detections should have the same lemma and detection type Returns: Optional[Dict[str, str]]: List of dictionaries with replacement and morphosyntactic tag. The dictionary should have following keys: word[str] -> replacement word lemma[str] -> lemma of the replacement word tag[str] -> morphosyntactic tag of the replacement word type[str] -> type of the replacement word """ assert len(original_entries) > 0 detection_type = original_entries[0].TYPE_NAME required_tags = set() for entry in original_entries: if isinstance(entry, MorphosyntacticInfoMixin): required_tags.add(entry.morpho_tag) required_tags = list(required_tags) if len(required_tags) == 0: return [self.get_random_replacement(original_entries[0])] * len( original_entries ) possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys()) for tag in required_tags[1:]: possible_lemmas.intersection_update( self._dictionary[detection_type][tag].keys() ) if len(possible_lemmas) == 0: return [self.get_random_replacement(original_entries[0])] * len( original_entries ) lemma = random.choice(list(possible_lemmas)) replacements = [] for entry in original_entries: if isinstance(entry, MorphosyntacticInfoMixin): morpho_tag = entry.morpho_tag word = self._dictionary[detection_type][morpho_tag][lemma] else: word = lemma replacements.append( { "word": word, "lemma": lemma, "tag": morpho_tag, "type": detection_type, } ) return replacements