Newer
Older
"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""
from typing import List, Optional, Type, Dict
from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
import logging
_log = logging.getLogger(__name__)
class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
"""Morphosyntactic dictionary that uses a tsv file with NER tags as a source.
Example of a tsv file:
name Aaronom Aaron subst:pl:dat:m1
name Aaronami Aaron subst:pl:inst:m1
name Aaronach Aaron subst:pl:loc:m1
country Apolonie Apolonia subst:pl:voc:f
country Apolonii Apolonia subst:sg:dat:f
country Apolonii Apolonia subst:pl:gen:f
country Apolonii Apolonia subst:sg:loc:f
city Araba Arab subst:sg:gen:m2
city Arabie Arab subst:sg:voc:m2
city Arabem Arab subst:sg:inst:m2
"""
def __init__(
self,
dictionary_path: Optional[str] = None,
always_replace=True,
) -> None:
"""Initializes NERFileMorphosyntacticDictionary.
Args:
dictionary_path (Optional[str], optional): Path to dictionary tsv file.
Defaults to None.
always_replace (bool, optional): Wheter to replace detection even if no
word with matching morpho tag is found. Defaults to True.
"""
super().__init__()
self._dictionary = None
self._always_replace = always_replace
self._dictionary = self._from_file(dictionary_path)
def _from_file(self, path_to_dictionary: str) -> None:
replacement_dictionary = defaultdict(lambda: defaultdict(dict))
with open(path_to_dictionary, "r", encoding="utf-8") as file:
for line in file:
line = line.strip()
ner_tag, word, lemma, morpho_tag = line.split("\t")
replacement_dictionary[ner_tag][morpho_tag][lemma] = word
return replacement_dictionary
def get_supported_detection_classes(self) -> List[Type[Detection]]:
"""Returns a list of supported detection classess.
Returns:
List[Type[Detection]]: List of detection classes that are supported
return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()]
def get_random_replacement(
self, original_entry: Detection
) -> Optional[Dict[str, str]]:
"""Returns a random replacement of original entry.
Args:
original_entry (Detection): Detection that should be replaced. Class
should have MorphosyntacticInfoMixin
Returns:
Optional[Dict[str, str]]: Dictionary with replacement and morphosyntactic
tag. The dictionary should have following keys:
word[str] -> replacement word
lemma[str] -> lemma of the replacement word
tag[str] -> morphosyntactic tag of the replacement word
type[str] -> type of the replacement word
original_entry_type = type(original_entry)
original_entry_type_name = original_entry.TYPE_NAME
if issubclass(original_entry_type, MorphosyntacticInfoMixin):
morpho_tag = original_entry.morpho_tag
if original_entry_type_name in self._dictionary:
entry_type = original_entry_type_name
_log.info(f"entry_type {entry_type} morpho_tag {morpho_tag}")
if entry_type in self._dictionary:
_log.info(
f"Dictionary\
{self._dictionary[entry_type][morpho_tag]}"
)
try:
if entry_type in self._dictionary \
and morpho_tag in self._dictionary[entry_type] \
and len(list(
self._dictionary[
entry_type][morpho_tag].keys())
) > 0:
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = self._dictionary[entry_type][morpho_tag][lemma]
letters = string.ascii_lowercase
size = random.randint(3, 5)
lemma = "".join(random.sample(
list(letters), size)).upper()
else:
morpho_tag = random.choice(
list(self._dictionary[entry_type].keys())
)
lemma = random.choice(
list(self._dictionary[entry_type][morpho_tag].keys())
)
word = lemma
except IndexError as exp:
_log.info(f"IndexError entry_type "
f"{entry_type} morpho_tag {morpho_tag}")
_log.info(exp)
_log.info(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
if word is None and self._always_replace:
entry_type = random.choice(list(self._dictionary.keys()))
morpho_tag = random.choice(list(self._dictionary[entry_type].keys()))
lemma = random.choice(list(self._dictionary[entry_type][morpho_tag].keys()))
word = self._dictionary[entry_type][morpho_tag][lemma]
return {
"word": word,
"lemma": lemma,
"tag": morpho_tag,
"type": entry_type,
}
def get_random_replacements(
self, original_entries: List[Detection]
) -> List[Optional[Dict[str, str]]]:
"""Returns a list of random replacements of original entries.
!!!Important!!!
It's assumed that all of the original entries have the same lemma.
Args:
original_entries (List[Detection]): List of detections that should be
replaced. All detections should have the same lemma and detection type
Returns:
Optional[Dict[str, str]]: List of dictionaries with replacement and
morphosyntactic tag. The dictionary should have following keys:
word[str] -> replacement word
lemma[str] -> lemma of the replacement word
tag[str] -> morphosyntactic tag of the replacement word
type[str] -> type of the replacement word
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
assert len(original_entries) > 0
detection_type = original_entries[0].TYPE_NAME
required_tags = set()
for entry in original_entries:
if isinstance(entry, MorphosyntacticInfoMixin):
required_tags.add(entry.morpho_tag)
required_tags = list(required_tags)
if len(required_tags) == 0:
return [self.get_random_replacement(original_entries[0])] * len(
original_entries
)
possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys())
for tag in required_tags[1:]:
possible_lemmas.intersection_update(
self._dictionary[detection_type][tag].keys()
)
if len(possible_lemmas) == 0:
return [self.get_random_replacement(original_entries[0])] * len(
original_entries
)
lemma = random.choice(list(possible_lemmas))
replacements = []
for entry in original_entries:
if isinstance(entry, MorphosyntacticInfoMixin):
morpho_tag = entry.morpho_tag
word = self._dictionary[detection_type][morpho_tag][lemma]
else:
word = lemma
replacements.append(
{
"word": word,
"lemma": lemma,
"tag": morpho_tag,
"type": detection_type,
}
return replacements