diff --git a/src/dictionaries/morphosyntactic/ner_file.py b/src/dictionaries/morphosyntactic/ner_file.py index 8fe2a78ce94b78f5343f28bd4e410ade3f2ff4e7..c7a8cdf3b7380c9efa8132932ef61b2847cae24e 100644 --- a/src/dictionaries/morphosyntactic/ner_file.py +++ b/src/dictionaries/morphosyntactic/ner_file.py @@ -1,12 +1,17 @@ """Module responsible for Morphosyntactic dict that uses a tsv file with NER tags.""" import random +import string from collections import defaultdict from typing import List, Optional, Type, Dict from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary +import logging + +_log = logging.getLogger(__name__) + class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): """Morphosyntactic dictionary that uses a tsv file with NER tags as a source. @@ -52,6 +57,9 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): ner_tag, word, lemma, morpho_tag = line.split("\t") replacement_dictionary[ner_tag][morpho_tag][lemma] = word + replacement_dictionary = {k: dict(v) for k, v + in replacement_dictionary.items() + if v} # freeze dict return replacement_dictionary def get_supported_detection_classes(self) -> List[Type[Detection]]: @@ -92,21 +100,39 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): if original_entry_type_name in self._dictionary: entry_type = original_entry_type_name - - if morpho_tag in self._dictionary[entry_type]: - lemma = random.choice( - list(self._dictionary[entry_type][morpho_tag].keys()) - ) - - word = self._dictionary[entry_type][morpho_tag][lemma] - else: - morpho_tag = random.choice( - list(self._dictionary[entry_type].keys()) - ) - lemma = random.choice( - list(self._dictionary[entry_type][morpho_tag].keys()) - ) - word = lemma + try: + if entry_type in self._dictionary \ + and morpho_tag in self._dictionary[entry_type] \ + and len(list( + self._dictionary[ + entry_type][morpho_tag].keys()) + ) > 0: + lemma = random.choice( + list(self._dictionary[entry_type][morpho_tag].keys()) + ) + + word = self._dictionary[entry_type][morpho_tag][lemma] + elif morpho_tag == "ign": # unknown form + letters = string.ascii_lowercase + size = random.randint(3, 5) + lemma = "".join(random.sample( + list(letters), size)).upper() + word = lemma + else: + morpho_tag = random.choice( + list(self._dictionary[entry_type].keys()) + ) + lemma = random.choice( + list( + self._dictionary[entry_type][morpho_tag].keys() + ) + ) + word = lemma + except IndexError as exp: + _log.error(f"IndexError entry_type " + f"{entry_type} morpho_tag {morpho_tag}") + _log.error(exp) + _log.error(f"Dictionary {self._dictionary[entry_type][morpho_tag]}") if word is None and self._always_replace: entry_type = random.choice(list(self._dictionary.keys())) @@ -157,11 +183,19 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): original_entries ) - possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys()) + possible_lemmas = set( + self._dictionary[detection_type][required_tags[0]].keys() + ) \ + if detection_type in self._dictionary \ + and required_tags[0] in self._dictionary[detection_type] \ + else set() for tag in required_tags[1:]: - possible_lemmas.intersection_update( - self._dictionary[detection_type][tag].keys() - ) + keys = self._dictionary[detection_type][tag].keys() \ + if detection_type in self._dictionary \ + and tag in self._dictionary[detection_type] \ + else set() + if keys: + possible_lemmas.intersection_update(keys) if len(possible_lemmas) == 0: return [self.get_random_replacement(original_entries[0])] * len( @@ -174,7 +208,13 @@ class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary): for entry in original_entries: if isinstance(entry, MorphosyntacticInfoMixin): morpho_tag = entry.morpho_tag - word = self._dictionary[detection_type][morpho_tag][lemma] + if detection_type in self._dictionary \ + and morpho_tag in self._dictionary[detection_type] \ + and lemma in \ + self._dictionary[detection_type][morpho_tag]: + word = self._dictionary[detection_type][morpho_tag][lemma] + else: + word = lemma else: word = lemma diff --git a/src/replacers/date_replacer.py b/src/replacers/date_replacer.py index 22bb7f3b53e4be83918b2f50b560786c1d85435a..06ea06e945c7ab58280f2cd89ab17100eaee502f 100644 --- a/src/replacers/date_replacer.py +++ b/src/replacers/date_replacer.py @@ -92,8 +92,10 @@ class DateReplacer(ReplacerInterface): month_name = months_map[random_month] replacement.append(month_name) elif entry[0] == DateDetection.AnnotationPart.OTHER: - replacement.append(entry[1]) - + if entry[1] is not None: + replacement.append(entry[1]) + else: + raise ValueError(f"Unknown format entry: {entry}") replacement = "".join(replacement) already_replaced[text[start:end]] = replacement diff --git a/src/replacers/ner_replacer.py b/src/replacers/ner_replacer.py index fb212b3ef60ff6c9a5ba917ad0cee2a5ba144d3b..b2d5564469c4613be4211ba40af69a866a15be1a 100644 --- a/src/replacers/ner_replacer.py +++ b/src/replacers/ner_replacer.py @@ -61,7 +61,7 @@ class NERReplacer(ReplacerInterface): ) morpho_detections[key].append(detection) else: - key = (text[start:end], detection.TYPE_NAME) + key = (text[start:end], detection_entry.TYPE_NAME) non_morpho_detections[key].append(detection) # Replace morphosyntactic detections