ner_file.py

"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""

import random
import string
from collections import defaultdict
from typing import List, Optional, Type, Dict

from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary

import logging

_log = logging.getLogger(__name__)


class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
    """Morphosyntactic dictionary that uses a tsv file with NER tags as a source.

    Example of a tsv file:
    name	Aaronom	Aaron	subst:pl:dat:m1
    name	Aaronami	Aaron	subst:pl:inst:m1
    name	Aaronach	Aaron	subst:pl:loc:m1
    country	Apolonie	Apolonia	subst:pl:voc:f
    country	Apolonii	Apolonia	subst:sg:dat:f
    country	Apolonii	Apolonia	subst:pl:gen:f
    country	Apolonii	Apolonia	subst:sg:loc:f
    city	Araba	Arab	subst:sg:gen:m2
    city	Arabie	Arab	subst:sg:voc:m2
    city	Arabem	Arab	subst:sg:inst:m2
    """

    def __init__(
        self,
        dictionary_path: Optional[str] = None,
        always_replace=True,
    ) -> None:
        """Initializes NERFileMorphosyntacticDictionary.

        Args:
            dictionary_path (Optional[str], optional): Path to dictionary tsv file.
                    Defaults to None.
            always_replace (bool, optional): Wheter to replace detection even if no
                    word with matching morpho tag is found. Defaults to True.

        """
        super().__init__()
        self._dictionary = None
        self._always_replace = always_replace

        self._dictionary = self._from_file(dictionary_path)

    def _from_file(self, path_to_dictionary: str) -> None:
        replacement_dictionary = defaultdict(lambda: defaultdict(dict))
        with open(path_to_dictionary, "r", encoding="utf-8") as file:
            for line in file:
                line = line.strip()
                ner_tag, word, lemma, morpho_tag = line.split("\t")
                replacement_dictionary[ner_tag][morpho_tag][lemma] = word

        return replacement_dictionary

    def get_supported_detection_classes(self) -> List[Type[Detection]]:
        """Returns a list of supported detection classess.

        Returns:
            List[Type[Detection]]: List of detection classes that are supported

        """
        return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()]

    def get_random_replacement(
        self, original_entry: Detection
    ) -> Optional[Dict[str, str]]:
        """Returns a random replacement of original entry.

        Args:
            original_entry (Detection): Detection that should be replaced. Class
                should have MorphosyntacticInfoMixin


        Returns:
            Optional[Dict[str, str]]: Dictionary with replacement and morphosyntactic
                tag. The dictionary should have following keys:
                word[str] -> replacement word
                lemma[str] -> lemma of the replacement word
                tag[str] -> morphosyntactic tag of the replacement word
                type[str] -> type of the replacement word

        """
        original_entry_type = type(original_entry)
        original_entry_type_name = original_entry.TYPE_NAME

        word = None

        if issubclass(original_entry_type, MorphosyntacticInfoMixin):
            morpho_tag = original_entry.morpho_tag

            if original_entry_type_name in self._dictionary:
                entry_type = original_entry_type_name
                _log.info(f"entry_type {entry_type} morpho_tag {morpho_tag}")
                if entry_type in self._dictionary:
                    _log.info(
                        f"Dictionary\
                         {self._dictionary[entry_type][morpho_tag]}"
                    )
                try:
                    if entry_type in self._dictionary \
                            and morpho_tag in self._dictionary[entry_type] \
                            and len(list(
                                    self._dictionary[
                                        entry_type][morpho_tag].keys())
                                    ) > 0:
                        lemma = random.choice(
                            list(self._dictionary[entry_type][morpho_tag].keys())
                        )

                        word = self._dictionary[entry_type][morpho_tag][lemma]
                    elif morpho_tag == "ign":  # unknown form
                        letters = string.ascii_lowercase
                        size = random.randint(3, 5)
                        lemma = "".join(random.sample(
                            list(letters), size)).upper()
                        word = lemma
                    else:
                        morpho_tag = random.choice(
                            list(self._dictionary[entry_type].keys())
                        )
                        lemma = random.choice(
                            list(self._dictionary[entry_type][morpho_tag].keys())
                        )
                        word = lemma
                except IndexError as exp:
                    _log.info(f"IndexError entry_type "
                              f"{entry_type} morpho_tag {morpho_tag}")
                    _log.info(exp)
                    _log.info(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")

        if word is None and self._always_replace:
            entry_type = random.choice(list(self._dictionary.keys()))
            morpho_tag = random.choice(list(self._dictionary[entry_type].keys()))
            lemma = random.choice(list(self._dictionary[entry_type][morpho_tag].keys()))
            word = self._dictionary[entry_type][morpho_tag][lemma]

        return {
            "word": word,
            "lemma": lemma,
            "tag": morpho_tag,
            "type": entry_type,
        }

    def get_random_replacements(
        self, original_entries: List[Detection]
    ) -> List[Optional[Dict[str, str]]]:
        """Returns a list of random replacements of original entries.

        !!!Important!!!
        It's assumed that all of the original entries have the same lemma.

        Args:
            original_entries (List[Detection]): List of detections that should be
            replaced. All detections should have the same lemma and detection type

        Returns:
            Optional[Dict[str, str]]: List of dictionaries with replacement and
                morphosyntactic tag. The dictionary should have following keys:
                word[str] -> replacement word
                lemma[str] -> lemma of the replacement word
                tag[str] -> morphosyntactic tag of the replacement word
                type[str] -> type of the replacement word

        """
        assert len(original_entries) > 0

        detection_type = original_entries[0].TYPE_NAME

        required_tags = set()
        for entry in original_entries:
            if isinstance(entry, MorphosyntacticInfoMixin):
                required_tags.add(entry.morpho_tag)
        required_tags = list(required_tags)

        if len(required_tags) == 0:
            return [self.get_random_replacement(original_entries[0])] * len(
                original_entries
            )

        possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys())
        for tag in required_tags[1:]:
            possible_lemmas.intersection_update(
                self._dictionary[detection_type][tag].keys()
            )

        if len(possible_lemmas) == 0:
            return [self.get_random_replacement(original_entries[0])] * len(
                original_entries
            )

        lemma = random.choice(list(possible_lemmas))

        replacements = []
        for entry in original_entries:
            if isinstance(entry, MorphosyntacticInfoMixin):
                morpho_tag = entry.morpho_tag
                word = self._dictionary[detection_type][morpho_tag][lemma]
            else:
                word = lemma

            replacements.append(
                {
                    "word": word,
                    "lemma": lemma,
                    "tag": morpho_tag,
                    "type": detection_type,
                }
            )

        return replacements