Skip to content
Snippets Groups Projects
ner_file.py 8.14 KiB
Newer Older
Michał Pogoda's avatar
Michał Pogoda committed
"""Module responsible for Morphosyntactic dict that uses a tsv file with NER tags."""

import random
Paweł Walkowiak's avatar
Paweł Walkowiak committed
import string
from collections import defaultdict
from typing import List, Optional, Type, Dict
Michał Pogoda's avatar
Michał Pogoda committed
from src.detections import DETECTION_CLASSES_MAP, Detection, MorphosyntacticInfoMixin
from src.dictionaries.morphosyntactic.interface import MorphosyntacticDictionary
Paweł Walkowiak's avatar
Paweł Walkowiak committed
import logging

_log = logging.getLogger(__name__)

class NERFileMorphosyntacticDictionary(MorphosyntacticDictionary):
Michał Pogoda's avatar
Michał Pogoda committed
    """Morphosyntactic dictionary that uses a tsv file with NER tags as a source.

    Example of a tsv file:
    name	Aaronom	Aaron	subst:pl:dat:m1
    name	Aaronami	Aaron	subst:pl:inst:m1
    name	Aaronach	Aaron	subst:pl:loc:m1
    country	Apolonie	Apolonia	subst:pl:voc:f
    country	Apolonii	Apolonia	subst:sg:dat:f
    country	Apolonii	Apolonia	subst:pl:gen:f
    country	Apolonii	Apolonia	subst:sg:loc:f
    city	Araba	Arab	subst:sg:gen:m2
    city	Arabie	Arab	subst:sg:voc:m2
    city	Arabem	Arab	subst:sg:inst:m2
    """

    def __init__(
        self,
        dictionary_path: Optional[str] = None,
        always_replace=True,
    ) -> None:
Michał Pogoda's avatar
Michał Pogoda committed
        """Initializes NERFileMorphosyntacticDictionary.

        Args:
            dictionary_path (Optional[str], optional): Path to dictionary tsv file.
                    Defaults to None.
            always_replace (bool, optional): Wheter to replace detection even if no
                    word with matching morpho tag is found. Defaults to True.

        """
        super().__init__()
        self._dictionary = None
        self._always_replace = always_replace
        self._dictionary = self._from_file(dictionary_path)
Michał Pogoda's avatar
Michał Pogoda committed

    def _from_file(self, path_to_dictionary: str) -> None:
        replacement_dictionary = defaultdict(lambda: defaultdict(dict))
        with open(path_to_dictionary, "r", encoding="utf-8") as file:
            for line in file:
                line = line.strip()
                ner_tag, word, lemma, morpho_tag = line.split("\t")
                replacement_dictionary[ner_tag][morpho_tag][lemma] = word

        return replacement_dictionary

    def get_supported_detection_classes(self) -> List[Type[Detection]]:
Michał Pogoda's avatar
Michał Pogoda committed
        """Returns a list of supported detection classess.

        Returns:
            List[Type[Detection]]: List of detection classes that are supported

Michał Pogoda's avatar
Michał Pogoda committed
        return [DETECTION_CLASSES_MAP[name] for name in self._dictionary.keys()]
    def get_random_replacement(
        self, original_entry: Detection
    ) -> Optional[Dict[str, str]]:
Michał Pogoda's avatar
Michał Pogoda committed
        """Returns a random replacement of original entry.

        Args:
            original_entry (Detection): Detection that should be replaced. Class
                should have MorphosyntacticInfoMixin


        Returns:
            Optional[Dict[str, str]]: Dictionary with replacement and morphosyntactic
                tag. The dictionary should have following keys:
                word[str] -> replacement word
                lemma[str] -> lemma of the replacement word
                tag[str] -> morphosyntactic tag of the replacement word
                type[str] -> type of the replacement word
Michał Pogoda's avatar
Michał Pogoda committed

        """
        original_entry_type = type(original_entry)
        original_entry_type_name = original_entry.TYPE_NAME

        if issubclass(original_entry_type, MorphosyntacticInfoMixin):
            morpho_tag = original_entry.morpho_tag

            if original_entry_type_name in self._dictionary:
                entry_type = original_entry_type_name
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                _log.info(f"entry_type {entry_type} morpho_tag {morpho_tag}")
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                if entry_type in self._dictionary:
                    _log.info(
                        f"Dictionary\
                         {self._dictionary[entry_type][morpho_tag]}"
                    )
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                try:
                    if entry_type in self._dictionary \
                            and morpho_tag in self._dictionary[entry_type] \
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                            and len(list(
                                    self._dictionary[
                                        entry_type][morpho_tag].keys())
                                    ) > 0:
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                        lemma = random.choice(
                            list(self._dictionary[entry_type][morpho_tag].keys())
                        )

                        word = self._dictionary[entry_type][morpho_tag][lemma]
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                    elif morpho_tag == "ign":  # unknown form
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                        letters = string.ascii_lowercase
                        size = random.randint(3, 5)
                        lemma = "".join(random.sample(
                            list(letters), size)).upper()
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                        word = lemma
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                    else:
                        morpho_tag = random.choice(
                            list(self._dictionary[entry_type].keys())
                        )
                        lemma = random.choice(
                            list(self._dictionary[entry_type][morpho_tag].keys())
                        )
                        word = lemma
                except IndexError as exp:
Paweł Walkowiak's avatar
Paweł Walkowiak committed
                    _log.info(f"IndexError entry_type "
                              f"{entry_type} morpho_tag {morpho_tag}")
                    _log.info(exp)
                    _log.info(f"Dictionary {self._dictionary[entry_type][morpho_tag]}")

        if word is None and self._always_replace:
            entry_type = random.choice(list(self._dictionary.keys()))
            morpho_tag = random.choice(list(self._dictionary[entry_type].keys()))
            lemma = random.choice(list(self._dictionary[entry_type][morpho_tag].keys()))
            word = self._dictionary[entry_type][morpho_tag][lemma]

        return {
            "word": word,
            "lemma": lemma,
            "tag": morpho_tag,
            "type": entry_type,
        }

    def get_random_replacements(
        self, original_entries: List[Detection]
    ) -> List[Optional[Dict[str, str]]]:
        """Returns a list of random replacements of original entries.

        !!!Important!!!
        It's assumed that all of the original entries have the same lemma.

        Args:
            original_entries (List[Detection]): List of detections that should be
            replaced. All detections should have the same lemma and detection type

        Returns:
            Optional[Dict[str, str]]: List of dictionaries with replacement and
                morphosyntactic tag. The dictionary should have following keys:
                word[str] -> replacement word
                lemma[str] -> lemma of the replacement word
                tag[str] -> morphosyntactic tag of the replacement word
                type[str] -> type of the replacement word
        """
        assert len(original_entries) > 0

        detection_type = original_entries[0].TYPE_NAME

        required_tags = set()
        for entry in original_entries:
            if isinstance(entry, MorphosyntacticInfoMixin):
                required_tags.add(entry.morpho_tag)
        required_tags = list(required_tags)

        if len(required_tags) == 0:
            return [self.get_random_replacement(original_entries[0])] * len(
                original_entries
            )

        possible_lemmas = set(self._dictionary[detection_type][required_tags[0]].keys())
        for tag in required_tags[1:]:
            possible_lemmas.intersection_update(
                self._dictionary[detection_type][tag].keys()
            )

        if len(possible_lemmas) == 0:
            return [self.get_random_replacement(original_entries[0])] * len(
                original_entries
            )
        lemma = random.choice(list(possible_lemmas))

        replacements = []
        for entry in original_entries:
            if isinstance(entry, MorphosyntacticInfoMixin):
                morpho_tag = entry.morpho_tag
                word = self._dictionary[detection_type][morpho_tag][lemma]
            else:
                word = lemma

            replacements.append(
                {
                    "word": word,
                    "lemma": lemma,
                    "tag": morpho_tag,
                    "type": detection_type,
                }