speller2_worker.py

"""Implementation of nlp_worker."""
import logging

import nlp_ws
from autocorrect import Speller, Word

_log = logging.getLogger(__name__)


class SpellerFixed(Speller):
    """Fixes orginal speller in case of long words."""

    def __init__(self, lang="en"):
        """Call superclass."""
        super().__init__(lang)

    def get_candidates(self, word):
        """Returns a list of possible candidate words."""
        w = Word(word, self.lang, self.only_replacements)
        if self.fast or len(word) > 15:
            candidates = (self.existing([word]) or self.existing(w.typos()) or
                          [word])
        else:
            candidates = (
                self.existing([word]) or
                self.existing(w.typos()) or
                self.existing(w.double_typos()) or
                [word]
            )
        return [(self.nlp_data.get(c, 0), c) for c in candidates]


class Speller2Worker(nlp_ws.NLPWorker):
    """Implements nlp_worker for text error correction service."""

    @classmethod
    def static_init(cls, config):
        """One time static initialisation."""
        _log.log(logging.INFO, "Worker started loading static models ")
        cls._model = {'pl': SpellerFixed('pl'),
                      'ru': SpellerFixed('ru'),
                      'en': SpellerFixed('en'),
                      'uk': SpellerFixed('uk')}
        _log.log(logging.INFO, "Worker finished loading static models ")

    def process(self, input_file, task_options, output_file):
        """Starting nlp process."""
        language = task_options.get('lang', 'pl')
        model = self._model.get(language)

        with open(input_file, 'r', encoding='utf-8') as f:
            with open(output_file, 'w', encoding='utf-8') as f_out:
                for line in f.readlines():
                    corrected_data = model(line)
                    f_out.write(corrected_data)
                    f_out.write("\n")

    @classmethod
    def _read_file(cls, input_path):
        """Reading text from input file."""
        with open(input_path, 'r', encoding='utf-8') as f:
            content = f.read()

        return content