Select Git revision
speller2_worker.py
Tomasz Walkowiak authored
speller2_worker.py 2.18 KiB
"""Implementation of nlp_worker."""
import logging
import nlp_ws
from autocorrect import Speller, Word
_log = logging.getLogger(__name__)
class SpellerFixed(Speller):
"""Fixes orginal speller in case of long words."""
def __init__(self, lang="en"):
"""Call superclass."""
super().__init__(lang)
def get_candidates(self, word):
"""Returns a list of possible candidate words."""
w = Word(word, self.lang, self.only_replacements)
if self.fast or len(word) > 15:
candidates = (self.existing([word]) or self.existing(w.typos()) or
[word])
else:
candidates = (
self.existing([word]) or
self.existing(w.typos()) or
self.existing(w.double_typos()) or
[word]
)
return [(self.nlp_data.get(c, 0), c) for c in candidates]
class Speller2Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for text error correction service."""
@classmethod
def static_init(cls, config):
"""One time static initialisation."""
_log.log(logging.INFO, "Worker started loading static models ")
cls._model = {'pl': SpellerFixed('pl'),
'ru': SpellerFixed('ru'),
'en': SpellerFixed('en'),
'uk': SpellerFixed('uk')}
_log.log(logging.INFO, "Worker finished loading static models ")
def process(self, input_file, task_options, output_file):
"""Starting nlp process."""
language = task_options.get('lang', 'pl')
model = self._model.get(language)
with open(input_file, 'r', encoding='utf-8') as f:
with open(output_file, 'w', encoding='utf-8') as f_out:
for line in f.readlines():
corrected_data = model(line)
f_out.write(corrected_data)
f_out.write("\n")
@classmethod
def _read_file(cls, input_path):
"""Reading text from input file."""
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
return content