Skip to content
Snippets Groups Projects
Commit 7035bd0a authored by Tomasz Walkowiak's avatar Tomasz Walkowiak
Browse files

fixing speller (infinite processing for long words)

parent b9d9540a
Branches
No related tags found
No related merge requests found
Pipeline #9207 failed
.gitignore 100755 → 100644
.idea/ .coverage
/tests/.pytest_cache
# temp files .pytest_cache
example_usage.py .idea
example_text.txt *__pycache__
\ No newline at end of file htmlcov
config-test.ini
version: '3'
services:
polem4json:
container_name: clarin_speller
build: ./
working_dir: /home/worker/
entrypoint:
- python
- main.py
- service
volumes:
- ./src:/home/worker/src
- ./main.py:/home/worker/main.py
- ./config-test.ini:/home/worker/config.ini
- /samba:/samba
environment:
- PYTHONUNBUFFERED=1
...@@ -2,10 +2,27 @@ ...@@ -2,10 +2,27 @@
import logging import logging
import nlp_ws import nlp_ws
from autocorrect import Speller from autocorrect import Speller, Word
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class SpellerFixed(Speller):
"""Fixes orginal speller in case of long words"""
def __init__(self, lang="en"):
super().__init__(lang)
def get_candidates(self, word):
w = Word(word, self.lang, self.only_replacements)
if self.fast or len(word)>15:
candidates = self.existing([word]) or self.existing(w.typos()) or [word]
else:
candidates = (
self.existing([word])
or self.existing(w.typos())
or self.existing(w.double_typos())
or [word]
)
return [(self.nlp_data.get(c, 0), c) for c in candidates]
class Speller2Worker(nlp_ws.NLPWorker): class Speller2Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for text error correction service.""" """Implements nlp_worker for text error correction service."""
...@@ -14,24 +31,23 @@ class Speller2Worker(nlp_ws.NLPWorker): ...@@ -14,24 +31,23 @@ class Speller2Worker(nlp_ws.NLPWorker):
def static_init(cls, config): def static_init(cls, config):
"""One time static initialisation.""" """One time static initialisation."""
_log.log(logging.INFO, "Worker started loading static models ") _log.log(logging.INFO, "Worker started loading static models ")
cls._model = {'pl': Speller('pl'), cls._model = {'pl': SpellerFixed('pl'),
'ru': Speller('ru'), 'ru': SpellerFixed('ru'),
'en': Speller('en'), 'en': SpellerFixed('en'),
'uk': Speller('uk')} 'uk': SpellerFixed('uk')}
_log.log(logging.INFO, "Worker finished loading static models ") _log.log(logging.INFO, "Worker finished loading static models ")
def process(self, input_file, task_options, output_file): def process(self, input_file, task_options, output_file):
"""Starting nlp process.""" """Starting nlp process."""
_log.info("Processing")
language = task_options.get('lang', 'pl') language = task_options.get('lang', 'pl')
model = self._model.get(language) model = self._model.get(language)
data = self._read_file(input_file) with open(input_file, 'r', encoding='utf-8') as f:
with open(output_file, 'w', encoding='utf-8') as f_out:
corrected_data = [model(line) for line in data.split('\n')] for line in f.readlines():
corrected_data = model(line)
with open(output_file, 'w', encoding='utf-8') as f: f_out.write(corrected_data)
f.write('\n'.join(corrected_data)) f_out.write("\n")
@classmethod @classmethod
def _read_file(cls, input_path): def _read_file(cls, input_path):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment