Skip to content
Snippets Groups Projects
Commit 7035bd0a authored by Tomasz Walkowiak's avatar Tomasz Walkowiak
Browse files

fixing speller (infinite processing for long words)

parent b9d9540a
Branches
No related merge requests found
Pipeline #9207 failed with stages
in 16 seconds
.gitignore 100755 → 100644
.idea/
# temp files
example_usage.py
example_text.txt
\ No newline at end of file
.coverage
/tests/.pytest_cache
.pytest_cache
.idea
*__pycache__
htmlcov
config-test.ini
version: '3'
services:
polem4json:
container_name: clarin_speller
build: ./
working_dir: /home/worker/
entrypoint:
- python
- main.py
- service
volumes:
- ./src:/home/worker/src
- ./main.py:/home/worker/main.py
- ./config-test.ini:/home/worker/config.ini
- /samba:/samba
environment:
- PYTHONUNBUFFERED=1
......@@ -2,10 +2,27 @@
import logging
import nlp_ws
from autocorrect import Speller
from autocorrect import Speller, Word
_log = logging.getLogger(__name__)
class SpellerFixed(Speller):
"""Fixes orginal speller in case of long words"""
def __init__(self, lang="en"):
super().__init__(lang)
def get_candidates(self, word):
w = Word(word, self.lang, self.only_replacements)
if self.fast or len(word)>15:
candidates = self.existing([word]) or self.existing(w.typos()) or [word]
else:
candidates = (
self.existing([word])
or self.existing(w.typos())
or self.existing(w.double_typos())
or [word]
)
return [(self.nlp_data.get(c, 0), c) for c in candidates]
class Speller2Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for text error correction service."""
......@@ -14,24 +31,23 @@ class Speller2Worker(nlp_ws.NLPWorker):
def static_init(cls, config):
"""One time static initialisation."""
_log.log(logging.INFO, "Worker started loading static models ")
cls._model = {'pl': Speller('pl'),
'ru': Speller('ru'),
'en': Speller('en'),
'uk': Speller('uk')}
cls._model = {'pl': SpellerFixed('pl'),
'ru': SpellerFixed('ru'),
'en': SpellerFixed('en'),
'uk': SpellerFixed('uk')}
_log.log(logging.INFO, "Worker finished loading static models ")
def process(self, input_file, task_options, output_file):
"""Starting nlp process."""
_log.info("Processing")
language = task_options.get('lang', 'pl')
model = self._model.get(language)
data = self._read_file(input_file)
corrected_data = [model(line) for line in data.split('\n')]
with open(output_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(corrected_data))
with open(input_file, 'r', encoding='utf-8') as f:
with open(output_file, 'w', encoding='utf-8') as f_out:
for line in f.readlines():
corrected_data = model(line)
f_out.write(corrected_data)
f_out.write("\n")
@classmethod
def _read_file(cls, input_path):
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment