fixing speller (infinite processing for long words)

7035bd0a · Tomasz Walkowiak · b9d9540a · 7035bd0a · 7035bd0a · 7035bd0a
Commit 7035bd0a authored 2 years ago by Tomasz Walkowiak
--- a/.gitignore
+++ b/.gitignore
-.idea/
-
-# temp files
-example_usage.py
-example_text.txt
\ No newline at end of file
+.coverage
+/tests/.pytest_cache
+.pytest_cache
+.idea
+*__pycache__
+htmlcov
+config-test.ini
--- a/docker-compose.yml
+++ b/docker-compose.yml
+version: '3'
+services:
+
+  polem4json:
+    container_name: clarin_speller
+    build: ./
+    working_dir: /home/worker/
+    entrypoint:
+        - python
+        - main.py
+        - service
+    volumes:
+        - ./src:/home/worker/src
+        - ./main.py:/home/worker/main.py
+        - ./config-test.ini:/home/worker/config.ini
+        - /samba:/samba
+    environment:
+      - PYTHONUNBUFFERED=1
--- a/src/speller2_worker.py
+++ b/src/speller2_worker.py
@@ -2,10 +2,27 @@
 import logging

 import nlp_ws
-from autocorrect import Speller
+from autocorrect import Speller, Word

 _log = logging.getLogger(__name__)

+class SpellerFixed(Speller):
+    """Fixes orginal speller in case of long words"""
+    def __init__(self, lang="en"):
+        super().__init__(lang)
+
+    def get_candidates(self, word):
+        w = Word(word, self.lang, self.only_replacements)
+        if self.fast or len(word)>15:
+            candidates = self.existing([word]) or self.existing(w.typos()) or [word]
+        else:
+            candidates = (
+                self.existing([word])
+                or self.existing(w.typos())
+                or self.existing(w.double_typos())
+                or [word]
+            )
+        return [(self.nlp_data.get(c, 0), c) for c in candidates]

 class Speller2Worker(nlp_ws.NLPWorker):
    """Implements nlp_worker for text error correction service."""
@@ -14,24 +31,23 @@ class Speller2Worker(nlp_ws.NLPWorker):
    def static_init(cls, config):
        """One time static initialisation."""
        _log.log(logging.INFO, "Worker started loading static models ")
-        cls._model = {'pl': Speller('pl'),
-                      'ru': Speller('ru'),
-                      'en': Speller('en'),
-                      'uk': Speller('uk')}
+        cls._model = {'pl': SpellerFixed('pl'),
+                      'ru': SpellerFixed('ru'),
+                      'en': SpellerFixed('en'),
+                      'uk': SpellerFixed('uk')}
        _log.log(logging.INFO, "Worker finished loading static models ")

    def process(self, input_file, task_options, output_file):
        """Starting nlp process."""
-        _log.info("Processing")
        language = task_options.get('lang', 'pl')
        model = self._model.get(language)

-        data = self._read_file(input_file)
-
-        corrected_data = [model(line) for line in data.split('\n')]
-
-        with open(output_file, 'w', encoding='utf-8') as f:
-            f.write('\n'.join(corrected_data))
+        with open(input_file, 'r', encoding='utf-8') as f:
+            with open(output_file, 'w', encoding='utf-8') as f_out:
+                for line in f.readlines():
+                    corrected_data = model(line)
+                    f_out.write(corrected_data)
+                    f_out.write("\n")

    @classmethod
    def _read_file(cls, input_path):