From cd7a5927944804470cae72d0c814e8852a75e167 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 11:26:23 +0200
Subject: [PATCH 01/35] Basic version of working service.

---
 Dockerfile                                    | 23 ++++++
 config.ini                                    | 11 +++
 docker-compose.yml                            | 14 ++++
 module/pom.xml                                | 70 +++++++++++++++++++
 .../main/java/pl/clarin/speller/Speller.java  | 67 ++++++++++++++++++
 .../main/java/pl/clarin/speller/TextEdit.java | 37 ++++++++++
 6 files changed, 222 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 config.ini
 create mode 100644 docker-compose.yml
 create mode 100644 module/pom.xml
 create mode 100644 module/src/main/java/pl/clarin/speller/Speller.java
 create mode 100644 module/src/main/java/pl/clarin/speller/TextEdit.java

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..348cb06
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,23 @@
+FROM clarinpl/openjdk:8 as builder
+
+LABEL application="Speller"
+LABEL description="Client - Workers - correcting mistakes in sentances in txt files"
+LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
+LABEL maintainer="bartlomiej.koptyra@pwr.edu.pl"
+
+WORKDIR /home/install
+RUN cd nlp.worker && \
+    mvn clean && \
+    mvn install
+
+WORKDIR /home/install
+COPY ./module ./module
+RUN cd module && \
+    mvn clean && \
+    mvn install
+
+FROM clarinpl/openjdk-jre:8
+
+WORKDIR /home/worker
+COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar .
+CMD ["java", "-jar", "nlp.worker.speller-1.0-SNAPSHOT.jar"]
\ No newline at end of file
diff --git a/config.ini b/config.ini
new file mode 100644
index 0000000..856bea7
--- /dev/null
+++ b/config.ini
@@ -0,0 +1,11 @@
+[service]
+tool = speller
+
+root = /samba/requests/
+rabbit_host = rabbitmq
+rabbit_user = test
+rabbit_password = test
+queue_prefix = nlp_
+
+[tool]
+workers_number = 1
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..f0a01ae
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,14 @@
+version: '3'
+services:
+  speller:
+    container_name: clarin_speller
+    build: ./
+    volumes:
+      - '/samba:/samba'
+      - './config.ini:/home/worker/config.ini'
+    working_dir: /home/worker/
+    entrypoint:
+      - java
+      - '-jar'
+      - nlp.worker.speller-1.0-SNAPSHOT.jar
+    restart: always
diff --git a/module/pom.xml b/module/pom.xml
new file mode 100644
index 0000000..62d8f94
--- /dev/null
+++ b/module/pom.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>pl.clarin</groupId>
+    <artifactId>nlp.worker.speller</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>nlp.worker</artifactId>
+            <version>1.0-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>org.languagetool</groupId>
+            <artifactId>language-pl</artifactId>
+            <version>5.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.json</groupId>
+            <artifactId>json</artifactId>
+            <version>20141113</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>2.3</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>pl.clarin.speller.Speller</mainClass>
+                                </transformer>
+                            </transformers>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
\ No newline at end of file
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
new file mode 100644
index 0000000..f924042
--- /dev/null
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -0,0 +1,67 @@
+package pl.clarin.speller;
+
+import java.io.*;
+
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.languagetool.JLanguageTool;
+
+import org.json.JSONObject;
+import org.languagetool.language.Polish;
+import pl.clarin.ws.worker.IniFile;
+import pl.clarin.ws.worker.Service;
+import pl.clarin.ws.worker.Worker;
+
+public class Speller extends Worker {
+  // init object for each thread
+  static TextEdit textEdit = null;
+  static JLanguageTool langTool = null;
+
+  public static void main(String[] args) {
+    new Service<>(Speller.class);
+  }
+
+  @Override
+  public void init() throws Exception {}
+
+  // init objects shared by threads
+  @Override
+  public void static_init(IniFile init) throws Exception {
+    textEdit = new TextEdit();
+    langTool = new JLanguageTool(new Polish());
+  }
+
+  @Override
+  public void process(String fileIn, String fileOut, JSONObject param) {
+    File file = new File(fileIn);
+    try {
+      FileWriter fileWriter = new FileWriter(fileOut);
+
+      StringBuilder sb = new StringBuilder();
+
+      try (BufferedReader br = new BufferedReader(new FileReader(file))) {
+        String line = null;
+        while ((line = br.readLine()) != null) {
+          try {
+            String corrected_line = TextEdit.edit(line, langTool);
+            sb.append(corrected_line).append('\n');
+          } catch (Exception exception) {
+            Logger.getLogger(Speller.class.getName())
+                .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception);
+          }
+        }
+      }
+
+      try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
+        bufferedWriter.write(sb.toString());
+      }
+
+      fileWriter.close();
+
+    } catch (IOException exception) {
+      Logger.getLogger(Speller.class.getName())
+          .log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
+    }
+  }
+}
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
new file mode 100644
index 0000000..1e767f2
--- /dev/null
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -0,0 +1,37 @@
+package pl.clarin.speller;
+
+import org.languagetool.JLanguageTool;
+import org.languagetool.rules.RuleMatch;
+
+import java.util.List;
+
+public class TextEdit {
+  public static String edit(String wrong_input, JLanguageTool langTool) throws Exception {
+    char[] buffer = wrong_input.toCharArray();
+    StringBuilder sb = new StringBuilder();
+    List<RuleMatch> matches = langTool.check(wrong_input);
+    int string_index = 0;
+    for (RuleMatch match : matches) {
+      int match_from = match.getFromPos();
+      int match_to = match.getToPos();
+
+      if (match_from > string_index) {
+        sb.append(buffer, string_index, (match_from - string_index));
+      } else if (match_from < string_index) {
+        throw new Exception("RuleMatches are not sorted for some reason.");
+      }
+      String matching_word;
+      if (match.getSuggestedReplacements().isEmpty()) {
+        // divide into words and fix
+        matching_word = wrong_input.substring(match_from, match_to);
+      } else {
+        matching_word = match.getSuggestedReplacements().get(0);
+      }
+
+      sb.append(matching_word.toCharArray(), 0, (matching_word.length()));
+      string_index = match_to;
+    }
+    sb.append(buffer, string_index, (buffer.length - string_index));
+    return sb.toString();
+  }
+}
-- 
GitLab


From 3a4c34e63ca6e6b3b95fdaaec34ae85334de7fc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 12:29:06 +0200
Subject: [PATCH 02/35] Added gitlab-ci, refactored code.

---
 .gitlab-ci.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .gitlab-ci.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..a70340f
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,15 @@
+build_image:
+  stage: build
+  image: 'docker:18.09.7'
+  only:
+    - master
+  services:
+    - 'docker:18.09.7-dind'
+  before_script:
+    - ''
+  script:
+    - docker build -t clarinpl/liner2 .
+    - echo $DOCKER_PASSWORD > pass.txt
+    - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
+    - rm pass.txt
+    - docker push clarinpl/liner2
-- 
GitLab


From da9ab975cb6c667a84c533579aa032ae03c95f82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 12:43:48 +0200
Subject: [PATCH 03/35] Delted README, added stages to gitlab-ci

---
 .gitlab-ci.yml                                     |  3 +++
 README.md                                          |  2 --
 .../src/main/java/pl/clarin/speller/Speller.java   | 14 +++++---------
 .../src/main/java/pl/clarin/speller/TextEdit.java  |  1 -
 4 files changed, 8 insertions(+), 12 deletions(-)
 delete mode 100644 README.md

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a70340f..535ac7d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,3 +1,6 @@
+stages:
+  - build
+
 build_image:
   stage: build
   image: 'docker:18.09.7'
diff --git a/README.md b/README.md
deleted file mode 100644
index e2e02ef..0000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# speller
-
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index f924042..a4bab37 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -1,21 +1,18 @@
 package pl.clarin.speller;
 
-import java.io.*;
-
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import org.languagetool.JLanguageTool;
-
 import org.json.JSONObject;
+import org.languagetool.JLanguageTool;
 import org.languagetool.language.Polish;
 import pl.clarin.ws.worker.IniFile;
 import pl.clarin.ws.worker.Service;
 import pl.clarin.ws.worker.Worker;
 
+import java.io.*;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
 public class Speller extends Worker {
   // init object for each thread
-  static TextEdit textEdit = null;
   static JLanguageTool langTool = null;
 
   public static void main(String[] args) {
@@ -28,7 +25,6 @@ public class Speller extends Worker {
   // init objects shared by threads
   @Override
   public void static_init(IniFile init) throws Exception {
-    textEdit = new TextEdit();
     langTool = new JLanguageTool(new Polish());
   }
 
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 1e767f2..d2c8fc6 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -22,7 +22,6 @@ public class TextEdit {
       }
       String matching_word;
       if (match.getSuggestedReplacements().isEmpty()) {
-        // divide into words and fix
         matching_word = wrong_input.substring(match_from, match_to);
       } else {
         matching_word = match.getSuggestedReplacements().get(0);
-- 
GitLab


From ef7478debc911af07a8a7558aa7edf4ea4289e83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:19:26 +0200
Subject: [PATCH 04/35] Test for pipeline.

---
 .gitlab-ci.yml                                |  9 ++++++
 .../main/java/pl/clarin/speller/Speller.java  | 16 ++++++----
 .../main/java/pl/clarin/speller/TextEdit.java | 30 +++++++++----------
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 535ac7d..e3f14cb 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,6 +1,15 @@
 stages:
+  - check_style
   - build
 
+checkstyle:
+  image: clarinpl/openjdk:8
+  stage: check_style
+  script:
+    - mvn checkstyle:checkstyle
+    - cat checkstyle-result.xml
+  allow_failure: false
+  
 build_image:
   stage: build
   image: 'docker:18.09.7'
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index a4bab37..5c2c474 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -1,5 +1,13 @@
 package pl.clarin.speller;
 
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.logging.Level;
+import java.util.logging.Logger;
 import org.json.JSONObject;
 import org.languagetool.JLanguageTool;
 import org.languagetool.language.Polish;
@@ -7,9 +15,7 @@ import pl.clarin.ws.worker.IniFile;
 import pl.clarin.ws.worker.Service;
 import pl.clarin.ws.worker.Worker;
 
-import java.io.*;
-import java.util.logging.Level;
-import java.util.logging.Logger;
+
 
 public class Speller extends Worker {
   // init object for each thread
@@ -40,8 +46,8 @@ public class Speller extends Worker {
         String line = null;
         while ((line = br.readLine()) != null) {
           try {
-            String corrected_line = TextEdit.edit(line, langTool);
-            sb.append(corrected_line).append('\n');
+            String correctedLine = TextEdit.edit(line, langTool);
+            sb.append(correctedLine).append('\n');
           } catch (Exception exception) {
             Logger.getLogger(Speller.class.getName())
                 .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception);
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index d2c8fc6..102dafd 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -1,34 +1,34 @@
 package pl.clarin.speller;
 
+import java.util.List;
 import org.languagetool.JLanguageTool;
 import org.languagetool.rules.RuleMatch;
 
-import java.util.List;
-
 public class TextEdit {
-  public static String edit(String wrong_input, JLanguageTool langTool) throws Exception {
-    char[] buffer = wrong_input.toCharArray();
+  // class that corrects input text
+  public static String edit(String inputString, JLanguageTool langTool) throws Exception {
+    char[] buffer = inputString.toCharArray();
     StringBuilder sb = new StringBuilder();
-    List<RuleMatch> matches = langTool.check(wrong_input);
+    List<RuleMatch> matches = langTool.check(inputString);
     int string_index = 0;
     for (RuleMatch match : matches) {
-      int match_from = match.getFromPos();
-      int match_to = match.getToPos();
+      int matchFrom = match.getFromPos();
+      int matchTo = match.getToPos();
 
-      if (match_from > string_index) {
-        sb.append(buffer, string_index, (match_from - string_index));
-      } else if (match_from < string_index) {
+      if (matchFrom > string_index) {
+        sb.append(buffer, string_index, (matchFrom - string_index));
+      } else if (matchFrom < string_index) {
         throw new Exception("RuleMatches are not sorted for some reason.");
       }
-      String matching_word;
+      String matchingWord;
       if (match.getSuggestedReplacements().isEmpty()) {
-        matching_word = wrong_input.substring(match_from, match_to);
+        matchingWord = inputString.substring(matchFrom, matchTo);
       } else {
-        matching_word = match.getSuggestedReplacements().get(0);
+        matchingWord = match.getSuggestedReplacements().get(0);
       }
 
-      sb.append(matching_word.toCharArray(), 0, (matching_word.length()));
-      string_index = match_to;
+      sb.append(matchingWord.toCharArray(), 0, (matchingWord.length()));
+      string_index = matchTo;
     }
     sb.append(buffer, string_index, (buffer.length - string_index));
     return sb.toString();
-- 
GitLab


From e9f48367e2ba4e27b68fcb4292733a76fbfd6657 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:22:04 +0200
Subject: [PATCH 05/35] Pipeline test 2.

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e3f14cb..2301573 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -6,6 +6,7 @@ checkstyle:
   image: clarinpl/openjdk:8
   stage: check_style
   script:
+    - cd module
     - mvn checkstyle:checkstyle
     - cat checkstyle-result.xml
   allow_failure: false
-- 
GitLab


From 5b03055c034a13c891a3c7e0bc96f4b7c1d3c74d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:25:51 +0200
Subject: [PATCH 06/35] Test 3

---
 .gitlab-ci.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2301573..39ea5a8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,13 +2,13 @@ stages:
   - check_style
   - build
 
-checkstyle:
-  image: clarinpl/openjdk:8
+ checkstyle-lint:
+   image: clarinpl/openjdk:8
   stage: check_style
+  before_script:
+    - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
   script:
-    - cd module
-    - mvn checkstyle:checkstyle
-    - cat checkstyle-result.xml
+    - java -jar checkstyle-8.26-all.jar -c google_checks.xml src
   allow_failure: false
   
 build_image:
-- 
GitLab


From def18b0a25501991c623b310cbb221ea936dcf1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:28:48 +0200
Subject: [PATCH 07/35] Test 4.

---
 .gitlab-ci.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 39ea5a8..a01bdc9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,16 +1,16 @@
 stages:
   - check_style
   - build
-
- checkstyle-lint:
-   image: clarinpl/openjdk:8
+checkstyle-lint:
+  image: 'clarinpl/openjdk:8'
   stage: check_style
   before_script:
-    - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
+    - >-
+      curl -OL
+      https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
   script:
     - java -jar checkstyle-8.26-all.jar -c google_checks.xml src
   allow_failure: false
-  
 build_image:
   stage: build
   image: 'docker:18.09.7'
-- 
GitLab


From ab889f6145f4b53a25e79bf6bb58f9d5a4606b50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:31:09 +0200
Subject: [PATCH 08/35] Test 5

---
 .gitlab-ci.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a01bdc9..764fc6f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,16 +1,16 @@
 stages:
   - check_style
   - build
-checkstyle-lint:
-  image: 'clarinpl/openjdk:8'
+
+ checkstyle-lint:
+  image: clarinpl/openjdk:8
   stage: check_style
   before_script:
-    - >-
-      curl -OL
-      https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
+    - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
   script:
     - java -jar checkstyle-8.26-all.jar -c google_checks.xml src
   allow_failure: false
+  
 build_image:
   stage: build
   image: 'docker:18.09.7'
-- 
GitLab


From 4ac33e10e6e34e71e64a2d5894f452da29d45a1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:34:33 +0200
Subject: [PATCH 09/35] Test 6

---
 .gitlab-ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 764fc6f..bf13a1c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,13 +2,14 @@ stages:
   - check_style
   - build
 
- checkstyle-lint:
+checkstyle-lint:
   image: clarinpl/openjdk:8
   stage: check_style
   before_script:
+    - sudo apt install curl
     - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
   script:
-    - java -jar checkstyle-8.26-all.jar -c google_checks.xml src
+    - java -jar checkstyle-8.26-all.jar -c google_checks.xml module/src
   allow_failure: false
   
 build_image:
-- 
GitLab


From cda995d481eefddb3213080788aabe612980474a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:36:46 +0200
Subject: [PATCH 10/35] Test 7

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index bf13a1c..0431e52 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,12 +1,12 @@
+image: clarinpl/openjdk:8
+
 stages:
   - check_style
   - build
 
 checkstyle-lint:
-  image: clarinpl/openjdk:8
   stage: check_style
   before_script:
-    - sudo apt install curl
     - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
   script:
     - java -jar checkstyle-8.26-all.jar -c google_checks.xml module/src
-- 
GitLab


From a6500b2071a08d4f287627d1f8216f46c4524693 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:37:39 +0200
Subject: [PATCH 11/35] Test 8

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0431e52..e207e77 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: clarinpl/openjdk:8
+image: openjdk:latest
 
 stages:
   - check_style
-- 
GitLab


From fc9b7ec05635005330c15f0f0a5259f77985a059 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:46:46 +0200
Subject: [PATCH 12/35] Test 9 .

---
 .gitlab-ci.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e207e77..a11d1d6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: openjdk:latest
+image: clarinpl/openjdk:8
 
 stages:
   - check_style
@@ -6,10 +6,9 @@ stages:
 
 checkstyle-lint:
   stage: check_style
-  before_script:
-    - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar
   script:
-    - java -jar checkstyle-8.26-all.jar -c google_checks.xml module/src
+    - cd module
+    - mvn checkstyle:checkstyle
   allow_failure: false
   
 build_image:
-- 
GitLab


From c4190b3036887e95bc3a9a2811bcf356e4b6605a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 13:50:07 +0200
Subject: [PATCH 13/35] Tes 10

---
 .gitlab-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a11d1d6..09f1d2a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,6 +9,7 @@ checkstyle-lint:
   script:
     - cd module
     - mvn checkstyle:checkstyle
+    - cat target/checkstyle-result.xml
   allow_failure: false
   
 build_image:
-- 
GitLab


From c35104d62e4e641ccd0c44b16074ebccd245e47a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 14:08:29 +0200
Subject: [PATCH 14/35] Test 11

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 09f1d2a..940008b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,7 +8,7 @@ checkstyle-lint:
   stage: check_style
   script:
     - cd module
-    - mvn checkstyle:checkstyle
+    - mvn checkstyle:check
     - cat target/checkstyle-result.xml
   allow_failure: false
   
-- 
GitLab


From 0c6d507f4a5a40cf41268f29d5ed9191e234711b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 14:20:09 +0200
Subject: [PATCH 15/35] Working spellchecker?

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 940008b..20ea932 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,7 +8,7 @@ checkstyle-lint:
   stage: check_style
   script:
     - cd module
-    - mvn checkstyle:check
+    - mvn checkstyle:check -DconfigLocation=google_checks.xml
     - cat target/checkstyle-result.xml
   allow_failure: false
   
-- 
GitLab


From ab73641a2fdc83d2036a741f063448f0ed093cbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 14:29:48 +0200
Subject: [PATCH 16/35] Changed ruleset to be google java style.

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 20ea932..ca787b9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,7 +8,7 @@ checkstyle-lint:
   stage: check_style
   script:
     - cd module
-    - mvn checkstyle:check -DconfigLocation=google_checks.xml
+    - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml
     - cat target/checkstyle-result.xml
   allow_failure: false
   
-- 
GitLab


From b6fc7143fbc870d9231c7ad65cb838d254d52f41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 14:41:04 +0200
Subject: [PATCH 17/35] Changed job name.

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ca787b9..9336cf5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,7 +4,7 @@ stages:
   - check_style
   - build
 
-checkstyle-lint:
+google_checks:
   stage: check_style
   script:
     - cd module
-- 
GitLab


From 20f9085c5295f7f5acf96fd2be9ac8c70e415952 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 14:52:07 +0200
Subject: [PATCH 18/35] Removed redundant line from -ci

---
 .gitlab-ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9336cf5..3676e88 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,7 +10,6 @@ google_checks:
     - cd module
     - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml
     - cat target/checkstyle-result.xml
-  allow_failure: false
   
 build_image:
   stage: build
-- 
GitLab


From 34fac3ba592283fcaf57b389c67946be5a0284ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 16:39:22 +0200
Subject: [PATCH 19/35] Changed vilolation severity to warning.

---
 .gitlab-ci.yml                                       |  3 +--
 module/src/main/java/pl/clarin/speller/TextEdit.java | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3676e88..3cb8b5b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,8 +8,7 @@ google_checks:
   stage: check_style
   script:
     - cd module
-    - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml
-    - cat target/checkstyle-result.xml
+    - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true
   
 build_image:
   stage: build
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 102dafd..33cbf0e 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -10,14 +10,14 @@ public class TextEdit {
     char[] buffer = inputString.toCharArray();
     StringBuilder sb = new StringBuilder();
     List<RuleMatch> matches = langTool.check(inputString);
-    int string_index = 0;
+    int stringIndex = 0;
     for (RuleMatch match : matches) {
       int matchFrom = match.getFromPos();
       int matchTo = match.getToPos();
 
-      if (matchFrom > string_index) {
-        sb.append(buffer, string_index, (matchFrom - string_index));
-      } else if (matchFrom < string_index) {
+      if (matchFrom > stringIndex) {
+        sb.append(buffer, stringIndex, (matchFrom - stringIndex));
+      } else if (matchFrom < stringIndex) {
         throw new Exception("RuleMatches are not sorted for some reason.");
       }
       String matchingWord;
@@ -28,9 +28,9 @@ public class TextEdit {
       }
 
       sb.append(matchingWord.toCharArray(), 0, (matchingWord.length()));
-      string_index = matchTo;
+      stringIndex = matchTo;
     }
-    sb.append(buffer, string_index, (buffer.length - string_index));
+    sb.append(buffer, stringIndex, (buffer.length - stringIndex));
     return sb.toString();
   }
 }
-- 
GitLab


From 634616f4a54defba76df9bd73ed48cfc76b5f2f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 16:44:49 +0200
Subject: [PATCH 20/35] Fixed checkstyle violations.

---
 module/src/main/java/pl/clarin/speller/Speller.java  | 4 +++-
 module/src/main/java/pl/clarin/speller/TextEdit.java | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 5c2c474..701a5d1 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -26,7 +26,9 @@ public class Speller extends Worker {
   }
 
   @Override
-  public void init() throws Exception {}
+  public void init() throws Exception {
+
+  }
 
   // init objects shared by threads
   @Override
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 33cbf0e..71bb6fb 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -5,7 +5,7 @@ import org.languagetool.JLanguageTool;
 import org.languagetool.rules.RuleMatch;
 
 public class TextEdit {
-  // class that corrects input text
+  /**Class that corrects input text.*/
   public static String edit(String inputString, JLanguageTool langTool) throws Exception {
     char[] buffer = inputString.toCharArray();
     StringBuilder sb = new StringBuilder();
-- 
GitLab


From a66747642bcf44f7e6a34ac6c2afd2476f3dfc0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 16:53:29 +0200
Subject: [PATCH 21/35] Example commit with violation.

---
 module/src/main/java/pl/clarin/speller/Speller.java | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 701a5d1..ba05c94 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -26,9 +26,7 @@ public class Speller extends Worker {
   }
 
   @Override
-  public void init() throws Exception {
-
-  }
+  public void init() throws Exception { }
 
   // init objects shared by threads
   @Override
-- 
GitLab


From f48aee568bd492a1d2faa16f6b77e586075d565e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 20 Jul 2020 16:54:00 +0200
Subject: [PATCH 22/35] Corrected example violation.

---
 module/src/main/java/pl/clarin/speller/Speller.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index ba05c94..0010e66 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -26,7 +26,9 @@ public class Speller extends Worker {
   }
 
   @Override
-  public void init() throws Exception { }
+  public void init() throws Exception {
+    
+  }
 
   // init objects shared by threads
   @Override
-- 
GitLab


From 047c02f0579115a31d4ca1ab6a496f386002f7dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Tue, 21 Jul 2020 15:40:05 +0200
Subject: [PATCH 23/35] Improved accuracy of speller by adding rules when not
 to change a word.

---
 .../main/java/pl/clarin/speller/Speller.java  |   2 +-
 .../main/java/pl/clarin/speller/TextEdit.java | 168 +++++++++++++++++-
 2 files changed, 167 insertions(+), 3 deletions(-)

diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 0010e66..701a5d1 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -27,7 +27,7 @@ public class Speller extends Worker {
 
   @Override
   public void init() throws Exception {
-    
+
   }
 
   // init objects shared by threads
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 71bb6fb..654a49b 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -1,6 +1,8 @@
 package pl.clarin.speller;
 
+import java.nio.charset.Charset;
 import java.util.List;
+
 import org.languagetool.JLanguageTool;
 import org.languagetool.rules.RuleMatch;
 
@@ -24,13 +26,175 @@ public class TextEdit {
       if (match.getSuggestedReplacements().isEmpty()) {
         matchingWord = inputString.substring(matchFrom, matchTo);
       } else {
-        matchingWord = match.getSuggestedReplacements().get(0);
+        boolean change = toChange(inputString,matchFrom,matchTo);
+        if (change){
+          matchingWord = match.getSuggestedReplacements().get(0);
+        }else{
+          matchingWord = inputString.substring(matchFrom, matchTo);
+        }
       }
-
       sb.append(matchingWord.toCharArray(), 0, (matchingWord.length()));
       stringIndex = matchTo;
     }
     sb.append(buffer, stringIndex, (buffer.length - stringIndex));
     return sb.toString();
   }
+
+  private static boolean toChange(String inputString, int matchFrom, int matchTo){
+    if(isProperNoun(inputString,matchFrom) || isAcronym(inputString,matchFrom,matchTo)
+            || isFileOrExtension(inputString,matchFrom,matchTo) || checkFirstLetter(inputString,matchFrom)
+            || isNotPolish(inputString, matchFrom, matchTo) || isSurname(inputString, matchFrom, matchTo)){
+      return false;
+    }else{
+      return true;
+    }
+  }
+
+  private static boolean isSurname(String inputString, int matchFrom, int matchTo){
+    boolean isSurname = false;
+    int i = matchFrom;
+    if((i-4) > 0 && Character.isUpperCase(inputString.charAt(i))
+            && Character.isWhitespace(inputString.charAt(i-1))
+            && (inputString.charAt(i-2)) == '.'
+            && Character.isUpperCase(inputString.charAt(i-3))){
+      i = i - 3;
+      while(i > 0 && Character.isUpperCase(inputString.charAt(i))){
+        --i;
+      }
+      if(i == 0 || Character.isWhitespace(inputString.charAt(i))){
+        i = matchFrom + 1;
+        isSurname = true;
+        while(i < matchTo){
+          if(!Character.isLowerCase(inputString.charAt(i)) && !(i == matchTo-1 && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)){
+            isSurname = false;
+          }
+          ++i;
+        }
+      }
+    }
+    return isSurname;
+  }
+
+  private static boolean isProperNoun(String inputString, int matchFrom){
+    boolean isProperNoun = false;
+    if (Character.isUpperCase(inputString.charAt(matchFrom))) {
+      isProperNoun = false;
+      int i = matchFrom - 1;
+      while (i > 0) {
+        if (Character.isLetterOrDigit(inputString.charAt(i))) {
+          isProperNoun = true;
+          break;
+        } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) {
+          break;
+        }
+        --i;
+      }
+    }
+    return isProperNoun;
+  }
+
+  private static boolean isAcronym(String inputString, int matchFrom, int matchTo){
+    double breakPoint = 0.6;
+    boolean isAcronym = false;
+    int lowerCaseLetters = 0;
+    if (Character.isUpperCase(inputString.charAt(matchFrom)) || Character.isDigit(inputString.charAt(matchFrom))) {
+      int i = matchFrom + 1;
+      while(i < matchTo){
+        if(Character.isLowerCase(inputString.charAt(i))){
+          ++lowerCaseLetters;
+        }
+        ++i;
+      }
+      if(((matchTo-matchFrom) * breakPoint) >= lowerCaseLetters){
+          isAcronym = true;
+        }
+    }
+    return isAcronym;
+  }
+
+  private static boolean isNotPolish(String inputString, int matchFrom, int matchTo){
+    String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ";
+    boolean isNotPolish = false;
+    for (int i = matchFrom; i < matchTo; ++i){
+      if(!isAscii(inputString.charAt(i))){
+        if(polishLetterSet.indexOf(inputString.charAt(i)) == -1){
+          isNotPolish = true;
+          break;
+        }
+      }
+    }
+    return isNotPolish;
+  }
+
+  public static boolean isAscii(Character v) {
+    return Charset.forName("US-ASCII").newEncoder().canEncode(v);
+  }
+
+  private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo){
+    String character_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()";
+    int extension_length = 4;
+    boolean isFileOrExtension = false;
+    boolean dot = false;
+    int lastIndex = lastIndex(inputString,matchFrom,matchTo);
+    int firstIndex = firstIndex(inputString,matchFrom);
+    int i = firstIndex + 1;
+    while (i < lastIndex) {
+      char current_char = inputString.charAt(i);
+      if(character_list.indexOf(current_char) == -1) {
+        if(current_char == '.'){
+          dot = true;
+          ++i;
+          int j = extension_length;
+          while(i < lastIndex){
+            current_char = inputString.charAt(i);
+            if(character_list.indexOf(current_char) == -1){
+              if(current_char == '.'){
+                j = extension_length;
+              } else if(!(i == lastIndex-1 && ".?!,".indexOf(current_char) != -1)){
+                break;
+              }
+            }
+            --j;
+            if (j < 0){
+              break;
+            }
+            ++i;
+          }
+        }
+        break;
+      }
+      ++i;
+    }
+    if (dot && i == lastIndex){
+      isFileOrExtension = true;
+    }
+    return isFileOrExtension;
+  }
+
+  private static int lastIndex(String inputString, int matchFrom, int matchTo){
+    int i = matchFrom;
+    while(i < inputString.length() && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)){
+      ++i;
+    }
+    return i;
+  }
+
+  private static int firstIndex(String inputString, int matchFrom){
+    int i = matchFrom-1;
+    if(i >= 0 && inputString.charAt(i) == '.'){
+      while(i >= 0 && !Character.isWhitespace(inputString.charAt(i))){
+        --i;
+      }
+    }
+    return i;
+  }
+
+  private static boolean checkFirstLetter(String inputString, int at){
+    String leaveCharacters = "§";
+    if(leaveCharacters.indexOf(inputString.charAt(at)) != -1){
+      return true;
+    }
+    return false;
+  }
+
 }
-- 
GitLab


From 509e9f282774ed45da015560cffdddd603c75a81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Tue, 21 Jul 2020 16:18:36 +0200
Subject: [PATCH 24/35] Fixed checkstyle violations.

---
 .../main/java/pl/clarin/speller/TextEdit.java | 110 +++++++++---------
 1 file changed, 58 insertions(+), 52 deletions(-)

diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 654a49b..f0a7f49 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -2,7 +2,6 @@ package pl.clarin.speller;
 
 import java.nio.charset.Charset;
 import java.util.List;
-
 import org.languagetool.JLanguageTool;
 import org.languagetool.rules.RuleMatch;
 
@@ -27,9 +26,9 @@ public class TextEdit {
         matchingWord = inputString.substring(matchFrom, matchTo);
       } else {
         boolean change = toChange(inputString,matchFrom,matchTo);
-        if (change){
+        if (change) {
           matchingWord = match.getSuggestedReplacements().get(0);
-        }else{
+        } else {
           matchingWord = inputString.substring(matchFrom, matchTo);
         }
       }
@@ -40,32 +39,36 @@ public class TextEdit {
     return sb.toString();
   }
 
-  private static boolean toChange(String inputString, int matchFrom, int matchTo){
-    if(isProperNoun(inputString,matchFrom) || isAcronym(inputString,matchFrom,matchTo)
-            || isFileOrExtension(inputString,matchFrom,matchTo) || checkFirstLetter(inputString,matchFrom)
-            || isNotPolish(inputString, matchFrom, matchTo) || isSurname(inputString, matchFrom, matchTo)){
+  private static boolean toChange(String inputString, int matchFrom, int matchTo) {
+    if (isProperNoun(inputString,matchFrom)
+            || isAcronym(inputString,matchFrom,matchTo)
+            || isFileOrExtension(inputString,matchFrom,matchTo)
+            || checkFirstLetter(inputString,matchFrom)
+            || isNotPolish(inputString, matchFrom, matchTo)
+            || isSurname(inputString, matchFrom, matchTo)) {
       return false;
-    }else{
+    } else {
       return true;
     }
   }
 
-  private static boolean isSurname(String inputString, int matchFrom, int matchTo){
+  private static boolean isSurname(String inputString, int matchFrom, int matchTo) {
     boolean isSurname = false;
     int i = matchFrom;
-    if((i-4) > 0 && Character.isUpperCase(inputString.charAt(i))
-            && Character.isWhitespace(inputString.charAt(i-1))
-            && (inputString.charAt(i-2)) == '.'
-            && Character.isUpperCase(inputString.charAt(i-3))){
+    if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i))
+            && Character.isWhitespace(inputString.charAt(i - 1))
+            && (inputString.charAt(i - 2)) == '.'
+            && Character.isUpperCase(inputString.charAt(i - 3))) {
       i = i - 3;
-      while(i > 0 && Character.isUpperCase(inputString.charAt(i))){
+      while (i > 0 && Character.isUpperCase(inputString.charAt(i))) {
         --i;
       }
-      if(i == 0 || Character.isWhitespace(inputString.charAt(i))){
+      if (i == 0 || Character.isWhitespace(inputString.charAt(i))) {
         i = matchFrom + 1;
         isSurname = true;
-        while(i < matchTo){
-          if(!Character.isLowerCase(inputString.charAt(i)) && !(i == matchTo-1 && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)){
+        while (i < matchTo) {
+          if (!Character.isLowerCase(inputString.charAt(i))
+                  && !(i == (matchTo - 1) && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)) {
             isSurname = false;
           }
           ++i;
@@ -75,7 +78,7 @@ public class TextEdit {
     return isSurname;
   }
 
-  private static boolean isProperNoun(String inputString, int matchFrom){
+  private static boolean isProperNoun(String inputString, int matchFrom) {
     boolean isProperNoun = false;
     if (Character.isUpperCase(inputString.charAt(matchFrom))) {
       isProperNoun = false;
@@ -93,31 +96,32 @@ public class TextEdit {
     return isProperNoun;
   }
 
-  private static boolean isAcronym(String inputString, int matchFrom, int matchTo){
+  private static boolean isAcronym(String inputString, int matchFrom, int matchTo) {
     double breakPoint = 0.6;
     boolean isAcronym = false;
     int lowerCaseLetters = 0;
-    if (Character.isUpperCase(inputString.charAt(matchFrom)) || Character.isDigit(inputString.charAt(matchFrom))) {
+    if (Character.isUpperCase(inputString.charAt(matchFrom))
+            || Character.isDigit(inputString.charAt(matchFrom))) {
       int i = matchFrom + 1;
-      while(i < matchTo){
-        if(Character.isLowerCase(inputString.charAt(i))){
+      while (i < matchTo) {
+        if (Character.isLowerCase(inputString.charAt(i))) {
           ++lowerCaseLetters;
         }
         ++i;
       }
-      if(((matchTo-matchFrom) * breakPoint) >= lowerCaseLetters){
-          isAcronym = true;
-        }
+      if (((matchTo - matchFrom) * breakPoint) >= lowerCaseLetters) {
+        isAcronym = true;
+      }
     }
     return isAcronym;
   }
 
-  private static boolean isNotPolish(String inputString, int matchFrom, int matchTo){
+  private static boolean isNotPolish(String inputString, int matchFrom, int matchTo) {
     String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ";
     boolean isNotPolish = false;
-    for (int i = matchFrom; i < matchTo; ++i){
-      if(!isAscii(inputString.charAt(i))){
-        if(polishLetterSet.indexOf(inputString.charAt(i)) == -1){
+    for (int i = matchFrom; i < matchTo; ++i) {
+      if (!isAscii(inputString.charAt(i))) {
+        if (polishLetterSet.indexOf(inputString.charAt(i)) == -1) {
           isNotPolish = true;
           break;
         }
@@ -130,32 +134,33 @@ public class TextEdit {
     return Charset.forName("US-ASCII").newEncoder().canEncode(v);
   }
 
-  private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo){
-    String character_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()";
-    int extension_length = 4;
+  private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) {
+    String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG"
+            + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()";
+    int extensionLength = 4;
     boolean isFileOrExtension = false;
     boolean dot = false;
     int lastIndex = lastIndex(inputString,matchFrom,matchTo);
     int firstIndex = firstIndex(inputString,matchFrom);
     int i = firstIndex + 1;
     while (i < lastIndex) {
-      char current_char = inputString.charAt(i);
-      if(character_list.indexOf(current_char) == -1) {
-        if(current_char == '.'){
+      char currentChar = inputString.charAt(i);
+      if (characterList.indexOf(currentChar) == -1) {
+        if (currentChar == '.') {
           dot = true;
           ++i;
-          int j = extension_length;
-          while(i < lastIndex){
-            current_char = inputString.charAt(i);
-            if(character_list.indexOf(current_char) == -1){
-              if(current_char == '.'){
-                j = extension_length;
-              } else if(!(i == lastIndex-1 && ".?!,".indexOf(current_char) != -1)){
+          int j = extensionLength;
+          while (i < lastIndex) {
+            currentChar = inputString.charAt(i);
+            if (characterList.indexOf(currentChar) == -1) {
+              if (currentChar == '.') {
+                j = extensionLength;
+              } else if (!(i == lastIndex - 1 && ".?!,".indexOf(currentChar) != -1)) {
                 break;
               }
             }
             --j;
-            if (j < 0){
+            if (j < 0) {
               break;
             }
             ++i;
@@ -165,33 +170,34 @@ public class TextEdit {
       }
       ++i;
     }
-    if (dot && i == lastIndex){
+    if (dot && i == lastIndex) {
       isFileOrExtension = true;
     }
     return isFileOrExtension;
   }
 
-  private static int lastIndex(String inputString, int matchFrom, int matchTo){
+  private static int lastIndex(String inputString, int matchFrom, int matchTo) {
     int i = matchFrom;
-    while(i < inputString.length() && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)){
+    while (i < inputString.length()
+            && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) {
       ++i;
     }
     return i;
   }
 
-  private static int firstIndex(String inputString, int matchFrom){
-    int i = matchFrom-1;
-    if(i >= 0 && inputString.charAt(i) == '.'){
-      while(i >= 0 && !Character.isWhitespace(inputString.charAt(i))){
+  private static int firstIndex(String inputString, int matchFrom) {
+    int i = matchFrom - 1;
+    if (i >= 0 && inputString.charAt(i) == '.') {
+      while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) {
         --i;
       }
     }
     return i;
   }
 
-  private static boolean checkFirstLetter(String inputString, int at){
+  private static boolean checkFirstLetter(String inputString, int at) {
     String leaveCharacters = "§";
-    if(leaveCharacters.indexOf(inputString.charAt(at)) != -1){
+    if (leaveCharacters.indexOf(inputString.charAt(at)) != -1) {
       return true;
     }
     return false;
-- 
GitLab


From 14968dc33dc0a2c1900edcd2df6848af597c1447 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Fri, 24 Jul 2020 16:15:05 +0200
Subject: [PATCH 25/35] Moved JLanguageTool to init, left lang inicialization
 in static init.

---
 module/src/main/java/pl/clarin/speller/Speller.java | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 701a5d1..403dfe9 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -15,25 +15,23 @@ import pl.clarin.ws.worker.IniFile;
 import pl.clarin.ws.worker.Service;
 import pl.clarin.ws.worker.Worker;
 
-
-
 public class Speller extends Worker {
-  // init object for each thread
-  static JLanguageTool langTool = null;
+  static Polish lang = null;
 
   public static void main(String[] args) {
     new Service<>(Speller.class);
   }
 
+  // init object for each thread
   @Override
   public void init() throws Exception {
-
+    JLanguageTool langTool = new JLanguageTool(lang);
   }
 
   // init objects shared by threads
   @Override
   public void static_init(IniFile init) throws Exception {
-    langTool = new JLanguageTool(new Polish());
+    lang = new Polish();
   }
 
   @Override
-- 
GitLab


From 360eeeb40422e82e8820a245a7b79bbdbad20114 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 27 Jul 2020 08:49:53 +0200
Subject: [PATCH 26/35] Moved langTool variable declaration.

---
 module/src/main/java/pl/clarin/speller/Speller.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 403dfe9..d57ece2 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -17,6 +17,7 @@ import pl.clarin.ws.worker.Worker;
 
 public class Speller extends Worker {
   static Polish lang = null;
+  JLanguageTool langTool = null;
 
   public static void main(String[] args) {
     new Service<>(Speller.class);
@@ -25,7 +26,7 @@ public class Speller extends Worker {
   // init object for each thread
   @Override
   public void init() throws Exception {
-    JLanguageTool langTool = new JLanguageTool(lang);
+    langTool = new JLanguageTool(lang);
   }
 
   // init objects shared by threads
-- 
GitLab


From a27bf30251d9c8071aa81132a07ae5a2473c41ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 27 Jul 2020 12:29:29 +0200
Subject: [PATCH 27/35] Fixed built on master.

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3cb8b5b..af39c1c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,8 +20,8 @@ build_image:
   before_script:
     - ''
   script:
-    - docker build -t clarinpl/liner2 .
+    - docker build -t clarinpl/speller .
     - echo $DOCKER_PASSWORD > pass.txt
     - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
     - rm pass.txt
-    - docker push clarinpl/liner2
+    - docker push clarinpl/speller
-- 
GitLab


From c472f4a80b4743c9fa20dcb3d95342763b838b63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Fri, 31 Jul 2020 10:13:39 +0200
Subject: [PATCH 28/35] Added detecting languge using spaCy.

---
 Dockerfile                                    |   9 +-
 SpacyDocConvert.py                            |   8 ++
 module/pom.xml                                |   5 +
 .../main/java/pl/clarin/speller/SpaCy.java    | 119 ++++++++++++++++++
 .../main/java/pl/clarin/speller/Speller.java  |   4 +-
 .../main/java/pl/clarin/speller/TextEdit.java |  52 ++++----
 6 files changed, 168 insertions(+), 29 deletions(-)
 create mode 100644 SpacyDocConvert.py
 create mode 100644 module/src/main/java/pl/clarin/speller/SpaCy.java

diff --git a/Dockerfile b/Dockerfile
index 348cb06..ab231c7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,6 @@ FROM clarinpl/openjdk:8 as builder
 LABEL application="Speller"
 LABEL description="Client - Workers - correcting mistakes in sentances in txt files"
 LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
-LABEL maintainer="bartlomiej.koptyra@pwr.edu.pl"
 
 WORKDIR /home/install
 RUN cd nlp.worker && \
@@ -16,6 +15,14 @@ RUN cd module && \
     mvn clean && \
     mvn install
 
+FROM clarinpl/python:3.6
+
+WORKDIR /home/worker
+
+RUN python3.6 -m pip install jep
+RUN python3.6 -m pip install spacy
+RUN python3.6 -m spacy download pl_core_news_lg
+	
 FROM clarinpl/openjdk-jre:8
 
 WORKDIR /home/worker
diff --git a/SpacyDocConvert.py b/SpacyDocConvert.py
new file mode 100644
index 0000000..b33f1c8
--- /dev/null
+++ b/SpacyDocConvert.py
@@ -0,0 +1,8 @@
+def convert(spacyDoc, sentence):
+    idx = 0
+    proper_nouns_list = []
+    for tok in spacyDoc:
+        idx = sentence.find(tok.text, idx)
+        if tok.ent_type_ != '':
+            proper_nouns_list.append((idx, idx+len(tok.text)))
+    return proper_nouns_list
diff --git a/module/pom.xml b/module/pom.xml
index 62d8f94..2522945 100644
--- a/module/pom.xml
+++ b/module/pom.xml
@@ -30,6 +30,11 @@
             <artifactId>json</artifactId>
             <version>20141113</version>
         </dependency>
+        <dependency>
+            <groupId>black.ninia</groupId>
+            <artifactId>jep</artifactId>
+            <version>3.9.0</version>
+        </dependency>
     </dependencies>
 
     <build>
diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java
new file mode 100644
index 0000000..dccf91b
--- /dev/null
+++ b/module/src/main/java/pl/clarin/speller/SpaCy.java
@@ -0,0 +1,119 @@
+package pl.clarin.speller;
+
+import jep.SharedInterpreter;
+import jep.JepException;
+import org.languagetool.JLanguageTool;
+
+import java.lang.Integer;
+import java.util.ArrayList;
+import java.util.List;
+
+public class SpaCy {
+    boolean loaded = false;
+    boolean processed = false;
+    SharedInterpreter interp = null;
+    ArrayList<List<Long>> properNounArray = null;
+    ArrayList<List<Long>> foreignSentenceArray = null;
+    {
+        try{
+            interp = new SharedInterpreter();
+            interp.exec("import sys");
+            interp.exec("sys.path.append(r'/home/worker')");
+            interp.exec("import SpacyDocConvert");
+            interp.exec("sys.argv=[]");
+            interp.exec("import spacy");
+            interp.exec("from spacy_langdetect import LanguageDetector");
+            interp.exec("model = pl_core_news_lg");
+            interp.exec("nlp = spacy.load(model)");
+            interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)");
+            loaded = true;
+        }
+        catch (JepException e) {
+            System.out.println("An error occurred: " + e.getMessage());
+        }
+    }
+
+    public boolean isLoaded() {
+        return loaded;
+    }
+
+    public boolean isProcessed() {
+        return processed;
+    }
+
+    public void process(String inputString) throws Exception {
+        processed = false;
+        if(loaded){
+            try {
+                interp.exec("sentence = " + inputString);
+                interp.exec("spacyDoc = nlp(sentence)");
+                Object properNounList = interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)");
+                properNounArray = ((ArrayList<List<Long>>) properNounList);
+                Object foreignSentList = interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)");
+                foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList);
+                processed = true;
+            }
+            catch (JepException e) {
+                System.out.println("An error occurred: " + e.getMessage());
+            }
+        }
+    }
+
+    public boolean isForeignSentence(String inputString, int matchFrom) {
+        boolean isForeginSent = false;
+        if(processed) {
+            for (List<Long> tuple : foreignSentenceArray){
+                if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){
+                    if(Character.isUpperCase(inputString.charAt(matchFrom))){
+                        return true;
+                    }else{
+                        return false;
+                    }
+                }
+                if(matchFrom > tuple.get(1).intValue()){
+                    return false;
+                }
+            }
+        }
+        return isForeginSent;
+    }
+
+    public boolean isProperNoun(String inputString, int matchFrom) {
+        boolean isProperNoun = false;
+        if(!processed) {
+            if (Character.isUpperCase(inputString.charAt(matchFrom))) {
+                isProperNoun = false;
+                int i = matchFrom - 1;
+                while (i > 0) {
+                    if (Character.isLetterOrDigit(inputString.charAt(i))) {
+                        isProperNoun = true;
+                        break;
+                    } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) {
+                        break;
+                    }
+                    --i;
+                }
+            }
+            return isProperNoun;
+        }else{
+            for (List<Long> tuple : properNounArray){
+                if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){
+                    if(Character.isUpperCase(inputString.charAt(matchFrom))){
+                        return true;
+                    }else{
+                        return false;
+                    }
+                }
+                if(matchFrom > tuple.get(1).intValue()){
+                    return false;
+                }
+            }
+            return isProperNoun;
+        }
+    }
+}
+
+
+
+
+
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index d57ece2..1296761 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -18,6 +18,7 @@ import pl.clarin.ws.worker.Worker;
 public class Speller extends Worker {
   static Polish lang = null;
   JLanguageTool langTool = null;
+  TextEdit textEditor = null;
 
   public static void main(String[] args) {
     new Service<>(Speller.class);
@@ -27,6 +28,7 @@ public class Speller extends Worker {
   @Override
   public void init() throws Exception {
     langTool = new JLanguageTool(lang);
+    textEditor = new TextEdit();
   }
 
   // init objects shared by threads
@@ -47,7 +49,7 @@ public class Speller extends Worker {
         String line = null;
         while ((line = br.readLine()) != null) {
           try {
-            String correctedLine = TextEdit.edit(line, langTool);
+            String correctedLine = textEditor.edit(line, langTool);
             sb.append(correctedLine).append('\n');
           } catch (Exception exception) {
             Logger.getLogger(Speller.class.getName())
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index f0a7f49..a42c18b 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -6,8 +6,15 @@ import org.languagetool.JLanguageTool;
 import org.languagetool.rules.RuleMatch;
 
 public class TextEdit {
+  SpaCy spacy = new SpaCy();
   /**Class that corrects input text.*/
-  public static String edit(String inputString, JLanguageTool langTool) throws Exception {
+  public String edit(String inputString, JLanguageTool langTool) throws Exception {
+    try {
+      spacy.process(inputString);
+    }
+    catch(Exception e){
+      System.out.println("Spacy did not process the input correctly! : " + e.getMessage());
+    }
     char[] buffer = inputString.toCharArray();
     StringBuilder sb = new StringBuilder();
     List<RuleMatch> matches = langTool.check(inputString);
@@ -39,8 +46,9 @@ public class TextEdit {
     return sb.toString();
   }
 
-  private static boolean toChange(String inputString, int matchFrom, int matchTo) {
-    if (isProperNoun(inputString,matchFrom)
+  private boolean toChange(String inputString, int matchFrom, int matchTo) {
+    if (isForeignSentence(inputString,matchFrom)
+            || isProperNoun(inputString,matchFrom)
             || isAcronym(inputString,matchFrom,matchTo)
             || isFileOrExtension(inputString,matchFrom,matchTo)
             || checkFirstLetter(inputString,matchFrom)
@@ -52,7 +60,7 @@ public class TextEdit {
     }
   }
 
-  private static boolean isSurname(String inputString, int matchFrom, int matchTo) {
+  private boolean isSurname(String inputString, int matchFrom, int matchTo) {
     boolean isSurname = false;
     int i = matchFrom;
     if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i))
@@ -78,25 +86,15 @@ public class TextEdit {
     return isSurname;
   }
 
-  private static boolean isProperNoun(String inputString, int matchFrom) {
-    boolean isProperNoun = false;
-    if (Character.isUpperCase(inputString.charAt(matchFrom))) {
-      isProperNoun = false;
-      int i = matchFrom - 1;
-      while (i > 0) {
-        if (Character.isLetterOrDigit(inputString.charAt(i))) {
-          isProperNoun = true;
-          break;
-        } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) {
-          break;
-        }
-        --i;
-      }
-    }
-    return isProperNoun;
+  private boolean isForeignSentence(String inputString, int matchFrom) {
+    return spacy.isForeignSentence(inputString, matchFrom);
+  }
+
+  private boolean isProperNoun(String inputString, int matchFrom) {
+    return spacy.isProperNoun(inputString, matchFrom);
   }
 
-  private static boolean isAcronym(String inputString, int matchFrom, int matchTo) {
+  private boolean isAcronym(String inputString, int matchFrom, int matchTo) {
     double breakPoint = 0.6;
     boolean isAcronym = false;
     int lowerCaseLetters = 0;
@@ -116,7 +114,7 @@ public class TextEdit {
     return isAcronym;
   }
 
-  private static boolean isNotPolish(String inputString, int matchFrom, int matchTo) {
+  private boolean isNotPolish(String inputString, int matchFrom, int matchTo) {
     String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ";
     boolean isNotPolish = false;
     for (int i = matchFrom; i < matchTo; ++i) {
@@ -130,11 +128,11 @@ public class TextEdit {
     return isNotPolish;
   }
 
-  public static boolean isAscii(Character v) {
+  public boolean isAscii(Character v) {
     return Charset.forName("US-ASCII").newEncoder().canEncode(v);
   }
 
-  private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) {
+  private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) {
     String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG"
             + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()";
     int extensionLength = 4;
@@ -176,7 +174,7 @@ public class TextEdit {
     return isFileOrExtension;
   }
 
-  private static int lastIndex(String inputString, int matchFrom, int matchTo) {
+  private int lastIndex(String inputString, int matchFrom, int matchTo) {
     int i = matchFrom;
     while (i < inputString.length()
             && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) {
@@ -185,7 +183,7 @@ public class TextEdit {
     return i;
   }
 
-  private static int firstIndex(String inputString, int matchFrom) {
+  private int firstIndex(String inputString, int matchFrom) {
     int i = matchFrom - 1;
     if (i >= 0 && inputString.charAt(i) == '.') {
       while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) {
@@ -195,7 +193,7 @@ public class TextEdit {
     return i;
   }
 
-  private static boolean checkFirstLetter(String inputString, int at) {
+  private boolean checkFirstLetter(String inputString, int at) {
     String leaveCharacters = "§";
     if (leaveCharacters.indexOf(inputString.charAt(at)) != -1) {
       return true;
-- 
GitLab


From 8c8604c7d212da9d577a84523dc9b674304de61f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Fri, 31 Jul 2020 13:08:58 +0200
Subject: [PATCH 29/35] Added tox. Might not work yet .

---
 .gitlab-ci.yml     | 18 +++++++++++++++++-
 SpacyDocConvert.py | 29 +++++++++++++++++++++++++++--
 tox.ini            | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 tox.ini

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index af39c1c..9b653ad 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,5 +1,21 @@
-image: clarinpl/openjdk:8
+image: 'clarinpl/python:3.6'
+stages:
+  - check_style
+cache:
+  paths:
+    - .tox
+before_script:
+  - pip install tox==2.9.1
+pep8:
+  stage: check_style
+  script:
+    - tox -v -e pep8
+docstyle:
+  stage: check_style
+  script:
+    - tox -v -e docstyle
 
+image: clarinpl/openjdk:8
 stages:
   - check_style
   - build
diff --git a/SpacyDocConvert.py b/SpacyDocConvert.py
index b33f1c8..5987e4c 100644
--- a/SpacyDocConvert.py
+++ b/SpacyDocConvert.py
@@ -1,8 +1,33 @@
-def convert(spacyDoc, sentence):
+"""Utility file for using spacyDoc."""
+
+
+def find_proper_nouns(spacyDoc, sentence):
+    """Function returns indices of words that are proper nouns.
+
+    :param spacyDoc: SpacyDoc with ner tags.
+    :param sentence: Text spacyDoc is made from.
+    :return:list with proper nouns indices
+    """
     idx = 0
     proper_nouns_list = []
     for tok in spacyDoc:
         idx = sentence.find(tok.text, idx)
         if tok.ent_type_ != '':
-            proper_nouns_list.append((idx, idx+len(tok.text)))
+            proper_nouns_list.append((idx, idx + len(tok.text)))
     return proper_nouns_list
+
+
+def find_foreign_sentences(spacyDoc, text):
+    """Function returns indices of sentences that are not in Polish.
+
+    :param spacyDoc: SpacyDoc with ner tags.
+    :param text: Text spacyDoc is made from.
+    :return: list with foreign sentences indices
+    """
+    idx = 0
+    foreign_sentences_list = []
+    for sent in spacyDoc.sents:
+        if sent._.language.get('language', 'no') != 'pl':
+            idx = text.find(sent.text, idx)
+            foreign_sentences_list.append((idx, idx + len(sent.text)))
+    return foreign_sentences_list
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..819e612
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,44 @@
+[tox]
+envlist = pep8,docstyle
+skipsdist = True
+
+[testenv:pep8]
+deps =
+    flake8
+basepython = python3
+commands =
+    flake8 {posargs}
+
+[testenv:docstyle]
+deps =
+    pydocstyle
+basepython = python3
+commands =
+    pydocstyle --verbose {posargs}
+
+[flake8]
+# W504 skipped because it is overeager and unnecessary
+ignore = W504
+show-source = True
+exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
+import-order-style = pep8
+max-line-length = 80
+
+
+[pydocstyle]
+# D104 Missing docstring in public package
+# D203 1 blank line required before class docstring
+# D213 Multi-line docstring summary should start at the second line
+# D214 Section is over-indented
+# D215 Section underline is over-indented
+# D401 First line should be in imperative mood; try rephrasing
+# D405 Section name should be properly capitalized
+# D406 Section name should end with a newline
+# D407 Missing dashed underline after section
+# D408 Section underline should be in the line following the section’s name
+# D409 Section underline should match the length of its name
+# D410 Missing blank line after section
+# D411 Missing blank line before section
+ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
+match-dir = ^(?!\.tox|venv).*
+match = ^(?!setup).*\.py
\ No newline at end of file
-- 
GitLab


From da993cedfe4e8e25d4ae34a5cca92f8c1f484a27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Fri, 31 Jul 2020 16:49:35 +0200
Subject: [PATCH 30/35] Sucessfully added spaCy to speller. First version that
 works on tester.

---
 Dockerfile                                    |  28 ++-
 docker-compose.yml                            |   6 +-
 entrypoint.sh                                 |   5 +
 .../main/java/pl/clarin/speller/SpaCy.java    | 191 +++++++++---------
 .../main/java/pl/clarin/speller/Speller.java  |  26 ++-
 .../main/java/pl/clarin/speller/TextEdit.java |  30 ++-
 requirements.txt                              |   3 +
 7 files changed, 147 insertions(+), 142 deletions(-)
 create mode 100644 entrypoint.sh
 create mode 100644 requirements.txt

diff --git a/Dockerfile b/Dockerfile
index ab231c7..b2aaee2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM clarinpl/openjdk:8 as builder
+FROM clarinpl/openjdk:8
 
 LABEL application="Speller"
 LABEL description="Client - Workers - correcting mistakes in sentances in txt files"
@@ -15,16 +15,24 @@ RUN cd module && \
     mvn clean && \
     mvn install
 
-FROM clarinpl/python:3.6
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y python3.5 && \
+    apt-get install -y python3-pip
+
+ENV LD_LIBRARY_PATH=/usr/local/lib/python3.5/dist-packages/jep/
 
 WORKDIR /home/worker
+COPY ./SpacyDocConvert.py .
+COPY ./requirements.txt .
+COPY ./entrypoint.sh ./entrypoint.sh
 
-RUN python3.6 -m pip install jep
-RUN python3.6 -m pip install spacy
-RUN python3.6 -m spacy download pl_core_news_lg
-	
-FROM clarinpl/openjdk-jre:8
+RUN python3 -m pip install --upgrade pip && \
+	python3 -m pip install -r requirements.txt
 
-WORKDIR /home/worker
-COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar .
-CMD ["java", "-jar", "nlp.worker.speller-1.0-SNAPSHOT.jar"]
\ No newline at end of file
+RUN ["chmod", "+x", "./entrypoint.sh"]
+
+RUN cp /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar ./nlp.worker.speller-1.0-SNAPSHOT.jar
+CMD ["./entrypoint.sh"]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index f0a01ae..06c895f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -8,7 +8,7 @@ services:
       - './config.ini:/home/worker/config.ini'
     working_dir: /home/worker/
     entrypoint:
-      - java
-      - '-jar'
-      - nlp.worker.speller-1.0-SNAPSHOT.jar
+      - ./entrypoint.sh
+    environment:
+      - PYTHONUNBUFFERED=0
     restart: always
diff --git a/entrypoint.sh b/entrypoint.sh
new file mode 100644
index 0000000..d864430
--- /dev/null
+++ b/entrypoint.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+cd /home/worker
+echo "Downloading model"
+python3 -m spacy download "pl_core_news_lg"
+java -jar nlp.worker.speller-1.0-SNAPSHOT.jar
\ No newline at end of file
diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java
index dccf91b..b14de22 100644
--- a/module/src/main/java/pl/clarin/speller/SpaCy.java
+++ b/module/src/main/java/pl/clarin/speller/SpaCy.java
@@ -1,119 +1,108 @@
 package pl.clarin.speller;
 
-import jep.SharedInterpreter;
-import jep.JepException;
-import org.languagetool.JLanguageTool;
-
-import java.lang.Integer;
 import java.util.ArrayList;
 import java.util.List;
+import jep.JepException;
+import jep.SharedInterpreter;
 
 public class SpaCy {
-    boolean loaded = false;
-    boolean processed = false;
-    SharedInterpreter interp = null;
-    ArrayList<List<Long>> properNounArray = null;
-    ArrayList<List<Long>> foreignSentenceArray = null;
-    {
-        try{
-            interp = new SharedInterpreter();
-            interp.exec("import sys");
-            interp.exec("sys.path.append(r'/home/worker')");
-            interp.exec("import SpacyDocConvert");
-            interp.exec("sys.argv=[]");
-            interp.exec("import spacy");
-            interp.exec("from spacy_langdetect import LanguageDetector");
-            interp.exec("model = pl_core_news_lg");
-            interp.exec("nlp = spacy.load(model)");
-            interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)");
-            loaded = true;
-        }
-        catch (JepException e) {
-            System.out.println("An error occurred: " + e.getMessage());
-        }
-    }
+  /**Class that handles spacy processing.*/
+  boolean loaded = false;
+  boolean processed = false;
+  SharedInterpreter interp = null;
+  ArrayList<List<Long>> properNounArray = null;
+  ArrayList<List<Long>> foreignSentenceArray = null;
 
-    public boolean isLoaded() {
-        return loaded;
+  {
+    try {
+      interp = new SharedInterpreter();
+      interp.exec("import sys");
+      interp.exec("sys.path.append(r'/home/worker')");
+      interp.exec("import SpacyDocConvert");
+      interp.exec("sys.argv=[]");
+      interp.exec("import spacy");
+      interp.exec("from spacy_langdetect import LanguageDetector");
+      interp.exec("model = 'pl_core_news_lg'");
+      interp.exec("nlp = spacy.load(model)");
+      interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)");
+      loaded = true;
+    } catch (JepException e) {
+      System.out.println("An error occurred: " + e.getMessage());
     }
+  }
 
-    public boolean isProcessed() {
-        return processed;
-    }
+  public boolean isLoaded() {
+    return loaded;
+  }
 
-    public void process(String inputString) throws Exception {
-        processed = false;
-        if(loaded){
-            try {
-                interp.exec("sentence = " + inputString);
-                interp.exec("spacyDoc = nlp(sentence)");
-                Object properNounList = interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)");
-                properNounArray = ((ArrayList<List<Long>>) properNounList);
-                Object foreignSentList = interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)");
-                foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList);
-                processed = true;
-            }
-            catch (JepException e) {
-                System.out.println("An error occurred: " + e.getMessage());
-            }
-        }
+  public boolean isProcessed() {
+    return processed;
+  }
+
+  /**Javadoc.*/
+  public void process(String inputString) throws Exception {
+    processed = false;
+    if (loaded) {
+      try {
+        interp.exec("sentence = r'" + inputString.replace("'", "\\'") + "'");
+        interp.exec("spacyDoc = nlp(sentence)");
+        Object properNounList =
+            interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)");
+        properNounArray = ((ArrayList<List<Long>>) properNounList);
+        Object foreignSentList =
+            interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)");
+        foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList);
+        processed = true;
+      } catch (JepException e) {
+        System.out.println("An error occurred: " + e.getMessage());
+      }
     }
+  }
 
-    public boolean isForeignSentence(String inputString, int matchFrom) {
-        boolean isForeginSent = false;
-        if(processed) {
-            for (List<Long> tuple : foreignSentenceArray){
-                if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){
-                    if(Character.isUpperCase(inputString.charAt(matchFrom))){
-                        return true;
-                    }else{
-                        return false;
-                    }
-                }
-                if(matchFrom > tuple.get(1).intValue()){
-                    return false;
-                }
-            }
+  /**Javadoc.*/
+  public boolean isForeignSentence(String inputString, int matchFrom) {
+    boolean isForeginSent = false;
+    if (processed) {
+      for (List<Long> tuple : foreignSentenceArray) {
+        if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) {
+          return Character.isUpperCase(inputString.charAt(matchFrom));
+        }
+        if (matchFrom > tuple.get(1).intValue()) {
+          return false;
         }
-        return isForeginSent;
+      }
     }
+    return isForeginSent;
+  }
 
-    public boolean isProperNoun(String inputString, int matchFrom) {
-        boolean isProperNoun = false;
-        if(!processed) {
-            if (Character.isUpperCase(inputString.charAt(matchFrom))) {
-                isProperNoun = false;
-                int i = matchFrom - 1;
-                while (i > 0) {
-                    if (Character.isLetterOrDigit(inputString.charAt(i))) {
-                        isProperNoun = true;
-                        break;
-                    } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) {
-                        break;
-                    }
-                    --i;
-                }
-            }
-            return isProperNoun;
-        }else{
-            for (List<Long> tuple : properNounArray){
-                if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){
-                    if(Character.isUpperCase(inputString.charAt(matchFrom))){
-                        return true;
-                    }else{
-                        return false;
-                    }
-                }
-                if(matchFrom > tuple.get(1).intValue()){
-                    return false;
-                }
-            }
-            return isProperNoun;
+  /**Javadoc.*/
+  public boolean isProperNoun(String inputString, int matchFrom) {
+    boolean isProperNoun = false;
+    if (!processed) {
+      if (Character.isUpperCase(inputString.charAt(matchFrom))) {
+        isProperNoun = false;
+        int i = matchFrom - 1;
+        while (i > 0) {
+          if (Character.isLetterOrDigit(inputString.charAt(i))) {
+            isProperNoun = true;
+            break;
+          } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) {
+            break;
+          }
+          --i;
+        }
+      }
+      return isProperNoun;
+    } else {
+      for (List<Long> tuple : properNounArray) {
+        if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) {
+          return Character.isUpperCase(inputString.charAt(matchFrom));
         }
+        if (matchFrom > tuple.get(1).intValue()) {
+          return false;
+        }
+      }
+      return isProperNoun;
     }
+  }
 }
-
-
-
-
-
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 1296761..5b396a7 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -2,10 +2,13 @@ package pl.clarin.speller;
 
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import org.json.JSONObject;
@@ -39,13 +42,16 @@ public class Speller extends Worker {
 
   @Override
   public void process(String fileIn, String fileOut, JSONObject param) {
-    File file = new File(fileIn);
     try {
-      FileWriter fileWriter = new FileWriter(fileOut);
+      FileInputStream fstream = new FileInputStream(fileIn);
+
+      Writer out = new BufferedWriter(new OutputStreamWriter(
+              new FileOutputStream(fileOut), StandardCharsets.UTF_8));
 
       StringBuilder sb = new StringBuilder();
 
-      try (BufferedReader br = new BufferedReader(new FileReader(file))) {
+      try (BufferedReader br = new BufferedReader(
+              new InputStreamReader(fstream, StandardCharsets.UTF_8))) {
         String line = null;
         while ((line = br.readLine()) != null) {
           try {
@@ -58,12 +64,12 @@ public class Speller extends Worker {
         }
       }
 
-      try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
-        bufferedWriter.write(sb.toString());
+      try {
+        out.write(sb.toString());
+      } finally {
+        out.close();
       }
 
-      fileWriter.close();
-
     } catch (IOException exception) {
       Logger.getLogger(Speller.class.getName())
           .log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index a42c18b..9239c73 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -1,6 +1,7 @@
 package pl.clarin.speller;
 
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import org.languagetool.JLanguageTool;
 import org.languagetool.rules.RuleMatch;
@@ -8,11 +9,11 @@ import org.languagetool.rules.RuleMatch;
 public class TextEdit {
   SpaCy spacy = new SpaCy();
   /**Class that corrects input text.*/
+
   public String edit(String inputString, JLanguageTool langTool) throws Exception {
     try {
       spacy.process(inputString);
-    }
-    catch(Exception e){
+    } catch (Exception e) {
       System.out.println("Spacy did not process the input correctly! : " + e.getMessage());
     }
     char[] buffer = inputString.toCharArray();
@@ -47,17 +48,13 @@ public class TextEdit {
   }
 
   private boolean toChange(String inputString, int matchFrom, int matchTo) {
-    if (isForeignSentence(inputString,matchFrom)
-            || isProperNoun(inputString,matchFrom)
-            || isAcronym(inputString,matchFrom,matchTo)
-            || isFileOrExtension(inputString,matchFrom,matchTo)
-            || checkFirstLetter(inputString,matchFrom)
-            || isNotPolish(inputString, matchFrom, matchTo)
-            || isSurname(inputString, matchFrom, matchTo)) {
-      return false;
-    } else {
-      return true;
-    }
+    return !isForeignSentence(inputString, matchFrom)
+            && !isProperNoun(inputString, matchFrom)
+            && !isAcronym(inputString, matchFrom, matchTo)
+            && !isFileOrExtension(inputString, matchFrom, matchTo)
+            && !checkFirstLetter(inputString, matchFrom)
+            && !isNotPolish(inputString, matchFrom, matchTo)
+            && !isSurname(inputString, matchFrom, matchTo);
   }
 
   private boolean isSurname(String inputString, int matchFrom, int matchTo) {
@@ -129,7 +126,7 @@ public class TextEdit {
   }
 
   public boolean isAscii(Character v) {
-    return Charset.forName("US-ASCII").newEncoder().canEncode(v);
+    return StandardCharsets.US_ASCII.newEncoder().canEncode(v);
   }
 
   private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) {
@@ -195,10 +192,7 @@ public class TextEdit {
 
   private boolean checkFirstLetter(String inputString, int at) {
     String leaveCharacters = "§";
-    if (leaveCharacters.indexOf(inputString.charAt(at)) != -1) {
-      return true;
-    }
-    return false;
+    return leaveCharacters.indexOf(inputString.charAt(at)) != -1;
   }
 
 }
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1c9e461
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+jep
+spacy
+spacy-langdetect
\ No newline at end of file
-- 
GitLab


From ea02135d2171a95a8c697f298338cd34ef0648bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Mon, 3 Aug 2020 08:38:25 +0200
Subject: [PATCH 31/35] Changed .gitlab-cI

---
 .gitlab-ci.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9b653ad..f117f7f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,32 +1,33 @@
-image: 'clarinpl/python:3.6'
 stages:
   - check_style
+  - build
 cache:
   paths:
     - .tox
-before_script:
-  - pip install tox==2.9.1
 pep8:
+  image: 'clarinpl/python:3.6'
+  before_script:
+    - pip install tox==2.9.1
   stage: check_style
   script:
     - tox -v -e pep8
 docstyle:
+  image: 'clarinpl/python:3.6'
+  before_script:
+    - pip install tox==2.9.1
   stage: check_style
   script:
     - tox -v -e docstyle
 
-image: clarinpl/openjdk:8
-stages:
-  - check_style
-  - build
-
 google_checks:
+  image: clarinpl/openjdk:8
   stage: check_style
   script:
     - cd module
     - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true
   
 build_image:
+  image: clarinpl/openjdk:8
   stage: build
   image: 'docker:18.09.7'
   only:
-- 
GitLab


From 484901e67d73e8307928baf94bb46ae0f96a4cbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Tue, 4 Aug 2020 15:04:29 +0200
Subject: [PATCH 32/35] Changed speller to take input piped from spacy service
 instead of running its own spaCy.

---
 .gitlab-ci.yml                                |  22 +---
 Dockerfile                                    |  23 +---
 SpacyDocConvert.py                            |  33 ------
 docker-compose.yml                            |   2 -
 entrypoint.sh                                 |   2 -
 module/pom.xml                                |   5 -
 .../main/java/pl/clarin/speller/SpaCy.java    | 112 +++++++++---------
 .../main/java/pl/clarin/speller/Speller.java  |  32 ++++-
 .../main/java/pl/clarin/speller/TextEdit.java |   6 -
 tox.ini                                       |  44 -------
 10 files changed, 90 insertions(+), 191 deletions(-)
 delete mode 100644 SpacyDocConvert.py
 delete mode 100644 tox.ini

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f117f7f..71b6d71 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,33 +1,13 @@
+image: clarinpl/openjdk:8
 stages:
   - check_style
   - build
-cache:
-  paths:
-    - .tox
-pep8:
-  image: 'clarinpl/python:3.6'
-  before_script:
-    - pip install tox==2.9.1
-  stage: check_style
-  script:
-    - tox -v -e pep8
-docstyle:
-  image: 'clarinpl/python:3.6'
-  before_script:
-    - pip install tox==2.9.1
-  stage: check_style
-  script:
-    - tox -v -e docstyle
-
 google_checks:
-  image: clarinpl/openjdk:8
   stage: check_style
   script:
     - cd module
     - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true
-  
 build_image:
-  image: clarinpl/openjdk:8
   stage: build
   image: 'docker:18.09.7'
   only:
diff --git a/Dockerfile b/Dockerfile
index b2aaee2..ed8e888 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM clarinpl/openjdk:8
+FROM clarinpl/openjdk:8 as builder
 
 LABEL application="Speller"
 LABEL description="Client - Workers - correcting mistakes in sentances in txt files"
@@ -15,24 +15,9 @@ RUN cd module && \
     mvn clean && \
     mvn install
 
-RUN apt-get update && \
-    apt-get install -y software-properties-common && \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    apt-get update && \
-    apt-get install -y python3.5 && \
-    apt-get install -y python3-pip
-
-ENV LD_LIBRARY_PATH=/usr/local/lib/python3.5/dist-packages/jep/
-
+FROM clarinpl/openjdk-jre:8
 WORKDIR /home/worker
-COPY ./SpacyDocConvert.py .
-COPY ./requirements.txt .
-COPY ./entrypoint.sh ./entrypoint.sh
-
-RUN python3 -m pip install --upgrade pip && \
-	python3 -m pip install -r requirements.txt
-
+COPY ./entrypoint.sh  ./entrypoint.sh
 RUN ["chmod", "+x", "./entrypoint.sh"]
-
-RUN cp /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar ./nlp.worker.speller-1.0-SNAPSHOT.jar
+COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar .
 CMD ["./entrypoint.sh"]
\ No newline at end of file
diff --git a/SpacyDocConvert.py b/SpacyDocConvert.py
deleted file mode 100644
index 5987e4c..0000000
--- a/SpacyDocConvert.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""Utility file for using spacyDoc."""
-
-
-def find_proper_nouns(spacyDoc, sentence):
-    """Function returns indices of words that are proper nouns.
-
-    :param spacyDoc: SpacyDoc with ner tags.
-    :param sentence: Text spacyDoc is made from.
-    :return:list with proper nouns indices
-    """
-    idx = 0
-    proper_nouns_list = []
-    for tok in spacyDoc:
-        idx = sentence.find(tok.text, idx)
-        if tok.ent_type_ != '':
-            proper_nouns_list.append((idx, idx + len(tok.text)))
-    return proper_nouns_list
-
-
-def find_foreign_sentences(spacyDoc, text):
-    """Function returns indices of sentences that are not in Polish.
-
-    :param spacyDoc: SpacyDoc with ner tags.
-    :param text: Text spacyDoc is made from.
-    :return: list with foreign sentences indices
-    """
-    idx = 0
-    foreign_sentences_list = []
-    for sent in spacyDoc.sents:
-        if sent._.language.get('language', 'no') != 'pl':
-            idx = text.find(sent.text, idx)
-            foreign_sentences_list.append((idx, idx + len(sent.text)))
-    return foreign_sentences_list
diff --git a/docker-compose.yml b/docker-compose.yml
index 06c895f..c28c48b 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,6 +9,4 @@ services:
     working_dir: /home/worker/
     entrypoint:
       - ./entrypoint.sh
-    environment:
-      - PYTHONUNBUFFERED=0
     restart: always
diff --git a/entrypoint.sh b/entrypoint.sh
index d864430..cefa313 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,5 +1,3 @@
 #!/bin/sh
 cd /home/worker
-echo "Downloading model"
-python3 -m spacy download "pl_core_news_lg"
 java -jar nlp.worker.speller-1.0-SNAPSHOT.jar
\ No newline at end of file
diff --git a/module/pom.xml b/module/pom.xml
index 2522945..62d8f94 100644
--- a/module/pom.xml
+++ b/module/pom.xml
@@ -30,11 +30,6 @@
             <artifactId>json</artifactId>
             <version>20141113</version>
         </dependency>
-        <dependency>
-            <groupId>black.ninia</groupId>
-            <artifactId>jep</artifactId>
-            <version>3.9.0</version>
-        </dependency>
     </dependencies>
 
     <build>
diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java
index b14de22..0fdc61a 100644
--- a/module/src/main/java/pl/clarin/speller/SpaCy.java
+++ b/module/src/main/java/pl/clarin/speller/SpaCy.java
@@ -1,73 +1,79 @@
 package pl.clarin.speller;
 
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.util.ArrayList;
+import java.util.Enumeration;
 import java.util.List;
-import jep.JepException;
-import jep.SharedInterpreter;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
 
 public class SpaCy {
-  /**Class that handles spacy processing.*/
+  /**Class that handles spacy input.*/
   boolean loaded = false;
-  boolean processed = false;
-  SharedInterpreter interp = null;
-  ArrayList<List<Long>> properNounArray = null;
-  ArrayList<List<Long>> foreignSentenceArray = null;
-
-  {
-    try {
-      interp = new SharedInterpreter();
-      interp.exec("import sys");
-      interp.exec("sys.path.append(r'/home/worker')");
-      interp.exec("import SpacyDocConvert");
-      interp.exec("sys.argv=[]");
-      interp.exec("import spacy");
-      interp.exec("from spacy_langdetect import LanguageDetector");
-      interp.exec("model = 'pl_core_news_lg'");
-      interp.exec("nlp = spacy.load(model)");
-      interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)");
-      loaded = true;
-    } catch (JepException e) {
-      System.out.println("An error occurred: " + e.getMessage());
-    }
-  }
+  ArrayList<ArrayList<Integer>> properNounArray = null;
+  ArrayList<ArrayList<Integer>> foreignSentenceArray = null;
 
   public boolean isLoaded() {
     return loaded;
   }
 
-  public boolean isProcessed() {
-    return processed;
+  /**Loading spaCy input files.*/
+  public InputStream load(ZipFile zipFile) throws IOException {
+    InputStream inputText = null;
+    for (Enumeration<? extends ZipEntry> entries = zipFile.entries(); entries.hasMoreElements();) {
+      ZipEntry entry = entries.nextElement();
+      if (!entry.isDirectory()) {
+        if (entry.getName().equals("text.txt")) {
+          inputText = zipFile.getInputStream(entry);
+        } else if (entry.getName().equals("proper_nouns.txt")) {
+          InputStream in = zipFile.getInputStream(entry);
+          properNounArray = processSpacyFiles(in);
+          in.close();
+        } else if (entry.getName().equals("foreign_sentences.txt")) {
+          InputStream in = zipFile.getInputStream(entry);
+          foreignSentenceArray = processSpacyFiles(in);
+          in.close();
+        } else {
+          System.out.println("Zip from spaCy contains unexpected files!");
+        }
+      } else {
+        System.out.println("Zip from spaCy contains unexpected directories!");
+      }
+    }
+    loaded = true;
+    return inputText;
   }
 
-  /**Javadoc.*/
-  public void process(String inputString) throws Exception {
-    processed = false;
-    if (loaded) {
-      try {
-        interp.exec("sentence = r'" + inputString.replace("'", "\\'") + "'");
-        interp.exec("spacyDoc = nlp(sentence)");
-        Object properNounList =
-            interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)");
-        properNounArray = ((ArrayList<List<Long>>) properNounList);
-        Object foreignSentList =
-            interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)");
-        foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList);
-        processed = true;
-      } catch (JepException e) {
-        System.out.println("An error occurred: " + e.getMessage());
+  private static ArrayList<ArrayList<Integer>> processSpacyFiles(InputStream in)  {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+    String line;
+    ArrayList<ArrayList<Integer>> array = new ArrayList<>();
+    try {
+      while ((line = reader.readLine()) != null) {
+        String[] str = line.split(" ");
+        ArrayList<Integer> list = new ArrayList<>();
+        list.add(Integer.parseInt(str[0]));
+        list.add(Integer.parseInt(str[1]));
+        array.add(list);
       }
+    } catch (IOException e) {
+      System.out.println("The text file contains incorrect data." + e.getMessage());
     }
+    return array;
   }
 
   /**Javadoc.*/
   public boolean isForeignSentence(String inputString, int matchFrom) {
     boolean isForeginSent = false;
-    if (processed) {
-      for (List<Long> tuple : foreignSentenceArray) {
-        if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) {
+    if (loaded) {
+      for (List<Integer> tuple : foreignSentenceArray) {
+        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
           return Character.isUpperCase(inputString.charAt(matchFrom));
         }
-        if (matchFrom > tuple.get(1).intValue()) {
+        if (matchFrom > tuple.get(1)) {
           return false;
         }
       }
@@ -78,9 +84,8 @@ public class SpaCy {
   /**Javadoc.*/
   public boolean isProperNoun(String inputString, int matchFrom) {
     boolean isProperNoun = false;
-    if (!processed) {
+    if (!loaded) {
       if (Character.isUpperCase(inputString.charAt(matchFrom))) {
-        isProperNoun = false;
         int i = matchFrom - 1;
         while (i > 0) {
           if (Character.isLetterOrDigit(inputString.charAt(i))) {
@@ -92,17 +97,16 @@ public class SpaCy {
           --i;
         }
       }
-      return isProperNoun;
     } else {
-      for (List<Long> tuple : properNounArray) {
-        if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) {
+      for (List<Integer> tuple : properNounArray) {
+        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
           return Character.isUpperCase(inputString.charAt(matchFrom));
         }
-        if (matchFrom > tuple.get(1).intValue()) {
+        if (matchFrom > tuple.get(1)) {
           return false;
         }
       }
-      return isProperNoun;
     }
+    return isProperNoun;
   }
 }
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 5b396a7..9a7b2d8 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -11,6 +11,7 @@ import java.io.Writer;
 import java.nio.charset.StandardCharsets;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+import java.util.zip.ZipFile;
 import org.json.JSONObject;
 import org.languagetool.JLanguageTool;
 import org.languagetool.language.Polish;
@@ -41,17 +42,34 @@ public class Speller extends Worker {
   }
 
   @Override
-  public void process(String fileIn, String fileOut, JSONObject param) {
+  public void process(String fileIn, String fileOut, JSONObject options) {
+    String inputformat = "text";
     try {
-      FileInputStream fstream = new FileInputStream(fileIn);
+      if (options.has("format")) {
+        inputformat = options.getString("format");
+      }
+      InputStreamReader reader = null;
+      ZipFile zipFile = null;
+      if (inputformat.equals("spacy")) {
+        try {
+          zipFile = new ZipFile(fileIn);
+          reader = new InputStreamReader(textEditor.spacy.load(zipFile), StandardCharsets.UTF_8);
+        } catch (IOException e) {
+          System.out.println("Problems reading zip file!" + e.getStackTrace());
+          throw e;
+        }
+      } else {
+        FileInputStream fstream = new FileInputStream(fileIn);
+        reader = new InputStreamReader(fstream, StandardCharsets.UTF_8);
+      }
+
 
       Writer out = new BufferedWriter(new OutputStreamWriter(
               new FileOutputStream(fileOut), StandardCharsets.UTF_8));
 
       StringBuilder sb = new StringBuilder();
 
-      try (BufferedReader br = new BufferedReader(
-              new InputStreamReader(fstream, StandardCharsets.UTF_8))) {
+      try (BufferedReader br = new BufferedReader(reader)) {
         String line = null;
         while ((line = br.readLine()) != null) {
           try {
@@ -62,6 +80,10 @@ public class Speller extends Worker {
                 .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception);
           }
         }
+        reader.close();
+        if (inputformat.equals("spacy")) {
+          zipFile.close();
+        }
       }
 
       try {
@@ -72,7 +94,7 @@ public class Speller extends Worker {
 
     } catch (IOException exception) {
       Logger.getLogger(Speller.class.getName())
-          .log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
+          .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception);
     }
   }
 }
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 9239c73..66de742 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -1,6 +1,5 @@
 package pl.clarin.speller;
 
-import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
 import org.languagetool.JLanguageTool;
@@ -11,11 +10,6 @@ public class TextEdit {
   /**Class that corrects input text.*/
 
   public String edit(String inputString, JLanguageTool langTool) throws Exception {
-    try {
-      spacy.process(inputString);
-    } catch (Exception e) {
-      System.out.println("Spacy did not process the input correctly! : " + e.getMessage());
-    }
     char[] buffer = inputString.toCharArray();
     StringBuilder sb = new StringBuilder();
     List<RuleMatch> matches = langTool.check(inputString);
diff --git a/tox.ini b/tox.ini
deleted file mode 100644
index 819e612..0000000
--- a/tox.ini
+++ /dev/null
@@ -1,44 +0,0 @@
-[tox]
-envlist = pep8,docstyle
-skipsdist = True
-
-[testenv:pep8]
-deps =
-    flake8
-basepython = python3
-commands =
-    flake8 {posargs}
-
-[testenv:docstyle]
-deps =
-    pydocstyle
-basepython = python3
-commands =
-    pydocstyle --verbose {posargs}
-
-[flake8]
-# W504 skipped because it is overeager and unnecessary
-ignore = W504
-show-source = True
-exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
-import-order-style = pep8
-max-line-length = 80
-
-
-[pydocstyle]
-# D104 Missing docstring in public package
-# D203 1 blank line required before class docstring
-# D213 Multi-line docstring summary should start at the second line
-# D214 Section is over-indented
-# D215 Section underline is over-indented
-# D401 First line should be in imperative mood; try rephrasing
-# D405 Section name should be properly capitalized
-# D406 Section name should end with a newline
-# D407 Missing dashed underline after section
-# D408 Section underline should be in the line following the section’s name
-# D409 Section underline should match the length of its name
-# D410 Missing blank line after section
-# D411 Missing blank line before section
-ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
-match-dir = ^(?!\.tox|venv).*
-match = ^(?!setup).*\.py
\ No newline at end of file
-- 
GitLab


From 9bbe45f9f8619e584140063e192349c30b50425c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl>
Date: Wed, 5 Aug 2020 11:47:14 +0200
Subject: [PATCH 33/35] Added unloading spacy. Changed temporary JavaDocs.

---
 module/src/main/java/pl/clarin/speller/SpaCy.java   | 12 ++++++++++--
 module/src/main/java/pl/clarin/speller/Speller.java |  4 ++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java
index 0fdc61a..b4b92cd 100644
--- a/module/src/main/java/pl/clarin/speller/SpaCy.java
+++ b/module/src/main/java/pl/clarin/speller/SpaCy.java
@@ -16,10 +16,18 @@ public class SpaCy {
   ArrayList<ArrayList<Integer>> properNounArray = null;
   ArrayList<ArrayList<Integer>> foreignSentenceArray = null;
 
+  /**Checks if spaCy correctly loaded input.*/
   public boolean isLoaded() {
     return loaded;
   }
 
+  /**Unloading input from file.*/
+  public void unload() {
+    properNounArray = null;
+    foreignSentenceArray = null;
+    loaded = false;
+  }
+
   /**Loading spaCy input files.*/
   public InputStream load(ZipFile zipFile) throws IOException {
     InputStream inputText = null;
@@ -65,7 +73,7 @@ public class SpaCy {
     return array;
   }
 
-  /**Javadoc.*/
+  /**Checks if input sentence is from a different language.*/
   public boolean isForeignSentence(String inputString, int matchFrom) {
     boolean isForeginSent = false;
     if (loaded) {
@@ -81,7 +89,7 @@ public class SpaCy {
     return isForeginSent;
   }
 
-  /**Javadoc.*/
+  /**Checks if input sentence is from a proper noun.*/
   public boolean isProperNoun(String inputString, int matchFrom) {
     boolean isProperNoun = false;
     if (!loaded) {
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 9a7b2d8..39d0e54 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -83,6 +83,7 @@ public class Speller extends Worker {
         reader.close();
         if (inputformat.equals("spacy")) {
           zipFile.close();
+          textEditor.spacy.unload();
         }
       }
 
@@ -96,5 +97,8 @@ public class Speller extends Worker {
       Logger.getLogger(Speller.class.getName())
           .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception);
     }
+    if (inputformat.equals("spacy")) {
+      textEditor.spacy.unload();
+    }
   }
 }
-- 
GitLab


From d365a5a7148fe38c575bf780e2cd9da778a7304c Mon Sep 17 00:00:00 2001
From: Bartlomiej Koptyra <bartlomiej.koptyra@gmail.com>
Date: Wed, 26 Aug 2020 13:35:25 +0200
Subject: [PATCH 34/35] Added procettors to some variables maybe it will fiz
 the internal error (no idea).

---
 .gitlab-ci.yml                                |  48 +--
 Dockerfile                                    |  44 +-
 config.ini                                    |  22 +-
 docker-compose.yml                            |  24 +-
 .../main/java/pl/clarin/speller/SpaCy.java    | 240 +++++------
 .../main/java/pl/clarin/speller/Speller.java  | 204 +++++-----
 .../main/java/pl/clarin/speller/TextEdit.java | 384 +++++++++---------
 requirements.txt                              |   4 +-
 8 files changed, 483 insertions(+), 487 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 71b6d71..1b993af 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,24 +1,24 @@
-image: clarinpl/openjdk:8
-stages:
-  - check_style
-  - build
-google_checks:
-  stage: check_style
-  script:
-    - cd module
-    - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true
-build_image:
-  stage: build
-  image: 'docker:18.09.7'
-  only:
-    - master
-  services:
-    - 'docker:18.09.7-dind'
-  before_script:
-    - ''
-  script:
-    - docker build -t clarinpl/speller .
-    - echo $DOCKER_PASSWORD > pass.txt
-    - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
-    - rm pass.txt
-    - docker push clarinpl/speller
+image: clarinpl/openjdk:8
+stages:
+  - check_style
+  - build
+google_checks:
+  stage: check_style
+  script:
+    - cd module
+    - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true
+build_image:
+  stage: build
+  image: 'docker:18.09.7'
+  only:
+    - master
+  services:
+    - 'docker:18.09.7-dind'
+  before_script:
+    - ''
+  script:
+    - docker build -t clarinpl/speller .
+    - echo $DOCKER_PASSWORD > pass.txt
+    - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
+    - rm pass.txt
+    - docker push clarinpl/speller
diff --git a/Dockerfile b/Dockerfile
index ed8e888..ee7dcfe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,23 +1,23 @@
-FROM clarinpl/openjdk:8 as builder
-
-LABEL application="Speller"
-LABEL description="Client - Workers - correcting mistakes in sentances in txt files"
-LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
-
-WORKDIR /home/install
-RUN cd nlp.worker && \
-    mvn clean && \
-    mvn install
-
-WORKDIR /home/install
-COPY ./module ./module
-RUN cd module && \
-    mvn clean && \
-    mvn install
-
-FROM clarinpl/openjdk-jre:8
-WORKDIR /home/worker
-COPY ./entrypoint.sh  ./entrypoint.sh
-RUN ["chmod", "+x", "./entrypoint.sh"]
-COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar .
+FROM clarinpl/openjdk:8 as builder
+
+LABEL application="Speller"
+LABEL description="Client - Workers - correcting mistakes in sentances in txt files"
+LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
+
+WORKDIR /home/install
+RUN cd nlp.worker && \
+    mvn clean && \
+    mvn install
+
+WORKDIR /home/install
+COPY ./module ./module
+RUN cd module && \
+    mvn clean && \
+    mvn install
+
+FROM clarinpl/openjdk-jre:8
+WORKDIR /home/worker
+COPY ./entrypoint.sh  ./entrypoint.sh
+RUN ["chmod", "+x", "./entrypoint.sh"]
+COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar .
 CMD ["./entrypoint.sh"]
\ No newline at end of file
diff --git a/config.ini b/config.ini
index 856bea7..45ad6bc 100644
--- a/config.ini
+++ b/config.ini
@@ -1,11 +1,11 @@
-[service]
-tool = speller
-
-root = /samba/requests/
-rabbit_host = rabbitmq
-rabbit_user = test
-rabbit_password = test
-queue_prefix = nlp_
-
-[tool]
-workers_number = 1
+[service]
+tool = speller
+
+root = /samba/requests/
+rabbit_host = rabbitmq
+rabbit_user = test
+rabbit_password = test
+queue_prefix = nlp_
+
+[tool]
+workers_number = 1
diff --git a/docker-compose.yml b/docker-compose.yml
index c28c48b..7a93952 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,12 +1,12 @@
-version: '3'
-services:
-  speller:
-    container_name: clarin_speller
-    build: ./
-    volumes:
-      - '/samba:/samba'
-      - './config.ini:/home/worker/config.ini'
-    working_dir: /home/worker/
-    entrypoint:
-      - ./entrypoint.sh
-    restart: always
+version: '3'
+services:
+  speller:
+    container_name: clarin_speller
+    build: ./
+    volumes:
+      - '/samba:/samba'
+      - './config.ini:/home/worker/config.ini'
+    working_dir: /home/worker/
+    entrypoint:
+      - ./entrypoint.sh
+    restart: always
diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java
index b4b92cd..82fa6c6 100644
--- a/module/src/main/java/pl/clarin/speller/SpaCy.java
+++ b/module/src/main/java/pl/clarin/speller/SpaCy.java
@@ -1,120 +1,120 @@
-package pl.clarin.speller;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.Enumeration;
-import java.util.List;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-
-public class SpaCy {
-  /**Class that handles spacy input.*/
-  boolean loaded = false;
-  ArrayList<ArrayList<Integer>> properNounArray = null;
-  ArrayList<ArrayList<Integer>> foreignSentenceArray = null;
-
-  /**Checks if spaCy correctly loaded input.*/
-  public boolean isLoaded() {
-    return loaded;
-  }
-
-  /**Unloading input from file.*/
-  public void unload() {
-    properNounArray = null;
-    foreignSentenceArray = null;
-    loaded = false;
-  }
-
-  /**Loading spaCy input files.*/
-  public InputStream load(ZipFile zipFile) throws IOException {
-    InputStream inputText = null;
-    for (Enumeration<? extends ZipEntry> entries = zipFile.entries(); entries.hasMoreElements();) {
-      ZipEntry entry = entries.nextElement();
-      if (!entry.isDirectory()) {
-        if (entry.getName().equals("text.txt")) {
-          inputText = zipFile.getInputStream(entry);
-        } else if (entry.getName().equals("proper_nouns.txt")) {
-          InputStream in = zipFile.getInputStream(entry);
-          properNounArray = processSpacyFiles(in);
-          in.close();
-        } else if (entry.getName().equals("foreign_sentences.txt")) {
-          InputStream in = zipFile.getInputStream(entry);
-          foreignSentenceArray = processSpacyFiles(in);
-          in.close();
-        } else {
-          System.out.println("Zip from spaCy contains unexpected files!");
-        }
-      } else {
-        System.out.println("Zip from spaCy contains unexpected directories!");
-      }
-    }
-    loaded = true;
-    return inputText;
-  }
-
-  private static ArrayList<ArrayList<Integer>> processSpacyFiles(InputStream in)  {
-    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
-    String line;
-    ArrayList<ArrayList<Integer>> array = new ArrayList<>();
-    try {
-      while ((line = reader.readLine()) != null) {
-        String[] str = line.split(" ");
-        ArrayList<Integer> list = new ArrayList<>();
-        list.add(Integer.parseInt(str[0]));
-        list.add(Integer.parseInt(str[1]));
-        array.add(list);
-      }
-    } catch (IOException e) {
-      System.out.println("The text file contains incorrect data." + e.getMessage());
-    }
-    return array;
-  }
-
-  /**Checks if input sentence is from a different language.*/
-  public boolean isForeignSentence(String inputString, int matchFrom) {
-    boolean isForeginSent = false;
-    if (loaded) {
-      for (List<Integer> tuple : foreignSentenceArray) {
-        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
-          return Character.isUpperCase(inputString.charAt(matchFrom));
-        }
-        if (matchFrom > tuple.get(1)) {
-          return false;
-        }
-      }
-    }
-    return isForeginSent;
-  }
-
-  /**Checks if input sentence is from a proper noun.*/
-  public boolean isProperNoun(String inputString, int matchFrom) {
-    boolean isProperNoun = false;
-    if (!loaded) {
-      if (Character.isUpperCase(inputString.charAt(matchFrom))) {
-        int i = matchFrom - 1;
-        while (i > 0) {
-          if (Character.isLetterOrDigit(inputString.charAt(i))) {
-            isProperNoun = true;
-            break;
-          } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) {
-            break;
-          }
-          --i;
-        }
-      }
-    } else {
-      for (List<Integer> tuple : properNounArray) {
-        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
-          return Character.isUpperCase(inputString.charAt(matchFrom));
-        }
-        if (matchFrom > tuple.get(1)) {
-          return false;
-        }
-      }
-    }
-    return isProperNoun;
-  }
-}
+package pl.clarin.speller;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+public class SpaCy {
+  /**Class that handles spacy input.*/
+  private boolean loaded = false;
+  private ArrayList<ArrayList<Integer>> properNounArray = null;
+  private ArrayList<ArrayList<Integer>> foreignSentenceArray = null;
+
+  /**Checks if spaCy correctly loaded input.*/
+  public boolean isLoaded() {
+    return loaded;
+  }
+
+  /**Unloading input from file.*/
+  public void unload() {
+    properNounArray = null;
+    foreignSentenceArray = null;
+    loaded = false;
+  }
+
+  /**Loading spaCy input files.*/
+  public InputStream load(ZipFile zipFile) throws IOException {
+    InputStream inputText = null;
+    for (Enumeration<? extends ZipEntry> entries = zipFile.entries(); entries.hasMoreElements();) {
+      ZipEntry entry = entries.nextElement();
+      if (!entry.isDirectory()) {
+        if (entry.getName().equals("text.txt")) {
+          inputText = zipFile.getInputStream(entry);
+        } else if (entry.getName().equals("proper_nouns.txt")) {
+          InputStream in = zipFile.getInputStream(entry);
+          properNounArray = processSpacyFiles(in);
+          in.close();
+        } else if (entry.getName().equals("foreign_sentences.txt")) {
+          InputStream in = zipFile.getInputStream(entry);
+          foreignSentenceArray = processSpacyFiles(in);
+          in.close();
+        } else {
+          System.out.println("Zip from spaCy contains unexpected files!");
+        }
+      } else {
+        System.out.println("Zip from spaCy contains unexpected directories!");
+      }
+    }
+    loaded = true;
+    return inputText;
+  }
+
+  private ArrayList<ArrayList<Integer>> processSpacyFiles(InputStream in)  {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+    String line;
+    ArrayList<ArrayList<Integer>> array = new ArrayList<>();
+    try {
+      while ((line = reader.readLine()) != null) {
+        String[] str = line.split(" ");
+        ArrayList<Integer> list = new ArrayList<>();
+        list.add(Integer.parseInt(str[0]));
+        list.add(Integer.parseInt(str[1]));
+        array.add(list);
+      }
+    } catch (IOException e) {
+      System.out.println("The text file contains incorrect data." + e.getMessage());
+    }
+    return array;
+  }
+
+  /**Checks if input sentence is from a different language.*/
+  public boolean isForeignSentence(String inputString, int matchFrom) {
+    boolean isForeginSent = false;
+    if (loaded) {
+      for (List<Integer> tuple : foreignSentenceArray) {
+        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
+          return Character.isUpperCase(inputString.charAt(matchFrom));
+        }
+        if (matchFrom > tuple.get(1)) {
+          return false;
+        }
+      }
+    }
+    return isForeginSent;
+  }
+
+  /**Checks if input sentence is from a proper noun.*/
+  public boolean isProperNoun(String inputString, int matchFrom) {
+    boolean isProperNoun = false;
+    if (!loaded) {
+      if (Character.isUpperCase(inputString.charAt(matchFrom))) {
+        int i = matchFrom - 1;
+        while (i > 0) {
+          if (Character.isLetterOrDigit(inputString.charAt(i))) {
+            isProperNoun = true;
+            break;
+          } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) {
+            break;
+          }
+          --i;
+        }
+      }
+    } else {
+      for (List<Integer> tuple : properNounArray) {
+        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
+          return Character.isUpperCase(inputString.charAt(matchFrom));
+        }
+        if (matchFrom > tuple.get(1)) {
+          return false;
+        }
+      }
+    }
+    return isProperNoun;
+  }
+}
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 39d0e54..2e01af8 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -1,104 +1,100 @@
-package pl.clarin.speller;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import java.util.zip.ZipFile;
-import org.json.JSONObject;
-import org.languagetool.JLanguageTool;
-import org.languagetool.language.Polish;
-import pl.clarin.ws.worker.IniFile;
-import pl.clarin.ws.worker.Service;
-import pl.clarin.ws.worker.Worker;
-
-public class Speller extends Worker {
-  static Polish lang = null;
-  JLanguageTool langTool = null;
-  TextEdit textEditor = null;
-
-  public static void main(String[] args) {
-    new Service<>(Speller.class);
-  }
-
-  // init object for each thread
-  @Override
-  public void init() throws Exception {
-    langTool = new JLanguageTool(lang);
-    textEditor = new TextEdit();
-  }
-
-  // init objects shared by threads
-  @Override
-  public void static_init(IniFile init) throws Exception {
-    lang = new Polish();
-  }
-
-  @Override
-  public void process(String fileIn, String fileOut, JSONObject options) {
-    String inputformat = "text";
-    try {
-      if (options.has("format")) {
-        inputformat = options.getString("format");
-      }
-      InputStreamReader reader = null;
-      ZipFile zipFile = null;
-      if (inputformat.equals("spacy")) {
-        try {
-          zipFile = new ZipFile(fileIn);
-          reader = new InputStreamReader(textEditor.spacy.load(zipFile), StandardCharsets.UTF_8);
-        } catch (IOException e) {
-          System.out.println("Problems reading zip file!" + e.getStackTrace());
-          throw e;
-        }
-      } else {
-        FileInputStream fstream = new FileInputStream(fileIn);
-        reader = new InputStreamReader(fstream, StandardCharsets.UTF_8);
-      }
-
-
-      Writer out = new BufferedWriter(new OutputStreamWriter(
-              new FileOutputStream(fileOut), StandardCharsets.UTF_8));
-
-      StringBuilder sb = new StringBuilder();
-
-      try (BufferedReader br = new BufferedReader(reader)) {
-        String line = null;
-        while ((line = br.readLine()) != null) {
-          try {
-            String correctedLine = textEditor.edit(line, langTool);
-            sb.append(correctedLine).append('\n');
-          } catch (Exception exception) {
-            Logger.getLogger(Speller.class.getName())
-                .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception);
-          }
-        }
-        reader.close();
-        if (inputformat.equals("spacy")) {
-          zipFile.close();
-          textEditor.spacy.unload();
-        }
-      }
-
-      try {
-        out.write(sb.toString());
-      } finally {
-        out.close();
-      }
-
-    } catch (IOException exception) {
-      Logger.getLogger(Speller.class.getName())
-          .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception);
-    }
-    if (inputformat.equals("spacy")) {
-      textEditor.spacy.unload();
-    }
-  }
-}
+package pl.clarin.speller;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.zip.ZipFile;
+import org.json.JSONObject;
+import org.languagetool.JLanguageTool;
+import org.languagetool.language.Polish;
+import pl.clarin.ws.worker.IniFile;
+import pl.clarin.ws.worker.Service;
+import pl.clarin.ws.worker.Worker;
+
+public class Speller extends Worker {
+  static Polish lang = null;
+  private JLanguageTool langTool = null;
+  private TextEdit textEditor = null;
+
+  public static void main(String[] args) {
+    new Service<>(Speller.class);
+  }
+
+  // init object for each thread
+  @Override
+  public void init() throws Exception {
+    langTool = new JLanguageTool(lang);
+    textEditor = new TextEdit();
+  }
+
+  // init objects shared by threads
+  @Override
+  public void static_init(IniFile init) throws Exception {
+    lang = new Polish();
+  }
+
+  @Override
+  public void process(String fileIn, String fileOut, JSONObject options) {
+    String inputformat = "text";
+    try {
+      if (options.has("format")) {
+        inputformat = options.getString("format");
+      }
+      InputStreamReader reader = null;
+      ZipFile zipFile = null;
+      if (inputformat.equals("spacy")) {
+        try {
+          zipFile = new ZipFile(fileIn);
+          reader = new InputStreamReader(textEditor.spacy.load(zipFile), StandardCharsets.UTF_8);
+        } catch (IOException e) {
+          System.out.println("Problems reading zip file!" + e.getStackTrace());
+          throw e;
+        }
+      } else {
+        FileInputStream fstream = new FileInputStream(fileIn);
+        reader = new InputStreamReader(fstream, StandardCharsets.UTF_8);
+      }
+
+
+      Writer out = new BufferedWriter(new OutputStreamWriter(
+              new FileOutputStream(fileOut), StandardCharsets.UTF_8));
+
+      StringBuilder sb = new StringBuilder();
+
+      try (BufferedReader br = new BufferedReader(reader)) {
+        String line = null;
+        while ((line = br.readLine()) != null) {
+          try {
+            String correctedLine = textEditor.edit(line, langTool);
+            sb.append(correctedLine).append('\n');
+          } catch (Exception exception) {
+            Logger.getLogger(Speller.class.getName())
+                .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception);
+          }
+        }
+        reader.close();
+      }
+
+      try {
+        out.write(sb.toString());
+      } finally {
+        out.close();
+      }
+
+    } catch (IOException exception) {
+      Logger.getLogger(Speller.class.getName())
+          .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception);
+    }
+    if (inputformat.equals("spacy")) {
+      textEditor.spacy.unload();
+    }
+  }
+}
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 66de742..45847f0 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -1,192 +1,192 @@
-package pl.clarin.speller;
-
-import java.nio.charset.StandardCharsets;
-import java.util.List;
-import org.languagetool.JLanguageTool;
-import org.languagetool.rules.RuleMatch;
-
-public class TextEdit {
-  SpaCy spacy = new SpaCy();
-  /**Class that corrects input text.*/
-
-  public String edit(String inputString, JLanguageTool langTool) throws Exception {
-    char[] buffer = inputString.toCharArray();
-    StringBuilder sb = new StringBuilder();
-    List<RuleMatch> matches = langTool.check(inputString);
-    int stringIndex = 0;
-    for (RuleMatch match : matches) {
-      int matchFrom = match.getFromPos();
-      int matchTo = match.getToPos();
-
-      if (matchFrom > stringIndex) {
-        sb.append(buffer, stringIndex, (matchFrom - stringIndex));
-      } else if (matchFrom < stringIndex) {
-        throw new Exception("RuleMatches are not sorted for some reason.");
-      }
-      String matchingWord;
-      if (match.getSuggestedReplacements().isEmpty()) {
-        matchingWord = inputString.substring(matchFrom, matchTo);
-      } else {
-        boolean change = toChange(inputString,matchFrom,matchTo);
-        if (change) {
-          matchingWord = match.getSuggestedReplacements().get(0);
-        } else {
-          matchingWord = inputString.substring(matchFrom, matchTo);
-        }
-      }
-      sb.append(matchingWord.toCharArray(), 0, (matchingWord.length()));
-      stringIndex = matchTo;
-    }
-    sb.append(buffer, stringIndex, (buffer.length - stringIndex));
-    return sb.toString();
-  }
-
-  private boolean toChange(String inputString, int matchFrom, int matchTo) {
-    return !isForeignSentence(inputString, matchFrom)
-            && !isProperNoun(inputString, matchFrom)
-            && !isAcronym(inputString, matchFrom, matchTo)
-            && !isFileOrExtension(inputString, matchFrom, matchTo)
-            && !checkFirstLetter(inputString, matchFrom)
-            && !isNotPolish(inputString, matchFrom, matchTo)
-            && !isSurname(inputString, matchFrom, matchTo);
-  }
-
-  private boolean isSurname(String inputString, int matchFrom, int matchTo) {
-    boolean isSurname = false;
-    int i = matchFrom;
-    if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i))
-            && Character.isWhitespace(inputString.charAt(i - 1))
-            && (inputString.charAt(i - 2)) == '.'
-            && Character.isUpperCase(inputString.charAt(i - 3))) {
-      i = i - 3;
-      while (i > 0 && Character.isUpperCase(inputString.charAt(i))) {
-        --i;
-      }
-      if (i == 0 || Character.isWhitespace(inputString.charAt(i))) {
-        i = matchFrom + 1;
-        isSurname = true;
-        while (i < matchTo) {
-          if (!Character.isLowerCase(inputString.charAt(i))
-                  && !(i == (matchTo - 1) && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)) {
-            isSurname = false;
-          }
-          ++i;
-        }
-      }
-    }
-    return isSurname;
-  }
-
-  private boolean isForeignSentence(String inputString, int matchFrom) {
-    return spacy.isForeignSentence(inputString, matchFrom);
-  }
-
-  private boolean isProperNoun(String inputString, int matchFrom) {
-    return spacy.isProperNoun(inputString, matchFrom);
-  }
-
-  private boolean isAcronym(String inputString, int matchFrom, int matchTo) {
-    double breakPoint = 0.6;
-    boolean isAcronym = false;
-    int lowerCaseLetters = 0;
-    if (Character.isUpperCase(inputString.charAt(matchFrom))
-            || Character.isDigit(inputString.charAt(matchFrom))) {
-      int i = matchFrom + 1;
-      while (i < matchTo) {
-        if (Character.isLowerCase(inputString.charAt(i))) {
-          ++lowerCaseLetters;
-        }
-        ++i;
-      }
-      if (((matchTo - matchFrom) * breakPoint) >= lowerCaseLetters) {
-        isAcronym = true;
-      }
-    }
-    return isAcronym;
-  }
-
-  private boolean isNotPolish(String inputString, int matchFrom, int matchTo) {
-    String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ";
-    boolean isNotPolish = false;
-    for (int i = matchFrom; i < matchTo; ++i) {
-      if (!isAscii(inputString.charAt(i))) {
-        if (polishLetterSet.indexOf(inputString.charAt(i)) == -1) {
-          isNotPolish = true;
-          break;
-        }
-      }
-    }
-    return isNotPolish;
-  }
-
-  public boolean isAscii(Character v) {
-    return StandardCharsets.US_ASCII.newEncoder().canEncode(v);
-  }
-
-  private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) {
-    String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG"
-            + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()";
-    int extensionLength = 4;
-    boolean isFileOrExtension = false;
-    boolean dot = false;
-    int lastIndex = lastIndex(inputString,matchFrom,matchTo);
-    int firstIndex = firstIndex(inputString,matchFrom);
-    int i = firstIndex + 1;
-    while (i < lastIndex) {
-      char currentChar = inputString.charAt(i);
-      if (characterList.indexOf(currentChar) == -1) {
-        if (currentChar == '.') {
-          dot = true;
-          ++i;
-          int j = extensionLength;
-          while (i < lastIndex) {
-            currentChar = inputString.charAt(i);
-            if (characterList.indexOf(currentChar) == -1) {
-              if (currentChar == '.') {
-                j = extensionLength;
-              } else if (!(i == lastIndex - 1 && ".?!,".indexOf(currentChar) != -1)) {
-                break;
-              }
-            }
-            --j;
-            if (j < 0) {
-              break;
-            }
-            ++i;
-          }
-        }
-        break;
-      }
-      ++i;
-    }
-    if (dot && i == lastIndex) {
-      isFileOrExtension = true;
-    }
-    return isFileOrExtension;
-  }
-
-  private int lastIndex(String inputString, int matchFrom, int matchTo) {
-    int i = matchFrom;
-    while (i < inputString.length()
-            && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) {
-      ++i;
-    }
-    return i;
-  }
-
-  private int firstIndex(String inputString, int matchFrom) {
-    int i = matchFrom - 1;
-    if (i >= 0 && inputString.charAt(i) == '.') {
-      while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) {
-        --i;
-      }
-    }
-    return i;
-  }
-
-  private boolean checkFirstLetter(String inputString, int at) {
-    String leaveCharacters = "§";
-    return leaveCharacters.indexOf(inputString.charAt(at)) != -1;
-  }
-
-}
+package pl.clarin.speller;
+
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import org.languagetool.JLanguageTool;
+import org.languagetool.rules.RuleMatch;
+
+public class TextEdit {
+  public SpaCy spacy = new SpaCy();
+  /**Class that corrects input text.*/
+
+  public String edit(String inputString, JLanguageTool langTool) throws Exception {
+    char[] buffer = inputString.toCharArray();
+    StringBuilder sb = new StringBuilder();
+    List<RuleMatch> matches = langTool.check(inputString);
+    int stringIndex = 0;
+    for (RuleMatch match : matches) {
+      int matchFrom = match.getFromPos();
+      int matchTo = match.getToPos();
+
+      if (matchFrom > stringIndex) {
+        sb.append(buffer, stringIndex, (matchFrom - stringIndex));
+      } else if (matchFrom < stringIndex) {
+        throw new Exception("RuleMatches are not sorted for some reason.");
+      }
+      String matchingWord;
+      if (match.getSuggestedReplacements().isEmpty()) {
+        matchingWord = inputString.substring(matchFrom, matchTo);
+      } else {
+        boolean change = toChange(inputString,matchFrom,matchTo);
+        if (change) {
+          matchingWord = match.getSuggestedReplacements().get(0);
+        } else {
+          matchingWord = inputString.substring(matchFrom, matchTo);
+        }
+      }
+      sb.append(matchingWord.toCharArray(), 0, (matchingWord.length()));
+      stringIndex = matchTo;
+    }
+    sb.append(buffer, stringIndex, (buffer.length - stringIndex));
+    return sb.toString();
+  }
+
+  private boolean toChange(String inputString, int matchFrom, int matchTo) {
+    return !isForeignSentence(inputString, matchFrom)
+            && !isProperNoun(inputString, matchFrom)
+            && !isAcronym(inputString, matchFrom, matchTo)
+            && !isFileOrExtension(inputString, matchFrom, matchTo)
+            && !checkFirstLetter(inputString, matchFrom)
+            && !isNotPolish(inputString, matchFrom, matchTo)
+            && !isSurname(inputString, matchFrom, matchTo);
+  }
+
+  private boolean isSurname(String inputString, int matchFrom, int matchTo) {
+    boolean isSurname = false;
+    int i = matchFrom;
+    if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i))
+            && Character.isWhitespace(inputString.charAt(i - 1))
+            && (inputString.charAt(i - 2)) == '.'
+            && Character.isUpperCase(inputString.charAt(i - 3))) {
+      i = i - 3;
+      while (i > 0 && Character.isUpperCase(inputString.charAt(i))) {
+        --i;
+      }
+      if (i == 0 || Character.isWhitespace(inputString.charAt(i))) {
+        i = matchFrom + 1;
+        isSurname = true;
+        while (i < matchTo) {
+          if (!Character.isLowerCase(inputString.charAt(i))
+                  && !(i == (matchTo - 1) && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)) {
+            isSurname = false;
+          }
+          ++i;
+        }
+      }
+    }
+    return isSurname;
+  }
+
+  private boolean isForeignSentence(String inputString, int matchFrom) {
+    return spacy.isForeignSentence(inputString, matchFrom);
+  }
+
+  private boolean isProperNoun(String inputString, int matchFrom) {
+    return spacy.isProperNoun(inputString, matchFrom);
+  }
+
+  private boolean isAcronym(String inputString, int matchFrom, int matchTo) {
+    double breakPoint = 0.6;
+    boolean isAcronym = false;
+    int lowerCaseLetters = 0;
+    if (Character.isUpperCase(inputString.charAt(matchFrom))
+            || Character.isDigit(inputString.charAt(matchFrom))) {
+      int i = matchFrom + 1;
+      while (i < matchTo) {
+        if (Character.isLowerCase(inputString.charAt(i))) {
+          ++lowerCaseLetters;
+        }
+        ++i;
+      }
+      if (((matchTo - matchFrom) * breakPoint) >= lowerCaseLetters) {
+        isAcronym = true;
+      }
+    }
+    return isAcronym;
+  }
+
+  private boolean isNotPolish(String inputString, int matchFrom, int matchTo) {
+    String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ";
+    boolean isNotPolish = false;
+    for (int i = matchFrom; i < matchTo; ++i) {
+      if (!isAscii(inputString.charAt(i))) {
+        if (polishLetterSet.indexOf(inputString.charAt(i)) == -1) {
+          isNotPolish = true;
+          break;
+        }
+      }
+    }
+    return isNotPolish;
+  }
+
+  public boolean isAscii(Character v) {
+    return StandardCharsets.US_ASCII.newEncoder().canEncode(v);
+  }
+
+  private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) {
+    String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG"
+            + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()";
+    int extensionLength = 4;
+    boolean isFileOrExtension = false;
+    boolean dot = false;
+    int lastIndex = lastIndex(inputString,matchFrom,matchTo);
+    int firstIndex = firstIndex(inputString,matchFrom);
+    int i = firstIndex + 1;
+    while (i < lastIndex) {
+      char currentChar = inputString.charAt(i);
+      if (characterList.indexOf(currentChar) == -1) {
+        if (currentChar == '.') {
+          dot = true;
+          ++i;
+          int j = extensionLength;
+          while (i < lastIndex) {
+            currentChar = inputString.charAt(i);
+            if (characterList.indexOf(currentChar) == -1) {
+              if (currentChar == '.') {
+                j = extensionLength;
+              } else if (!(i == lastIndex - 1 && ".?!,".indexOf(currentChar) != -1)) {
+                break;
+              }
+            }
+            --j;
+            if (j < 0) {
+              break;
+            }
+            ++i;
+          }
+        }
+        break;
+      }
+      ++i;
+    }
+    if (dot && i == lastIndex) {
+      isFileOrExtension = true;
+    }
+    return isFileOrExtension;
+  }
+
+  private int lastIndex(String inputString, int matchFrom, int matchTo) {
+    int i = matchFrom;
+    while (i < inputString.length()
+            && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) {
+      ++i;
+    }
+    return i;
+  }
+
+  private int firstIndex(String inputString, int matchFrom) {
+    int i = matchFrom - 1;
+    if (i >= 0 && inputString.charAt(i) == '.') {
+      while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) {
+        --i;
+      }
+    }
+    return i;
+  }
+
+  private boolean checkFirstLetter(String inputString, int at) {
+    String leaveCharacters = "§";
+    return leaveCharacters.indexOf(inputString.charAt(at)) != -1;
+  }
+
+}
diff --git a/requirements.txt b/requirements.txt
index 1c9e461..2136c37 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
-jep
-spacy
+jep
+spacy
 spacy-langdetect
\ No newline at end of file
-- 
GitLab


From 1cf03999de57021e978e08ef85b32f3f30e202d6 Mon Sep 17 00:00:00 2001
From: Bartlomiej Koptyra <bartlomiej.koptyra@gmail.com>
Date: Mon, 31 Aug 2020 13:36:48 +0200
Subject: [PATCH 35/35] Spacy method is now handled correctly.

---
 config.ini                                    |  2 +-
 entrypoint.sh                                 |  2 +-
 .../main/java/pl/clarin/speller/SpaCy.java    | 26 ++++++++++---------
 .../main/java/pl/clarin/speller/Speller.java  |  5 ++--
 .../main/java/pl/clarin/speller/TextEdit.java | 18 ++++++-------
 5 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/config.ini b/config.ini
index 45ad6bc..b649cb3 100644
--- a/config.ini
+++ b/config.ini
@@ -8,4 +8,4 @@ rabbit_password = test
 queue_prefix = nlp_
 
 [tool]
-workers_number = 1
+workers_number = 12
diff --git a/entrypoint.sh b/entrypoint.sh
index cefa313..a25740c 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 cd /home/worker
-java -jar nlp.worker.speller-1.0-SNAPSHOT.jar
\ No newline at end of file
+java -jar nlp.worker.speller-1.0-SNAPSHOT.jar
diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java
index 82fa6c6..4dfbc50 100644
--- a/module/src/main/java/pl/clarin/speller/SpaCy.java
+++ b/module/src/main/java/pl/clarin/speller/SpaCy.java
@@ -61,11 +61,13 @@ public class SpaCy {
     ArrayList<ArrayList<Integer>> array = new ArrayList<>();
     try {
       while ((line = reader.readLine()) != null) {
-        String[] str = line.split(" ");
-        ArrayList<Integer> list = new ArrayList<>();
-        list.add(Integer.parseInt(str[0]));
-        list.add(Integer.parseInt(str[1]));
-        array.add(list);
+        if (!line.equals("")) {
+          String[] str = line.split(" ");
+          ArrayList<Integer> list = new ArrayList<>();
+          list.add(Integer.parseInt(str[0]));
+          list.add(Integer.parseInt(str[1]));
+          array.add(list);
+        }
       }
     } catch (IOException e) {
       System.out.println("The text file contains incorrect data." + e.getMessage());
@@ -74,14 +76,14 @@ public class SpaCy {
   }
 
   /**Checks if input sentence is from a different language.*/
-  public boolean isForeignSentence(String inputString, int matchFrom) {
+  public boolean isForeignSentence(String inputString, int idx, int matchFrom) {
     boolean isForeginSent = false;
     if (loaded) {
       for (List<Integer> tuple : foreignSentenceArray) {
-        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
-          return Character.isUpperCase(inputString.charAt(matchFrom));
+        if (matchFrom >= (tuple.get(0) - idx) && matchFrom < (tuple.get(1) - idx)) {
+          return true;
         }
-        if (matchFrom > tuple.get(1)) {
+        if ((tuple.get(1) - idx) > matchFrom) {
           return false;
         }
       }
@@ -90,7 +92,7 @@ public class SpaCy {
   }
 
   /**Checks if input sentence is from a proper noun.*/
-  public boolean isProperNoun(String inputString, int matchFrom) {
+  public boolean isProperNoun(String inputString, int idx, int matchFrom) {
     boolean isProperNoun = false;
     if (!loaded) {
       if (Character.isUpperCase(inputString.charAt(matchFrom))) {
@@ -107,10 +109,10 @@ public class SpaCy {
       }
     } else {
       for (List<Integer> tuple : properNounArray) {
-        if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) {
+        if (matchFrom >= (tuple.get(0) - idx) && matchFrom < (tuple.get(1) - idx)) {
           return Character.isUpperCase(inputString.charAt(matchFrom));
         }
-        if (matchFrom > tuple.get(1)) {
+        if ((tuple.get(1) - idx) > matchFrom) {
           return false;
         }
       }
diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java
index 2e01af8..c26afdb 100644
--- a/module/src/main/java/pl/clarin/speller/Speller.java
+++ b/module/src/main/java/pl/clarin/speller/Speller.java
@@ -63,7 +63,6 @@ public class Speller extends Worker {
         reader = new InputStreamReader(fstream, StandardCharsets.UTF_8);
       }
 
-
       Writer out = new BufferedWriter(new OutputStreamWriter(
               new FileOutputStream(fileOut), StandardCharsets.UTF_8));
 
@@ -71,9 +70,11 @@ public class Speller extends Worker {
 
       try (BufferedReader br = new BufferedReader(reader)) {
         String line = null;
+        int idx = 0;
         while ((line = br.readLine()) != null) {
           try {
-            String correctedLine = textEditor.edit(line, langTool);
+            String correctedLine = textEditor.edit(line, idx, langTool);
+            idx = idx + line.length() + "\n".length();
             sb.append(correctedLine).append('\n');
           } catch (Exception exception) {
             Logger.getLogger(Speller.class.getName())
diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java
index 45847f0..fb39132 100644
--- a/module/src/main/java/pl/clarin/speller/TextEdit.java
+++ b/module/src/main/java/pl/clarin/speller/TextEdit.java
@@ -9,7 +9,7 @@ public class TextEdit {
   public SpaCy spacy = new SpaCy();
   /**Class that corrects input text.*/
 
-  public String edit(String inputString, JLanguageTool langTool) throws Exception {
+  public String edit(String inputString, int idx, JLanguageTool langTool) throws Exception {
     char[] buffer = inputString.toCharArray();
     StringBuilder sb = new StringBuilder();
     List<RuleMatch> matches = langTool.check(inputString);
@@ -27,7 +27,7 @@ public class TextEdit {
       if (match.getSuggestedReplacements().isEmpty()) {
         matchingWord = inputString.substring(matchFrom, matchTo);
       } else {
-        boolean change = toChange(inputString,matchFrom,matchTo);
+        boolean change = toChange(inputString, idx, matchFrom, matchTo);
         if (change) {
           matchingWord = match.getSuggestedReplacements().get(0);
         } else {
@@ -41,9 +41,9 @@ public class TextEdit {
     return sb.toString();
   }
 
-  private boolean toChange(String inputString, int matchFrom, int matchTo) {
-    return !isForeignSentence(inputString, matchFrom)
-            && !isProperNoun(inputString, matchFrom)
+  private boolean toChange(String inputString, int idx, int matchFrom, int matchTo) {
+    return !isForeignSentence(inputString, idx, matchFrom)
+            && !isProperNoun(inputString, idx, matchFrom)
             && !isAcronym(inputString, matchFrom, matchTo)
             && !isFileOrExtension(inputString, matchFrom, matchTo)
             && !checkFirstLetter(inputString, matchFrom)
@@ -77,12 +77,12 @@ public class TextEdit {
     return isSurname;
   }
 
-  private boolean isForeignSentence(String inputString, int matchFrom) {
-    return spacy.isForeignSentence(inputString, matchFrom);
+  private boolean isForeignSentence(String inputString, int idx, int matchFrom) {
+    return spacy.isForeignSentence(inputString, idx, matchFrom);
   }
 
-  private boolean isProperNoun(String inputString, int matchFrom) {
-    return spacy.isProperNoun(inputString, matchFrom);
+  private boolean isProperNoun(String inputString, int idx, int matchFrom) {
+    return spacy.isProperNoun(inputString, idx, matchFrom);
   }
 
   private boolean isAcronym(String inputString, int matchFrom, int matchTo) {
-- 
GitLab