From cd7a5927944804470cae72d0c814e8852a75e167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 11:26:23 +0200 Subject: [PATCH 01/35] Basic version of working service. --- Dockerfile | 23 ++++++ config.ini | 11 +++ docker-compose.yml | 14 ++++ module/pom.xml | 70 +++++++++++++++++++ .../main/java/pl/clarin/speller/Speller.java | 67 ++++++++++++++++++ .../main/java/pl/clarin/speller/TextEdit.java | 37 ++++++++++ 6 files changed, 222 insertions(+) create mode 100644 Dockerfile create mode 100644 config.ini create mode 100644 docker-compose.yml create mode 100644 module/pom.xml create mode 100644 module/src/main/java/pl/clarin/speller/Speller.java create mode 100644 module/src/main/java/pl/clarin/speller/TextEdit.java diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..348cb06 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM clarinpl/openjdk:8 as builder + +LABEL application="Speller" +LABEL description="Client - Workers - correcting mistakes in sentances in txt files" +LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology" +LABEL maintainer="bartlomiej.koptyra@pwr.edu.pl" + +WORKDIR /home/install +RUN cd nlp.worker && \ + mvn clean && \ + mvn install + +WORKDIR /home/install +COPY ./module ./module +RUN cd module && \ + mvn clean && \ + mvn install + +FROM clarinpl/openjdk-jre:8 + +WORKDIR /home/worker +COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar . +CMD ["java", "-jar", "nlp.worker.speller-1.0-SNAPSHOT.jar"] \ No newline at end of file diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..856bea7 --- /dev/null +++ b/config.ini @@ -0,0 +1,11 @@ +[service] +tool = speller + +root = /samba/requests/ +rabbit_host = rabbitmq +rabbit_user = test +rabbit_password = test +queue_prefix = nlp_ + +[tool] +workers_number = 1 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..f0a01ae --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '3' +services: + speller: + container_name: clarin_speller + build: ./ + volumes: + - '/samba:/samba' + - './config.ini:/home/worker/config.ini' + working_dir: /home/worker/ + entrypoint: + - java + - '-jar' + - nlp.worker.speller-1.0-SNAPSHOT.jar + restart: always diff --git a/module/pom.xml b/module/pom.xml new file mode 100644 index 0000000..62d8f94 --- /dev/null +++ b/module/pom.xml @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>pl.clarin</groupId> + <artifactId>nlp.worker.speller</artifactId> + <version>1.0-SNAPSHOT</version> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <maven.compiler.source>1.8</maven.compiler.source> + <maven.compiler.target>1.8</maven.compiler.target> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>nlp.worker</artifactId> + <version>1.0-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.languagetool</groupId> + <artifactId>language-pl</artifactId> + <version>5.0</version> + </dependency> + <dependency> + <groupId>org.json</groupId> + <artifactId>json</artifactId> + <version>20141113</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.3</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>pl.clarin.speller.Speller</mainClass> + </transformer> + </transformers> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </filter> + </filters> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java new file mode 100644 index 0000000..f924042 --- /dev/null +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -0,0 +1,67 @@ +package pl.clarin.speller; + +import java.io.*; + +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.languagetool.JLanguageTool; + +import org.json.JSONObject; +import org.languagetool.language.Polish; +import pl.clarin.ws.worker.IniFile; +import pl.clarin.ws.worker.Service; +import pl.clarin.ws.worker.Worker; + +public class Speller extends Worker { + // init object for each thread + static TextEdit textEdit = null; + static JLanguageTool langTool = null; + + public static void main(String[] args) { + new Service<>(Speller.class); + } + + @Override + public void init() throws Exception {} + + // init objects shared by threads + @Override + public void static_init(IniFile init) throws Exception { + textEdit = new TextEdit(); + langTool = new JLanguageTool(new Polish()); + } + + @Override + public void process(String fileIn, String fileOut, JSONObject param) { + File file = new File(fileIn); + try { + FileWriter fileWriter = new FileWriter(fileOut); + + StringBuilder sb = new StringBuilder(); + + try (BufferedReader br = new BufferedReader(new FileReader(file))) { + String line = null; + while ((line = br.readLine()) != null) { + try { + String corrected_line = TextEdit.edit(line, langTool); + sb.append(corrected_line).append('\n'); + } catch (Exception exception) { + Logger.getLogger(Speller.class.getName()) + .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception); + } + } + } + + try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) { + bufferedWriter.write(sb.toString()); + } + + fileWriter.close(); + + } catch (IOException exception) { + Logger.getLogger(Speller.class.getName()) + .log(Level.SEVERE, "Problems with writing: " + fileOut, exception); + } + } +} diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java new file mode 100644 index 0000000..1e767f2 --- /dev/null +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -0,0 +1,37 @@ +package pl.clarin.speller; + +import org.languagetool.JLanguageTool; +import org.languagetool.rules.RuleMatch; + +import java.util.List; + +public class TextEdit { + public static String edit(String wrong_input, JLanguageTool langTool) throws Exception { + char[] buffer = wrong_input.toCharArray(); + StringBuilder sb = new StringBuilder(); + List<RuleMatch> matches = langTool.check(wrong_input); + int string_index = 0; + for (RuleMatch match : matches) { + int match_from = match.getFromPos(); + int match_to = match.getToPos(); + + if (match_from > string_index) { + sb.append(buffer, string_index, (match_from - string_index)); + } else if (match_from < string_index) { + throw new Exception("RuleMatches are not sorted for some reason."); + } + String matching_word; + if (match.getSuggestedReplacements().isEmpty()) { + // divide into words and fix + matching_word = wrong_input.substring(match_from, match_to); + } else { + matching_word = match.getSuggestedReplacements().get(0); + } + + sb.append(matching_word.toCharArray(), 0, (matching_word.length())); + string_index = match_to; + } + sb.append(buffer, string_index, (buffer.length - string_index)); + return sb.toString(); + } +} -- GitLab From 3a4c34e63ca6e6b3b95fdaaec34ae85334de7fc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 12:29:06 +0200 Subject: [PATCH 02/35] Added gitlab-ci, refactored code. --- .gitlab-ci.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..a70340f --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,15 @@ +build_image: + stage: build + image: 'docker:18.09.7' + only: + - master + services: + - 'docker:18.09.7-dind' + before_script: + - '' + script: + - docker build -t clarinpl/liner2 . + - echo $DOCKER_PASSWORD > pass.txt + - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin + - rm pass.txt + - docker push clarinpl/liner2 -- GitLab From da9ab975cb6c667a84c533579aa032ae03c95f82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 12:43:48 +0200 Subject: [PATCH 03/35] Delted README, added stages to gitlab-ci --- .gitlab-ci.yml | 3 +++ README.md | 2 -- .../src/main/java/pl/clarin/speller/Speller.java | 14 +++++--------- .../src/main/java/pl/clarin/speller/TextEdit.java | 1 - 4 files changed, 8 insertions(+), 12 deletions(-) delete mode 100644 README.md diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a70340f..535ac7d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,6 @@ +stages: + - build + build_image: stage: build image: 'docker:18.09.7' diff --git a/README.md b/README.md deleted file mode 100644 index e2e02ef..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# speller - diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index f924042..a4bab37 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -1,21 +1,18 @@ package pl.clarin.speller; -import java.io.*; - -import java.util.logging.Level; -import java.util.logging.Logger; - -import org.languagetool.JLanguageTool; - import org.json.JSONObject; +import org.languagetool.JLanguageTool; import org.languagetool.language.Polish; import pl.clarin.ws.worker.IniFile; import pl.clarin.ws.worker.Service; import pl.clarin.ws.worker.Worker; +import java.io.*; +import java.util.logging.Level; +import java.util.logging.Logger; + public class Speller extends Worker { // init object for each thread - static TextEdit textEdit = null; static JLanguageTool langTool = null; public static void main(String[] args) { @@ -28,7 +25,6 @@ public class Speller extends Worker { // init objects shared by threads @Override public void static_init(IniFile init) throws Exception { - textEdit = new TextEdit(); langTool = new JLanguageTool(new Polish()); } diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 1e767f2..d2c8fc6 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -22,7 +22,6 @@ public class TextEdit { } String matching_word; if (match.getSuggestedReplacements().isEmpty()) { - // divide into words and fix matching_word = wrong_input.substring(match_from, match_to); } else { matching_word = match.getSuggestedReplacements().get(0); -- GitLab From ef7478debc911af07a8a7558aa7edf4ea4289e83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:19:26 +0200 Subject: [PATCH 04/35] Test for pipeline. --- .gitlab-ci.yml | 9 ++++++ .../main/java/pl/clarin/speller/Speller.java | 16 ++++++---- .../main/java/pl/clarin/speller/TextEdit.java | 30 +++++++++---------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 535ac7d..e3f14cb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,15 @@ stages: + - check_style - build +checkstyle: + image: clarinpl/openjdk:8 + stage: check_style + script: + - mvn checkstyle:checkstyle + - cat checkstyle-result.xml + allow_failure: false + build_image: stage: build image: 'docker:18.09.7' diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index a4bab37..5c2c474 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -1,5 +1,13 @@ package pl.clarin.speller; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.logging.Level; +import java.util.logging.Logger; import org.json.JSONObject; import org.languagetool.JLanguageTool; import org.languagetool.language.Polish; @@ -7,9 +15,7 @@ import pl.clarin.ws.worker.IniFile; import pl.clarin.ws.worker.Service; import pl.clarin.ws.worker.Worker; -import java.io.*; -import java.util.logging.Level; -import java.util.logging.Logger; + public class Speller extends Worker { // init object for each thread @@ -40,8 +46,8 @@ public class Speller extends Worker { String line = null; while ((line = br.readLine()) != null) { try { - String corrected_line = TextEdit.edit(line, langTool); - sb.append(corrected_line).append('\n'); + String correctedLine = TextEdit.edit(line, langTool); + sb.append(correctedLine).append('\n'); } catch (Exception exception) { Logger.getLogger(Speller.class.getName()) .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception); diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index d2c8fc6..102dafd 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -1,34 +1,34 @@ package pl.clarin.speller; +import java.util.List; import org.languagetool.JLanguageTool; import org.languagetool.rules.RuleMatch; -import java.util.List; - public class TextEdit { - public static String edit(String wrong_input, JLanguageTool langTool) throws Exception { - char[] buffer = wrong_input.toCharArray(); + // class that corrects input text + public static String edit(String inputString, JLanguageTool langTool) throws Exception { + char[] buffer = inputString.toCharArray(); StringBuilder sb = new StringBuilder(); - List<RuleMatch> matches = langTool.check(wrong_input); + List<RuleMatch> matches = langTool.check(inputString); int string_index = 0; for (RuleMatch match : matches) { - int match_from = match.getFromPos(); - int match_to = match.getToPos(); + int matchFrom = match.getFromPos(); + int matchTo = match.getToPos(); - if (match_from > string_index) { - sb.append(buffer, string_index, (match_from - string_index)); - } else if (match_from < string_index) { + if (matchFrom > string_index) { + sb.append(buffer, string_index, (matchFrom - string_index)); + } else if (matchFrom < string_index) { throw new Exception("RuleMatches are not sorted for some reason."); } - String matching_word; + String matchingWord; if (match.getSuggestedReplacements().isEmpty()) { - matching_word = wrong_input.substring(match_from, match_to); + matchingWord = inputString.substring(matchFrom, matchTo); } else { - matching_word = match.getSuggestedReplacements().get(0); + matchingWord = match.getSuggestedReplacements().get(0); } - sb.append(matching_word.toCharArray(), 0, (matching_word.length())); - string_index = match_to; + sb.append(matchingWord.toCharArray(), 0, (matchingWord.length())); + string_index = matchTo; } sb.append(buffer, string_index, (buffer.length - string_index)); return sb.toString(); -- GitLab From e9f48367e2ba4e27b68fcb4292733a76fbfd6657 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:22:04 +0200 Subject: [PATCH 05/35] Pipeline test 2. --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e3f14cb..2301573 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,6 +6,7 @@ checkstyle: image: clarinpl/openjdk:8 stage: check_style script: + - cd module - mvn checkstyle:checkstyle - cat checkstyle-result.xml allow_failure: false -- GitLab From 5b03055c034a13c891a3c7e0bc96f4b7c1d3c74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:25:51 +0200 Subject: [PATCH 06/35] Test 3 --- .gitlab-ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2301573..39ea5a8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,13 +2,13 @@ stages: - check_style - build -checkstyle: - image: clarinpl/openjdk:8 + checkstyle-lint: + image: clarinpl/openjdk:8 stage: check_style + before_script: + - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar script: - - cd module - - mvn checkstyle:checkstyle - - cat checkstyle-result.xml + - java -jar checkstyle-8.26-all.jar -c google_checks.xml src allow_failure: false build_image: -- GitLab From def18b0a25501991c623b310cbb221ea936dcf1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:28:48 +0200 Subject: [PATCH 07/35] Test 4. --- .gitlab-ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 39ea5a8..a01bdc9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,16 +1,16 @@ stages: - check_style - build - - checkstyle-lint: - image: clarinpl/openjdk:8 +checkstyle-lint: + image: 'clarinpl/openjdk:8' stage: check_style before_script: - - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar + - >- + curl -OL + https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar script: - java -jar checkstyle-8.26-all.jar -c google_checks.xml src allow_failure: false - build_image: stage: build image: 'docker:18.09.7' -- GitLab From ab889f6145f4b53a25e79bf6bb58f9d5a4606b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:31:09 +0200 Subject: [PATCH 08/35] Test 5 --- .gitlab-ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a01bdc9..764fc6f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,16 +1,16 @@ stages: - check_style - build -checkstyle-lint: - image: 'clarinpl/openjdk:8' + + checkstyle-lint: + image: clarinpl/openjdk:8 stage: check_style before_script: - - >- - curl -OL - https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar + - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar script: - java -jar checkstyle-8.26-all.jar -c google_checks.xml src allow_failure: false + build_image: stage: build image: 'docker:18.09.7' -- GitLab From 4ac33e10e6e34e71e64a2d5894f452da29d45a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:34:33 +0200 Subject: [PATCH 09/35] Test 6 --- .gitlab-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 764fc6f..bf13a1c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,13 +2,14 @@ stages: - check_style - build - checkstyle-lint: +checkstyle-lint: image: clarinpl/openjdk:8 stage: check_style before_script: + - sudo apt install curl - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar script: - - java -jar checkstyle-8.26-all.jar -c google_checks.xml src + - java -jar checkstyle-8.26-all.jar -c google_checks.xml module/src allow_failure: false build_image: -- GitLab From cda995d481eefddb3213080788aabe612980474a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:36:46 +0200 Subject: [PATCH 10/35] Test 7 --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bf13a1c..0431e52 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,12 +1,12 @@ +image: clarinpl/openjdk:8 + stages: - check_style - build checkstyle-lint: - image: clarinpl/openjdk:8 stage: check_style before_script: - - sudo apt install curl - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar script: - java -jar checkstyle-8.26-all.jar -c google_checks.xml module/src -- GitLab From a6500b2071a08d4f287627d1f8216f46c4524693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:37:39 +0200 Subject: [PATCH 11/35] Test 8 --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0431e52..e207e77 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: clarinpl/openjdk:8 +image: openjdk:latest stages: - check_style -- GitLab From fc9b7ec05635005330c15f0f0a5259f77985a059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:46:46 +0200 Subject: [PATCH 12/35] Test 9 . --- .gitlab-ci.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e207e77..a11d1d6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: openjdk:latest +image: clarinpl/openjdk:8 stages: - check_style @@ -6,10 +6,9 @@ stages: checkstyle-lint: stage: check_style - before_script: - - curl -OL https://github.com/checkstyle/checkstyle/releases/download/checkstyle-8.26/checkstyle-8.26-all.jar script: - - java -jar checkstyle-8.26-all.jar -c google_checks.xml module/src + - cd module + - mvn checkstyle:checkstyle allow_failure: false build_image: -- GitLab From c4190b3036887e95bc3a9a2811bcf356e4b6605a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 13:50:07 +0200 Subject: [PATCH 13/35] Tes 10 --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a11d1d6..09f1d2a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -9,6 +9,7 @@ checkstyle-lint: script: - cd module - mvn checkstyle:checkstyle + - cat target/checkstyle-result.xml allow_failure: false build_image: -- GitLab From c35104d62e4e641ccd0c44b16074ebccd245e47a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 14:08:29 +0200 Subject: [PATCH 14/35] Test 11 --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 09f1d2a..940008b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,7 +8,7 @@ checkstyle-lint: stage: check_style script: - cd module - - mvn checkstyle:checkstyle + - mvn checkstyle:check - cat target/checkstyle-result.xml allow_failure: false -- GitLab From 0c6d507f4a5a40cf41268f29d5ed9191e234711b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 14:20:09 +0200 Subject: [PATCH 15/35] Working spellchecker? --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 940008b..20ea932 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,7 +8,7 @@ checkstyle-lint: stage: check_style script: - cd module - - mvn checkstyle:check + - mvn checkstyle:check -DconfigLocation=google_checks.xml - cat target/checkstyle-result.xml allow_failure: false -- GitLab From ab73641a2fdc83d2036a741f063448f0ed093cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 14:29:48 +0200 Subject: [PATCH 16/35] Changed ruleset to be google java style. --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 20ea932..ca787b9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,7 +8,7 @@ checkstyle-lint: stage: check_style script: - cd module - - mvn checkstyle:check -DconfigLocation=google_checks.xml + - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml - cat target/checkstyle-result.xml allow_failure: false -- GitLab From b6fc7143fbc870d9231c7ad65cb838d254d52f41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 14:41:04 +0200 Subject: [PATCH 17/35] Changed job name. --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ca787b9..9336cf5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: - check_style - build -checkstyle-lint: +google_checks: stage: check_style script: - cd module -- GitLab From 20f9085c5295f7f5acf96fd2be9ac8c70e415952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 14:52:07 +0200 Subject: [PATCH 18/35] Removed redundant line from -ci --- .gitlab-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9336cf5..3676e88 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,7 +10,6 @@ google_checks: - cd module - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml - cat target/checkstyle-result.xml - allow_failure: false build_image: stage: build -- GitLab From 34fac3ba592283fcaf57b389c67946be5a0284ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 16:39:22 +0200 Subject: [PATCH 19/35] Changed vilolation severity to warning. --- .gitlab-ci.yml | 3 +-- module/src/main/java/pl/clarin/speller/TextEdit.java | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3676e88..3cb8b5b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,8 +8,7 @@ google_checks: stage: check_style script: - cd module - - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml - - cat target/checkstyle-result.xml + - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true build_image: stage: build diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 102dafd..33cbf0e 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -10,14 +10,14 @@ public class TextEdit { char[] buffer = inputString.toCharArray(); StringBuilder sb = new StringBuilder(); List<RuleMatch> matches = langTool.check(inputString); - int string_index = 0; + int stringIndex = 0; for (RuleMatch match : matches) { int matchFrom = match.getFromPos(); int matchTo = match.getToPos(); - if (matchFrom > string_index) { - sb.append(buffer, string_index, (matchFrom - string_index)); - } else if (matchFrom < string_index) { + if (matchFrom > stringIndex) { + sb.append(buffer, stringIndex, (matchFrom - stringIndex)); + } else if (matchFrom < stringIndex) { throw new Exception("RuleMatches are not sorted for some reason."); } String matchingWord; @@ -28,9 +28,9 @@ public class TextEdit { } sb.append(matchingWord.toCharArray(), 0, (matchingWord.length())); - string_index = matchTo; + stringIndex = matchTo; } - sb.append(buffer, string_index, (buffer.length - string_index)); + sb.append(buffer, stringIndex, (buffer.length - stringIndex)); return sb.toString(); } } -- GitLab From 634616f4a54defba76df9bd73ed48cfc76b5f2f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 16:44:49 +0200 Subject: [PATCH 20/35] Fixed checkstyle violations. --- module/src/main/java/pl/clarin/speller/Speller.java | 4 +++- module/src/main/java/pl/clarin/speller/TextEdit.java | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 5c2c474..701a5d1 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -26,7 +26,9 @@ public class Speller extends Worker { } @Override - public void init() throws Exception {} + public void init() throws Exception { + + } // init objects shared by threads @Override diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 33cbf0e..71bb6fb 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -5,7 +5,7 @@ import org.languagetool.JLanguageTool; import org.languagetool.rules.RuleMatch; public class TextEdit { - // class that corrects input text + /**Class that corrects input text.*/ public static String edit(String inputString, JLanguageTool langTool) throws Exception { char[] buffer = inputString.toCharArray(); StringBuilder sb = new StringBuilder(); -- GitLab From a66747642bcf44f7e6a34ac6c2afd2476f3dfc0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 16:53:29 +0200 Subject: [PATCH 21/35] Example commit with violation. --- module/src/main/java/pl/clarin/speller/Speller.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 701a5d1..ba05c94 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -26,9 +26,7 @@ public class Speller extends Worker { } @Override - public void init() throws Exception { - - } + public void init() throws Exception { } // init objects shared by threads @Override -- GitLab From f48aee568bd492a1d2faa16f6b77e586075d565e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 20 Jul 2020 16:54:00 +0200 Subject: [PATCH 22/35] Corrected example violation. --- module/src/main/java/pl/clarin/speller/Speller.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index ba05c94..0010e66 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -26,7 +26,9 @@ public class Speller extends Worker { } @Override - public void init() throws Exception { } + public void init() throws Exception { + + } // init objects shared by threads @Override -- GitLab From 047c02f0579115a31d4ca1ab6a496f386002f7dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Tue, 21 Jul 2020 15:40:05 +0200 Subject: [PATCH 23/35] Improved accuracy of speller by adding rules when not to change a word. --- .../main/java/pl/clarin/speller/Speller.java | 2 +- .../main/java/pl/clarin/speller/TextEdit.java | 168 +++++++++++++++++- 2 files changed, 167 insertions(+), 3 deletions(-) diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 0010e66..701a5d1 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -27,7 +27,7 @@ public class Speller extends Worker { @Override public void init() throws Exception { - + } // init objects shared by threads diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 71bb6fb..654a49b 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -1,6 +1,8 @@ package pl.clarin.speller; +import java.nio.charset.Charset; import java.util.List; + import org.languagetool.JLanguageTool; import org.languagetool.rules.RuleMatch; @@ -24,13 +26,175 @@ public class TextEdit { if (match.getSuggestedReplacements().isEmpty()) { matchingWord = inputString.substring(matchFrom, matchTo); } else { - matchingWord = match.getSuggestedReplacements().get(0); + boolean change = toChange(inputString,matchFrom,matchTo); + if (change){ + matchingWord = match.getSuggestedReplacements().get(0); + }else{ + matchingWord = inputString.substring(matchFrom, matchTo); + } } - sb.append(matchingWord.toCharArray(), 0, (matchingWord.length())); stringIndex = matchTo; } sb.append(buffer, stringIndex, (buffer.length - stringIndex)); return sb.toString(); } + + private static boolean toChange(String inputString, int matchFrom, int matchTo){ + if(isProperNoun(inputString,matchFrom) || isAcronym(inputString,matchFrom,matchTo) + || isFileOrExtension(inputString,matchFrom,matchTo) || checkFirstLetter(inputString,matchFrom) + || isNotPolish(inputString, matchFrom, matchTo) || isSurname(inputString, matchFrom, matchTo)){ + return false; + }else{ + return true; + } + } + + private static boolean isSurname(String inputString, int matchFrom, int matchTo){ + boolean isSurname = false; + int i = matchFrom; + if((i-4) > 0 && Character.isUpperCase(inputString.charAt(i)) + && Character.isWhitespace(inputString.charAt(i-1)) + && (inputString.charAt(i-2)) == '.' + && Character.isUpperCase(inputString.charAt(i-3))){ + i = i - 3; + while(i > 0 && Character.isUpperCase(inputString.charAt(i))){ + --i; + } + if(i == 0 || Character.isWhitespace(inputString.charAt(i))){ + i = matchFrom + 1; + isSurname = true; + while(i < matchTo){ + if(!Character.isLowerCase(inputString.charAt(i)) && !(i == matchTo-1 && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)){ + isSurname = false; + } + ++i; + } + } + } + return isSurname; + } + + private static boolean isProperNoun(String inputString, int matchFrom){ + boolean isProperNoun = false; + if (Character.isUpperCase(inputString.charAt(matchFrom))) { + isProperNoun = false; + int i = matchFrom - 1; + while (i > 0) { + if (Character.isLetterOrDigit(inputString.charAt(i))) { + isProperNoun = true; + break; + } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) { + break; + } + --i; + } + } + return isProperNoun; + } + + private static boolean isAcronym(String inputString, int matchFrom, int matchTo){ + double breakPoint = 0.6; + boolean isAcronym = false; + int lowerCaseLetters = 0; + if (Character.isUpperCase(inputString.charAt(matchFrom)) || Character.isDigit(inputString.charAt(matchFrom))) { + int i = matchFrom + 1; + while(i < matchTo){ + if(Character.isLowerCase(inputString.charAt(i))){ + ++lowerCaseLetters; + } + ++i; + } + if(((matchTo-matchFrom) * breakPoint) >= lowerCaseLetters){ + isAcronym = true; + } + } + return isAcronym; + } + + private static boolean isNotPolish(String inputString, int matchFrom, int matchTo){ + String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ"; + boolean isNotPolish = false; + for (int i = matchFrom; i < matchTo; ++i){ + if(!isAscii(inputString.charAt(i))){ + if(polishLetterSet.indexOf(inputString.charAt(i)) == -1){ + isNotPolish = true; + break; + } + } + } + return isNotPolish; + } + + public static boolean isAscii(Character v) { + return Charset.forName("US-ASCII").newEncoder().canEncode(v); + } + + private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo){ + String character_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()"; + int extension_length = 4; + boolean isFileOrExtension = false; + boolean dot = false; + int lastIndex = lastIndex(inputString,matchFrom,matchTo); + int firstIndex = firstIndex(inputString,matchFrom); + int i = firstIndex + 1; + while (i < lastIndex) { + char current_char = inputString.charAt(i); + if(character_list.indexOf(current_char) == -1) { + if(current_char == '.'){ + dot = true; + ++i; + int j = extension_length; + while(i < lastIndex){ + current_char = inputString.charAt(i); + if(character_list.indexOf(current_char) == -1){ + if(current_char == '.'){ + j = extension_length; + } else if(!(i == lastIndex-1 && ".?!,".indexOf(current_char) != -1)){ + break; + } + } + --j; + if (j < 0){ + break; + } + ++i; + } + } + break; + } + ++i; + } + if (dot && i == lastIndex){ + isFileOrExtension = true; + } + return isFileOrExtension; + } + + private static int lastIndex(String inputString, int matchFrom, int matchTo){ + int i = matchFrom; + while(i < inputString.length() && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)){ + ++i; + } + return i; + } + + private static int firstIndex(String inputString, int matchFrom){ + int i = matchFrom-1; + if(i >= 0 && inputString.charAt(i) == '.'){ + while(i >= 0 && !Character.isWhitespace(inputString.charAt(i))){ + --i; + } + } + return i; + } + + private static boolean checkFirstLetter(String inputString, int at){ + String leaveCharacters = "§"; + if(leaveCharacters.indexOf(inputString.charAt(at)) != -1){ + return true; + } + return false; + } + } -- GitLab From 509e9f282774ed45da015560cffdddd603c75a81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Tue, 21 Jul 2020 16:18:36 +0200 Subject: [PATCH 24/35] Fixed checkstyle violations. --- .../main/java/pl/clarin/speller/TextEdit.java | 110 +++++++++--------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 654a49b..f0a7f49 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -2,7 +2,6 @@ package pl.clarin.speller; import java.nio.charset.Charset; import java.util.List; - import org.languagetool.JLanguageTool; import org.languagetool.rules.RuleMatch; @@ -27,9 +26,9 @@ public class TextEdit { matchingWord = inputString.substring(matchFrom, matchTo); } else { boolean change = toChange(inputString,matchFrom,matchTo); - if (change){ + if (change) { matchingWord = match.getSuggestedReplacements().get(0); - }else{ + } else { matchingWord = inputString.substring(matchFrom, matchTo); } } @@ -40,32 +39,36 @@ public class TextEdit { return sb.toString(); } - private static boolean toChange(String inputString, int matchFrom, int matchTo){ - if(isProperNoun(inputString,matchFrom) || isAcronym(inputString,matchFrom,matchTo) - || isFileOrExtension(inputString,matchFrom,matchTo) || checkFirstLetter(inputString,matchFrom) - || isNotPolish(inputString, matchFrom, matchTo) || isSurname(inputString, matchFrom, matchTo)){ + private static boolean toChange(String inputString, int matchFrom, int matchTo) { + if (isProperNoun(inputString,matchFrom) + || isAcronym(inputString,matchFrom,matchTo) + || isFileOrExtension(inputString,matchFrom,matchTo) + || checkFirstLetter(inputString,matchFrom) + || isNotPolish(inputString, matchFrom, matchTo) + || isSurname(inputString, matchFrom, matchTo)) { return false; - }else{ + } else { return true; } } - private static boolean isSurname(String inputString, int matchFrom, int matchTo){ + private static boolean isSurname(String inputString, int matchFrom, int matchTo) { boolean isSurname = false; int i = matchFrom; - if((i-4) > 0 && Character.isUpperCase(inputString.charAt(i)) - && Character.isWhitespace(inputString.charAt(i-1)) - && (inputString.charAt(i-2)) == '.' - && Character.isUpperCase(inputString.charAt(i-3))){ + if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i)) + && Character.isWhitespace(inputString.charAt(i - 1)) + && (inputString.charAt(i - 2)) == '.' + && Character.isUpperCase(inputString.charAt(i - 3))) { i = i - 3; - while(i > 0 && Character.isUpperCase(inputString.charAt(i))){ + while (i > 0 && Character.isUpperCase(inputString.charAt(i))) { --i; } - if(i == 0 || Character.isWhitespace(inputString.charAt(i))){ + if (i == 0 || Character.isWhitespace(inputString.charAt(i))) { i = matchFrom + 1; isSurname = true; - while(i < matchTo){ - if(!Character.isLowerCase(inputString.charAt(i)) && !(i == matchTo-1 && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)){ + while (i < matchTo) { + if (!Character.isLowerCase(inputString.charAt(i)) + && !(i == (matchTo - 1) && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)) { isSurname = false; } ++i; @@ -75,7 +78,7 @@ public class TextEdit { return isSurname; } - private static boolean isProperNoun(String inputString, int matchFrom){ + private static boolean isProperNoun(String inputString, int matchFrom) { boolean isProperNoun = false; if (Character.isUpperCase(inputString.charAt(matchFrom))) { isProperNoun = false; @@ -93,31 +96,32 @@ public class TextEdit { return isProperNoun; } - private static boolean isAcronym(String inputString, int matchFrom, int matchTo){ + private static boolean isAcronym(String inputString, int matchFrom, int matchTo) { double breakPoint = 0.6; boolean isAcronym = false; int lowerCaseLetters = 0; - if (Character.isUpperCase(inputString.charAt(matchFrom)) || Character.isDigit(inputString.charAt(matchFrom))) { + if (Character.isUpperCase(inputString.charAt(matchFrom)) + || Character.isDigit(inputString.charAt(matchFrom))) { int i = matchFrom + 1; - while(i < matchTo){ - if(Character.isLowerCase(inputString.charAt(i))){ + while (i < matchTo) { + if (Character.isLowerCase(inputString.charAt(i))) { ++lowerCaseLetters; } ++i; } - if(((matchTo-matchFrom) * breakPoint) >= lowerCaseLetters){ - isAcronym = true; - } + if (((matchTo - matchFrom) * breakPoint) >= lowerCaseLetters) { + isAcronym = true; + } } return isAcronym; } - private static boolean isNotPolish(String inputString, int matchFrom, int matchTo){ + private static boolean isNotPolish(String inputString, int matchFrom, int matchTo) { String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ"; boolean isNotPolish = false; - for (int i = matchFrom; i < matchTo; ++i){ - if(!isAscii(inputString.charAt(i))){ - if(polishLetterSet.indexOf(inputString.charAt(i)) == -1){ + for (int i = matchFrom; i < matchTo; ++i) { + if (!isAscii(inputString.charAt(i))) { + if (polishLetterSet.indexOf(inputString.charAt(i)) == -1) { isNotPolish = true; break; } @@ -130,32 +134,33 @@ public class TextEdit { return Charset.forName("US-ASCII").newEncoder().canEncode(v); } - private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo){ - String character_list = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()"; - int extension_length = 4; + private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) { + String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG" + + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()"; + int extensionLength = 4; boolean isFileOrExtension = false; boolean dot = false; int lastIndex = lastIndex(inputString,matchFrom,matchTo); int firstIndex = firstIndex(inputString,matchFrom); int i = firstIndex + 1; while (i < lastIndex) { - char current_char = inputString.charAt(i); - if(character_list.indexOf(current_char) == -1) { - if(current_char == '.'){ + char currentChar = inputString.charAt(i); + if (characterList.indexOf(currentChar) == -1) { + if (currentChar == '.') { dot = true; ++i; - int j = extension_length; - while(i < lastIndex){ - current_char = inputString.charAt(i); - if(character_list.indexOf(current_char) == -1){ - if(current_char == '.'){ - j = extension_length; - } else if(!(i == lastIndex-1 && ".?!,".indexOf(current_char) != -1)){ + int j = extensionLength; + while (i < lastIndex) { + currentChar = inputString.charAt(i); + if (characterList.indexOf(currentChar) == -1) { + if (currentChar == '.') { + j = extensionLength; + } else if (!(i == lastIndex - 1 && ".?!,".indexOf(currentChar) != -1)) { break; } } --j; - if (j < 0){ + if (j < 0) { break; } ++i; @@ -165,33 +170,34 @@ public class TextEdit { } ++i; } - if (dot && i == lastIndex){ + if (dot && i == lastIndex) { isFileOrExtension = true; } return isFileOrExtension; } - private static int lastIndex(String inputString, int matchFrom, int matchTo){ + private static int lastIndex(String inputString, int matchFrom, int matchTo) { int i = matchFrom; - while(i < inputString.length() && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)){ + while (i < inputString.length() + && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) { ++i; } return i; } - private static int firstIndex(String inputString, int matchFrom){ - int i = matchFrom-1; - if(i >= 0 && inputString.charAt(i) == '.'){ - while(i >= 0 && !Character.isWhitespace(inputString.charAt(i))){ + private static int firstIndex(String inputString, int matchFrom) { + int i = matchFrom - 1; + if (i >= 0 && inputString.charAt(i) == '.') { + while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) { --i; } } return i; } - private static boolean checkFirstLetter(String inputString, int at){ + private static boolean checkFirstLetter(String inputString, int at) { String leaveCharacters = "§"; - if(leaveCharacters.indexOf(inputString.charAt(at)) != -1){ + if (leaveCharacters.indexOf(inputString.charAt(at)) != -1) { return true; } return false; -- GitLab From 14968dc33dc0a2c1900edcd2df6848af597c1447 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Fri, 24 Jul 2020 16:15:05 +0200 Subject: [PATCH 25/35] Moved JLanguageTool to init, left lang inicialization in static init. --- module/src/main/java/pl/clarin/speller/Speller.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 701a5d1..403dfe9 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -15,25 +15,23 @@ import pl.clarin.ws.worker.IniFile; import pl.clarin.ws.worker.Service; import pl.clarin.ws.worker.Worker; - - public class Speller extends Worker { - // init object for each thread - static JLanguageTool langTool = null; + static Polish lang = null; public static void main(String[] args) { new Service<>(Speller.class); } + // init object for each thread @Override public void init() throws Exception { - + JLanguageTool langTool = new JLanguageTool(lang); } // init objects shared by threads @Override public void static_init(IniFile init) throws Exception { - langTool = new JLanguageTool(new Polish()); + lang = new Polish(); } @Override -- GitLab From 360eeeb40422e82e8820a245a7b79bbdbad20114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 27 Jul 2020 08:49:53 +0200 Subject: [PATCH 26/35] Moved langTool variable declaration. --- module/src/main/java/pl/clarin/speller/Speller.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 403dfe9..d57ece2 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -17,6 +17,7 @@ import pl.clarin.ws.worker.Worker; public class Speller extends Worker { static Polish lang = null; + JLanguageTool langTool = null; public static void main(String[] args) { new Service<>(Speller.class); @@ -25,7 +26,7 @@ public class Speller extends Worker { // init object for each thread @Override public void init() throws Exception { - JLanguageTool langTool = new JLanguageTool(lang); + langTool = new JLanguageTool(lang); } // init objects shared by threads -- GitLab From a27bf30251d9c8071aa81132a07ae5a2473c41ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 27 Jul 2020 12:29:29 +0200 Subject: [PATCH 27/35] Fixed built on master. --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3cb8b5b..af39c1c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,8 +20,8 @@ build_image: before_script: - '' script: - - docker build -t clarinpl/liner2 . + - docker build -t clarinpl/speller . - echo $DOCKER_PASSWORD > pass.txt - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin - rm pass.txt - - docker push clarinpl/liner2 + - docker push clarinpl/speller -- GitLab From c472f4a80b4743c9fa20dcb3d95342763b838b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Fri, 31 Jul 2020 10:13:39 +0200 Subject: [PATCH 28/35] Added detecting languge using spaCy. --- Dockerfile | 9 +- SpacyDocConvert.py | 8 ++ module/pom.xml | 5 + .../main/java/pl/clarin/speller/SpaCy.java | 119 ++++++++++++++++++ .../main/java/pl/clarin/speller/Speller.java | 4 +- .../main/java/pl/clarin/speller/TextEdit.java | 52 ++++---- 6 files changed, 168 insertions(+), 29 deletions(-) create mode 100644 SpacyDocConvert.py create mode 100644 module/src/main/java/pl/clarin/speller/SpaCy.java diff --git a/Dockerfile b/Dockerfile index 348cb06..ab231c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,6 @@ FROM clarinpl/openjdk:8 as builder LABEL application="Speller" LABEL description="Client - Workers - correcting mistakes in sentances in txt files" LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology" -LABEL maintainer="bartlomiej.koptyra@pwr.edu.pl" WORKDIR /home/install RUN cd nlp.worker && \ @@ -16,6 +15,14 @@ RUN cd module && \ mvn clean && \ mvn install +FROM clarinpl/python:3.6 + +WORKDIR /home/worker + +RUN python3.6 -m pip install jep +RUN python3.6 -m pip install spacy +RUN python3.6 -m spacy download pl_core_news_lg + FROM clarinpl/openjdk-jre:8 WORKDIR /home/worker diff --git a/SpacyDocConvert.py b/SpacyDocConvert.py new file mode 100644 index 0000000..b33f1c8 --- /dev/null +++ b/SpacyDocConvert.py @@ -0,0 +1,8 @@ +def convert(spacyDoc, sentence): + idx = 0 + proper_nouns_list = [] + for tok in spacyDoc: + idx = sentence.find(tok.text, idx) + if tok.ent_type_ != '': + proper_nouns_list.append((idx, idx+len(tok.text))) + return proper_nouns_list diff --git a/module/pom.xml b/module/pom.xml index 62d8f94..2522945 100644 --- a/module/pom.xml +++ b/module/pom.xml @@ -30,6 +30,11 @@ <artifactId>json</artifactId> <version>20141113</version> </dependency> + <dependency> + <groupId>black.ninia</groupId> + <artifactId>jep</artifactId> + <version>3.9.0</version> + </dependency> </dependencies> <build> diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java new file mode 100644 index 0000000..dccf91b --- /dev/null +++ b/module/src/main/java/pl/clarin/speller/SpaCy.java @@ -0,0 +1,119 @@ +package pl.clarin.speller; + +import jep.SharedInterpreter; +import jep.JepException; +import org.languagetool.JLanguageTool; + +import java.lang.Integer; +import java.util.ArrayList; +import java.util.List; + +public class SpaCy { + boolean loaded = false; + boolean processed = false; + SharedInterpreter interp = null; + ArrayList<List<Long>> properNounArray = null; + ArrayList<List<Long>> foreignSentenceArray = null; + { + try{ + interp = new SharedInterpreter(); + interp.exec("import sys"); + interp.exec("sys.path.append(r'/home/worker')"); + interp.exec("import SpacyDocConvert"); + interp.exec("sys.argv=[]"); + interp.exec("import spacy"); + interp.exec("from spacy_langdetect import LanguageDetector"); + interp.exec("model = pl_core_news_lg"); + interp.exec("nlp = spacy.load(model)"); + interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)"); + loaded = true; + } + catch (JepException e) { + System.out.println("An error occurred: " + e.getMessage()); + } + } + + public boolean isLoaded() { + return loaded; + } + + public boolean isProcessed() { + return processed; + } + + public void process(String inputString) throws Exception { + processed = false; + if(loaded){ + try { + interp.exec("sentence = " + inputString); + interp.exec("spacyDoc = nlp(sentence)"); + Object properNounList = interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)"); + properNounArray = ((ArrayList<List<Long>>) properNounList); + Object foreignSentList = interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)"); + foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList); + processed = true; + } + catch (JepException e) { + System.out.println("An error occurred: " + e.getMessage()); + } + } + } + + public boolean isForeignSentence(String inputString, int matchFrom) { + boolean isForeginSent = false; + if(processed) { + for (List<Long> tuple : foreignSentenceArray){ + if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){ + if(Character.isUpperCase(inputString.charAt(matchFrom))){ + return true; + }else{ + return false; + } + } + if(matchFrom > tuple.get(1).intValue()){ + return false; + } + } + } + return isForeginSent; + } + + public boolean isProperNoun(String inputString, int matchFrom) { + boolean isProperNoun = false; + if(!processed) { + if (Character.isUpperCase(inputString.charAt(matchFrom))) { + isProperNoun = false; + int i = matchFrom - 1; + while (i > 0) { + if (Character.isLetterOrDigit(inputString.charAt(i))) { + isProperNoun = true; + break; + } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) { + break; + } + --i; + } + } + return isProperNoun; + }else{ + for (List<Long> tuple : properNounArray){ + if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){ + if(Character.isUpperCase(inputString.charAt(matchFrom))){ + return true; + }else{ + return false; + } + } + if(matchFrom > tuple.get(1).intValue()){ + return false; + } + } + return isProperNoun; + } + } +} + + + + + diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index d57ece2..1296761 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -18,6 +18,7 @@ import pl.clarin.ws.worker.Worker; public class Speller extends Worker { static Polish lang = null; JLanguageTool langTool = null; + TextEdit textEditor = null; public static void main(String[] args) { new Service<>(Speller.class); @@ -27,6 +28,7 @@ public class Speller extends Worker { @Override public void init() throws Exception { langTool = new JLanguageTool(lang); + textEditor = new TextEdit(); } // init objects shared by threads @@ -47,7 +49,7 @@ public class Speller extends Worker { String line = null; while ((line = br.readLine()) != null) { try { - String correctedLine = TextEdit.edit(line, langTool); + String correctedLine = textEditor.edit(line, langTool); sb.append(correctedLine).append('\n'); } catch (Exception exception) { Logger.getLogger(Speller.class.getName()) diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index f0a7f49..a42c18b 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -6,8 +6,15 @@ import org.languagetool.JLanguageTool; import org.languagetool.rules.RuleMatch; public class TextEdit { + SpaCy spacy = new SpaCy(); /**Class that corrects input text.*/ - public static String edit(String inputString, JLanguageTool langTool) throws Exception { + public String edit(String inputString, JLanguageTool langTool) throws Exception { + try { + spacy.process(inputString); + } + catch(Exception e){ + System.out.println("Spacy did not process the input correctly! : " + e.getMessage()); + } char[] buffer = inputString.toCharArray(); StringBuilder sb = new StringBuilder(); List<RuleMatch> matches = langTool.check(inputString); @@ -39,8 +46,9 @@ public class TextEdit { return sb.toString(); } - private static boolean toChange(String inputString, int matchFrom, int matchTo) { - if (isProperNoun(inputString,matchFrom) + private boolean toChange(String inputString, int matchFrom, int matchTo) { + if (isForeignSentence(inputString,matchFrom) + || isProperNoun(inputString,matchFrom) || isAcronym(inputString,matchFrom,matchTo) || isFileOrExtension(inputString,matchFrom,matchTo) || checkFirstLetter(inputString,matchFrom) @@ -52,7 +60,7 @@ public class TextEdit { } } - private static boolean isSurname(String inputString, int matchFrom, int matchTo) { + private boolean isSurname(String inputString, int matchFrom, int matchTo) { boolean isSurname = false; int i = matchFrom; if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i)) @@ -78,25 +86,15 @@ public class TextEdit { return isSurname; } - private static boolean isProperNoun(String inputString, int matchFrom) { - boolean isProperNoun = false; - if (Character.isUpperCase(inputString.charAt(matchFrom))) { - isProperNoun = false; - int i = matchFrom - 1; - while (i > 0) { - if (Character.isLetterOrDigit(inputString.charAt(i))) { - isProperNoun = true; - break; - } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) { - break; - } - --i; - } - } - return isProperNoun; + private boolean isForeignSentence(String inputString, int matchFrom) { + return spacy.isForeignSentence(inputString, matchFrom); + } + + private boolean isProperNoun(String inputString, int matchFrom) { + return spacy.isProperNoun(inputString, matchFrom); } - private static boolean isAcronym(String inputString, int matchFrom, int matchTo) { + private boolean isAcronym(String inputString, int matchFrom, int matchTo) { double breakPoint = 0.6; boolean isAcronym = false; int lowerCaseLetters = 0; @@ -116,7 +114,7 @@ public class TextEdit { return isAcronym; } - private static boolean isNotPolish(String inputString, int matchFrom, int matchTo) { + private boolean isNotPolish(String inputString, int matchFrom, int matchTo) { String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ"; boolean isNotPolish = false; for (int i = matchFrom; i < matchTo; ++i) { @@ -130,11 +128,11 @@ public class TextEdit { return isNotPolish; } - public static boolean isAscii(Character v) { + public boolean isAscii(Character v) { return Charset.forName("US-ASCII").newEncoder().canEncode(v); } - private static boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) { + private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) { String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG" + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()"; int extensionLength = 4; @@ -176,7 +174,7 @@ public class TextEdit { return isFileOrExtension; } - private static int lastIndex(String inputString, int matchFrom, int matchTo) { + private int lastIndex(String inputString, int matchFrom, int matchTo) { int i = matchFrom; while (i < inputString.length() && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) { @@ -185,7 +183,7 @@ public class TextEdit { return i; } - private static int firstIndex(String inputString, int matchFrom) { + private int firstIndex(String inputString, int matchFrom) { int i = matchFrom - 1; if (i >= 0 && inputString.charAt(i) == '.') { while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) { @@ -195,7 +193,7 @@ public class TextEdit { return i; } - private static boolean checkFirstLetter(String inputString, int at) { + private boolean checkFirstLetter(String inputString, int at) { String leaveCharacters = "§"; if (leaveCharacters.indexOf(inputString.charAt(at)) != -1) { return true; -- GitLab From 8c8604c7d212da9d577a84523dc9b674304de61f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Fri, 31 Jul 2020 13:08:58 +0200 Subject: [PATCH 29/35] Added tox. Might not work yet . --- .gitlab-ci.yml | 18 +++++++++++++++++- SpacyDocConvert.py | 29 +++++++++++++++++++++++++++-- tox.ini | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 tox.ini diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index af39c1c..9b653ad 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,5 +1,21 @@ -image: clarinpl/openjdk:8 +image: 'clarinpl/python:3.6' +stages: + - check_style +cache: + paths: + - .tox +before_script: + - pip install tox==2.9.1 +pep8: + stage: check_style + script: + - tox -v -e pep8 +docstyle: + stage: check_style + script: + - tox -v -e docstyle +image: clarinpl/openjdk:8 stages: - check_style - build diff --git a/SpacyDocConvert.py b/SpacyDocConvert.py index b33f1c8..5987e4c 100644 --- a/SpacyDocConvert.py +++ b/SpacyDocConvert.py @@ -1,8 +1,33 @@ -def convert(spacyDoc, sentence): +"""Utility file for using spacyDoc.""" + + +def find_proper_nouns(spacyDoc, sentence): + """Function returns indices of words that are proper nouns. + + :param spacyDoc: SpacyDoc with ner tags. + :param sentence: Text spacyDoc is made from. + :return:list with proper nouns indices + """ idx = 0 proper_nouns_list = [] for tok in spacyDoc: idx = sentence.find(tok.text, idx) if tok.ent_type_ != '': - proper_nouns_list.append((idx, idx+len(tok.text))) + proper_nouns_list.append((idx, idx + len(tok.text))) return proper_nouns_list + + +def find_foreign_sentences(spacyDoc, text): + """Function returns indices of sentences that are not in Polish. + + :param spacyDoc: SpacyDoc with ner tags. + :param text: Text spacyDoc is made from. + :return: list with foreign sentences indices + """ + idx = 0 + foreign_sentences_list = [] + for sent in spacyDoc.sents: + if sent._.language.get('language', 'no') != 'pl': + idx = text.find(sent.text, idx) + foreign_sentences_list.append((idx, idx + len(sent.text))) + return foreign_sentences_list diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..819e612 --- /dev/null +++ b/tox.ini @@ -0,0 +1,44 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python3 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python3 +commands = + pydocstyle --verbose {posargs} + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 +match-dir = ^(?!\.tox|venv).* +match = ^(?!setup).*\.py \ No newline at end of file -- GitLab From da993cedfe4e8e25d4ae34a5cca92f8c1f484a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Fri, 31 Jul 2020 16:49:35 +0200 Subject: [PATCH 30/35] Sucessfully added spaCy to speller. First version that works on tester. --- Dockerfile | 28 ++- docker-compose.yml | 6 +- entrypoint.sh | 5 + .../main/java/pl/clarin/speller/SpaCy.java | 191 +++++++++--------- .../main/java/pl/clarin/speller/Speller.java | 26 ++- .../main/java/pl/clarin/speller/TextEdit.java | 30 ++- requirements.txt | 3 + 7 files changed, 147 insertions(+), 142 deletions(-) create mode 100644 entrypoint.sh create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile index ab231c7..b2aaee2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM clarinpl/openjdk:8 as builder +FROM clarinpl/openjdk:8 LABEL application="Speller" LABEL description="Client - Workers - correcting mistakes in sentances in txt files" @@ -15,16 +15,24 @@ RUN cd module && \ mvn clean && \ mvn install -FROM clarinpl/python:3.6 +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y python3.5 && \ + apt-get install -y python3-pip + +ENV LD_LIBRARY_PATH=/usr/local/lib/python3.5/dist-packages/jep/ WORKDIR /home/worker +COPY ./SpacyDocConvert.py . +COPY ./requirements.txt . +COPY ./entrypoint.sh ./entrypoint.sh -RUN python3.6 -m pip install jep -RUN python3.6 -m pip install spacy -RUN python3.6 -m spacy download pl_core_news_lg - -FROM clarinpl/openjdk-jre:8 +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install -r requirements.txt -WORKDIR /home/worker -COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar . -CMD ["java", "-jar", "nlp.worker.speller-1.0-SNAPSHOT.jar"] \ No newline at end of file +RUN ["chmod", "+x", "./entrypoint.sh"] + +RUN cp /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar ./nlp.worker.speller-1.0-SNAPSHOT.jar +CMD ["./entrypoint.sh"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f0a01ae..06c895f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,7 +8,7 @@ services: - './config.ini:/home/worker/config.ini' working_dir: /home/worker/ entrypoint: - - java - - '-jar' - - nlp.worker.speller-1.0-SNAPSHOT.jar + - ./entrypoint.sh + environment: + - PYTHONUNBUFFERED=0 restart: always diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..d864430 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,5 @@ +#!/bin/sh +cd /home/worker +echo "Downloading model" +python3 -m spacy download "pl_core_news_lg" +java -jar nlp.worker.speller-1.0-SNAPSHOT.jar \ No newline at end of file diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java index dccf91b..b14de22 100644 --- a/module/src/main/java/pl/clarin/speller/SpaCy.java +++ b/module/src/main/java/pl/clarin/speller/SpaCy.java @@ -1,119 +1,108 @@ package pl.clarin.speller; -import jep.SharedInterpreter; -import jep.JepException; -import org.languagetool.JLanguageTool; - -import java.lang.Integer; import java.util.ArrayList; import java.util.List; +import jep.JepException; +import jep.SharedInterpreter; public class SpaCy { - boolean loaded = false; - boolean processed = false; - SharedInterpreter interp = null; - ArrayList<List<Long>> properNounArray = null; - ArrayList<List<Long>> foreignSentenceArray = null; - { - try{ - interp = new SharedInterpreter(); - interp.exec("import sys"); - interp.exec("sys.path.append(r'/home/worker')"); - interp.exec("import SpacyDocConvert"); - interp.exec("sys.argv=[]"); - interp.exec("import spacy"); - interp.exec("from spacy_langdetect import LanguageDetector"); - interp.exec("model = pl_core_news_lg"); - interp.exec("nlp = spacy.load(model)"); - interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)"); - loaded = true; - } - catch (JepException e) { - System.out.println("An error occurred: " + e.getMessage()); - } - } + /**Class that handles spacy processing.*/ + boolean loaded = false; + boolean processed = false; + SharedInterpreter interp = null; + ArrayList<List<Long>> properNounArray = null; + ArrayList<List<Long>> foreignSentenceArray = null; - public boolean isLoaded() { - return loaded; + { + try { + interp = new SharedInterpreter(); + interp.exec("import sys"); + interp.exec("sys.path.append(r'/home/worker')"); + interp.exec("import SpacyDocConvert"); + interp.exec("sys.argv=[]"); + interp.exec("import spacy"); + interp.exec("from spacy_langdetect import LanguageDetector"); + interp.exec("model = 'pl_core_news_lg'"); + interp.exec("nlp = spacy.load(model)"); + interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)"); + loaded = true; + } catch (JepException e) { + System.out.println("An error occurred: " + e.getMessage()); } + } - public boolean isProcessed() { - return processed; - } + public boolean isLoaded() { + return loaded; + } - public void process(String inputString) throws Exception { - processed = false; - if(loaded){ - try { - interp.exec("sentence = " + inputString); - interp.exec("spacyDoc = nlp(sentence)"); - Object properNounList = interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)"); - properNounArray = ((ArrayList<List<Long>>) properNounList); - Object foreignSentList = interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)"); - foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList); - processed = true; - } - catch (JepException e) { - System.out.println("An error occurred: " + e.getMessage()); - } - } + public boolean isProcessed() { + return processed; + } + + /**Javadoc.*/ + public void process(String inputString) throws Exception { + processed = false; + if (loaded) { + try { + interp.exec("sentence = r'" + inputString.replace("'", "\\'") + "'"); + interp.exec("spacyDoc = nlp(sentence)"); + Object properNounList = + interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)"); + properNounArray = ((ArrayList<List<Long>>) properNounList); + Object foreignSentList = + interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)"); + foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList); + processed = true; + } catch (JepException e) { + System.out.println("An error occurred: " + e.getMessage()); + } } + } - public boolean isForeignSentence(String inputString, int matchFrom) { - boolean isForeginSent = false; - if(processed) { - for (List<Long> tuple : foreignSentenceArray){ - if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){ - if(Character.isUpperCase(inputString.charAt(matchFrom))){ - return true; - }else{ - return false; - } - } - if(matchFrom > tuple.get(1).intValue()){ - return false; - } - } + /**Javadoc.*/ + public boolean isForeignSentence(String inputString, int matchFrom) { + boolean isForeginSent = false; + if (processed) { + for (List<Long> tuple : foreignSentenceArray) { + if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) { + return Character.isUpperCase(inputString.charAt(matchFrom)); + } + if (matchFrom > tuple.get(1).intValue()) { + return false; } - return isForeginSent; + } } + return isForeginSent; + } - public boolean isProperNoun(String inputString, int matchFrom) { - boolean isProperNoun = false; - if(!processed) { - if (Character.isUpperCase(inputString.charAt(matchFrom))) { - isProperNoun = false; - int i = matchFrom - 1; - while (i > 0) { - if (Character.isLetterOrDigit(inputString.charAt(i))) { - isProperNoun = true; - break; - } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) { - break; - } - --i; - } - } - return isProperNoun; - }else{ - for (List<Long> tuple : properNounArray){ - if(matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()){ - if(Character.isUpperCase(inputString.charAt(matchFrom))){ - return true; - }else{ - return false; - } - } - if(matchFrom > tuple.get(1).intValue()){ - return false; - } - } - return isProperNoun; + /**Javadoc.*/ + public boolean isProperNoun(String inputString, int matchFrom) { + boolean isProperNoun = false; + if (!processed) { + if (Character.isUpperCase(inputString.charAt(matchFrom))) { + isProperNoun = false; + int i = matchFrom - 1; + while (i > 0) { + if (Character.isLetterOrDigit(inputString.charAt(i))) { + isProperNoun = true; + break; + } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) { + break; + } + --i; + } + } + return isProperNoun; + } else { + for (List<Long> tuple : properNounArray) { + if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) { + return Character.isUpperCase(inputString.charAt(matchFrom)); } + if (matchFrom > tuple.get(1).intValue()) { + return false; + } + } + return isProperNoun; } + } } - - - - - diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 1296761..5b396a7 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -2,10 +2,13 @@ package pl.clarin.speller; import java.io.BufferedReader; import java.io.BufferedWriter; -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.util.logging.Level; import java.util.logging.Logger; import org.json.JSONObject; @@ -39,13 +42,16 @@ public class Speller extends Worker { @Override public void process(String fileIn, String fileOut, JSONObject param) { - File file = new File(fileIn); try { - FileWriter fileWriter = new FileWriter(fileOut); + FileInputStream fstream = new FileInputStream(fileIn); + + Writer out = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(fileOut), StandardCharsets.UTF_8)); StringBuilder sb = new StringBuilder(); - try (BufferedReader br = new BufferedReader(new FileReader(file))) { + try (BufferedReader br = new BufferedReader( + new InputStreamReader(fstream, StandardCharsets.UTF_8))) { String line = null; while ((line = br.readLine()) != null) { try { @@ -58,12 +64,12 @@ public class Speller extends Worker { } } - try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) { - bufferedWriter.write(sb.toString()); + try { + out.write(sb.toString()); + } finally { + out.close(); } - fileWriter.close(); - } catch (IOException exception) { Logger.getLogger(Speller.class.getName()) .log(Level.SEVERE, "Problems with writing: " + fileOut, exception); diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index a42c18b..9239c73 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -1,6 +1,7 @@ package pl.clarin.speller; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.List; import org.languagetool.JLanguageTool; import org.languagetool.rules.RuleMatch; @@ -8,11 +9,11 @@ import org.languagetool.rules.RuleMatch; public class TextEdit { SpaCy spacy = new SpaCy(); /**Class that corrects input text.*/ + public String edit(String inputString, JLanguageTool langTool) throws Exception { try { spacy.process(inputString); - } - catch(Exception e){ + } catch (Exception e) { System.out.println("Spacy did not process the input correctly! : " + e.getMessage()); } char[] buffer = inputString.toCharArray(); @@ -47,17 +48,13 @@ public class TextEdit { } private boolean toChange(String inputString, int matchFrom, int matchTo) { - if (isForeignSentence(inputString,matchFrom) - || isProperNoun(inputString,matchFrom) - || isAcronym(inputString,matchFrom,matchTo) - || isFileOrExtension(inputString,matchFrom,matchTo) - || checkFirstLetter(inputString,matchFrom) - || isNotPolish(inputString, matchFrom, matchTo) - || isSurname(inputString, matchFrom, matchTo)) { - return false; - } else { - return true; - } + return !isForeignSentence(inputString, matchFrom) + && !isProperNoun(inputString, matchFrom) + && !isAcronym(inputString, matchFrom, matchTo) + && !isFileOrExtension(inputString, matchFrom, matchTo) + && !checkFirstLetter(inputString, matchFrom) + && !isNotPolish(inputString, matchFrom, matchTo) + && !isSurname(inputString, matchFrom, matchTo); } private boolean isSurname(String inputString, int matchFrom, int matchTo) { @@ -129,7 +126,7 @@ public class TextEdit { } public boolean isAscii(Character v) { - return Charset.forName("US-ASCII").newEncoder().canEncode(v); + return StandardCharsets.US_ASCII.newEncoder().canEncode(v); } private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) { @@ -195,10 +192,7 @@ public class TextEdit { private boolean checkFirstLetter(String inputString, int at) { String leaveCharacters = "§"; - if (leaveCharacters.indexOf(inputString.charAt(at)) != -1) { - return true; - } - return false; + return leaveCharacters.indexOf(inputString.charAt(at)) != -1; } } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1c9e461 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +jep +spacy +spacy-langdetect \ No newline at end of file -- GitLab From ea02135d2171a95a8c697f298338cd34ef0648bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Mon, 3 Aug 2020 08:38:25 +0200 Subject: [PATCH 31/35] Changed .gitlab-cI --- .gitlab-ci.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9b653ad..f117f7f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,32 +1,33 @@ -image: 'clarinpl/python:3.6' stages: - check_style + - build cache: paths: - .tox -before_script: - - pip install tox==2.9.1 pep8: + image: 'clarinpl/python:3.6' + before_script: + - pip install tox==2.9.1 stage: check_style script: - tox -v -e pep8 docstyle: + image: 'clarinpl/python:3.6' + before_script: + - pip install tox==2.9.1 stage: check_style script: - tox -v -e docstyle -image: clarinpl/openjdk:8 -stages: - - check_style - - build - google_checks: + image: clarinpl/openjdk:8 stage: check_style script: - cd module - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true build_image: + image: clarinpl/openjdk:8 stage: build image: 'docker:18.09.7' only: -- GitLab From 484901e67d73e8307928baf94bb46ae0f96a4cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Tue, 4 Aug 2020 15:04:29 +0200 Subject: [PATCH 32/35] Changed speller to take input piped from spacy service instead of running its own spaCy. --- .gitlab-ci.yml | 22 +--- Dockerfile | 23 +--- SpacyDocConvert.py | 33 ------ docker-compose.yml | 2 - entrypoint.sh | 2 - module/pom.xml | 5 - .../main/java/pl/clarin/speller/SpaCy.java | 112 +++++++++--------- .../main/java/pl/clarin/speller/Speller.java | 32 ++++- .../main/java/pl/clarin/speller/TextEdit.java | 6 - tox.ini | 44 ------- 10 files changed, 90 insertions(+), 191 deletions(-) delete mode 100644 SpacyDocConvert.py delete mode 100644 tox.ini diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f117f7f..71b6d71 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,33 +1,13 @@ +image: clarinpl/openjdk:8 stages: - check_style - build -cache: - paths: - - .tox -pep8: - image: 'clarinpl/python:3.6' - before_script: - - pip install tox==2.9.1 - stage: check_style - script: - - tox -v -e pep8 -docstyle: - image: 'clarinpl/python:3.6' - before_script: - - pip install tox==2.9.1 - stage: check_style - script: - - tox -v -e docstyle - google_checks: - image: clarinpl/openjdk:8 stage: check_style script: - cd module - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true - build_image: - image: clarinpl/openjdk:8 stage: build image: 'docker:18.09.7' only: diff --git a/Dockerfile b/Dockerfile index b2aaee2..ed8e888 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM clarinpl/openjdk:8 +FROM clarinpl/openjdk:8 as builder LABEL application="Speller" LABEL description="Client - Workers - correcting mistakes in sentances in txt files" @@ -15,24 +15,9 @@ RUN cd module && \ mvn clean && \ mvn install -RUN apt-get update && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get update && \ - apt-get install -y python3.5 && \ - apt-get install -y python3-pip - -ENV LD_LIBRARY_PATH=/usr/local/lib/python3.5/dist-packages/jep/ - +FROM clarinpl/openjdk-jre:8 WORKDIR /home/worker -COPY ./SpacyDocConvert.py . -COPY ./requirements.txt . -COPY ./entrypoint.sh ./entrypoint.sh - -RUN python3 -m pip install --upgrade pip && \ - python3 -m pip install -r requirements.txt - +COPY ./entrypoint.sh ./entrypoint.sh RUN ["chmod", "+x", "./entrypoint.sh"] - -RUN cp /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar ./nlp.worker.speller-1.0-SNAPSHOT.jar +COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar . CMD ["./entrypoint.sh"] \ No newline at end of file diff --git a/SpacyDocConvert.py b/SpacyDocConvert.py deleted file mode 100644 index 5987e4c..0000000 --- a/SpacyDocConvert.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Utility file for using spacyDoc.""" - - -def find_proper_nouns(spacyDoc, sentence): - """Function returns indices of words that are proper nouns. - - :param spacyDoc: SpacyDoc with ner tags. - :param sentence: Text spacyDoc is made from. - :return:list with proper nouns indices - """ - idx = 0 - proper_nouns_list = [] - for tok in spacyDoc: - idx = sentence.find(tok.text, idx) - if tok.ent_type_ != '': - proper_nouns_list.append((idx, idx + len(tok.text))) - return proper_nouns_list - - -def find_foreign_sentences(spacyDoc, text): - """Function returns indices of sentences that are not in Polish. - - :param spacyDoc: SpacyDoc with ner tags. - :param text: Text spacyDoc is made from. - :return: list with foreign sentences indices - """ - idx = 0 - foreign_sentences_list = [] - for sent in spacyDoc.sents: - if sent._.language.get('language', 'no') != 'pl': - idx = text.find(sent.text, idx) - foreign_sentences_list.append((idx, idx + len(sent.text))) - return foreign_sentences_list diff --git a/docker-compose.yml b/docker-compose.yml index 06c895f..c28c48b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,6 +9,4 @@ services: working_dir: /home/worker/ entrypoint: - ./entrypoint.sh - environment: - - PYTHONUNBUFFERED=0 restart: always diff --git a/entrypoint.sh b/entrypoint.sh index d864430..cefa313 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,5 +1,3 @@ #!/bin/sh cd /home/worker -echo "Downloading model" -python3 -m spacy download "pl_core_news_lg" java -jar nlp.worker.speller-1.0-SNAPSHOT.jar \ No newline at end of file diff --git a/module/pom.xml b/module/pom.xml index 2522945..62d8f94 100644 --- a/module/pom.xml +++ b/module/pom.xml @@ -30,11 +30,6 @@ <artifactId>json</artifactId> <version>20141113</version> </dependency> - <dependency> - <groupId>black.ninia</groupId> - <artifactId>jep</artifactId> - <version>3.9.0</version> - </dependency> </dependencies> <build> diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java index b14de22..0fdc61a 100644 --- a/module/src/main/java/pl/clarin/speller/SpaCy.java +++ b/module/src/main/java/pl/clarin/speller/SpaCy.java @@ -1,73 +1,79 @@ package pl.clarin.speller; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.ArrayList; +import java.util.Enumeration; import java.util.List; -import jep.JepException; -import jep.SharedInterpreter; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; public class SpaCy { - /**Class that handles spacy processing.*/ + /**Class that handles spacy input.*/ boolean loaded = false; - boolean processed = false; - SharedInterpreter interp = null; - ArrayList<List<Long>> properNounArray = null; - ArrayList<List<Long>> foreignSentenceArray = null; - - { - try { - interp = new SharedInterpreter(); - interp.exec("import sys"); - interp.exec("sys.path.append(r'/home/worker')"); - interp.exec("import SpacyDocConvert"); - interp.exec("sys.argv=[]"); - interp.exec("import spacy"); - interp.exec("from spacy_langdetect import LanguageDetector"); - interp.exec("model = 'pl_core_news_lg'"); - interp.exec("nlp = spacy.load(model)"); - interp.exec("nlp.add_pipe(LanguageDetector(), name=\"language_detector\", last=True)"); - loaded = true; - } catch (JepException e) { - System.out.println("An error occurred: " + e.getMessage()); - } - } + ArrayList<ArrayList<Integer>> properNounArray = null; + ArrayList<ArrayList<Integer>> foreignSentenceArray = null; public boolean isLoaded() { return loaded; } - public boolean isProcessed() { - return processed; + /**Loading spaCy input files.*/ + public InputStream load(ZipFile zipFile) throws IOException { + InputStream inputText = null; + for (Enumeration<? extends ZipEntry> entries = zipFile.entries(); entries.hasMoreElements();) { + ZipEntry entry = entries.nextElement(); + if (!entry.isDirectory()) { + if (entry.getName().equals("text.txt")) { + inputText = zipFile.getInputStream(entry); + } else if (entry.getName().equals("proper_nouns.txt")) { + InputStream in = zipFile.getInputStream(entry); + properNounArray = processSpacyFiles(in); + in.close(); + } else if (entry.getName().equals("foreign_sentences.txt")) { + InputStream in = zipFile.getInputStream(entry); + foreignSentenceArray = processSpacyFiles(in); + in.close(); + } else { + System.out.println("Zip from spaCy contains unexpected files!"); + } + } else { + System.out.println("Zip from spaCy contains unexpected directories!"); + } + } + loaded = true; + return inputText; } - /**Javadoc.*/ - public void process(String inputString) throws Exception { - processed = false; - if (loaded) { - try { - interp.exec("sentence = r'" + inputString.replace("'", "\\'") + "'"); - interp.exec("spacyDoc = nlp(sentence)"); - Object properNounList = - interp.getValue("SpacyDocConvert.find_proper_nouns(spacyDoc, sentence)"); - properNounArray = ((ArrayList<List<Long>>) properNounList); - Object foreignSentList = - interp.getValue("SpacyDocConvert.find_foreign_sentences(spacyDoc, sentence)"); - foreignSentenceArray = ((ArrayList<List<Long>>) foreignSentList); - processed = true; - } catch (JepException e) { - System.out.println("An error occurred: " + e.getMessage()); + private static ArrayList<ArrayList<Integer>> processSpacyFiles(InputStream in) { + BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + String line; + ArrayList<ArrayList<Integer>> array = new ArrayList<>(); + try { + while ((line = reader.readLine()) != null) { + String[] str = line.split(" "); + ArrayList<Integer> list = new ArrayList<>(); + list.add(Integer.parseInt(str[0])); + list.add(Integer.parseInt(str[1])); + array.add(list); } + } catch (IOException e) { + System.out.println("The text file contains incorrect data." + e.getMessage()); } + return array; } /**Javadoc.*/ public boolean isForeignSentence(String inputString, int matchFrom) { boolean isForeginSent = false; - if (processed) { - for (List<Long> tuple : foreignSentenceArray) { - if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) { + if (loaded) { + for (List<Integer> tuple : foreignSentenceArray) { + if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { return Character.isUpperCase(inputString.charAt(matchFrom)); } - if (matchFrom > tuple.get(1).intValue()) { + if (matchFrom > tuple.get(1)) { return false; } } @@ -78,9 +84,8 @@ public class SpaCy { /**Javadoc.*/ public boolean isProperNoun(String inputString, int matchFrom) { boolean isProperNoun = false; - if (!processed) { + if (!loaded) { if (Character.isUpperCase(inputString.charAt(matchFrom))) { - isProperNoun = false; int i = matchFrom - 1; while (i > 0) { if (Character.isLetterOrDigit(inputString.charAt(i))) { @@ -92,17 +97,16 @@ public class SpaCy { --i; } } - return isProperNoun; } else { - for (List<Long> tuple : properNounArray) { - if (matchFrom >= tuple.get(0).intValue() && matchFrom <= tuple.get(1).intValue()) { + for (List<Integer> tuple : properNounArray) { + if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { return Character.isUpperCase(inputString.charAt(matchFrom)); } - if (matchFrom > tuple.get(1).intValue()) { + if (matchFrom > tuple.get(1)) { return false; } } - return isProperNoun; } + return isProperNoun; } } diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 5b396a7..9a7b2d8 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -11,6 +11,7 @@ import java.io.Writer; import java.nio.charset.StandardCharsets; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.zip.ZipFile; import org.json.JSONObject; import org.languagetool.JLanguageTool; import org.languagetool.language.Polish; @@ -41,17 +42,34 @@ public class Speller extends Worker { } @Override - public void process(String fileIn, String fileOut, JSONObject param) { + public void process(String fileIn, String fileOut, JSONObject options) { + String inputformat = "text"; try { - FileInputStream fstream = new FileInputStream(fileIn); + if (options.has("format")) { + inputformat = options.getString("format"); + } + InputStreamReader reader = null; + ZipFile zipFile = null; + if (inputformat.equals("spacy")) { + try { + zipFile = new ZipFile(fileIn); + reader = new InputStreamReader(textEditor.spacy.load(zipFile), StandardCharsets.UTF_8); + } catch (IOException e) { + System.out.println("Problems reading zip file!" + e.getStackTrace()); + throw e; + } + } else { + FileInputStream fstream = new FileInputStream(fileIn); + reader = new InputStreamReader(fstream, StandardCharsets.UTF_8); + } + Writer out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(fileOut), StandardCharsets.UTF_8)); StringBuilder sb = new StringBuilder(); - try (BufferedReader br = new BufferedReader( - new InputStreamReader(fstream, StandardCharsets.UTF_8))) { + try (BufferedReader br = new BufferedReader(reader)) { String line = null; while ((line = br.readLine()) != null) { try { @@ -62,6 +80,10 @@ public class Speller extends Worker { .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception); } } + reader.close(); + if (inputformat.equals("spacy")) { + zipFile.close(); + } } try { @@ -72,7 +94,7 @@ public class Speller extends Worker { } catch (IOException exception) { Logger.getLogger(Speller.class.getName()) - .log(Level.SEVERE, "Problems with writing: " + fileOut, exception); + .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception); } } } diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 9239c73..66de742 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -1,6 +1,5 @@ package pl.clarin.speller; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.List; import org.languagetool.JLanguageTool; @@ -11,11 +10,6 @@ public class TextEdit { /**Class that corrects input text.*/ public String edit(String inputString, JLanguageTool langTool) throws Exception { - try { - spacy.process(inputString); - } catch (Exception e) { - System.out.println("Spacy did not process the input correctly! : " + e.getMessage()); - } char[] buffer = inputString.toCharArray(); StringBuilder sb = new StringBuilder(); List<RuleMatch> matches = langTool.check(inputString); diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 819e612..0000000 --- a/tox.ini +++ /dev/null @@ -1,44 +0,0 @@ -[tox] -envlist = pep8,docstyle -skipsdist = True - -[testenv:pep8] -deps = - flake8 -basepython = python3 -commands = - flake8 {posargs} - -[testenv:docstyle] -deps = - pydocstyle -basepython = python3 -commands = - pydocstyle --verbose {posargs} - -[flake8] -# W504 skipped because it is overeager and unnecessary -ignore = W504 -show-source = True -exclude = .git,.venv,.tox,dist,doc,*egg,build,venv -import-order-style = pep8 -max-line-length = 80 - - -[pydocstyle] -# D104 Missing docstring in public package -# D203 1 blank line required before class docstring -# D213 Multi-line docstring summary should start at the second line -# D214 Section is over-indented -# D215 Section underline is over-indented -# D401 First line should be in imperative mood; try rephrasing -# D405 Section name should be properly capitalized -# D406 Section name should end with a newline -# D407 Missing dashed underline after section -# D408 Section underline should be in the line following the section’s name -# D409 Section underline should match the length of its name -# D410 Missing blank line after section -# D411 Missing blank line before section -ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 -match-dir = ^(?!\.tox|venv).* -match = ^(?!setup).*\.py \ No newline at end of file -- GitLab From 9bbe45f9f8619e584140063e192349c30b50425c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Koptyra?= <rikain@e-science.pl> Date: Wed, 5 Aug 2020 11:47:14 +0200 Subject: [PATCH 33/35] Added unloading spacy. Changed temporary JavaDocs. --- module/src/main/java/pl/clarin/speller/SpaCy.java | 12 ++++++++++-- module/src/main/java/pl/clarin/speller/Speller.java | 4 ++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java index 0fdc61a..b4b92cd 100644 --- a/module/src/main/java/pl/clarin/speller/SpaCy.java +++ b/module/src/main/java/pl/clarin/speller/SpaCy.java @@ -16,10 +16,18 @@ public class SpaCy { ArrayList<ArrayList<Integer>> properNounArray = null; ArrayList<ArrayList<Integer>> foreignSentenceArray = null; + /**Checks if spaCy correctly loaded input.*/ public boolean isLoaded() { return loaded; } + /**Unloading input from file.*/ + public void unload() { + properNounArray = null; + foreignSentenceArray = null; + loaded = false; + } + /**Loading spaCy input files.*/ public InputStream load(ZipFile zipFile) throws IOException { InputStream inputText = null; @@ -65,7 +73,7 @@ public class SpaCy { return array; } - /**Javadoc.*/ + /**Checks if input sentence is from a different language.*/ public boolean isForeignSentence(String inputString, int matchFrom) { boolean isForeginSent = false; if (loaded) { @@ -81,7 +89,7 @@ public class SpaCy { return isForeginSent; } - /**Javadoc.*/ + /**Checks if input sentence is from a proper noun.*/ public boolean isProperNoun(String inputString, int matchFrom) { boolean isProperNoun = false; if (!loaded) { diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 9a7b2d8..39d0e54 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -83,6 +83,7 @@ public class Speller extends Worker { reader.close(); if (inputformat.equals("spacy")) { zipFile.close(); + textEditor.spacy.unload(); } } @@ -96,5 +97,8 @@ public class Speller extends Worker { Logger.getLogger(Speller.class.getName()) .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception); } + if (inputformat.equals("spacy")) { + textEditor.spacy.unload(); + } } } -- GitLab From d365a5a7148fe38c575bf780e2cd9da778a7304c Mon Sep 17 00:00:00 2001 From: Bartlomiej Koptyra <bartlomiej.koptyra@gmail.com> Date: Wed, 26 Aug 2020 13:35:25 +0200 Subject: [PATCH 34/35] Added procettors to some variables maybe it will fiz the internal error (no idea). --- .gitlab-ci.yml | 48 +-- Dockerfile | 44 +- config.ini | 22 +- docker-compose.yml | 24 +- .../main/java/pl/clarin/speller/SpaCy.java | 240 +++++------ .../main/java/pl/clarin/speller/Speller.java | 204 +++++----- .../main/java/pl/clarin/speller/TextEdit.java | 384 +++++++++--------- requirements.txt | 4 +- 8 files changed, 483 insertions(+), 487 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 71b6d71..1b993af 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,24 +1,24 @@ -image: clarinpl/openjdk:8 -stages: - - check_style - - build -google_checks: - stage: check_style - script: - - cd module - - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true -build_image: - stage: build - image: 'docker:18.09.7' - only: - - master - services: - - 'docker:18.09.7-dind' - before_script: - - '' - script: - - docker build -t clarinpl/speller . - - echo $DOCKER_PASSWORD > pass.txt - - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin - - rm pass.txt - - docker push clarinpl/speller +image: clarinpl/openjdk:8 +stages: + - check_style + - build +google_checks: + stage: check_style + script: + - cd module + - mvn checkstyle:check -Dcheckstyle.config.location=google_checks.xml -Dcheckstyle.violationSeverity=warning -Dcheckstyle.checkstyle.consoleOutput=true +build_image: + stage: build + image: 'docker:18.09.7' + only: + - master + services: + - 'docker:18.09.7-dind' + before_script: + - '' + script: + - docker build -t clarinpl/speller . + - echo $DOCKER_PASSWORD > pass.txt + - cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin + - rm pass.txt + - docker push clarinpl/speller diff --git a/Dockerfile b/Dockerfile index ed8e888..ee7dcfe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,23 @@ -FROM clarinpl/openjdk:8 as builder - -LABEL application="Speller" -LABEL description="Client - Workers - correcting mistakes in sentances in txt files" -LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology" - -WORKDIR /home/install -RUN cd nlp.worker && \ - mvn clean && \ - mvn install - -WORKDIR /home/install -COPY ./module ./module -RUN cd module && \ - mvn clean && \ - mvn install - -FROM clarinpl/openjdk-jre:8 -WORKDIR /home/worker -COPY ./entrypoint.sh ./entrypoint.sh -RUN ["chmod", "+x", "./entrypoint.sh"] -COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar . +FROM clarinpl/openjdk:8 as builder + +LABEL application="Speller" +LABEL description="Client - Workers - correcting mistakes in sentances in txt files" +LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology" + +WORKDIR /home/install +RUN cd nlp.worker && \ + mvn clean && \ + mvn install + +WORKDIR /home/install +COPY ./module ./module +RUN cd module && \ + mvn clean && \ + mvn install + +FROM clarinpl/openjdk-jre:8 +WORKDIR /home/worker +COPY ./entrypoint.sh ./entrypoint.sh +RUN ["chmod", "+x", "./entrypoint.sh"] +COPY --from=builder /home/install/module/target/nlp.worker.speller-1.0-SNAPSHOT.jar . CMD ["./entrypoint.sh"] \ No newline at end of file diff --git a/config.ini b/config.ini index 856bea7..45ad6bc 100644 --- a/config.ini +++ b/config.ini @@ -1,11 +1,11 @@ -[service] -tool = speller - -root = /samba/requests/ -rabbit_host = rabbitmq -rabbit_user = test -rabbit_password = test -queue_prefix = nlp_ - -[tool] -workers_number = 1 +[service] +tool = speller + +root = /samba/requests/ +rabbit_host = rabbitmq +rabbit_user = test +rabbit_password = test +queue_prefix = nlp_ + +[tool] +workers_number = 1 diff --git a/docker-compose.yml b/docker-compose.yml index c28c48b..7a93952 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,12 +1,12 @@ -version: '3' -services: - speller: - container_name: clarin_speller - build: ./ - volumes: - - '/samba:/samba' - - './config.ini:/home/worker/config.ini' - working_dir: /home/worker/ - entrypoint: - - ./entrypoint.sh - restart: always +version: '3' +services: + speller: + container_name: clarin_speller + build: ./ + volumes: + - '/samba:/samba' + - './config.ini:/home/worker/config.ini' + working_dir: /home/worker/ + entrypoint: + - ./entrypoint.sh + restart: always diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java index b4b92cd..82fa6c6 100644 --- a/module/src/main/java/pl/clarin/speller/SpaCy.java +++ b/module/src/main/java/pl/clarin/speller/SpaCy.java @@ -1,120 +1,120 @@ -package pl.clarin.speller; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Enumeration; -import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipFile; - -public class SpaCy { - /**Class that handles spacy input.*/ - boolean loaded = false; - ArrayList<ArrayList<Integer>> properNounArray = null; - ArrayList<ArrayList<Integer>> foreignSentenceArray = null; - - /**Checks if spaCy correctly loaded input.*/ - public boolean isLoaded() { - return loaded; - } - - /**Unloading input from file.*/ - public void unload() { - properNounArray = null; - foreignSentenceArray = null; - loaded = false; - } - - /**Loading spaCy input files.*/ - public InputStream load(ZipFile zipFile) throws IOException { - InputStream inputText = null; - for (Enumeration<? extends ZipEntry> entries = zipFile.entries(); entries.hasMoreElements();) { - ZipEntry entry = entries.nextElement(); - if (!entry.isDirectory()) { - if (entry.getName().equals("text.txt")) { - inputText = zipFile.getInputStream(entry); - } else if (entry.getName().equals("proper_nouns.txt")) { - InputStream in = zipFile.getInputStream(entry); - properNounArray = processSpacyFiles(in); - in.close(); - } else if (entry.getName().equals("foreign_sentences.txt")) { - InputStream in = zipFile.getInputStream(entry); - foreignSentenceArray = processSpacyFiles(in); - in.close(); - } else { - System.out.println("Zip from spaCy contains unexpected files!"); - } - } else { - System.out.println("Zip from spaCy contains unexpected directories!"); - } - } - loaded = true; - return inputText; - } - - private static ArrayList<ArrayList<Integer>> processSpacyFiles(InputStream in) { - BufferedReader reader = new BufferedReader(new InputStreamReader(in)); - String line; - ArrayList<ArrayList<Integer>> array = new ArrayList<>(); - try { - while ((line = reader.readLine()) != null) { - String[] str = line.split(" "); - ArrayList<Integer> list = new ArrayList<>(); - list.add(Integer.parseInt(str[0])); - list.add(Integer.parseInt(str[1])); - array.add(list); - } - } catch (IOException e) { - System.out.println("The text file contains incorrect data." + e.getMessage()); - } - return array; - } - - /**Checks if input sentence is from a different language.*/ - public boolean isForeignSentence(String inputString, int matchFrom) { - boolean isForeginSent = false; - if (loaded) { - for (List<Integer> tuple : foreignSentenceArray) { - if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { - return Character.isUpperCase(inputString.charAt(matchFrom)); - } - if (matchFrom > tuple.get(1)) { - return false; - } - } - } - return isForeginSent; - } - - /**Checks if input sentence is from a proper noun.*/ - public boolean isProperNoun(String inputString, int matchFrom) { - boolean isProperNoun = false; - if (!loaded) { - if (Character.isUpperCase(inputString.charAt(matchFrom))) { - int i = matchFrom - 1; - while (i > 0) { - if (Character.isLetterOrDigit(inputString.charAt(i))) { - isProperNoun = true; - break; - } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) { - break; - } - --i; - } - } - } else { - for (List<Integer> tuple : properNounArray) { - if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { - return Character.isUpperCase(inputString.charAt(matchFrom)); - } - if (matchFrom > tuple.get(1)) { - return false; - } - } - } - return isProperNoun; - } -} +package pl.clarin.speller; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.List; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +public class SpaCy { + /**Class that handles spacy input.*/ + private boolean loaded = false; + private ArrayList<ArrayList<Integer>> properNounArray = null; + private ArrayList<ArrayList<Integer>> foreignSentenceArray = null; + + /**Checks if spaCy correctly loaded input.*/ + public boolean isLoaded() { + return loaded; + } + + /**Unloading input from file.*/ + public void unload() { + properNounArray = null; + foreignSentenceArray = null; + loaded = false; + } + + /**Loading spaCy input files.*/ + public InputStream load(ZipFile zipFile) throws IOException { + InputStream inputText = null; + for (Enumeration<? extends ZipEntry> entries = zipFile.entries(); entries.hasMoreElements();) { + ZipEntry entry = entries.nextElement(); + if (!entry.isDirectory()) { + if (entry.getName().equals("text.txt")) { + inputText = zipFile.getInputStream(entry); + } else if (entry.getName().equals("proper_nouns.txt")) { + InputStream in = zipFile.getInputStream(entry); + properNounArray = processSpacyFiles(in); + in.close(); + } else if (entry.getName().equals("foreign_sentences.txt")) { + InputStream in = zipFile.getInputStream(entry); + foreignSentenceArray = processSpacyFiles(in); + in.close(); + } else { + System.out.println("Zip from spaCy contains unexpected files!"); + } + } else { + System.out.println("Zip from spaCy contains unexpected directories!"); + } + } + loaded = true; + return inputText; + } + + private ArrayList<ArrayList<Integer>> processSpacyFiles(InputStream in) { + BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + String line; + ArrayList<ArrayList<Integer>> array = new ArrayList<>(); + try { + while ((line = reader.readLine()) != null) { + String[] str = line.split(" "); + ArrayList<Integer> list = new ArrayList<>(); + list.add(Integer.parseInt(str[0])); + list.add(Integer.parseInt(str[1])); + array.add(list); + } + } catch (IOException e) { + System.out.println("The text file contains incorrect data." + e.getMessage()); + } + return array; + } + + /**Checks if input sentence is from a different language.*/ + public boolean isForeignSentence(String inputString, int matchFrom) { + boolean isForeginSent = false; + if (loaded) { + for (List<Integer> tuple : foreignSentenceArray) { + if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { + return Character.isUpperCase(inputString.charAt(matchFrom)); + } + if (matchFrom > tuple.get(1)) { + return false; + } + } + } + return isForeginSent; + } + + /**Checks if input sentence is from a proper noun.*/ + public boolean isProperNoun(String inputString, int matchFrom) { + boolean isProperNoun = false; + if (!loaded) { + if (Character.isUpperCase(inputString.charAt(matchFrom))) { + int i = matchFrom - 1; + while (i > 0) { + if (Character.isLetterOrDigit(inputString.charAt(i))) { + isProperNoun = true; + break; + } else if (".!?\n\t".indexOf(inputString.charAt(i)) != -1) { + break; + } + --i; + } + } + } else { + for (List<Integer> tuple : properNounArray) { + if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { + return Character.isUpperCase(inputString.charAt(matchFrom)); + } + if (matchFrom > tuple.get(1)) { + return false; + } + } + } + return isProperNoun; + } +} diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 39d0e54..2e01af8 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -1,104 +1,100 @@ -package pl.clarin.speller; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.charset.StandardCharsets; -import java.util.logging.Level; -import java.util.logging.Logger; -import java.util.zip.ZipFile; -import org.json.JSONObject; -import org.languagetool.JLanguageTool; -import org.languagetool.language.Polish; -import pl.clarin.ws.worker.IniFile; -import pl.clarin.ws.worker.Service; -import pl.clarin.ws.worker.Worker; - -public class Speller extends Worker { - static Polish lang = null; - JLanguageTool langTool = null; - TextEdit textEditor = null; - - public static void main(String[] args) { - new Service<>(Speller.class); - } - - // init object for each thread - @Override - public void init() throws Exception { - langTool = new JLanguageTool(lang); - textEditor = new TextEdit(); - } - - // init objects shared by threads - @Override - public void static_init(IniFile init) throws Exception { - lang = new Polish(); - } - - @Override - public void process(String fileIn, String fileOut, JSONObject options) { - String inputformat = "text"; - try { - if (options.has("format")) { - inputformat = options.getString("format"); - } - InputStreamReader reader = null; - ZipFile zipFile = null; - if (inputformat.equals("spacy")) { - try { - zipFile = new ZipFile(fileIn); - reader = new InputStreamReader(textEditor.spacy.load(zipFile), StandardCharsets.UTF_8); - } catch (IOException e) { - System.out.println("Problems reading zip file!" + e.getStackTrace()); - throw e; - } - } else { - FileInputStream fstream = new FileInputStream(fileIn); - reader = new InputStreamReader(fstream, StandardCharsets.UTF_8); - } - - - Writer out = new BufferedWriter(new OutputStreamWriter( - new FileOutputStream(fileOut), StandardCharsets.UTF_8)); - - StringBuilder sb = new StringBuilder(); - - try (BufferedReader br = new BufferedReader(reader)) { - String line = null; - while ((line = br.readLine()) != null) { - try { - String correctedLine = textEditor.edit(line, langTool); - sb.append(correctedLine).append('\n'); - } catch (Exception exception) { - Logger.getLogger(Speller.class.getName()) - .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception); - } - } - reader.close(); - if (inputformat.equals("spacy")) { - zipFile.close(); - textEditor.spacy.unload(); - } - } - - try { - out.write(sb.toString()); - } finally { - out.close(); - } - - } catch (IOException exception) { - Logger.getLogger(Speller.class.getName()) - .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception); - } - if (inputformat.equals("spacy")) { - textEditor.spacy.unload(); - } - } -} +package pl.clarin.speller; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.zip.ZipFile; +import org.json.JSONObject; +import org.languagetool.JLanguageTool; +import org.languagetool.language.Polish; +import pl.clarin.ws.worker.IniFile; +import pl.clarin.ws.worker.Service; +import pl.clarin.ws.worker.Worker; + +public class Speller extends Worker { + static Polish lang = null; + private JLanguageTool langTool = null; + private TextEdit textEditor = null; + + public static void main(String[] args) { + new Service<>(Speller.class); + } + + // init object for each thread + @Override + public void init() throws Exception { + langTool = new JLanguageTool(lang); + textEditor = new TextEdit(); + } + + // init objects shared by threads + @Override + public void static_init(IniFile init) throws Exception { + lang = new Polish(); + } + + @Override + public void process(String fileIn, String fileOut, JSONObject options) { + String inputformat = "text"; + try { + if (options.has("format")) { + inputformat = options.getString("format"); + } + InputStreamReader reader = null; + ZipFile zipFile = null; + if (inputformat.equals("spacy")) { + try { + zipFile = new ZipFile(fileIn); + reader = new InputStreamReader(textEditor.spacy.load(zipFile), StandardCharsets.UTF_8); + } catch (IOException e) { + System.out.println("Problems reading zip file!" + e.getStackTrace()); + throw e; + } + } else { + FileInputStream fstream = new FileInputStream(fileIn); + reader = new InputStreamReader(fstream, StandardCharsets.UTF_8); + } + + + Writer out = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(fileOut), StandardCharsets.UTF_8)); + + StringBuilder sb = new StringBuilder(); + + try (BufferedReader br = new BufferedReader(reader)) { + String line = null; + while ((line = br.readLine()) != null) { + try { + String correctedLine = textEditor.edit(line, langTool); + sb.append(correctedLine).append('\n'); + } catch (Exception exception) { + Logger.getLogger(Speller.class.getName()) + .log(Level.SEVERE, "Problems with TextEdit class: " + fileOut, exception); + } + } + reader.close(); + } + + try { + out.write(sb.toString()); + } finally { + out.close(); + } + + } catch (IOException exception) { + Logger.getLogger(Speller.class.getName()) + .log(Level.SEVERE, "Problems with reading or writing: " + fileOut, exception); + } + if (inputformat.equals("spacy")) { + textEditor.spacy.unload(); + } + } +} diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 66de742..45847f0 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -1,192 +1,192 @@ -package pl.clarin.speller; - -import java.nio.charset.StandardCharsets; -import java.util.List; -import org.languagetool.JLanguageTool; -import org.languagetool.rules.RuleMatch; - -public class TextEdit { - SpaCy spacy = new SpaCy(); - /**Class that corrects input text.*/ - - public String edit(String inputString, JLanguageTool langTool) throws Exception { - char[] buffer = inputString.toCharArray(); - StringBuilder sb = new StringBuilder(); - List<RuleMatch> matches = langTool.check(inputString); - int stringIndex = 0; - for (RuleMatch match : matches) { - int matchFrom = match.getFromPos(); - int matchTo = match.getToPos(); - - if (matchFrom > stringIndex) { - sb.append(buffer, stringIndex, (matchFrom - stringIndex)); - } else if (matchFrom < stringIndex) { - throw new Exception("RuleMatches are not sorted for some reason."); - } - String matchingWord; - if (match.getSuggestedReplacements().isEmpty()) { - matchingWord = inputString.substring(matchFrom, matchTo); - } else { - boolean change = toChange(inputString,matchFrom,matchTo); - if (change) { - matchingWord = match.getSuggestedReplacements().get(0); - } else { - matchingWord = inputString.substring(matchFrom, matchTo); - } - } - sb.append(matchingWord.toCharArray(), 0, (matchingWord.length())); - stringIndex = matchTo; - } - sb.append(buffer, stringIndex, (buffer.length - stringIndex)); - return sb.toString(); - } - - private boolean toChange(String inputString, int matchFrom, int matchTo) { - return !isForeignSentence(inputString, matchFrom) - && !isProperNoun(inputString, matchFrom) - && !isAcronym(inputString, matchFrom, matchTo) - && !isFileOrExtension(inputString, matchFrom, matchTo) - && !checkFirstLetter(inputString, matchFrom) - && !isNotPolish(inputString, matchFrom, matchTo) - && !isSurname(inputString, matchFrom, matchTo); - } - - private boolean isSurname(String inputString, int matchFrom, int matchTo) { - boolean isSurname = false; - int i = matchFrom; - if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i)) - && Character.isWhitespace(inputString.charAt(i - 1)) - && (inputString.charAt(i - 2)) == '.' - && Character.isUpperCase(inputString.charAt(i - 3))) { - i = i - 3; - while (i > 0 && Character.isUpperCase(inputString.charAt(i))) { - --i; - } - if (i == 0 || Character.isWhitespace(inputString.charAt(i))) { - i = matchFrom + 1; - isSurname = true; - while (i < matchTo) { - if (!Character.isLowerCase(inputString.charAt(i)) - && !(i == (matchTo - 1) && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)) { - isSurname = false; - } - ++i; - } - } - } - return isSurname; - } - - private boolean isForeignSentence(String inputString, int matchFrom) { - return spacy.isForeignSentence(inputString, matchFrom); - } - - private boolean isProperNoun(String inputString, int matchFrom) { - return spacy.isProperNoun(inputString, matchFrom); - } - - private boolean isAcronym(String inputString, int matchFrom, int matchTo) { - double breakPoint = 0.6; - boolean isAcronym = false; - int lowerCaseLetters = 0; - if (Character.isUpperCase(inputString.charAt(matchFrom)) - || Character.isDigit(inputString.charAt(matchFrom))) { - int i = matchFrom + 1; - while (i < matchTo) { - if (Character.isLowerCase(inputString.charAt(i))) { - ++lowerCaseLetters; - } - ++i; - } - if (((matchTo - matchFrom) * breakPoint) >= lowerCaseLetters) { - isAcronym = true; - } - } - return isAcronym; - } - - private boolean isNotPolish(String inputString, int matchFrom, int matchTo) { - String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ"; - boolean isNotPolish = false; - for (int i = matchFrom; i < matchTo; ++i) { - if (!isAscii(inputString.charAt(i))) { - if (polishLetterSet.indexOf(inputString.charAt(i)) == -1) { - isNotPolish = true; - break; - } - } - } - return isNotPolish; - } - - public boolean isAscii(Character v) { - return StandardCharsets.US_ASCII.newEncoder().canEncode(v); - } - - private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) { - String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG" - + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()"; - int extensionLength = 4; - boolean isFileOrExtension = false; - boolean dot = false; - int lastIndex = lastIndex(inputString,matchFrom,matchTo); - int firstIndex = firstIndex(inputString,matchFrom); - int i = firstIndex + 1; - while (i < lastIndex) { - char currentChar = inputString.charAt(i); - if (characterList.indexOf(currentChar) == -1) { - if (currentChar == '.') { - dot = true; - ++i; - int j = extensionLength; - while (i < lastIndex) { - currentChar = inputString.charAt(i); - if (characterList.indexOf(currentChar) == -1) { - if (currentChar == '.') { - j = extensionLength; - } else if (!(i == lastIndex - 1 && ".?!,".indexOf(currentChar) != -1)) { - break; - } - } - --j; - if (j < 0) { - break; - } - ++i; - } - } - break; - } - ++i; - } - if (dot && i == lastIndex) { - isFileOrExtension = true; - } - return isFileOrExtension; - } - - private int lastIndex(String inputString, int matchFrom, int matchTo) { - int i = matchFrom; - while (i < inputString.length() - && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) { - ++i; - } - return i; - } - - private int firstIndex(String inputString, int matchFrom) { - int i = matchFrom - 1; - if (i >= 0 && inputString.charAt(i) == '.') { - while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) { - --i; - } - } - return i; - } - - private boolean checkFirstLetter(String inputString, int at) { - String leaveCharacters = "§"; - return leaveCharacters.indexOf(inputString.charAt(at)) != -1; - } - -} +package pl.clarin.speller; + +import java.nio.charset.StandardCharsets; +import java.util.List; +import org.languagetool.JLanguageTool; +import org.languagetool.rules.RuleMatch; + +public class TextEdit { + public SpaCy spacy = new SpaCy(); + /**Class that corrects input text.*/ + + public String edit(String inputString, JLanguageTool langTool) throws Exception { + char[] buffer = inputString.toCharArray(); + StringBuilder sb = new StringBuilder(); + List<RuleMatch> matches = langTool.check(inputString); + int stringIndex = 0; + for (RuleMatch match : matches) { + int matchFrom = match.getFromPos(); + int matchTo = match.getToPos(); + + if (matchFrom > stringIndex) { + sb.append(buffer, stringIndex, (matchFrom - stringIndex)); + } else if (matchFrom < stringIndex) { + throw new Exception("RuleMatches are not sorted for some reason."); + } + String matchingWord; + if (match.getSuggestedReplacements().isEmpty()) { + matchingWord = inputString.substring(matchFrom, matchTo); + } else { + boolean change = toChange(inputString,matchFrom,matchTo); + if (change) { + matchingWord = match.getSuggestedReplacements().get(0); + } else { + matchingWord = inputString.substring(matchFrom, matchTo); + } + } + sb.append(matchingWord.toCharArray(), 0, (matchingWord.length())); + stringIndex = matchTo; + } + sb.append(buffer, stringIndex, (buffer.length - stringIndex)); + return sb.toString(); + } + + private boolean toChange(String inputString, int matchFrom, int matchTo) { + return !isForeignSentence(inputString, matchFrom) + && !isProperNoun(inputString, matchFrom) + && !isAcronym(inputString, matchFrom, matchTo) + && !isFileOrExtension(inputString, matchFrom, matchTo) + && !checkFirstLetter(inputString, matchFrom) + && !isNotPolish(inputString, matchFrom, matchTo) + && !isSurname(inputString, matchFrom, matchTo); + } + + private boolean isSurname(String inputString, int matchFrom, int matchTo) { + boolean isSurname = false; + int i = matchFrom; + if ((i - 4) > 0 && Character.isUpperCase(inputString.charAt(i)) + && Character.isWhitespace(inputString.charAt(i - 1)) + && (inputString.charAt(i - 2)) == '.' + && Character.isUpperCase(inputString.charAt(i - 3))) { + i = i - 3; + while (i > 0 && Character.isUpperCase(inputString.charAt(i))) { + --i; + } + if (i == 0 || Character.isWhitespace(inputString.charAt(i))) { + i = matchFrom + 1; + isSurname = true; + while (i < matchTo) { + if (!Character.isLowerCase(inputString.charAt(i)) + && !(i == (matchTo - 1) && ".!?,\n\t".indexOf(inputString.charAt(i)) == -1)) { + isSurname = false; + } + ++i; + } + } + } + return isSurname; + } + + private boolean isForeignSentence(String inputString, int matchFrom) { + return spacy.isForeignSentence(inputString, matchFrom); + } + + private boolean isProperNoun(String inputString, int matchFrom) { + return spacy.isProperNoun(inputString, matchFrom); + } + + private boolean isAcronym(String inputString, int matchFrom, int matchTo) { + double breakPoint = 0.6; + boolean isAcronym = false; + int lowerCaseLetters = 0; + if (Character.isUpperCase(inputString.charAt(matchFrom)) + || Character.isDigit(inputString.charAt(matchFrom))) { + int i = matchFrom + 1; + while (i < matchTo) { + if (Character.isLowerCase(inputString.charAt(i))) { + ++lowerCaseLetters; + } + ++i; + } + if (((matchTo - matchFrom) * breakPoint) >= lowerCaseLetters) { + isAcronym = true; + } + } + return isAcronym; + } + + private boolean isNotPolish(String inputString, int matchFrom, int matchTo) { + String polishLetterSet = "ąĄćĆęĘłŁńŃóÓśŚźŹżŻ"; + boolean isNotPolish = false; + for (int i = matchFrom; i < matchTo; ++i) { + if (!isAscii(inputString.charAt(i))) { + if (polishLetterSet.indexOf(inputString.charAt(i)) == -1) { + isNotPolish = true; + break; + } + } + } + return isNotPolish; + } + + public boolean isAscii(Character v) { + return StandardCharsets.US_ASCII.newEncoder().canEncode(v); + } + + private boolean isFileOrExtension(String inputString, int matchFrom, int matchTo) { + String characterList = "abcdefghijklmnopqrstuvwxyzABCDEFG" + + "HIJKLMNOPQRSTUVWXYZ0123456789_-^=+~()"; + int extensionLength = 4; + boolean isFileOrExtension = false; + boolean dot = false; + int lastIndex = lastIndex(inputString,matchFrom,matchTo); + int firstIndex = firstIndex(inputString,matchFrom); + int i = firstIndex + 1; + while (i < lastIndex) { + char currentChar = inputString.charAt(i); + if (characterList.indexOf(currentChar) == -1) { + if (currentChar == '.') { + dot = true; + ++i; + int j = extensionLength; + while (i < lastIndex) { + currentChar = inputString.charAt(i); + if (characterList.indexOf(currentChar) == -1) { + if (currentChar == '.') { + j = extensionLength; + } else if (!(i == lastIndex - 1 && ".?!,".indexOf(currentChar) != -1)) { + break; + } + } + --j; + if (j < 0) { + break; + } + ++i; + } + } + break; + } + ++i; + } + if (dot && i == lastIndex) { + isFileOrExtension = true; + } + return isFileOrExtension; + } + + private int lastIndex(String inputString, int matchFrom, int matchTo) { + int i = matchFrom; + while (i < inputString.length() + && (!Character.isWhitespace(inputString.charAt(i)) || i < matchTo)) { + ++i; + } + return i; + } + + private int firstIndex(String inputString, int matchFrom) { + int i = matchFrom - 1; + if (i >= 0 && inputString.charAt(i) == '.') { + while (i >= 0 && !Character.isWhitespace(inputString.charAt(i))) { + --i; + } + } + return i; + } + + private boolean checkFirstLetter(String inputString, int at) { + String leaveCharacters = "§"; + return leaveCharacters.indexOf(inputString.charAt(at)) != -1; + } + +} diff --git a/requirements.txt b/requirements.txt index 1c9e461..2136c37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -jep -spacy +jep +spacy spacy-langdetect \ No newline at end of file -- GitLab From 1cf03999de57021e978e08ef85b32f3f30e202d6 Mon Sep 17 00:00:00 2001 From: Bartlomiej Koptyra <bartlomiej.koptyra@gmail.com> Date: Mon, 31 Aug 2020 13:36:48 +0200 Subject: [PATCH 35/35] Spacy method is now handled correctly. --- config.ini | 2 +- entrypoint.sh | 2 +- .../main/java/pl/clarin/speller/SpaCy.java | 26 ++++++++++--------- .../main/java/pl/clarin/speller/Speller.java | 5 ++-- .../main/java/pl/clarin/speller/TextEdit.java | 18 ++++++------- 5 files changed, 28 insertions(+), 25 deletions(-) diff --git a/config.ini b/config.ini index 45ad6bc..b649cb3 100644 --- a/config.ini +++ b/config.ini @@ -8,4 +8,4 @@ rabbit_password = test queue_prefix = nlp_ [tool] -workers_number = 1 +workers_number = 12 diff --git a/entrypoint.sh b/entrypoint.sh index cefa313..a25740c 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,3 +1,3 @@ #!/bin/sh cd /home/worker -java -jar nlp.worker.speller-1.0-SNAPSHOT.jar \ No newline at end of file +java -jar nlp.worker.speller-1.0-SNAPSHOT.jar diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java index 82fa6c6..4dfbc50 100644 --- a/module/src/main/java/pl/clarin/speller/SpaCy.java +++ b/module/src/main/java/pl/clarin/speller/SpaCy.java @@ -61,11 +61,13 @@ public class SpaCy { ArrayList<ArrayList<Integer>> array = new ArrayList<>(); try { while ((line = reader.readLine()) != null) { - String[] str = line.split(" "); - ArrayList<Integer> list = new ArrayList<>(); - list.add(Integer.parseInt(str[0])); - list.add(Integer.parseInt(str[1])); - array.add(list); + if (!line.equals("")) { + String[] str = line.split(" "); + ArrayList<Integer> list = new ArrayList<>(); + list.add(Integer.parseInt(str[0])); + list.add(Integer.parseInt(str[1])); + array.add(list); + } } } catch (IOException e) { System.out.println("The text file contains incorrect data." + e.getMessage()); @@ -74,14 +76,14 @@ public class SpaCy { } /**Checks if input sentence is from a different language.*/ - public boolean isForeignSentence(String inputString, int matchFrom) { + public boolean isForeignSentence(String inputString, int idx, int matchFrom) { boolean isForeginSent = false; if (loaded) { for (List<Integer> tuple : foreignSentenceArray) { - if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { - return Character.isUpperCase(inputString.charAt(matchFrom)); + if (matchFrom >= (tuple.get(0) - idx) && matchFrom < (tuple.get(1) - idx)) { + return true; } - if (matchFrom > tuple.get(1)) { + if ((tuple.get(1) - idx) > matchFrom) { return false; } } @@ -90,7 +92,7 @@ public class SpaCy { } /**Checks if input sentence is from a proper noun.*/ - public boolean isProperNoun(String inputString, int matchFrom) { + public boolean isProperNoun(String inputString, int idx, int matchFrom) { boolean isProperNoun = false; if (!loaded) { if (Character.isUpperCase(inputString.charAt(matchFrom))) { @@ -107,10 +109,10 @@ public class SpaCy { } } else { for (List<Integer> tuple : properNounArray) { - if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { + if (matchFrom >= (tuple.get(0) - idx) && matchFrom < (tuple.get(1) - idx)) { return Character.isUpperCase(inputString.charAt(matchFrom)); } - if (matchFrom > tuple.get(1)) { + if ((tuple.get(1) - idx) > matchFrom) { return false; } } diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 2e01af8..c26afdb 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -63,7 +63,6 @@ public class Speller extends Worker { reader = new InputStreamReader(fstream, StandardCharsets.UTF_8); } - Writer out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(fileOut), StandardCharsets.UTF_8)); @@ -71,9 +70,11 @@ public class Speller extends Worker { try (BufferedReader br = new BufferedReader(reader)) { String line = null; + int idx = 0; while ((line = br.readLine()) != null) { try { - String correctedLine = textEditor.edit(line, langTool); + String correctedLine = textEditor.edit(line, idx, langTool); + idx = idx + line.length() + "\n".length(); sb.append(correctedLine).append('\n'); } catch (Exception exception) { Logger.getLogger(Speller.class.getName()) diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 45847f0..fb39132 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -9,7 +9,7 @@ public class TextEdit { public SpaCy spacy = new SpaCy(); /**Class that corrects input text.*/ - public String edit(String inputString, JLanguageTool langTool) throws Exception { + public String edit(String inputString, int idx, JLanguageTool langTool) throws Exception { char[] buffer = inputString.toCharArray(); StringBuilder sb = new StringBuilder(); List<RuleMatch> matches = langTool.check(inputString); @@ -27,7 +27,7 @@ public class TextEdit { if (match.getSuggestedReplacements().isEmpty()) { matchingWord = inputString.substring(matchFrom, matchTo); } else { - boolean change = toChange(inputString,matchFrom,matchTo); + boolean change = toChange(inputString, idx, matchFrom, matchTo); if (change) { matchingWord = match.getSuggestedReplacements().get(0); } else { @@ -41,9 +41,9 @@ public class TextEdit { return sb.toString(); } - private boolean toChange(String inputString, int matchFrom, int matchTo) { - return !isForeignSentence(inputString, matchFrom) - && !isProperNoun(inputString, matchFrom) + private boolean toChange(String inputString, int idx, int matchFrom, int matchTo) { + return !isForeignSentence(inputString, idx, matchFrom) + && !isProperNoun(inputString, idx, matchFrom) && !isAcronym(inputString, matchFrom, matchTo) && !isFileOrExtension(inputString, matchFrom, matchTo) && !checkFirstLetter(inputString, matchFrom) @@ -77,12 +77,12 @@ public class TextEdit { return isSurname; } - private boolean isForeignSentence(String inputString, int matchFrom) { - return spacy.isForeignSentence(inputString, matchFrom); + private boolean isForeignSentence(String inputString, int idx, int matchFrom) { + return spacy.isForeignSentence(inputString, idx, matchFrom); } - private boolean isProperNoun(String inputString, int matchFrom) { - return spacy.isProperNoun(inputString, matchFrom); + private boolean isProperNoun(String inputString, int idx, int matchFrom) { + return spacy.isProperNoun(inputString, idx, matchFrom); } private boolean isAcronym(String inputString, int matchFrom, int matchTo) { -- GitLab