diff --git a/config.ini b/config.ini index 45ad6bce768d4b23d037b8594b3f8b0c290f05ea..b649cb37fd392fcaaf61c54b33229eaacdf04d45 100644 --- a/config.ini +++ b/config.ini @@ -8,4 +8,4 @@ rabbit_password = test queue_prefix = nlp_ [tool] -workers_number = 1 +workers_number = 12 diff --git a/entrypoint.sh b/entrypoint.sh index cefa313c5facb789eff87ef2d3ff4cbddcd3a25e..a25740c8101ee284b5f5b9ac1481967f84cda6b4 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,3 +1,3 @@ #!/bin/sh cd /home/worker -java -jar nlp.worker.speller-1.0-SNAPSHOT.jar \ No newline at end of file +java -jar nlp.worker.speller-1.0-SNAPSHOT.jar diff --git a/module/src/main/java/pl/clarin/speller/SpaCy.java b/module/src/main/java/pl/clarin/speller/SpaCy.java index 82fa6c6ccbd3e72b429f0f64a3b7ed2843fed4b0..4dfbc50ef0349354ad5f9180583207ebb484529d 100644 --- a/module/src/main/java/pl/clarin/speller/SpaCy.java +++ b/module/src/main/java/pl/clarin/speller/SpaCy.java @@ -61,11 +61,13 @@ public class SpaCy { ArrayList> array = new ArrayList<>(); try { while ((line = reader.readLine()) != null) { - String[] str = line.split(" "); - ArrayList list = new ArrayList<>(); - list.add(Integer.parseInt(str[0])); - list.add(Integer.parseInt(str[1])); - array.add(list); + if (!line.equals("")) { + String[] str = line.split(" "); + ArrayList list = new ArrayList<>(); + list.add(Integer.parseInt(str[0])); + list.add(Integer.parseInt(str[1])); + array.add(list); + } } } catch (IOException e) { System.out.println("The text file contains incorrect data." + e.getMessage()); @@ -74,14 +76,14 @@ public class SpaCy { } /**Checks if input sentence is from a different language.*/ - public boolean isForeignSentence(String inputString, int matchFrom) { + public boolean isForeignSentence(String inputString, int idx, int matchFrom) { boolean isForeginSent = false; if (loaded) { for (List tuple : foreignSentenceArray) { - if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { - return Character.isUpperCase(inputString.charAt(matchFrom)); + if (matchFrom >= (tuple.get(0) - idx) && matchFrom < (tuple.get(1) - idx)) { + return true; } - if (matchFrom > tuple.get(1)) { + if ((tuple.get(1) - idx) > matchFrom) { return false; } } @@ -90,7 +92,7 @@ public class SpaCy { } /**Checks if input sentence is from a proper noun.*/ - public boolean isProperNoun(String inputString, int matchFrom) { + public boolean isProperNoun(String inputString, int idx, int matchFrom) { boolean isProperNoun = false; if (!loaded) { if (Character.isUpperCase(inputString.charAt(matchFrom))) { @@ -107,10 +109,10 @@ public class SpaCy { } } else { for (List tuple : properNounArray) { - if (matchFrom >= tuple.get(0) && matchFrom < tuple.get(1)) { + if (matchFrom >= (tuple.get(0) - idx) && matchFrom < (tuple.get(1) - idx)) { return Character.isUpperCase(inputString.charAt(matchFrom)); } - if (matchFrom > tuple.get(1)) { + if ((tuple.get(1) - idx) > matchFrom) { return false; } } diff --git a/module/src/main/java/pl/clarin/speller/Speller.java b/module/src/main/java/pl/clarin/speller/Speller.java index 2e01af86af185fdaed2241282d798434cd462c59..c26afdbf058fd11efabae6a844d484ef09ff5da4 100644 --- a/module/src/main/java/pl/clarin/speller/Speller.java +++ b/module/src/main/java/pl/clarin/speller/Speller.java @@ -63,7 +63,6 @@ public class Speller extends Worker { reader = new InputStreamReader(fstream, StandardCharsets.UTF_8); } - Writer out = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(fileOut), StandardCharsets.UTF_8)); @@ -71,9 +70,11 @@ public class Speller extends Worker { try (BufferedReader br = new BufferedReader(reader)) { String line = null; + int idx = 0; while ((line = br.readLine()) != null) { try { - String correctedLine = textEditor.edit(line, langTool); + String correctedLine = textEditor.edit(line, idx, langTool); + idx = idx + line.length() + "\n".length(); sb.append(correctedLine).append('\n'); } catch (Exception exception) { Logger.getLogger(Speller.class.getName()) diff --git a/module/src/main/java/pl/clarin/speller/TextEdit.java b/module/src/main/java/pl/clarin/speller/TextEdit.java index 45847f0cca4e7a41b49246a96633947125fa96f2..fb39132460d294c8d79589f9e1b867a17dbe91d5 100644 --- a/module/src/main/java/pl/clarin/speller/TextEdit.java +++ b/module/src/main/java/pl/clarin/speller/TextEdit.java @@ -9,7 +9,7 @@ public class TextEdit { public SpaCy spacy = new SpaCy(); /**Class that corrects input text.*/ - public String edit(String inputString, JLanguageTool langTool) throws Exception { + public String edit(String inputString, int idx, JLanguageTool langTool) throws Exception { char[] buffer = inputString.toCharArray(); StringBuilder sb = new StringBuilder(); List matches = langTool.check(inputString); @@ -27,7 +27,7 @@ public class TextEdit { if (match.getSuggestedReplacements().isEmpty()) { matchingWord = inputString.substring(matchFrom, matchTo); } else { - boolean change = toChange(inputString,matchFrom,matchTo); + boolean change = toChange(inputString, idx, matchFrom, matchTo); if (change) { matchingWord = match.getSuggestedReplacements().get(0); } else { @@ -41,9 +41,9 @@ public class TextEdit { return sb.toString(); } - private boolean toChange(String inputString, int matchFrom, int matchTo) { - return !isForeignSentence(inputString, matchFrom) - && !isProperNoun(inputString, matchFrom) + private boolean toChange(String inputString, int idx, int matchFrom, int matchTo) { + return !isForeignSentence(inputString, idx, matchFrom) + && !isProperNoun(inputString, idx, matchFrom) && !isAcronym(inputString, matchFrom, matchTo) && !isFileOrExtension(inputString, matchFrom, matchTo) && !checkFirstLetter(inputString, matchFrom) @@ -77,12 +77,12 @@ public class TextEdit { return isSurname; } - private boolean isForeignSentence(String inputString, int matchFrom) { - return spacy.isForeignSentence(inputString, matchFrom); + private boolean isForeignSentence(String inputString, int idx, int matchFrom) { + return spacy.isForeignSentence(inputString, idx, matchFrom); } - private boolean isProperNoun(String inputString, int matchFrom) { - return spacy.isProperNoun(inputString, matchFrom); + private boolean isProperNoun(String inputString, int idx, int matchFrom) { + return spacy.isProperNoun(inputString, idx, matchFrom); } private boolean isAcronym(String inputString, int matchFrom, int matchTo) {