From 405a05dccd5d19f1e780869bebea0056125ae73d Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 30 Mar 2023 14:23:23 +0200
Subject: [PATCH 01/90] Add support & tests 4 de,es,pt,fr,ru

---
 .gitignore                           |   1 +
 pos_tagger.yaml                      |  50 ++++++
 tests/test.py                        | 236 ++++++++++++++++++++++++++-
 tests/testdata/input/pos_tagger.yaml |  50 ++++++
 4 files changed, 333 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 91d3885..f46ef3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 *__pycache__
 htmlcov
 config-test.ini
+/tests/tmp-test.py
diff --git a/pos_tagger.yaml b/pos_tagger.yaml
index 4418310..cc7bf6c 100644
--- a/pos_tagger.yaml
+++ b/pos_tagger.yaml
@@ -9,6 +9,31 @@ taggers:
         lpmn: [{"spacy":{"lang":"en"}}]
         output: json
         tagset: ud
+    de:
+      default:
+        lpmn: [{"spacy":{"lang":"de"}}]
+        output: json
+        tagset: ud
+    es:
+      default:
+        lpmn: [{"spacy":{"lang":"es"}}]
+        output: json
+        tagset: ud
+    pt:
+      default:
+        lpmn: [{"spacy":{"lang":"pt"}}]
+        output: json
+        tagset: ud
+    fr:
+      default:
+        lpmn: [{"spacy":{"lang":"fr"}}]
+        output: json
+        tagset: ud
+    ru:
+      default:
+        lpmn: [{"spacy":{"lang":"ru"}}]
+        output: json
+        tagset: ud
 ners:
     pl:
       default:
@@ -20,3 +45,28 @@ ners:
         lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
         output: json
         tagset: ud
+    de:
+      default:
+        lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    es:
+      default:
+        lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    pt:
+      default:
+        lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    fr:
+      default:
+        lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    ru:
+      default:
+        lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}]
+        output: json
+        tagset: ud
\ No newline at end of file
diff --git a/tests/test.py b/tests/test.py
index d79f410..174a0d0 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -22,7 +22,7 @@ def test_init():
     assert type(worker).__name__ == 'TaggerWorker'
 
 
-def test_base_process_file(mocker, worker, input_dir, input_file1,
+def test_base_process_file_en(mocker, worker, input_dir, input_file1,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -34,14 +34,14 @@ def test_base_process_file(mocker, worker, input_dir, input_file1,
     )
     worker.process(
         os.path.join(input_dir, input_file1),
-        {}, os.path.join(output_dir, input_file1)
+        {"lang": "en"}, os.path.join(output_dir, input_file1)
     )
     assert cmp(os.path.join(output_dir, input_file1),
                os.path.join(expected_dir, input_file1))
     os.remove(os.path.join(output_dir, input_file1))
 
 
-def test_base_process_file_small_limit(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input_file_small,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -53,7 +53,235 @@ def test_base_process_file_small_limit(mocker, worker_small, input_dir, input_fi
     )
     worker_small.process(
         os.path.join(input_dir, input_file_small),
-        {}, os.path.join(output_dir, input_file_small)
+        {"lang": "en"}, os.path.join(output_dir, input_file_small)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small),
+               os.path.join(expected_dir, input_file_small))
+    os.remove(os.path.join(output_dir, input_file_small))
+
+
+def test_base_process_file_pl(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker.process(
+        os.path.join(input_dir, input_file1),
+        {"lang": "pl"}, os.path.join(output_dir, input_file1)
+    )
+    assert cmp(os.path.join(output_dir, input_file1),
+               os.path.join(expected_dir, input_file1))
+    os.remove(os.path.join(output_dir, input_file1))
+
+
+def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input_file_small,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker_small.process(
+        os.path.join(input_dir, input_file_small),
+        {"lang": "pl"}, os.path.join(output_dir, input_file_small)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small),
+               os.path.join(expected_dir, input_file_small))
+    os.remove(os.path.join(output_dir, input_file_small))
+
+
+def test_base_process_file_de(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker.process(
+        os.path.join(input_dir, input_file1),
+        {"lang": "de"}, os.path.join(output_dir, input_file1)
+    )
+    assert cmp(os.path.join(output_dir, input_file1),
+               os.path.join(expected_dir, input_file1))
+    os.remove(os.path.join(output_dir, input_file1))
+
+
+def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input_file_small,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker_small.process(
+        os.path.join(input_dir, input_file_small),
+        {"lang": "de"}, os.path.join(output_dir, input_file_small)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small),
+               os.path.join(expected_dir, input_file_small))
+    os.remove(os.path.join(output_dir, input_file_small))
+
+
+def test_base_process_file_es(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker.process(
+        os.path.join(input_dir, input_file1),
+        {"lang": "es"}, os.path.join(output_dir, input_file1)
+    )
+    assert cmp(os.path.join(output_dir, input_file1),
+               os.path.join(expected_dir, input_file1))
+    os.remove(os.path.join(output_dir, input_file1))
+
+
+def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input_file_small,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker_small.process(
+        os.path.join(input_dir, input_file_small),
+        {"lang": "es"}, os.path.join(output_dir, input_file_small)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small),
+               os.path.join(expected_dir, input_file_small))
+    os.remove(os.path.join(output_dir, input_file_small))
+
+
+def test_base_process_file_pt(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker.process(
+        os.path.join(input_dir, input_file1),
+        {"lang": "pt"}, os.path.join(output_dir, input_file1)
+    )
+    assert cmp(os.path.join(output_dir, input_file1),
+               os.path.join(expected_dir, input_file1))
+    os.remove(os.path.join(output_dir, input_file1))
+
+
+def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input_file_small,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker_small.process(
+        os.path.join(input_dir, input_file_small),
+        {"lang": "pt"}, os.path.join(output_dir, input_file_small)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small),
+               os.path.join(expected_dir, input_file_small))
+    os.remove(os.path.join(output_dir, input_file_small))
+
+
+def test_base_process_file_fr(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker.process(
+        os.path.join(input_dir, input_file1),
+        {"lang": "fr"}, os.path.join(output_dir, input_file1)
+    )
+    assert cmp(os.path.join(output_dir, input_file1),
+               os.path.join(expected_dir, input_file1))
+    os.remove(os.path.join(output_dir, input_file1))
+
+
+def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input_file_small,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker_small.process(
+        os.path.join(input_dir, input_file_small),
+        {"lang": "fr"}, os.path.join(output_dir, input_file_small)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small),
+               os.path.join(expected_dir, input_file_small))
+    os.remove(os.path.join(output_dir, input_file_small))
+
+
+def test_base_process_file_ru(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker.process(
+        os.path.join(input_dir, input_file1),
+        {"lang": "ru"}, os.path.join(output_dir, input_file1)
+    )
+    assert cmp(os.path.join(output_dir, input_file1),
+               os.path.join(expected_dir, input_file1))
+    os.remove(os.path.join(output_dir, input_file1))
+
+
+def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input_file_small,
+                        output_dir, expected_dir):
+    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
+    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
+    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
+    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
+    worker_small.process(
+        os.path.join(input_dir, input_file_small),
+        {"lang": "ru"}, os.path.join(output_dir, input_file_small)
     )
     assert cmp(os.path.join(output_dir, input_file_small),
                os.path.join(expected_dir, input_file_small))
diff --git a/tests/testdata/input/pos_tagger.yaml b/tests/testdata/input/pos_tagger.yaml
index 4418310..cc7bf6c 100644
--- a/tests/testdata/input/pos_tagger.yaml
+++ b/tests/testdata/input/pos_tagger.yaml
@@ -9,6 +9,31 @@ taggers:
         lpmn: [{"spacy":{"lang":"en"}}]
         output: json
         tagset: ud
+    de:
+      default:
+        lpmn: [{"spacy":{"lang":"de"}}]
+        output: json
+        tagset: ud
+    es:
+      default:
+        lpmn: [{"spacy":{"lang":"es"}}]
+        output: json
+        tagset: ud
+    pt:
+      default:
+        lpmn: [{"spacy":{"lang":"pt"}}]
+        output: json
+        tagset: ud
+    fr:
+      default:
+        lpmn: [{"spacy":{"lang":"fr"}}]
+        output: json
+        tagset: ud
+    ru:
+      default:
+        lpmn: [{"spacy":{"lang":"ru"}}]
+        output: json
+        tagset: ud
 ners:
     pl:
       default:
@@ -20,3 +45,28 @@ ners:
         lpmn: [{"spacy":{"lang":"en", 'method': 'ner'}}]
         output: json
         tagset: ud
+    de:
+      default:
+        lpmn: [{"spacy":{"lang":"de", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    es:
+      default:
+        lpmn: [{"spacy":{"lang":"es", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    pt:
+      default:
+        lpmn: [{"spacy":{"lang":"pt", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    fr:
+      default:
+        lpmn: [{"spacy":{"lang":"fr", 'method': 'ner'}}]
+        output: json
+        tagset: ud
+    ru:
+      default:
+        lpmn: [{"spacy":{"lang":"ru", 'method': 'ner'}}]
+        output: json
+        tagset: ud
\ No newline at end of file
-- 
GitLab


From 2ffb0e428c5ee119bbb9292bf893aa3e117749dc Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 4 Apr 2023 10:41:04 +0200
Subject: [PATCH 02/90] Add debug print

---
 tests/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test.py b/tests/test.py
index 174a0d0..3f0c5e9 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -14,6 +14,7 @@ def prepare_subtask(parameters: dict, process_id: int):
 
 
 def get_output_path(self, timeout=0):
+    print(self.task)
     return "tests/testdata/output/tmp-subtask-result"
 
 
-- 
GitLab


From ed23a6a5455e6df6235a483a02d57a417687838c Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 4 Apr 2023 10:45:26 +0200
Subject: [PATCH 03/90] Add [popsute] keyword to tmp-subtask-result file

---
 tests/testdata/output/tmp-subtask-result | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testdata/output/tmp-subtask-result b/tests/testdata/output/tmp-subtask-result
index 1a1c303..433fd91 100644
--- a/tests/testdata/output/tmp-subtask-result
+++ b/tests/testdata/output/tmp-subtask-result
@@ -1 +1 @@
-foobar, baz
\ No newline at end of file
+foobar, baz[popsute]
\ No newline at end of file
-- 
GitLab


From 232f360b3fc279cef9f894ce73ad2b9bebb4366f Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 4 Apr 2023 12:54:51 +0200
Subject: [PATCH 04/90] Add separate expects for tests

---
 tests/conftest.py                             |  60 +++++++
 tests/test.py                                 | 157 +++++++++---------
 tests/testdata/expected/post_spacy_input_de   |   1 +
 tests/testdata/expected/post_spacy_input_es   |   1 +
 tests/testdata/expected/post_spacy_input_fr   |   1 +
 tests/testdata/expected/post_spacy_input_pl   |   1 +
 tests/testdata/expected/post_spacy_input_pt   |   1 +
 tests/testdata/expected/post_spacy_input_ru   |   1 +
 .../expected/post_spacy_small_limit_input_de  |  15 ++
 .../expected/post_spacy_small_limit_input_es  |  15 ++
 .../expected/post_spacy_small_limit_input_fr  |  15 ++
 .../expected/post_spacy_small_limit_input_pl  |  15 ++
 .../expected/post_spacy_small_limit_input_pt  |  15 ++
 .../expected/post_spacy_small_limit_input_ru  |  15 ++
 tests/testdata/input/post_spacy_input_de      |   1 +
 tests/testdata/input/post_spacy_input_es      |   1 +
 tests/testdata/input/post_spacy_input_fr      |   1 +
 tests/testdata/input/post_spacy_input_pl      |   1 +
 tests/testdata/input/post_spacy_input_pt      |   1 +
 tests/testdata/input/post_spacy_input_ru      |   1 +
 .../input/post_spacy_small_limit_input_de     |   1 +
 .../input/post_spacy_small_limit_input_es     |   1 +
 .../input/post_spacy_small_limit_input_fr     |   1 +
 .../input/post_spacy_small_limit_input_pl     |   1 +
 .../input/post_spacy_small_limit_input_pt     |   1 +
 .../input/post_spacy_small_limit_input_ru     |   1 +
 tests/testdata/output/tmp-subtask-result      |   2 +-
 tests/testdata/output/tmp-subtask-result-de   |   1 +
 tests/testdata/output/tmp-subtask-result-es   |   1 +
 tests/testdata/output/tmp-subtask-result-fr   |   1 +
 tests/testdata/output/tmp-subtask-result-pl   |   1 +
 tests/testdata/output/tmp-subtask-result-pt   |   1 +
 tests/testdata/output/tmp-subtask-result-ru   |   1 +
 33 files changed, 258 insertions(+), 75 deletions(-)
 create mode 100644 tests/testdata/expected/post_spacy_input_de
 create mode 100644 tests/testdata/expected/post_spacy_input_es
 create mode 100644 tests/testdata/expected/post_spacy_input_fr
 create mode 100644 tests/testdata/expected/post_spacy_input_pl
 create mode 100644 tests/testdata/expected/post_spacy_input_pt
 create mode 100644 tests/testdata/expected/post_spacy_input_ru
 create mode 100644 tests/testdata/expected/post_spacy_small_limit_input_de
 create mode 100644 tests/testdata/expected/post_spacy_small_limit_input_es
 create mode 100644 tests/testdata/expected/post_spacy_small_limit_input_fr
 create mode 100644 tests/testdata/expected/post_spacy_small_limit_input_pl
 create mode 100644 tests/testdata/expected/post_spacy_small_limit_input_pt
 create mode 100644 tests/testdata/expected/post_spacy_small_limit_input_ru
 create mode 100644 tests/testdata/input/post_spacy_input_de
 create mode 100644 tests/testdata/input/post_spacy_input_es
 create mode 100644 tests/testdata/input/post_spacy_input_fr
 create mode 100644 tests/testdata/input/post_spacy_input_pl
 create mode 100644 tests/testdata/input/post_spacy_input_pt
 create mode 100644 tests/testdata/input/post_spacy_input_ru
 create mode 100644 tests/testdata/input/post_spacy_small_limit_input_de
 create mode 100644 tests/testdata/input/post_spacy_small_limit_input_es
 create mode 100644 tests/testdata/input/post_spacy_small_limit_input_fr
 create mode 100644 tests/testdata/input/post_spacy_small_limit_input_pl
 create mode 100644 tests/testdata/input/post_spacy_small_limit_input_pt
 create mode 100644 tests/testdata/input/post_spacy_small_limit_input_ru
 create mode 100644 tests/testdata/output/tmp-subtask-result-de
 create mode 100644 tests/testdata/output/tmp-subtask-result-es
 create mode 100644 tests/testdata/output/tmp-subtask-result-fr
 create mode 100644 tests/testdata/output/tmp-subtask-result-pl
 create mode 100644 tests/testdata/output/tmp-subtask-result-pt
 create mode 100644 tests/testdata/output/tmp-subtask-result-ru

diff --git a/tests/conftest.py b/tests/conftest.py
index e19c946..22ea48e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -34,6 +34,66 @@ def input_file_small():
     return 'post_spacy_small_limit_input'
 
 
+@pytest.fixture
+def input_file1_pl():
+    return 'post_spacy_input_pl'
+
+
+@pytest.fixture
+def input_file_small_pl():
+    return 'post_spacy_small_limit_input_pl'
+
+
+@pytest.fixture
+def input_file1_de():
+    return 'post_spacy_input_de'
+
+
+@pytest.fixture
+def input_file_small_de():
+    return 'post_spacy_small_limit_input_de'
+
+
+@pytest.fixture
+def input_file1_es():
+    return 'post_spacy_input_es'
+
+
+@pytest.fixture
+def input_file_small_es():
+    return 'post_spacy_small_limit_input_es'
+
+
+@pytest.fixture
+def input_file1_pt():
+    return 'post_spacy_input_pt'
+
+
+@pytest.fixture
+def input_file_small_pt():
+    return 'post_spacy_small_limit_input_pt'
+
+
+@pytest.fixture
+def input_file1_fr():
+    return 'post_spacy_input_fr'
+
+
+@pytest.fixture
+def input_file_small_fr():
+    return 'post_spacy_small_limit_input_fr'
+
+
+@pytest.fixture
+def input_file1_ru():
+    return 'post_spacy_input_ru'
+
+
+@pytest.fixture
+def input_file_small_ru():
+    return 'post_spacy_small_limit_input_ru'
+
+
 @pytest.fixture
 def input_dir2():
     return 'input_dir2'
diff --git a/tests/test.py b/tests/test.py
index 3f0c5e9..53c9582 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -14,8 +14,17 @@ def prepare_subtask(parameters: dict, process_id: int):
 
 
 def get_output_path(self, timeout=0):
-    print(self.task)
-    return "tests/testdata/output/tmp-subtask-result"
+    tmp_subtask_result_file = {
+        "en": "tests/testdata/output/tmp-subtask-result",
+        "pl": "tests/testdata/output/tmp-subtask-result-pl",
+        "de": "tests/testdata/output/tmp-subtask-result-de",
+        "es": "tests/testdata/output/tmp-subtask-result-es",
+        "pt": "tests/testdata/output/tmp-subtask-result-pt",
+        "fr": "tests/testdata/output/tmp-subtask-result-fr",
+        "ru": "tests/testdata/output/tmp-subtask-result-ru",
+    }
+    dict_key = self.task[0].get("spacy", {"lang": "pl"})["lang"]
+    return tmp_subtask_result_file[dict_key]
 
 
 def test_init():
@@ -61,7 +70,7 @@ def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small))
 
 
-def test_base_process_file_pl(mocker, worker, input_dir, input_file1,
+def test_base_process_file_pl(mocker, worker, input_dir, input_file1_pl,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -72,15 +81,15 @@ def test_base_process_file_pl(mocker, worker, input_dir, input_file1,
         os.getpid()
     )
     worker.process(
-        os.path.join(input_dir, input_file1),
-        {"lang": "pl"}, os.path.join(output_dir, input_file1)
+        os.path.join(input_dir, input_file1_pl),
+        {"lang": "pl"}, os.path.join(output_dir, input_file1_pl)
     )
-    assert cmp(os.path.join(output_dir, input_file1),
-               os.path.join(expected_dir, input_file1))
-    os.remove(os.path.join(output_dir, input_file1))
+    assert cmp(os.path.join(output_dir, input_file1_pl),
+               os.path.join(expected_dir, input_file1_pl))
+    os.remove(os.path.join(output_dir, input_file1_pl))
 
 
-def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input_file_small_pl,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -91,15 +100,15 @@ def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input
         os.getpid()
     )
     worker_small.process(
-        os.path.join(input_dir, input_file_small),
-        {"lang": "pl"}, os.path.join(output_dir, input_file_small)
+        os.path.join(input_dir, input_file_small_pl),
+        {"lang": "pl"}, os.path.join(output_dir, input_file_small_pl)
     )
-    assert cmp(os.path.join(output_dir, input_file_small),
-               os.path.join(expected_dir, input_file_small))
-    os.remove(os.path.join(output_dir, input_file_small))
+    assert cmp(os.path.join(output_dir, input_file_small_pl),
+               os.path.join(expected_dir, input_file_small_pl))
+    os.remove(os.path.join(output_dir, input_file_small_pl))
 
 
-def test_base_process_file_de(mocker, worker, input_dir, input_file1,
+def test_base_process_file_de(mocker, worker, input_dir, input_file1_de,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -110,15 +119,15 @@ def test_base_process_file_de(mocker, worker, input_dir, input_file1,
         os.getpid()
     )
     worker.process(
-        os.path.join(input_dir, input_file1),
-        {"lang": "de"}, os.path.join(output_dir, input_file1)
+        os.path.join(input_dir, input_file1_de),
+        {"lang": "de"}, os.path.join(output_dir, input_file1_de)
     )
-    assert cmp(os.path.join(output_dir, input_file1),
-               os.path.join(expected_dir, input_file1))
-    os.remove(os.path.join(output_dir, input_file1))
+    assert cmp(os.path.join(output_dir, input_file1_de),
+               os.path.join(expected_dir, input_file1_de))
+    os.remove(os.path.join(output_dir, input_file1_de))
 
 
-def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input_file_small_de,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -129,15 +138,15 @@ def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input
         os.getpid()
     )
     worker_small.process(
-        os.path.join(input_dir, input_file_small),
-        {"lang": "de"}, os.path.join(output_dir, input_file_small)
+        os.path.join(input_dir, input_file_small_de),
+        {"lang": "de"}, os.path.join(output_dir, input_file_small_de)
     )
-    assert cmp(os.path.join(output_dir, input_file_small),
-               os.path.join(expected_dir, input_file_small))
-    os.remove(os.path.join(output_dir, input_file_small))
+    assert cmp(os.path.join(output_dir, input_file_small_de),
+               os.path.join(expected_dir, input_file_small_de))
+    os.remove(os.path.join(output_dir, input_file_small_de))
 
 
-def test_base_process_file_es(mocker, worker, input_dir, input_file1,
+def test_base_process_file_es(mocker, worker, input_dir, input_file1_es,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -148,15 +157,15 @@ def test_base_process_file_es(mocker, worker, input_dir, input_file1,
         os.getpid()
     )
     worker.process(
-        os.path.join(input_dir, input_file1),
-        {"lang": "es"}, os.path.join(output_dir, input_file1)
+        os.path.join(input_dir, input_file1_es),
+        {"lang": "es"}, os.path.join(output_dir, input_file1_es)
     )
-    assert cmp(os.path.join(output_dir, input_file1),
-               os.path.join(expected_dir, input_file1))
-    os.remove(os.path.join(output_dir, input_file1))
+    assert cmp(os.path.join(output_dir, input_file1_es),
+               os.path.join(expected_dir, input_file1_es))
+    os.remove(os.path.join(output_dir, input_file1_es))
 
 
-def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input_file_small_es,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -167,15 +176,15 @@ def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input
         os.getpid()
     )
     worker_small.process(
-        os.path.join(input_dir, input_file_small),
-        {"lang": "es"}, os.path.join(output_dir, input_file_small)
+        os.path.join(input_dir, input_file_small_es),
+        {"lang": "es"}, os.path.join(output_dir, input_file_small_es)
     )
-    assert cmp(os.path.join(output_dir, input_file_small),
-               os.path.join(expected_dir, input_file_small))
-    os.remove(os.path.join(output_dir, input_file_small))
+    assert cmp(os.path.join(output_dir, input_file_small_es),
+               os.path.join(expected_dir, input_file_small_es))
+    os.remove(os.path.join(output_dir, input_file_small_es))
 
 
-def test_base_process_file_pt(mocker, worker, input_dir, input_file1,
+def test_base_process_file_pt(mocker, worker, input_dir, input_file1_pt,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -186,15 +195,15 @@ def test_base_process_file_pt(mocker, worker, input_dir, input_file1,
         os.getpid()
     )
     worker.process(
-        os.path.join(input_dir, input_file1),
-        {"lang": "pt"}, os.path.join(output_dir, input_file1)
+        os.path.join(input_dir, input_file1_pt),
+        {"lang": "pt"}, os.path.join(output_dir, input_file1_pt)
     )
-    assert cmp(os.path.join(output_dir, input_file1),
-               os.path.join(expected_dir, input_file1))
-    os.remove(os.path.join(output_dir, input_file1))
+    assert cmp(os.path.join(output_dir, input_file1_pt),
+               os.path.join(expected_dir, input_file1_pt))
+    os.remove(os.path.join(output_dir, input_file1_pt))
 
 
-def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input_file_small_pt,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -205,15 +214,15 @@ def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input
         os.getpid()
     )
     worker_small.process(
-        os.path.join(input_dir, input_file_small),
-        {"lang": "pt"}, os.path.join(output_dir, input_file_small)
+        os.path.join(input_dir, input_file_small_pt),
+        {"lang": "pt"}, os.path.join(output_dir, input_file_small_pt)
     )
-    assert cmp(os.path.join(output_dir, input_file_small),
-               os.path.join(expected_dir, input_file_small))
-    os.remove(os.path.join(output_dir, input_file_small))
+    assert cmp(os.path.join(output_dir, input_file_small_pt),
+               os.path.join(expected_dir, input_file_small_pt))
+    os.remove(os.path.join(output_dir, input_file_small_pt))
 
 
-def test_base_process_file_fr(mocker, worker, input_dir, input_file1,
+def test_base_process_file_fr(mocker, worker, input_dir, input_file1_fr,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -224,15 +233,15 @@ def test_base_process_file_fr(mocker, worker, input_dir, input_file1,
         os.getpid()
     )
     worker.process(
-        os.path.join(input_dir, input_file1),
-        {"lang": "fr"}, os.path.join(output_dir, input_file1)
+        os.path.join(input_dir, input_file1_fr),
+        {"lang": "fr"}, os.path.join(output_dir, input_file1_fr)
     )
-    assert cmp(os.path.join(output_dir, input_file1),
-               os.path.join(expected_dir, input_file1))
-    os.remove(os.path.join(output_dir, input_file1))
+    assert cmp(os.path.join(output_dir, input_file1_fr),
+               os.path.join(expected_dir, input_file1_fr))
+    os.remove(os.path.join(output_dir, input_file1_fr))
 
 
-def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input_file_small_fr,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -243,15 +252,15 @@ def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input
         os.getpid()
     )
     worker_small.process(
-        os.path.join(input_dir, input_file_small),
-        {"lang": "fr"}, os.path.join(output_dir, input_file_small)
+        os.path.join(input_dir, input_file_small_fr),
+        {"lang": "fr"}, os.path.join(output_dir, input_file_small_fr)
     )
-    assert cmp(os.path.join(output_dir, input_file_small),
-               os.path.join(expected_dir, input_file_small))
-    os.remove(os.path.join(output_dir, input_file_small))
+    assert cmp(os.path.join(output_dir, input_file_small_fr),
+               os.path.join(expected_dir, input_file_small_fr))
+    os.remove(os.path.join(output_dir, input_file_small_fr))
 
 
-def test_base_process_file_ru(mocker, worker, input_dir, input_file1,
+def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -262,15 +271,15 @@ def test_base_process_file_ru(mocker, worker, input_dir, input_file1,
         os.getpid()
     )
     worker.process(
-        os.path.join(input_dir, input_file1),
-        {"lang": "ru"}, os.path.join(output_dir, input_file1)
+        os.path.join(input_dir, input_file1_ru),
+        {"lang": "ru"}, os.path.join(output_dir, input_file1_ru)
     )
-    assert cmp(os.path.join(output_dir, input_file1),
-               os.path.join(expected_dir, input_file1))
-    os.remove(os.path.join(output_dir, input_file1))
+    assert cmp(os.path.join(output_dir, input_file1_ru),
+               os.path.join(expected_dir, input_file1_ru))
+    os.remove(os.path.join(output_dir, input_file1_ru))
 
 
-def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input_file_small_ru,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
@@ -281,9 +290,9 @@ def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input
         os.getpid()
     )
     worker_small.process(
-        os.path.join(input_dir, input_file_small),
-        {"lang": "ru"}, os.path.join(output_dir, input_file_small)
+        os.path.join(input_dir, input_file_small_ru),
+        {"lang": "ru"}, os.path.join(output_dir, input_file_small_ru)
     )
-    assert cmp(os.path.join(output_dir, input_file_small),
-               os.path.join(expected_dir, input_file_small))
-    os.remove(os.path.join(output_dir, input_file_small))
+    assert cmp(os.path.join(output_dir, input_file_small_ru),
+               os.path.join(expected_dir, input_file_small_ru))
+    os.remove(os.path.join(output_dir, input_file_small_ru))
diff --git a/tests/testdata/expected/post_spacy_input_de b/tests/testdata/expected/post_spacy_input_de
new file mode 100644
index 0000000..941f92a
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_input_de
@@ -0,0 +1 @@
+fööbär, bäzßü
diff --git a/tests/testdata/expected/post_spacy_input_es b/tests/testdata/expected/post_spacy_input_es
new file mode 100644
index 0000000..8d95aa5
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_input_es
@@ -0,0 +1 @@
+fóóbár, bézíñú¿¡
diff --git a/tests/testdata/expected/post_spacy_input_fr b/tests/testdata/expected/post_spacy_input_fr
new file mode 100644
index 0000000..c6624ee
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_input_fr
@@ -0,0 +1 @@
+fæèêbàr, bâzœ
diff --git a/tests/testdata/expected/post_spacy_input_pl b/tests/testdata/expected/post_spacy_input_pl
new file mode 100644
index 0000000..11c7a71
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_input_pl
@@ -0,0 +1 @@
+fóbąr, baźż
diff --git a/tests/testdata/expected/post_spacy_input_pt b/tests/testdata/expected/post_spacy_input_pt
new file mode 100644
index 0000000..4eef2ea
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_input_pt
@@ -0,0 +1 @@
+fõbár, bâz
diff --git a/tests/testdata/expected/post_spacy_input_ru b/tests/testdata/expected/post_spacy_input_ru
new file mode 100644
index 0000000..f3a908b
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_input_ru
@@ -0,0 +1 @@
+фубар, баз
diff --git a/tests/testdata/expected/post_spacy_small_limit_input_de b/tests/testdata/expected/post_spacy_small_limit_input_de
new file mode 100644
index 0000000..bd91f84
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_small_limit_input_de
@@ -0,0 +1,15 @@
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
+fööbär, bäzßü
diff --git a/tests/testdata/expected/post_spacy_small_limit_input_es b/tests/testdata/expected/post_spacy_small_limit_input_es
new file mode 100644
index 0000000..51179d5
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_small_limit_input_es
@@ -0,0 +1,15 @@
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
+fóóbár, bézíñú¿¡
diff --git a/tests/testdata/expected/post_spacy_small_limit_input_fr b/tests/testdata/expected/post_spacy_small_limit_input_fr
new file mode 100644
index 0000000..e3c8c5d
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_small_limit_input_fr
@@ -0,0 +1,15 @@
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
+fæèêbàr, bâzœ
diff --git a/tests/testdata/expected/post_spacy_small_limit_input_pl b/tests/testdata/expected/post_spacy_small_limit_input_pl
new file mode 100644
index 0000000..6f9787d
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_small_limit_input_pl
@@ -0,0 +1,15 @@
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
+fóbąr, baźż
diff --git a/tests/testdata/expected/post_spacy_small_limit_input_pt b/tests/testdata/expected/post_spacy_small_limit_input_pt
new file mode 100644
index 0000000..0af0035
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_small_limit_input_pt
@@ -0,0 +1,15 @@
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
+fõbár, bâz
diff --git a/tests/testdata/expected/post_spacy_small_limit_input_ru b/tests/testdata/expected/post_spacy_small_limit_input_ru
new file mode 100644
index 0000000..410acfc
--- /dev/null
+++ b/tests/testdata/expected/post_spacy_small_limit_input_ru
@@ -0,0 +1,15 @@
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
+фубар, баз
diff --git a/tests/testdata/input/post_spacy_input_de b/tests/testdata/input/post_spacy_input_de
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_input_de
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_input_es b/tests/testdata/input/post_spacy_input_es
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_input_es
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_input_fr b/tests/testdata/input/post_spacy_input_fr
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_input_fr
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_input_pl b/tests/testdata/input/post_spacy_input_pl
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_input_pl
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_input_pt b/tests/testdata/input/post_spacy_input_pt
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_input_pt
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_input_ru b/tests/testdata/input/post_spacy_input_ru
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_input_ru
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_small_limit_input_de b/tests/testdata/input/post_spacy_small_limit_input_de
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_small_limit_input_de
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_small_limit_input_es b/tests/testdata/input/post_spacy_small_limit_input_es
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_small_limit_input_es
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_small_limit_input_fr b/tests/testdata/input/post_spacy_small_limit_input_fr
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_small_limit_input_fr
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_small_limit_input_pl b/tests/testdata/input/post_spacy_small_limit_input_pl
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_small_limit_input_pl
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_small_limit_input_pt b/tests/testdata/input/post_spacy_small_limit_input_pt
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_small_limit_input_pt
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/input/post_spacy_small_limit_input_ru b/tests/testdata/input/post_spacy_small_limit_input_ru
new file mode 100644
index 0000000..46d611b
--- /dev/null
+++ b/tests/testdata/input/post_spacy_small_limit_input_ru
@@ -0,0 +1 @@
+When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.
diff --git a/tests/testdata/output/tmp-subtask-result b/tests/testdata/output/tmp-subtask-result
index 433fd91..1a1c303 100644
--- a/tests/testdata/output/tmp-subtask-result
+++ b/tests/testdata/output/tmp-subtask-result
@@ -1 +1 @@
-foobar, baz[popsute]
\ No newline at end of file
+foobar, baz
\ No newline at end of file
diff --git a/tests/testdata/output/tmp-subtask-result-de b/tests/testdata/output/tmp-subtask-result-de
new file mode 100644
index 0000000..231232f
--- /dev/null
+++ b/tests/testdata/output/tmp-subtask-result-de
@@ -0,0 +1 @@
+fööbär, bäzßü
\ No newline at end of file
diff --git a/tests/testdata/output/tmp-subtask-result-es b/tests/testdata/output/tmp-subtask-result-es
new file mode 100644
index 0000000..a99a38a
--- /dev/null
+++ b/tests/testdata/output/tmp-subtask-result-es
@@ -0,0 +1 @@
+fóóbár, bézíñú¿¡
\ No newline at end of file
diff --git a/tests/testdata/output/tmp-subtask-result-fr b/tests/testdata/output/tmp-subtask-result-fr
new file mode 100644
index 0000000..8759587
--- /dev/null
+++ b/tests/testdata/output/tmp-subtask-result-fr
@@ -0,0 +1 @@
+fæèêbàr, bâzœ
\ No newline at end of file
diff --git a/tests/testdata/output/tmp-subtask-result-pl b/tests/testdata/output/tmp-subtask-result-pl
new file mode 100644
index 0000000..efa5be7
--- /dev/null
+++ b/tests/testdata/output/tmp-subtask-result-pl
@@ -0,0 +1 @@
+fóbąr, baźż
\ No newline at end of file
diff --git a/tests/testdata/output/tmp-subtask-result-pt b/tests/testdata/output/tmp-subtask-result-pt
new file mode 100644
index 0000000..c2a919c
--- /dev/null
+++ b/tests/testdata/output/tmp-subtask-result-pt
@@ -0,0 +1 @@
+fõbár, bâz
\ No newline at end of file
diff --git a/tests/testdata/output/tmp-subtask-result-ru b/tests/testdata/output/tmp-subtask-result-ru
new file mode 100644
index 0000000..5e9d869
--- /dev/null
+++ b/tests/testdata/output/tmp-subtask-result-ru
@@ -0,0 +1 @@
+фубар, баз
\ No newline at end of file
-- 
GitLab


From d0016f9f3a4b44eada97a55ed5c63cb9dc63bbd8 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 4 Apr 2023 13:02:21 +0200
Subject: [PATCH 05/90] Quickfix for pl

---
 tests/test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test.py b/tests/test.py
index 53c9582..6edb025 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -23,7 +23,9 @@ def get_output_path(self, timeout=0):
         "fr": "tests/testdata/output/tmp-subtask-result-fr",
         "ru": "tests/testdata/output/tmp-subtask-result-ru",
     }
-    dict_key = self.task[0].get("spacy", {"lang": "pl"})["lang"]
+    dict_key = "pl"
+    if len(self.task) == 1 and 'spacy' in self.task[0]:
+        dict_key = self.task[0]["spacy"]["lang"]
     return tmp_subtask_result_file[dict_key]
 
 
-- 
GitLab


From 59f29a343c2efc8dba265d5e4f3d780753262ce0 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 5 Apr 2023 11:44:12 +0200
Subject: [PATCH 06/90] Add integration tests

---
 lpmn_queries.json                                    | 12 +++++++++++-
 tests/testdata/expected/elasmucha_grisea_ru.json     |  1 +
 .../testdata/expected/turritopsis_nutricula_de.json  |  1 +
 .../testdata/expected/turritopsis_nutricula_es.json  |  1 +
 .../testdata/expected/turritopsis_nutricula_fr.json  |  1 +
 .../testdata/expected/turritopsis_nutricula_pt.json  |  1 +
 tests/testdata/input/elasmucha_grisea_ru.txt         |  1 +
 tests/testdata/input/turritopsis_nutricula_de.txt    |  1 +
 tests/testdata/input/turritopsis_nutricula_es.txt    |  1 +
 tests/testdata/input/turritopsis_nutricula_fr.txt    |  1 +
 tests/testdata/input/turritopsis_nutricula_pt.txt    |  1 +
 11 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 tests/testdata/expected/elasmucha_grisea_ru.json
 create mode 100644 tests/testdata/expected/turritopsis_nutricula_de.json
 create mode 100644 tests/testdata/expected/turritopsis_nutricula_es.json
 create mode 100644 tests/testdata/expected/turritopsis_nutricula_fr.json
 create mode 100644 tests/testdata/expected/turritopsis_nutricula_pt.json
 create mode 100644 tests/testdata/input/elasmucha_grisea_ru.txt
 create mode 100644 tests/testdata/input/turritopsis_nutricula_de.txt
 create mode 100644 tests/testdata/input/turritopsis_nutricula_es.txt
 create mode 100644 tests/testdata/input/turritopsis_nutricula_fr.txt
 create mode 100644 tests/testdata/input/turritopsis_nutricula_pt.txt

diff --git a/lpmn_queries.json b/lpmn_queries.json
index 67b852d..a764318 100644
--- a/lpmn_queries.json
+++ b/lpmn_queries.json
@@ -11,5 +11,15 @@
 
     "ner_for_pl": {"task": [{"postagger": {"lang": "pl", "output": "json", "method": "ner"}}], "input": "post_postagger_input", "expected": "ner_for_pl.json"},
 
-    "ner_for_en": {"task": [{"postagger": {"lang": "en", "output": "json", "method": "ner"}}], "input": "post_spacy_input", "expected": "ner_for_en.json"}
+    "ner_for_en": {"task": [{"postagger": {"lang": "en", "output": "json", "method": "ner"}}], "input": "post_spacy_input", "expected": "ner_for_en.json"},
+
+    "postagger_lone_json_de": {"task": [{"postagger": {"lang": "de", "output": "json", "method": "tagger"}}], "input": "turritopsis_nutricula_de.txt", "expected": "turritopsis_nutricula_de.json"},
+
+    "postagger_lone_json_es": {"task": [{"postagger": {"lang": "es", "output": "json", "method": "tagger"}}], "input": "turritopsis_nutricula_es.txt", "expected": "turritopsis_nutricula_es.json"},
+
+    "postagger_lone_json_pt": {"task": [{"postagger": {"lang": "pt", "output": "json", "method": "tagger"}}], "input": "turritopsis_nutricula_pt.txt", "expected": "turritopsis_nutricula_pt.json"},
+
+    "postagger_lone_json_fr": {"task": [{"postagger": {"lang": "fr", "output": "json", "method": "tagger"}}], "input": "turritopsis_nutricula_fr.txt", "expected": "turritopsis_nutricula_fr.json"},
+
+    "postagger_lone_json_ru": {"task": [{"postagger": {"lang": "ru", "output": "json", "method": "tagger"}}], "input": "elasmucha_grisea_ru.txt", "expected": "elasmucha_grisea_ru.json"}
 }
diff --git a/tests/testdata/expected/elasmucha_grisea_ru.json b/tests/testdata/expected/elasmucha_grisea_ru.json
new file mode 100644
index 0000000..debf8e2
--- /dev/null
+++ b/tests/testdata/expected/elasmucha_grisea_ru.json
@@ -0,0 +1 @@
+{"filename": "1a863079-6aec-457f-8681-f670d716a19c", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 3], "orth": "Для", "lexemes": [{"lemma": "для", "mstag": "ADP", "disamb": true}]}, {"index": 2, "position": [4, 8], "orth": "вида", "lexemes": [{"lemma": "вид", "mstag": "NOUN", "disamb": true}]}, {"index": 3, "position": [9, 22], "orth": "зафиксирована", "lexemes": [{"lemma": "зафиксировать", "mstag": "VERB", "disamb": true}]}, {"index": 4, "position": [23, 34], "orth": "материнская", "lexemes": [{"lemma": "материнский", "mstag": "ADJ", "disamb": true}]}, {"index": 5, "position": [35, 41], "orth": "забота", "lexemes": [{"lemma": "забота", "mstag": "NOUN", "disamb": true}]}, {"index": 6, "position": [42, 43], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "PUNCT", "disamb": true}]}, {"index": 7, "position": [44, 52], "orth": "поведение", "lexemes": [{"lemma": "поведение", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [53, 55], "orth": "по", "lexemes": [{"lemma": "по", "mstag": "ADP", "disamb": true}]}, {"index": 9, "position": [56, 62], "orth": "охране", "lexemes": [{"lemma": "охрана", "mstag": "NOUN", "disamb": true}]}, {"index": 10, "position": [63, 66], "orth": "яиц", "lexemes": [{"lemma": "яйцо", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [67, 68], "orth": "и", "lexemes": [{"lemma": "и", "mstag": "CCONJ", "disamb": true}]}, {"index": 12, "position": [69, 76], "orth": "личинок", "lexemes": [{"lemma": "личинка", "mstag": "NOUN", "disamb": true}]}, {"index": 13, "position": [77, 77], "orth": "-", "lexemes": [{"lemma": "-", "mstag": "NOUN", "disamb": true}]}, {"index": 14, "position": [78, 81], "orth": "нимф", "lexemes": [{"lemma": "нимф", "mstag": "NOUN", "disamb": true}]}, {"index": 15, "position": [82, 82], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "PUNCT", "disamb": true}]}, {"index": 16, "position": [83, 83], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 17, "position": [84, 89], "orth": "После", "lexemes": [{"lemma": "после", "mstag": "ADP", "disamb": true}]}, {"index": 18, "position": [90, 100], "orth": "яйцекладки", "lexemes": [{"lemma": "яйцекладка", "mstag": "NOUN", "disamb": true}]}, {"index": 19, "position": [101, 106], "orth": "самка", "lexemes": [{"lemma": "самка", "mstag": "NOUN", "disamb": true}]}, {"index": 20, "position": [107, 112], "orth": "стоит", "lexemes": [{"lemma": "стоить", "mstag": "VERB", "disamb": true}]}, {"index": 21, "position": [113, 116], "orth": "над", "lexemes": [{"lemma": "над", "mstag": "ADP", "disamb": true}]}, {"index": 22, "position": [117, 124], "orth": "кладкой", "lexemes": [{"lemma": "кладка", "mstag": "NOUN", "disamb": true}]}, {"index": 23, "position": [125, 128], "orth": "яиц", "lexemes": [{"lemma": "яйцо", "mstag": "NOUN", "disamb": true}]}, {"index": 24, "position": [129, 130], "orth": "и", "lexemes": [{"lemma": "и", "mstag": "CCONJ", "disamb": true}]}, {"index": 25, "position": [131, 139], "orth": "защищает", "lexemes": [{"lemma": "защищать", "mstag": "VERB", "disamb": true}]}, {"index": 26, "position": [140, 142], "orth": "её", "lexemes": [{"lemma": "её", "mstag": "PRON", "disamb": true}]}, {"index": 27, "position": [143, 145], "orth": "на", "lexemes": [{"lemma": "на", "mstag": "ADP", "disamb": true}]}, {"index": 28, "position": [146, 156], "orth": "протяжении", "lexemes": [{"lemma": "протяжение", "mstag": "NOUN", "disamb": true}]}, {"index": 29, "position": [157, 162], "orth": "всего", "lexemes": [{"lemma": "весь", "mstag": "DET", "disamb": true}]}, {"index": 30, "position": [163, 171], "orth": "развития", "lexemes": [{"lemma": "развитие", "mstag": "NOUN", "disamb": true}]}, {"index": 31, "position": [172, 175], "orth": "яиц", "lexemes": [{"lemma": "яйцо", "mstag": "NOUN", "disamb": true}]}, {"index": 32, "position": [176, 176], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 33, "position": [177, 186], "orth": "Репертуар", "lexemes": [{"lemma": "репертуар", "mstag": "NOUN", "disamb": true}]}, {"index": 34, "position": [187, 196], "orth": "защитного", "lexemes": [{"lemma": "защитный", "mstag": "ADJ", "disamb": true}]}, {"index": 35, "position": [197, 206], "orth": "поведения", "lexemes": [{"lemma": "поведение", "mstag": "NOUN", "disamb": true}]}, {"index": 36, "position": [207, 212], "orth": "самок", "lexemes": [{"lemma": "самка", "mstag": "NOUN", "disamb": true}]}, {"index": 37, "position": [213, 213], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 38, "position": [214, 221], "orth": "стоящих", "lexemes": [{"lemma": "стоить", "mstag": "VERB", "disamb": true}]}, {"index": 39, "position": [222, 225], "orth": "над", "lexemes": [{"lemma": "над", "mstag": "ADP", "disamb": true}]}, {"index": 40, "position": [226, 233], "orth": "пакетом", "lexemes": [{"lemma": "пакет", "mstag": "NOUN", "disamb": true}]}, {"index": 41, "position": [234, 237], "orth": "яиц", "lexemes": [{"lemma": "яйцо", "mstag": "NOUN", "disamb": true}]}, {"index": 42, "position": [238, 238], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 43, "position": [239, 247], "orth": "включает", "lexemes": [{"lemma": "включать", "mstag": "VERB", "disamb": true}]}, {"index": 44, "position": [248, 249], "orth": "в", "lexemes": [{"lemma": "в", "mstag": "ADP", "disamb": true}]}, {"index": 45, "position": [250, 254], "orth": "себя", "lexemes": [{"lemma": "себя", "mstag": "PRON", "disamb": true}]}, {"index": 46, "position": [255, 261], "orth": "взмахи", "lexemes": [{"lemma": "взмахи", "mstag": "NOUN", "disamb": true}]}, {"index": 47, "position": [262, 270], "orth": "крыльями", "lexemes": [{"lemma": "крыло", "mstag": "NOUN", "disamb": true}]}, {"index": 48, "position": [271, 271], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 49, "position": [272, 284], "orth": "подёргивание", "lexemes": [{"lemma": "подёргивание", "mstag": "NOUN", "disamb": true}]}, {"index": 50, "position": [285, 289], "orth": "тела", "lexemes": [{"lemma": "тело", "mstag": "NOUN", "disamb": true}]}, {"index": 51, "position": [290, 290], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 52, "position": [291, 297], "orth": "наклон", "lexemes": [{"lemma": "наклон", "mstag": "NOUN", "disamb": true}]}, {"index": 53, "position": [298, 299], "orth": "в", "lexemes": [{"lemma": "в", "mstag": "ADP", "disamb": true}]}, {"index": 54, "position": [300, 307], "orth": "сторону", "lexemes": [{"lemma": "сторона", "mstag": "NOUN", "disamb": true}]}, {"index": 55, "position": [308, 313], "orth": "врага", "lexemes": [{"lemma": "враг", "mstag": "NOUN", "disamb": true}]}, {"index": 56, "position": [314, 315], "orth": "и", "lexemes": [{"lemma": "и", "mstag": "CCONJ", "disamb": true}]}, {"index": 57, "position": [316, 316], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [317, 324], "orth": "наконец", "lexemes": [{"lemma": "наконец", "mstag": "ADV", "disamb": true}]}, {"index": 59, "position": [325, 325], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 60, "position": [326, 335], "orth": "выделение", "lexemes": [{"lemma": "выделение", "mstag": "NOUN", "disamb": true}]}, {"index": 61, "position": [336, 346], "orth": "неприятных", "lexemes": [{"lemma": "неприятный", "mstag": "ADJ", "disamb": true}]}, {"index": 62, "position": [347, 354], "orth": "запахов", "lexemes": [{"lemma": "запах", "mstag": "NOUN", "disamb": true}]}, {"index": 63, "position": [355, 357], "orth": "из", "lexemes": [{"lemma": "из", "mstag": "ADP", "disamb": true}]}, {"index": 64, "position": [358, 371], "orth": "ароматических", "lexemes": [{"lemma": "ароматический", "mstag": "ADJ", "disamb": true}]}, {"index": 65, "position": [372, 377], "orth": "желёз", "lexemes": [{"lemma": "желёз", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [378, 378], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 67, "position": [379, 385], "orth": "Иногда", "lexemes": [{"lemma": "иногда", "mstag": "ADV", "disamb": true}]}, {"index": 68, "position": [386, 391], "orth": "самки", "lexemes": [{"lemma": "самка", "mstag": "NOUN", "disamb": true}]}, {"index": 69, "position": [392, 401], "orth": "совместно", "lexemes": [{"lemma": "совместно", "mstag": "ADV", "disamb": true}]}, {"index": 70, "position": [402, 410], "orth": "охраняют", "lexemes": [{"lemma": "охранять", "mstag": "VERB", "disamb": true}]}, {"index": 71, "position": [411, 415], "orth": "свои", "lexemes": [{"lemma": "свой", "mstag": "DET", "disamb": true}]}, {"index": 72, "position": [416, 422], "orth": "кладки", "lexemes": [{"lemma": "кладка", "mstag": "NOUN", "disamb": true}]}, {"index": 73, "position": [423, 426], "orth": "бок", "lexemes": [{"lemma": "бок", "mstag": "ADV", "disamb": true}]}, {"index": 74, "position": [427, 428], "orth": "о", "lexemes": [{"lemma": "о", "mstag": "ADP", "disamb": true}]}, {"index": 75, "position": [429, 432], "orth": "бок", "lexemes": [{"lemma": "бок", "mstag": "NOUN", "disamb": true}]}, {"index": 76, "position": [433, 435], "orth": "на", "lexemes": [{"lemma": "на", "mstag": "ADP", "disamb": true}]}, {"index": 77, "position": [436, 441], "orth": "одном", "lexemes": [{"lemma": "один", "mstag": "DET", "disamb": true}]}, {"index": 78, "position": [442, 443], "orth": "и", "lexemes": [{"lemma": "и", "mstag": "CCONJ", "disamb": true}]}, {"index": 79, "position": [444, 447], "orth": "том", "lexemes": [{"lemma": "тот", "mstag": "DET", "disamb": true}]}, {"index": 80, "position": [448, 450], "orth": "же", "lexemes": [{"lemma": "же", "mstag": "PART", "disamb": true}]}, {"index": 81, "position": [451, 456], "orth": "листе", "lexemes": [{"lemma": "лист", "mstag": "NOUN", "disamb": true}]}, {"index": 82, "position": [457, 463], "orth": "берёзы", "lexemes": [{"lemma": "берёзы", "mstag": "NOUN", "disamb": true}]}, {"index": 83, "position": [464, 464], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 84, "position": [465, 468], "orth": "что", "lexemes": [{"lemma": "что", "mstag": "PRON", "disamb": true}]}, {"index": 85, "position": [469, 480], "orth": "увеличивает", "lexemes": [{"lemma": "увеличивать", "mstag": "VERB", "disamb": true}]}, {"index": 86, "position": [481, 486], "orth": "шансы", "lexemes": [{"lemma": "шанс", "mstag": "NOUN", "disamb": true}]}, {"index": 87, "position": [487, 496], "orth": "потомства", "lexemes": [{"lemma": "потомство", "mstag": "NOUN", "disamb": true}]}, {"index": 88, "position": [497, 499], "orth": "на", "lexemes": [{"lemma": "на", "mstag": "ADP", "disamb": true}]}, {"index": 89, "position": [500, 509], "orth": "выживание", "lexemes": [{"lemma": "выживание", "mstag": "NOUN", "disamb": true}]}, {"index": 90, "position": [510, 510], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 91, "position": [511, 514], "orth": "Эти", "lexemes": [{"lemma": "этот", "mstag": "DET", "disamb": true}]}, {"index": 92, "position": [515, 524], "orth": "насекомые", "lexemes": [{"lemma": "насекомое", "mstag": "NOUN", "disamb": true}]}, {"index": 93, "position": [525, 531], "orth": "служат", "lexemes": [{"lemma": "служить", "mstag": "VERB", "disamb": true}]}, {"index": 94, "position": [532, 540], "orth": "примером", "lexemes": [{"lemma": "пример", "mstag": "NOUN", "disamb": true}]}, {"index": 95, "position": [541, 546], "orth": "самых", "lexemes": [{"lemma": "самых", "mstag": "ADJ", "disamb": true}]}, {"index": 96, "position": [547, 553], "orth": "ранних", "lexemes": [{"lemma": "ранний", "mstag": "ADJ", "disamb": true}]}, {"index": 97, "position": [554, 560], "orth": "стадий", "lexemes": [{"lemma": "стадия", "mstag": "NOUN", "disamb": true}]}, {"index": 98, "position": [561, 575], "orth": "эусоциальности", "lexemes": [{"lemma": "эусоциальности", "mstag": "NOUN", "disamb": true}]}, {"index": 99, "position": [576, 576], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 100, "position": [577, 577], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "Для вида зафиксирована материнская забота (поведение по охране яиц и личинок-нимф). После яйцекладки самка стоит над кладкой яиц и защищает её на протяжении всего развития яиц. Репертуар защитного поведения самок, стоящих над пакетом яиц, включает в себя взмахи крыльями, подёргивание тела, наклон в сторону врага и, наконец, выделение неприятных запахов из ароматических желёз. Иногда самки совместно охраняют свои кладки бок о бок на одном и том же листе берёзы, что увеличивает шансы потомства на выживание. Эти насекомые служат примером самых ранних стадий эусоциальности.\n"}
diff --git a/tests/testdata/expected/turritopsis_nutricula_de.json b/tests/testdata/expected/turritopsis_nutricula_de.json
new file mode 100644
index 0000000..8ee7b67
--- /dev/null
+++ b/tests/testdata/expected/turritopsis_nutricula_de.json
@@ -0,0 +1 @@
+{"filename": "e7498228-90ac-4b09-81b7-a82f2ac02f55", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 11], "orth": "Turritopsis", "lexemes": [{"lemma": "Turritopsis", "mstag": "NOUN", "disamb": true}]}, {"index": 2, "position": [12, 21], "orth": "nutricula", "lexemes": [{"lemma": "nutricula", "mstag": "ADV", "disamb": true}]}, {"index": 3, "position": [22, 28], "orth": "bildet", "lexemes": [{"lemma": "bilden", "mstag": "VERB", "disamb": true}]}, {"index": 4, "position": [29, 38], "orth": "aufrechte", "lexemes": [{"lemma": "aufrechen", "mstag": "VERB", "disamb": true}]}, {"index": 5, "position": [39, 39], "orth": ",", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 6, "position": [40, 50], "orth": "verzweigte", "lexemes": [{"lemma": "verzweigt", "mstag": "ADJ", "disamb": true}]}, {"index": 7, "position": [51, 69], "orth": "Hydroiden-Kolonien", "lexemes": [{"lemma": "Hydroiden-Kolonien", "mstag": "NOUN", "disamb": true}]}, {"index": 8, "position": [70, 70], "orth": ".", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 9, "position": [71, 74], "orth": "Die", "lexemes": [{"lemma": "der", "mstag": "DET", "disamb": true}]}, {"index": 10, "position": [75, 90], "orth": "Polypenköpfchen", "lexemes": [{"lemma": "Polypenköpfche", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [91, 92], "orth": "(", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 12, "position": [93, 102], "orth": "Hydranthen", "lexemes": [{"lemma": "Hydranthe", "mstag": "NOUN", "disamb": true}]}, {"index": 13, "position": [103, 103], "orth": ")", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 14, "position": [104, 108], "orth": "sind", "lexemes": [{"lemma": "sein", "mstag": "AUX", "disamb": true}]}, {"index": 15, "position": [109, 117], "orth": "spindel-", "lexemes": [{"lemma": "spindel", "mstag": "X", "disamb": true}]}, {"index": 16, "position": [118, 121], "orth": "bis", "lexemes": [{"lemma": "bis", "mstag": "CCONJ", "disamb": true}]}, {"index": 17, "position": [122, 134], "orth": "keulenförmig", "lexemes": [{"lemma": "keulenförmig", "mstag": "ADV", "disamb": true}]}, {"index": 18, "position": [135, 135], "orth": ",", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 19, "position": [136, 140], "orth": "ihre", "lexemes": [{"lemma": "ihr", "mstag": "DET", "disamb": true}]}, {"index": 20, "position": [141, 154], "orth": "fadenförmigen", "lexemes": [{"lemma": "fadenförmig", "mstag": "ADJ", "disamb": true}]}, {"index": 21, "position": [155, 163], "orth": "Tentakel", "lexemes": [{"lemma": "Tentakel", "mstag": "NOUN", "disamb": true}]}, {"index": 22, "position": [164, 168], "orth": "sind", "lexemes": [{"lemma": "sein", "mstag": "AUX", "disamb": true}]}, {"index": 23, "position": [169, 181], "orth": "unregelmäßig", "lexemes": [{"lemma": "unregelmäßig", "mstag": "ADV", "disamb": true}]}, {"index": 24, "position": [182, 186], "orth": "über", "lexemes": [{"lemma": "über", "mstag": "ADP", "disamb": true}]}, {"index": 25, "position": [187, 190], "orth": "den", "lexemes": [{"lemma": "der", "mstag": "DET", "disamb": true}]}, {"index": 26, "position": [191, 207], "orth": "Hydranthenkörper", "lexemes": [{"lemma": "Hydranthenkörper", "mstag": "NOUN", "disamb": true}]}, {"index": 27, "position": [208, 216], "orth": "verteilt", "lexemes": [{"lemma": "verteilen", "mstag": "VERB", "disamb": true}]}, {"index": 28, "position": [217, 217], "orth": ".", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 29, "position": [218, 221], "orth": "Die", "lexemes": [{"lemma": "der", "mstag": "DET", "disamb": true}]}, {"index": 30, "position": [222, 227], "orth": "Hülle", "lexemes": [{"lemma": "Hülle", "mstag": "NOUN", "disamb": true}]}, {"index": 31, "position": [228, 229], "orth": "(", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 32, "position": [230, 237], "orth": "Periderm", "lexemes": [{"lemma": "Periderm", "mstag": "NOUN", "disamb": true}]}, {"index": 33, "position": [238, 238], "orth": ")", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 34, "position": [239, 241], "orth": "um", "lexemes": [{"lemma": "um", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [242, 245], "orth": "den", "lexemes": [{"lemma": "der", "mstag": "DET", "disamb": true}]}, {"index": 36, "position": [246, 251], "orth": "Stiel", "lexemes": [{"lemma": "Stiel", "mstag": "NOUN", "disamb": true}]}, {"index": 37, "position": [252, 253], "orth": "(", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 38, "position": [254, 264], "orth": "Hydrocaulus", "lexemes": [{"lemma": "Hydrocaulus", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [265, 265], "orth": ")", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 40, "position": [266, 269], "orth": "ist", "lexemes": [{"lemma": "sein", "mstag": "AUX", "disamb": true}]}, {"index": 41, "position": [270, 279], "orth": "zweilagig", "lexemes": [{"lemma": "zweilagig", "mstag": "ADV", "disamb": true}]}, {"index": 42, "position": [280, 280], "orth": ".", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 43, "position": [281, 284], "orth": "Die", "lexemes": [{"lemma": "der", "mstag": "DET", "disamb": true}]}, {"index": 44, "position": [285, 299], "orth": "Medusenknospen", "lexemes": [{"lemma": "Medusenknospe", "mstag": "NOUN", "disamb": true}]}, {"index": 45, "position": [300, 301], "orth": "(", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 46, "position": [302, 311], "orth": "Gonophoren", "lexemes": [{"lemma": "Gonophor", "mstag": "NOUN", "disamb": true}]}, {"index": 47, "position": [312, 312], "orth": ")", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 48, "position": [313, 323], "orth": "entwickeln", "lexemes": [{"lemma": "entwickeln", "mstag": "VERB", "disamb": true}]}, {"index": 49, "position": [324, 328], "orth": "sich", "lexemes": [{"lemma": "sich", "mstag": "PRON", "disamb": true}]}, {"index": 50, "position": [329, 332], "orth": "auf", "lexemes": [{"lemma": "auf", "mstag": "ADP", "disamb": true}]}, {"index": 51, "position": [333, 336], "orth": "den", "lexemes": [{"lemma": "der", "mstag": "DET", "disamb": true}]}, {"index": 52, "position": [337, 344], "orth": "Stielen", "lexemes": [{"lemma": "Stiel", "mstag": "NOUN", "disamb": true}]}, {"index": 53, "position": [345, 347], "orth": "in", "lexemes": [{"lemma": "in", "mstag": "ADP", "disamb": true}]}, {"index": 54, "position": [348, 353], "orth": "einer", "lexemes": [{"lemma": "ein", "mstag": "DET", "disamb": true}]}, {"index": 55, "position": [354, 357], "orth": "mit", "lexemes": [{"lemma": "mit", "mstag": "ADP", "disamb": true}]}, {"index": 56, "position": [358, 366], "orth": "Perisarc", "lexemes": [{"lemma": "Perisarc", "mstag": "PROPN", "disamb": true}]}, {"index": 57, "position": [367, 379], "orth": "eingehüllten", "lexemes": [{"lemma": "eingehüllt", "mstag": "ADJ", "disamb": true}]}, {"index": 58, "position": [380, 386], "orth": "Region", "lexemes": [{"lemma": "Region", "mstag": "NOUN", "disamb": true}]}, {"index": 59, "position": [387, 387], "orth": ".", "lexemes": [{"lemma": "--", "mstag": "PUNCT", "disamb": true}]}, {"index": 60, "position": [388, 388], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "Turritopsis nutricula bildet aufrechte, verzweigte Hydroiden-Kolonien. Die Polypenköpfchen (Hydranthen) sind spindel- bis keulenförmig, ihre fadenförmigen Tentakel sind unregelmäßig über den Hydranthenkörper verteilt. Die Hülle (Periderm) um den Stiel (Hydrocaulus) ist zweilagig. Die Medusenknospen (Gonophoren) entwickeln sich auf den Stielen in einer mit Perisarc eingehüllten Region.\n"}
diff --git a/tests/testdata/expected/turritopsis_nutricula_es.json b/tests/testdata/expected/turritopsis_nutricula_es.json
new file mode 100644
index 0000000..45766e3
--- /dev/null
+++ b/tests/testdata/expected/turritopsis_nutricula_es.json
@@ -0,0 +1 @@
+{"filename": "4b30963c-3e8f-477f-aa7f-aa238618023f", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 5], "orth": "Tiene", "lexemes": [{"lemma": "tener", "mstag": "VERB", "disamb": true}]}, {"index": 2, "position": [6, 8], "orth": "un", "lexemes": [{"lemma": "uno", "mstag": "DET", "disamb": true}]}, {"index": 3, "position": [9, 17], "orth": "diámetro", "lexemes": [{"lemma": "diámetro", "mstag": "NOUN", "disamb": true}]}, {"index": 4, "position": [18, 20], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 5, "position": [21, 24], "orth": "4-5", "lexemes": [{"lemma": "4-5", "mstag": "NUM", "disamb": true}]}, {"index": 6, "position": [25, 27], "orth": "mm", "lexemes": [{"lemma": "mm", "mstag": "NOUN", "disamb": true}]}, {"index": 7, "position": [28, 28], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 8, "position": [29, 31], "orth": "Su", "lexemes": [{"lemma": "su", "mstag": "DET", "disamb": true}]}, {"index": 9, "position": [32, 38], "orth": "figura", "lexemes": [{"lemma": "figura", "mstag": "NOUN", "disamb": true}]}, {"index": 10, "position": [39, 41], "orth": "es", "lexemes": [{"lemma": "ser", "mstag": "AUX", "disamb": true}]}, {"index": 11, "position": [42, 46], "orth": "alta", "lexemes": [{"lemma": "alto", "mstag": "ADJ", "disamb": true}]}, {"index": 12, "position": [47, 48], "orth": "y", "lexemes": [{"lemma": "y", "mstag": "CCONJ", "disamb": true}]}, {"index": 13, "position": [49, 59], "orth": "acampanada", "lexemes": [{"lemma": "acampanado", "mstag": "ADJ", "disamb": true}]}, {"index": 14, "position": [60, 63], "orth": "con", "lexemes": [{"lemma": "con", "mstag": "ADP", "disamb": true}]}, {"index": 15, "position": [64, 71], "orth": "paredes", "lexemes": [{"lemma": "pared", "mstag": "NOUN", "disamb": true}]}, {"index": 16, "position": [72, 77], "orth": "finas", "lexemes": [{"lemma": "fino", "mstag": "ADJ", "disamb": true}]}, {"index": 17, "position": [78, 79], "orth": "y", "lexemes": [{"lemma": "y", "mstag": "CCONJ", "disamb": true}]}, {"index": 18, "position": [80, 89], "orth": "uniformes", "lexemes": [{"lemma": "uniforme", "mstag": "NOUN", "disamb": true}]}, {"index": 19, "position": [90, 90], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 20, "position": [91, 93], "orth": "Su", "lexemes": [{"lemma": "su", "mstag": "DET", "disamb": true}]}, {"index": 21, "position": [94, 98], "orth": "gran", "lexemes": [{"lemma": "gran", "mstag": "ADJ", "disamb": true}]}, {"index": 22, "position": [99, 107], "orth": "estómago", "lexemes": [{"lemma": "estómago", "mstag": "NOUN", "disamb": true}]}, {"index": 23, "position": [108, 109], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "PUNCT", "disamb": true}]}, {"index": 24, "position": [110, 116], "orth": "cavidad", "lexemes": [{"lemma": "cavidad", "mstag": "NOUN", "disamb": true}]}, {"index": 25, "position": [117, 131], "orth": "gastrovascular", "lexemes": [{"lemma": "gastrovascular", "mstag": "ADJ", "disamb": true}]}, {"index": 26, "position": [132, 132], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "PUNCT", "disamb": true}]}, {"index": 27, "position": [133, 133], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 28, "position": [134, 138], "orth": "rojo", "lexemes": [{"lemma": "rojo", "mstag": "NOUN", "disamb": true}]}, {"index": 29, "position": [139, 143], "orth": "vivo", "lexemes": [{"lemma": "vivo", "mstag": "ADJ", "disamb": true}]}, {"index": 30, "position": [144, 144], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 31, "position": [145, 150], "orth": "tiene", "lexemes": [{"lemma": "tener", "mstag": "VERB", "disamb": true}]}, {"index": 32, "position": [151, 156], "orth": "forma", "lexemes": [{"lemma": "forma", "mstag": "NOUN", "disamb": true}]}, {"index": 33, "position": [157, 167], "orth": "cruciforme", "lexemes": [{"lemma": "cruciforme", "mstag": "ADJ", "disamb": true}]}, {"index": 34, "position": [168, 170], "orth": "en", "lexemes": [{"lemma": "en", "mstag": "ADP", "disamb": true}]}, {"index": 35, "position": [171, 173], "orth": "su", "lexemes": [{"lemma": "su", "mstag": "DET", "disamb": true}]}, {"index": 36, "position": [174, 179], "orth": "corte", "lexemes": [{"lemma": "corte", "mstag": "NOUN", "disamb": true}]}, {"index": 37, "position": [180, 191], "orth": "transversal", "lexemes": [{"lemma": "transversal", "mstag": "ADJ", "disamb": true}]}, {"index": 38, "position": [192, 192], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 39, "position": [193, 196], "orth": "Los", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 40, "position": [197, 208], "orth": "especímenes", "lexemes": [{"lemma": "especímén", "mstag": "NOUN", "disamb": true}]}, {"index": 41, "position": [209, 216], "orth": "jóvenes", "lexemes": [{"lemma": "joven", "mstag": "ADJ", "disamb": true}]}, {"index": 42, "position": [217, 223], "orth": "tienen", "lexemes": [{"lemma": "tener", "mstag": "VERB", "disamb": true}]}, {"index": 43, "position": [224, 228], "orth": "ocho", "lexemes": [{"lemma": "ocho", "mstag": "NUM", "disamb": true}]}, {"index": 44, "position": [229, 239], "orth": "tentáculos", "lexemes": [{"lemma": "tentáculo", "mstag": "NOUN", "disamb": true}]}, {"index": 45, "position": [240, 242], "orth": "en", "lexemes": [{"lemma": "en", "mstag": "ADP", "disamb": true}]}, {"index": 46, "position": [243, 245], "orth": "el", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 47, "position": [246, 251], "orth": "borde", "lexemes": [{"lemma": "borde", "mstag": "NOUN", "disamb": true}]}, {"index": 48, "position": [252, 256], "orth": "pero", "lexemes": [{"lemma": "pero", "mstag": "CCONJ", "disamb": true}]}, {"index": 49, "position": [257, 260], "orth": "los", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 50, "position": [261, 268], "orth": "adultos", "lexemes": [{"lemma": "adulto", "mstag": "NOUN", "disamb": true}]}, {"index": 51, "position": [269, 275], "orth": "llegan", "lexemes": [{"lemma": "llegar", "mstag": "VERB", "disamb": true}]}, {"index": 52, "position": [276, 277], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [278, 283], "orth": "tener", "lexemes": [{"lemma": "tener", "mstag": "VERB", "disamb": true}]}, {"index": 54, "position": [284, 289], "orth": "hasta", "lexemes": [{"lemma": "hasta", "mstag": "ADP", "disamb": true}]}, {"index": 55, "position": [290, 297], "orth": "noventa", "lexemes": [{"lemma": "noventa", "mstag": "NUM", "disamb": true}]}, {"index": 56, "position": [298, 308], "orth": "tentáculos", "lexemes": [{"lemma": "tentáculo", "mstag": "NOUN", "disamb": true}]}, {"index": 57, "position": [309, 309], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 58, "position": [310, 313], "orth": "Los", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 59, "position": [314, 320], "orth": "huevos", "lexemes": [{"lemma": "huevo", "mstag": "NOUN", "disamb": true}]}, {"index": 60, "position": [321, 333], "orth": "fertilizados", "lexemes": [{"lemma": "fertilizado", "mstag": "ADJ", "disamb": true}]}, {"index": 61, "position": [334, 336], "orth": "se", "lexemes": [{"lemma": "él", "mstag": "PRON", "disamb": true}]}, {"index": 62, "position": [337, 348], "orth": "desarrollan", "lexemes": [{"lemma": "desarrollar", "mstag": "VERB", "disamb": true}]}, {"index": 63, "position": [349, 351], "orth": "en", "lexemes": [{"lemma": "en", "mstag": "ADP", "disamb": true}]}, {"index": 64, "position": [352, 354], "orth": "el", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 65, "position": [355, 363], "orth": "estómago", "lexemes": [{"lemma": "estómago", "mstag": "NOUN", "disamb": true}]}, {"index": 66, "position": [364, 365], "orth": "y", "lexemes": [{"lemma": "y", "mstag": "CCONJ", "disamb": true}]}, {"index": 67, "position": [366, 368], "orth": "en", "lexemes": [{"lemma": "en", "mstag": "ADP", "disamb": true}]}, {"index": 68, "position": [369, 378], "orth": "cavidades", "lexemes": [{"lemma": "cavidad", "mstag": "NOUN", "disamb": true}]}, {"index": 69, "position": [379, 381], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 70, "position": [382, 384], "orth": "la", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 71, "position": [385, 390], "orth": "larva", "lexemes": [{"lemma": "larva", "mstag": "NOUN", "disamb": true}]}, {"index": 72, "position": [391, 392], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "PUNCT", "disamb": true}]}, {"index": 73, "position": [393, 399], "orth": "plánula", "lexemes": [{"lemma": "plánular", "mstag": "VERB", "disamb": true}]}, {"index": 74, "position": [400, 400], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "PUNCT", "disamb": true}]}, {"index": 75, "position": [401, 401], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 76, "position": [402, 405], "orth": "Los", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 77, "position": [406, 412], "orth": "huevos", "lexemes": [{"lemma": "huevo", "mstag": "NOUN", "disamb": true}]}, {"index": 78, "position": [413, 427], "orth": "posteriormente", "lexemes": [{"lemma": "posteriormente", "mstag": "ADV", "disamb": true}]}, {"index": 79, "position": [428, 430], "orth": "se", "lexemes": [{"lemma": "él", "mstag": "PRON", "disamb": true}]}, {"index": 80, "position": [431, 438], "orth": "plantan", "lexemes": [{"lemma": "plantir", "mstag": "VERB", "disamb": true}]}, {"index": 81, "position": [439, 441], "orth": "en", "lexemes": [{"lemma": "en", "mstag": "ADP", "disamb": true}]}, {"index": 82, "position": [442, 444], "orth": "el", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 83, "position": [445, 450], "orth": "fondo", "lexemes": [{"lemma": "fondo", "mstag": "NOUN", "disamb": true}]}, {"index": 84, "position": [451, 454], "orth": "del", "lexemes": [{"lemma": "del", "mstag": "ADP", "disamb": true}]}, {"index": 85, "position": [455, 458], "orth": "mar", "lexemes": [{"lemma": "mar", "mstag": "NOUN", "disamb": true}]}, {"index": 86, "position": [459, 461], "orth": "en", "lexemes": [{"lemma": "en", "mstag": "ADP", "disamb": true}]}, {"index": 87, "position": [462, 470], "orth": "colonias", "lexemes": [{"lemma": "colonia", "mstag": "NOUN", "disamb": true}]}, {"index": 88, "position": [471, 473], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 89, "position": [474, 481], "orth": "pólipos", "lexemes": [{"lemma": "pólipo", "mstag": "NOUN", "disamb": true}]}, {"index": 90, "position": [482, 482], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 91, "position": [483, 485], "orth": "La", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 92, "position": [486, 492], "orth": "medusa", "lexemes": [{"lemma": "medusa", "mstag": "NOUN", "disamb": true}]}, {"index": 93, "position": [493, 499], "orth": "incuba", "lexemes": [{"lemma": "incubar", "mstag": "VERB", "disamb": true}]}, {"index": 94, "position": [500, 507], "orth": "después", "lexemes": [{"lemma": "después", "mstag": "ADV", "disamb": true}]}, {"index": 95, "position": [508, 510], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 96, "position": [511, 514], "orth": "dos", "lexemes": [{"lemma": "dos", "mstag": "NUM", "disamb": true}]}, {"index": 97, "position": [515, 519], "orth": "días", "lexemes": [{"lemma": "día", "mstag": "NOUN", "disamb": true}]}, {"index": 98, "position": [520, 520], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 99, "position": [521, 526], "orth": "Llega", "lexemes": [{"lemma": "llegar", "mstag": "VERB", "disamb": true}]}, {"index": 100, "position": [527, 528], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "ADP", "disamb": true}]}, {"index": 101, "position": [529, 532], "orth": "ser", "lexemes": [{"lemma": "ser", "mstag": "AUX", "disamb": true}]}, {"index": 102, "position": [533, 539], "orth": "madura", "lexemes": [{"lemma": "madura", "mstag": "NOUN", "disamb": true}]}, {"index": 103, "position": [540, 551], "orth": "sexualmente", "lexemes": [{"lemma": "sexualmente", "mstag": "ADV", "disamb": true}]}, {"index": 104, "position": [552, 559], "orth": "después", "lexemes": [{"lemma": "después", "mstag": "ADV", "disamb": true}]}, {"index": 105, "position": [560, 562], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 106, "position": [563, 568], "orth": "pocas", "lexemes": [{"lemma": "poco", "mstag": "DET", "disamb": true}]}, {"index": 107, "position": [569, 576], "orth": "semanas", "lexemes": [{"lemma": "semana", "mstag": "NOUN", "disamb": true}]}, {"index": 108, "position": [577, 578], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "PUNCT", "disamb": true}]}, {"index": 109, "position": [579, 580], "orth": "su", "lexemes": [{"lemma": "su", "mstag": "DET", "disamb": true}]}, {"index": 110, "position": [581, 589], "orth": "duración", "lexemes": [{"lemma": "duración", "mstag": "NOUN", "disamb": true}]}, {"index": 111, "position": [590, 596], "orth": "exacta", "lexemes": [{"lemma": "exacto", "mstag": "ADJ", "disamb": true}]}, {"index": 112, "position": [597, 604], "orth": "depende", "lexemes": [{"lemma": "depender", "mstag": "VERB", "disamb": true}]}, {"index": 113, "position": [605, 607], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 114, "position": [608, 610], "orth": "la", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 115, "position": [611, 622], "orth": "temperatura", "lexemes": [{"lemma": "temperatura", "mstag": "NOUN", "disamb": true}]}, {"index": 116, "position": [623, 625], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 117, "position": [626, 629], "orth": "las", "lexemes": [{"lemma": "el", "mstag": "DET", "disamb": true}]}, {"index": 118, "position": [630, 635], "orth": "aguas", "lexemes": [{"lemma": "agua", "mstag": "NOUN", "disamb": true}]}, {"index": 119, "position": [636, 636], "orth": ":", "lexemes": [{"lemma": ":", "mstag": "PUNCT", "disamb": true}]}, {"index": 120, "position": [637, 638], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "ADP", "disamb": true}]}, {"index": 121, "position": [639, 641], "orth": "20", "lexemes": [{"lemma": "20", "mstag": "NUM", "disamb": true}]}, {"index": 122, "position": [642, 644], "orth": "°C", "lexemes": [{"lemma": "°C", "mstag": "PROPN", "disamb": true}]}, {"index": 123, "position": [645, 650], "orth": "entre", "lexemes": [{"lemma": "entre", "mstag": "ADP", "disamb": true}]}, {"index": 124, "position": [651, 662], "orth": "veinticinco", "lexemes": [{"lemma": "veinticinco", "mstag": "NUM", "disamb": true}]}, {"index": 125, "position": [663, 664], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "ADP", "disamb": true}]}, {"index": 126, "position": [665, 672], "orth": "treinta", "lexemes": [{"lemma": "treinta", "mstag": "NUM", "disamb": true}]}, {"index": 127, "position": [673, 677], "orth": "días", "lexemes": [{"lemma": "día", "mstag": "NOUN", "disamb": true}]}, {"index": 128, "position": [678, 679], "orth": "y", "lexemes": [{"lemma": "y", "mstag": "CCONJ", "disamb": true}]}, {"index": 129, "position": [680, 681], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "ADP", "disamb": true}]}, {"index": 130, "position": [682, 684], "orth": "22", "lexemes": [{"lemma": "22", "mstag": "NUM", "disamb": true}]}, {"index": 131, "position": [685, 687], "orth": "°C", "lexemes": [{"lemma": "°C", "mstag": "PROPN", "disamb": true}]}, {"index": 132, "position": [688, 690], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 133, "position": [691, 700], "orth": "dieciocho", "lexemes": [{"lemma": "dieciocho", "mstag": "NUM", "disamb": true}]}, {"index": 134, "position": [701, 702], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "ADP", "disamb": true}]}, {"index": 135, "position": [703, 712], "orth": "veintidós", "lexemes": [{"lemma": "veintidós", "mstag": "DET", "disamb": true}]}, {"index": 136, "position": [713, 717], "orth": "días", "lexemes": [{"lemma": "día", "mstag": "NOUN", "disamb": true}]}, {"index": 137, "position": [718, 718], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "PUNCT", "disamb": true}]}, {"index": 138, "position": [719, 719], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 139, "position": [720, 720], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "Tiene un diámetro de 4-5 mm. Su figura es alta y acampanada con paredes finas y uniformes. Su gran estómago (cavidad gastrovascular), rojo vivo, tiene forma cruciforme en su corte transversal. Los especímenes jóvenes tienen ocho tentáculos en el borde pero los adultos llegan a tener hasta noventa tentáculos. Los huevos fertilizados se desarrollan en el estómago y en cavidades de la larva (plánula). Los huevos posteriormente se plantan en el fondo del mar en colonias de pólipos. La medusa incuba después de dos días. Llega a ser madura sexualmente después de pocas semanas (su duración exacta depende de la temperatura de las aguas: a 20 °C entre veinticinco a treinta días y a 22 °C de dieciocho a veintidós días).\n"}
diff --git a/tests/testdata/expected/turritopsis_nutricula_fr.json b/tests/testdata/expected/turritopsis_nutricula_fr.json
new file mode 100644
index 0000000..152444e
--- /dev/null
+++ b/tests/testdata/expected/turritopsis_nutricula_fr.json
@@ -0,0 +1 @@
+{"filename": "df1f4f2c-a675-4c7b-be4f-3e751f969153", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 5], "orth": "Cette", "lexemes": [{"lemma": "ce", "mstag": "DET", "disamb": true}]}, {"index": 2, "position": [6, 12], "orth": "méduse", "lexemes": [{"lemma": "méduse", "mstag": "NOUN", "disamb": true}]}, {"index": 3, "position": [13, 16], "orth": "est", "lexemes": [{"lemma": "être", "mstag": "AUX", "disamb": true}]}, {"index": 4, "position": [17, 24], "orth": "capable", "lexemes": [{"lemma": "capable", "mstag": "ADJ", "disamb": true}]}, {"index": 5, "position": [25, 27], "orth": "d’", "lexemes": [{"lemma": "d’", "mstag": "ADV", "disamb": true}]}, {"index": 6, "position": [28, 35], "orth": "inverser", "lexemes": [{"lemma": "inverser", "mstag": "VERB", "disamb": true}]}, {"index": 7, "position": [36, 39], "orth": "son", "lexemes": [{"lemma": "son", "mstag": "DET", "disamb": true}]}, {"index": 8, "position": [40, 49], "orth": "processus", "lexemes": [{"lemma": "processus", "mstag": "NOUN", "disamb": true}]}, {"index": 9, "position": [50, 52], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 10, "position": [53, 67], "orth": "vieillissement", "lexemes": [{"lemma": "vieillissement", "mstag": "NOUN", "disamb": true}]}, {"index": 11, "position": [68, 69], "orth": "a", "lexemes": [{"lemma": "avoir", "mstag": "AUX", "disamb": true}]}, {"index": 12, "position": [70, 76], "orth": "priori", "lexemes": [{"lemma": "priori", "mstag": "X", "disamb": true}]}, {"index": 13, "position": [77, 89], "orth": "indéfiniment", "lexemes": [{"lemma": "indéfiniment", "mstag": "ADV", "disamb": true}]}, {"index": 14, "position": [90, 90], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 15, "position": [91, 96], "orth": "grâce", "lexemes": [{"lemma": "grâce", "mstag": "NOUN", "disamb": true}]}, {"index": 16, "position": [97, 99], "orth": "au", "lexemes": [{"lemma": "au", "mstag": "DET", "disamb": true}]}, {"index": 17, "position": [100, 109], "orth": "processus", "lexemes": [{"lemma": "processus", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [110, 112], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 19, "position": [113, 133], "orth": "transdifférenciation", "lexemes": [{"lemma": "transdifférenciation", "mstag": "NOUN", "disamb": true}]}, {"index": 20, "position": [134, 134], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 21, "position": [135, 142], "orth": "Presque", "lexemes": [{"lemma": "presque", "mstag": "ADV", "disamb": true}]}, {"index": 22, "position": [143, 149], "orth": "toutes", "lexemes": [{"lemma": "tout", "mstag": "ADJ", "disamb": true}]}, {"index": 23, "position": [150, 153], "orth": "les", "lexemes": [{"lemma": "le", "mstag": "DET", "disamb": true}]}, {"index": 24, "position": [154, 161], "orth": "méduses", "lexemes": [{"lemma": "méduse", "mstag": "NOUN", "disamb": true}]}, {"index": 25, "position": [162, 169], "orth": "peuvent", "lexemes": [{"lemma": "pouvoir", "mstag": "VERB", "disamb": true}]}, {"index": 26, "position": [170, 172], "orth": "se", "lexemes": [{"lemma": "se", "mstag": "PRON", "disamb": true}]}, {"index": 27, "position": [173, 183], "orth": "multiplier", "lexemes": [{"lemma": "multiplier", "mstag": "VERB", "disamb": true}]}, {"index": 28, "position": [184, 187], "orth": "par", "lexemes": [{"lemma": "par", "mstag": "ADP", "disamb": true}]}, {"index": 29, "position": [188, 195], "orth": "clonage", "lexemes": [{"lemma": "clonage", "mstag": "NOUN", "disamb": true}]}, {"index": 30, "position": [196, 203], "orth": "pendant", "lexemes": [{"lemma": "pendant", "mstag": "ADP", "disamb": true}]}, {"index": 31, "position": [204, 208], "orth": "leur", "lexemes": [{"lemma": "leur", "mstag": "DET", "disamb": true}]}, {"index": 32, "position": [209, 214], "orth": "stade", "lexemes": [{"lemma": "stade", "mstag": "NOUN", "disamb": true}]}, {"index": 33, "position": [215, 221], "orth": "polype", "lexemes": [{"lemma": "polyp", "mstag": "ADJ", "disamb": true}]}, {"index": 34, "position": [222, 223], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "PUNCT", "disamb": true}]}, {"index": 35, "position": [224, 226], "orth": "tel", "lexemes": [{"lemma": "tel", "mstag": "ADJ", "disamb": true}]}, {"index": 36, "position": [227, 229], "orth": "le", "lexemes": [{"lemma": "le", "mstag": "DET", "disamb": true}]}, {"index": 37, "position": [230, 244], "orth": "bourgeonnement", "lexemes": [{"lemma": "bourgeonnement", "mstag": "NOUN", "disamb": true}]}, {"index": 38, "position": [245, 248], "orth": "des", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 39, "position": [249, 261], "orth": "hydrozoaires", "lexemes": [{"lemma": "hydrozoaire", "mstag": "NOUN", "disamb": true}]}, {"index": 40, "position": [262, 262], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "PUNCT", "disamb": true}]}, {"index": 41, "position": [263, 263], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 42, "position": [264, 266], "orth": "ce", "lexemes": [{"lemma": "ce", "mstag": "PRON", "disamb": true}]}, {"index": 43, "position": [267, 270], "orth": "qui", "lexemes": [{"lemma": "qui", "mstag": "PRON", "disamb": true}]}, {"index": 44, "position": [271, 274], "orth": "est", "lexemes": [{"lemma": "être", "mstag": "AUX", "disamb": true}]}, {"index": 45, "position": [275, 280], "orth": "aussi", "lexemes": [{"lemma": "aussi", "mstag": "ADV", "disamb": true}]}, {"index": 46, "position": [281, 284], "orth": "une", "lexemes": [{"lemma": "un", "mstag": "DET", "disamb": true}]}, {"index": 47, "position": [285, 293], "orth": "certaine", "lexemes": [{"lemma": "certaine", "mstag": "NOUN", "disamb": true}]}, {"index": 48, "position": [294, 299], "orth": "forme", "lexemes": [{"lemma": "form", "mstag": "ADJ", "disamb": true}]}, {"index": 49, "position": [300, 302], "orth": "d'", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 50, "position": [303, 313], "orth": "immortalité", "lexemes": [{"lemma": "immortalité", "mstag": "NOUN", "disamb": true}]}, {"index": 51, "position": [314, 314], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 52, "position": [315, 317], "orth": "Du", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [318, 322], "orth": "fait", "lexemes": [{"lemma": "fait", "mstag": "NOUN", "disamb": true}]}, {"index": 54, "position": [323, 325], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 55, "position": [326, 329], "orth": "ses", "lexemes": [{"lemma": "son", "mstag": "DET", "disamb": true}]}, {"index": 56, "position": [330, 346], "orth": "caractéristiques", "lexemes": [{"lemma": "caractéristique", "mstag": "NOUN", "disamb": true}]}, {"index": 57, "position": [347, 362], "orth": "exceptionnelles", "lexemes": [{"lemma": "exceptionnel", "mstag": "ADJ", "disamb": true}]}, {"index": 58, "position": [363, 363], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 59, "position": [364, 368], "orth": "elle", "lexemes": [{"lemma": "lui", "mstag": "PRON", "disamb": true}]}, {"index": 60, "position": [369, 373], "orth": "fait", "lexemes": [{"lemma": "faire", "mstag": "VERB", "disamb": true}]}, {"index": 61, "position": [374, 376], "orth": "l’", "lexemes": [{"lemma": "l’", "mstag": "SPACE", "disamb": true}]}, {"index": 62, "position": [377, 381], "orth": "objet", "lexemes": [{"lemma": "objet", "mstag": "ADJ", "disamb": true}]}, {"index": 63, "position": [382, 384], "orth": "d'", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 64, "position": [385, 390], "orth": "études", "lexemes": [{"lemma": "étude", "mstag": "NOUN", "disamb": true}]}, {"index": 65, "position": [391, 394], "orth": "par", "lexemes": [{"lemma": "par", "mstag": "ADP", "disamb": true}]}, {"index": 66, "position": [395, 398], "orth": "les", "lexemes": [{"lemma": "le", "mstag": "DET", "disamb": true}]}, {"index": 67, "position": [399, 410], "orth": "biologistes", "lexemes": [{"lemma": "biologiste", "mstag": "NOUN", "disamb": true}]}, {"index": 68, "position": [411, 413], "orth": "et", "lexemes": [{"lemma": "et", "mstag": "CCONJ", "disamb": true}]}, {"index": 69, "position": [414, 417], "orth": "les", "lexemes": [{"lemma": "le", "mstag": "DET", "disamb": true}]}, {"index": 70, "position": [418, 429], "orth": "généticiens", "lexemes": [{"lemma": "généticien", "mstag": "NOUN", "disamb": true}]}, {"index": 71, "position": [430, 430], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 72, "position": [431, 433], "orth": "Le", "lexemes": [{"lemma": "le", "mstag": "DET", "disamb": true}]}, {"index": 73, "position": [434, 443], "orth": "chercheur", "lexemes": [{"lemma": "chercheur", "mstag": "NOUN", "disamb": true}]}, {"index": 74, "position": [444, 452], "orth": "japonais", "lexemes": [{"lemma": "japoner", "mstag": "VERB", "disamb": true}]}, {"index": 75, "position": [453, 457], "orth": "Shin", "lexemes": [{"lemma": "Shin", "mstag": "PROPN", "disamb": true}]}, {"index": 76, "position": [458, 464], "orth": "Kubota", "lexemes": [{"lemma": "Kubota", "mstag": "PROPN", "disamb": true}]}, {"index": 77, "position": [465, 466], "orth": "a", "lexemes": [{"lemma": "avoir", "mstag": "VERB", "disamb": true}]}, {"index": 78, "position": [467, 469], "orth": "d'", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 79, "position": [470, 477], "orth": "ailleurs", "lexemes": [{"lemma": "ailleurs", "mstag": "ADV", "disamb": true}]}, {"index": 80, "position": [478, 485], "orth": "observé", "lexemes": [{"lemma": "observer", "mstag": "VERB", "disamb": true}]}, {"index": 81, "position": [486, 488], "orth": "en", "lexemes": [{"lemma": "en", "mstag": "ADP", "disamb": true}]}, {"index": 82, "position": [489, 493], "orth": "2011", "lexemes": [{"lemma": "2011", "mstag": "NUM", "disamb": true}]}, {"index": 83, "position": [494, 496], "orth": "ce", "lexemes": [{"lemma": "ce", "mstag": "DET", "disamb": true}]}, {"index": 84, "position": [497, 511], "orth": "rajeunissement", "lexemes": [{"lemma": "rajeunissement", "mstag": "NOUN", "disamb": true}]}, {"index": 85, "position": [512, 513], "orth": "à", "lexemes": [{"lemma": "à", "mstag": "ADP", "disamb": true}]}, {"index": 86, "position": [514, 517], "orth": "une", "lexemes": [{"lemma": "un", "mstag": "DET", "disamb": true}]}, {"index": 87, "position": [518, 525], "orth": "dizaine", "lexemes": [{"lemma": "dizaine", "mstag": "NOUN", "disamb": true}]}, {"index": 88, "position": [526, 528], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 89, "position": [529, 537], "orth": "reprises", "lexemes": [{"lemma": "reprise", "mstag": "NOUN", "disamb": true}]}, {"index": 90, "position": [538, 538], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}], "text": "Cette méduse est capable d’inverser son processus de vieillissement a priori indéfiniment, grâce au processus de transdifférenciation. Presque toutes les méduses peuvent se multiplier par clonage pendant leur stade polype (tel le bourgeonnement des hydrozoaires), ce qui est aussi une certaine forme d'immortalité. Du fait de ses caractéristiques exceptionnelles, elle fait l’objet d'études par les biologistes et les généticiens. Le chercheur japonais Shin Kubota a d'ailleurs observé en 2011 ce rajeunissement à une dizaine de reprises."}
diff --git a/tests/testdata/expected/turritopsis_nutricula_pt.json b/tests/testdata/expected/turritopsis_nutricula_pt.json
new file mode 100644
index 0000000..7c5c259
--- /dev/null
+++ b/tests/testdata/expected/turritopsis_nutricula_pt.json
@@ -0,0 +1 @@
+{"filename": "54553d40-690a-4faf-9dbf-2c9476412e02", "tagset": "ud", "tokens": [{"index": 1, "position": [0, 11], "orth": "Turritopsis", "lexemes": [{"lemma": "Turritopsis", "mstag": "NOUN", "disamb": true}]}, {"index": 2, "position": [12, 21], "orth": "nutricula", "lexemes": [{"lemma": "nutriculo", "mstag": "PROPN", "disamb": true}]}, {"index": 3, "position": [22, 23], "orth": "é", "lexemes": [{"lemma": "ser", "mstag": "AUX", "disamb": true}]}, {"index": 4, "position": [24, 26], "orth": "um", "lexemes": [{"lemma": "um", "mstag": "DET", "disamb": true}]}, {"index": 5, "position": [27, 38], "orth": "hidrozoário", "lexemes": [{"lemma": "hidrozoário", "mstag": "NOUN", "disamb": true}]}, {"index": 6, "position": [39, 39], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 7, "position": [40, 42], "orth": "As", "lexemes": [{"lemma": "o", "mstag": "DET", "disamb": true}]}, {"index": 8, "position": [43, 44], "orth": "\"", "lexemes": [{"lemma": "\"", "mstag": "SPACE", "disamb": true}]}, {"index": 9, "position": [45, 55], "orth": "águas-vivas", "lexemes": [{"lemma": "águas-viva", "mstag": "NOUN", "disamb": true}]}, {"index": 10, "position": [56, 64], "orth": "imortais", "lexemes": [{"lemma": "imortal", "mstag": "ADJ", "disamb": true}]}, {"index": 11, "position": [65, 65], "orth": "\"", "lexemes": [{"lemma": "\"", "mstag": "SPACE", "disamb": true}]}, {"index": 12, "position": [66, 66], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 13, "position": [67, 70], "orth": "com", "lexemes": [{"lemma": "com", "mstag": "ADP", "disamb": true}]}, {"index": 14, "position": [71, 73], "orth": "um", "lexemes": [{"lemma": "um", "mstag": "DET", "disamb": true}]}, {"index": 15, "position": [74, 79], "orth": "ciclo", "lexemes": [{"lemma": "ciclo", "mstag": "NOUN", "disamb": true}]}, {"index": 16, "position": [80, 82], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 17, "position": [83, 87], "orth": "vida", "lexemes": [{"lemma": "vida", "mstag": "NOUN", "disamb": true}]}, {"index": 18, "position": [88, 90], "orth": "no", "lexemes": [{"lemma": "em o", "mstag": "ADP", "disamb": true}]}, {"index": 19, "position": [91, 95], "orth": "qual", "lexemes": [{"lemma": "qual", "mstag": "PRON", "disamb": true}]}, {"index": 20, "position": [96, 106], "orth": "reverte-se", "lexemes": [{"lemma": "reverte-se", "mstag": "VERB", "disamb": true}]}, {"index": 21, "position": [107, 109], "orth": "ao", "lexemes": [{"lemma": "a o", "mstag": "ADP", "disamb": true}]}, {"index": 22, "position": [110, 117], "orth": "estágio", "lexemes": [{"lemma": "estágio", "mstag": "NOUN", "disamb": true}]}, {"index": 23, "position": [118, 120], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 24, "position": [121, 127], "orth": "pólipo", "lexemes": [{"lemma": "pólipo", "mstag": "NOUN", "disamb": true}]}, {"index": 25, "position": [128, 132], "orth": "após", "lexemes": [{"lemma": "após", "mstag": "SCONJ", "disamb": true}]}, {"index": 26, "position": [133, 142], "orth": "chegar-se", "lexemes": [{"lemma": "chegar se", "mstag": "VERB", "disamb": true}]}, {"index": 27, "position": [143, 144], "orth": "a", "lexemes": [{"lemma": "o", "mstag": "DET", "disamb": true}]}, {"index": 28, "position": [145, 155], "orth": "maturidade", "lexemes": [{"lemma": "maturidade", "mstag": "NOUN", "disamb": true}]}, {"index": 29, "position": [156, 162], "orth": "sexual", "lexemes": [{"lemma": "sexual", "mstag": "ADJ", "disamb": true}]}, {"index": 30, "position": [163, 163], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 31, "position": [164, 167], "orth": "são", "lexemes": [{"lemma": "ser", "mstag": "AUX", "disamb": true}]}, {"index": 32, "position": [168, 169], "orth": "o", "lexemes": [{"lemma": "o", "mstag": "DET", "disamb": true}]}, {"index": 33, "position": [170, 175], "orth": "único", "lexemes": [{"lemma": "único", "mstag": "ADJ", "disamb": true}]}, {"index": 34, "position": [176, 180], "orth": "caso", "lexemes": [{"lemma": "caso", "mstag": "NOUN", "disamb": true}]}, {"index": 35, "position": [181, 190], "orth": "conhecido", "lexemes": [{"lemma": "conhecer", "mstag": "VERB", "disamb": true}]}, {"index": 36, "position": [191, 193], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 37, "position": [194, 196], "orth": "um", "lexemes": [{"lemma": "um", "mstag": "DET", "disamb": true}]}, {"index": 38, "position": [197, 203], "orth": "animal", "lexemes": [{"lemma": "animal", "mstag": "NOUN", "disamb": true}]}, {"index": 39, "position": [204, 212], "orth": "ferfóide", "lexemes": [{"lemma": "ferfóide", "mstag": "ADJ", "disamb": true}]}, {"index": 40, "position": [213, 214], "orth": "(", "lexemes": [{"lemma": "(", "mstag": "PUNCT", "disamb": true}]}, {"index": 41, "position": [215, 221], "orth": "exemplo", "lexemes": [{"lemma": "exemplo", "mstag": "NOUN", "disamb": true}]}, {"index": 42, "position": [222, 222], "orth": ":", "lexemes": [{"lemma": ":", "mstag": "PUNCT", "disamb": true}]}, {"index": 43, "position": [223, 234], "orth": "Peixe-Ferfa", "lexemes": [{"lemma": "Peixe-Ferfa", "mstag": "NOUN", "disamb": true}]}, {"index": 44, "position": [235, 235], "orth": ")", "lexemes": [{"lemma": ")", "mstag": "PUNCT", "disamb": true}]}, {"index": 45, "position": [236, 241], "orth": "capaz", "lexemes": [{"lemma": "capaz", "mstag": "ADJ", "disamb": true}]}, {"index": 46, "position": [242, 244], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "SCONJ", "disamb": true}]}, {"index": 47, "position": [245, 253], "orth": "reverter", "lexemes": [{"lemma": "reverter", "mstag": "VERB", "disamb": true}]}, {"index": 48, "position": [254, 267], "orth": "completamente", "lexemes": [{"lemma": "completamente", "mstag": "ADV", "disamb": true}]}, {"index": 49, "position": [268, 269], "orth": "a", "lexemes": [{"lemma": "a", "mstag": "ADP", "disamb": true}]}, {"index": 50, "position": [270, 272], "orth": "um", "lexemes": [{"lemma": "um", "mstag": "DET", "disamb": true}]}, {"index": 51, "position": [273, 280], "orth": "estágio", "lexemes": [{"lemma": "estágio", "mstag": "NOUN", "disamb": true}]}, {"index": 52, "position": [281, 283], "orth": "de", "lexemes": [{"lemma": "de", "mstag": "ADP", "disamb": true}]}, {"index": 53, "position": [284, 295], "orth": "imaturidade", "lexemes": [{"lemma": "imaturidade", "mstag": "NOUN", "disamb": true}]}, {"index": 54, "position": [296, 302], "orth": "sexual", "lexemes": [{"lemma": "sexual", "mstag": "ADJ", "disamb": true}]}, {"index": 55, "position": [303, 303], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 56, "position": [304, 308], "orth": "Elas", "lexemes": [{"lemma": "elas", "mstag": "PRON", "disamb": true}]}, {"index": 57, "position": [309, 316], "orth": "estavam", "lexemes": [{"lemma": "estar", "mstag": "AUX", "disamb": true}]}, {"index": 58, "position": [317, 330], "orth": "anteriormente", "lexemes": [{"lemma": "anteriormente", "mstag": "ADV", "disamb": true}]}, {"index": 59, "position": [331, 344], "orth": "classificadas", "lexemes": [{"lemma": "classificar", "mstag": "VERB", "disamb": true}]}, {"index": 60, "position": [345, 350], "orth": "nessa", "lexemes": [{"lemma": "em esse", "mstag": "ADP", "disamb": true}]}, {"index": 61, "position": [351, 358], "orth": "espécie", "lexemes": [{"lemma": "espécie", "mstag": "NOUN", "disamb": true}]}, {"index": 62, "position": [359, 359], "orth": ",", "lexemes": [{"lemma": ",", "mstag": "PUNCT", "disamb": true}]}, {"index": 63, "position": [360, 361], "orth": "e", "lexemes": [{"lemma": "e", "mstag": "CCONJ", "disamb": true}]}, {"index": 64, "position": [362, 365], "orth": "são", "lexemes": [{"lemma": "ser", "mstag": "AUX", "disamb": true}]}, {"index": 65, "position": [366, 379], "orth": "classificadas", "lexemes": [{"lemma": "classificar", "mstag": "VERB", "disamb": true}]}, {"index": 66, "position": [380, 390], "orth": "atualmente", "lexemes": [{"lemma": "atualmente", "mstag": "ADV", "disamb": true}]}, {"index": 67, "position": [391, 395], "orth": "como", "lexemes": [{"lemma": "como", "mstag": "ADP", "disamb": true}]}, {"index": 68, "position": [396, 407], "orth": "Turritopsis", "lexemes": [{"lemma": "Turritopsi", "mstag": "NOUN", "disamb": true}]}, {"index": 69, "position": [408, 415], "orth": "dohrnii", "lexemes": [{"lemma": "dohrnii", "mstag": "NOUN", "disamb": true}]}, {"index": 70, "position": [416, 416], "orth": ".", "lexemes": [{"lemma": ".", "mstag": "PUNCT", "disamb": true}]}, {"index": 71, "position": [417, 417], "orth": "\n", "lexemes": [{"lemma": "\n", "mstag": "SPACE", "disamb": true}]}], "text": "Turritopsis nutricula é um hidrozoário. As \"águas-vivas imortais\", com um ciclo de vida no qual reverte-se ao estágio de pólipo após chegar-se a maturidade sexual, são o único caso conhecido de um animal ferfóide (exemplo: Peixe-Ferfa) capaz de reverter completamente a um estágio de imaturidade sexual. Elas estavam anteriormente classificadas nessa espécie, e são classificadas atualmente como Turritopsis dohrnii.\n"}
diff --git a/tests/testdata/input/elasmucha_grisea_ru.txt b/tests/testdata/input/elasmucha_grisea_ru.txt
new file mode 100644
index 0000000..25b41ea
--- /dev/null
+++ b/tests/testdata/input/elasmucha_grisea_ru.txt
@@ -0,0 +1 @@
+Для вида зафиксирована материнская забота (поведение по охране яиц и личинок-нимф). После яйцекладки самка стоит над кладкой яиц и защищает её на протяжении всего развития яиц. Репертуар защитного поведения самок, стоящих над пакетом яиц, включает в себя взмахи крыльями, подёргивание тела, наклон в сторону врага и, наконец, выделение неприятных запахов из ароматических желёз. Иногда самки совместно охраняют свои кладки бок о бок на одном и том же листе берёзы, что увеличивает шансы потомства на выживание. Эти насекомые служат примером самых ранних стадий эусоциальности.
diff --git a/tests/testdata/input/turritopsis_nutricula_de.txt b/tests/testdata/input/turritopsis_nutricula_de.txt
new file mode 100644
index 0000000..47b0595
--- /dev/null
+++ b/tests/testdata/input/turritopsis_nutricula_de.txt
@@ -0,0 +1 @@
+Turritopsis nutricula bildet aufrechte, verzweigte Hydroiden-Kolonien. Die Polypenköpfchen (Hydranthen) sind spindel- bis keulenförmig, ihre fadenförmigen Tentakel sind unregelmäßig über den Hydranthenkörper verteilt. Die Hülle (Periderm) um den Stiel (Hydrocaulus) ist zweilagig. Die Medusenknospen (Gonophoren) entwickeln sich auf den Stielen in einer mit Perisarc eingehüllten Region.
diff --git a/tests/testdata/input/turritopsis_nutricula_es.txt b/tests/testdata/input/turritopsis_nutricula_es.txt
new file mode 100644
index 0000000..9f6898e
--- /dev/null
+++ b/tests/testdata/input/turritopsis_nutricula_es.txt
@@ -0,0 +1 @@
+Tiene un diámetro de 4-5 mm. Su figura es alta y acampanada con paredes finas y uniformes. Su gran estómago (cavidad gastrovascular), rojo vivo, tiene forma cruciforme en su corte transversal. Los especímenes jóvenes tienen ocho tentáculos en el borde pero los adultos llegan a tener hasta noventa tentáculos. Los huevos fertilizados se desarrollan en el estómago y en cavidades de la larva (plánula). Los huevos posteriormente se plantan en el fondo del mar en colonias de pólipos. La medusa incuba después de dos días. Llega a ser madura sexualmente después de pocas semanas (su duración exacta depende de la temperatura de las aguas: a 20 °C entre veinticinco a treinta días y a 22 °C de dieciocho a veintidós días).
diff --git a/tests/testdata/input/turritopsis_nutricula_fr.txt b/tests/testdata/input/turritopsis_nutricula_fr.txt
new file mode 100644
index 0000000..0896689
--- /dev/null
+++ b/tests/testdata/input/turritopsis_nutricula_fr.txt
@@ -0,0 +1 @@
+Cette méduse est capable d’inverser son processus de vieillissement a priori indéfiniment, grâce au processus de transdifférenciation. Presque toutes les méduses peuvent se multiplier par clonage pendant leur stade polype (tel le bourgeonnement des hydrozoaires), ce qui est aussi une certaine forme d'immortalité. Du fait de ses caractéristiques exceptionnelles, elle fait l’objet d'études par les biologistes et les généticiens. Le chercheur japonais Shin Kubota a d'ailleurs observé en 2011 ce rajeunissement à une dizaine de reprises.
\ No newline at end of file
diff --git a/tests/testdata/input/turritopsis_nutricula_pt.txt b/tests/testdata/input/turritopsis_nutricula_pt.txt
new file mode 100644
index 0000000..bc296fe
--- /dev/null
+++ b/tests/testdata/input/turritopsis_nutricula_pt.txt
@@ -0,0 +1 @@
+Turritopsis nutricula é um hidrozoário. As "águas-vivas imortais", com um ciclo de vida no qual reverte-se ao estágio de pólipo após chegar-se a maturidade sexual, são o único caso conhecido de um animal ferfóide (exemplo: Peixe-Ferfa) capaz de reverter completamente a um estágio de imaturidade sexual. Elas estavam anteriormente classificadas nessa espécie, e são classificadas atualmente como Turritopsis dohrnii.
-- 
GitLab


From 157692aa7704edc937a808de04f4329f99870c84 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 6 Apr 2023 13:14:29 +0200
Subject: [PATCH 07/90] Run subtasks in parallel

---
 src/tagger.py |  6 +++++-
 src/utils.py  | 36 ++++++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/src/tagger.py b/src/tagger.py
index 04b2d7c..f6709f1 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -75,6 +75,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
         json_text: bool if json output should contain original
         text (default = True)
         method: method of processing (default = 'tagger', values: tagger, ner)
+        parallel_subtasks: number of parallel subtasks (default = 1)
         :type task_options: dict
 
         :param output_path: Path to directory where the
@@ -102,6 +103,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
 
         json_text = task_options.get("json_text", True)
 
+        parallel_subtasks = task_options.get("parallel_subtasks", 1)
+
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
         convert_lpmn = self.get_converter_directive(
@@ -149,7 +152,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
                     destination_path,
                     splitted_corpus,
                     json_lpmn,
-                    _log
+                    _log,
+                    parallel_subtasks
                 )
                 # remove tmp directory
                 if os.path.exists(destination_path):
diff --git a/src/utils.py b/src/utils.py
index 05227d7..9dfd07f 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -39,7 +39,7 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 
 def merge_splits(output_path: str, destination_path: str,
                  splitted_corpus: List[str], json_lpmn: str,
-                 _log: logging.Logger):
+                 _log: logging.Logger, parallel_subtasks: int = 1):
     """Merges splitted corpus into one file.
 
     :param output_path: path to output file
@@ -52,6 +52,8 @@ def merge_splits(output_path: str, destination_path: str,
     :type json_lpmn: str
     :param _log: logger
     :type _log: logging.Logger
+    :param parallel_subtasks: number of parallel subtasks (default: 1)
+    :type parallel_subtasks: int
     """
     # remove output file if exists
     if os.path.isfile(output_path):
@@ -59,21 +61,31 @@ def merge_splits(output_path: str, destination_path: str,
     # create output file
     with open(output_path, "a") as f2:
         # run tagger on each chunk
+        subtask_pool_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
-                f"Running chunk {dbg_i}: {chunk}"
+                f"Spawning task {dbg_i}: {chunk}"
             )
-            subtask = SubTask(
-                os.path.join(destination_path, chunk),
-                json_lpmn
+            subtask_pool_awaiting.append(
+                SubTask(
+                    os.path.join(destination_path, chunk),
+                    json_lpmn
+                )
             )
-            subtask.run(blocking=False)
-            l_result = subtask.get_output_path()
-            _log.debug(f"Result of chunk: {l_result}")
-
-            # merge results
-            with open(l_result, "r") as f:
-                f2.write(f"{f.read()}\n")
+        subtask_pool_running = []
+        while subtask_pool_awaiting:
+            for _ in range(parallel_subtasks):
+                if subtask_pool_awaiting:
+                    subtask_pool_running.append(
+                        subtask_pool_awaiting.pop(0)
+                    )
+                    subtask_pool_running[-1].run(blocking=False)
+            for subtask in subtask_pool_running:
+                l_result = subtask.get_output_path()
+                _log.debug(f"Result of chunk: {l_result}")
+                with open(l_result, "r") as f:
+                    f2.write(f"{f.read()}\n")
+            subtask_pool_running.clear()
 
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
-- 
GitLab


From e579d3b7951f89f5ca6cdd920a5dd7e39d0314d5 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 10:35:28 +0200
Subject: [PATCH 08/90] Run subtasks using pool

---
 src/utils.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 9dfd07f..4f2de0d 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
+from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -59,7 +60,7 @@ def merge_splits(output_path: str, destination_path: str,
     if os.path.isfile(output_path):
         os.remove(output_path)
     # create output file
-    with open(output_path, "a") as f2:
+    with open(output_path, "a") as f2, Pool(processes=parallel_subtasks) as pool:
         # run tagger on each chunk
         subtask_pool_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
@@ -72,20 +73,16 @@ def merge_splits(output_path: str, destination_path: str,
                     json_lpmn
                 )
             )
-        subtask_pool_running = []
-        while subtask_pool_awaiting:
-            for _ in range(parallel_subtasks):
-                if subtask_pool_awaiting:
-                    subtask_pool_running.append(
-                        subtask_pool_awaiting.pop(0)
-                    )
-                    subtask_pool_running[-1].run(blocking=False)
-            for subtask in subtask_pool_running:
-                l_result = subtask.get_output_path()
-                _log.debug(f"Result of chunk: {l_result}")
-                with open(l_result, "r") as f:
-                    f2.write(f"{f.read()}\n")
-            subtask_pool_running.clear()
+        def _run_subtask(subtask):
+            subtask.run(blocking=False)
+            return subtask.get_output_path()
+        
+        l_results = pool.map(_run_subtask, subtask_pool_awaiting)
+        
+        for l_result in l_results:
+            _log.debug(f"Result of chunk: {l_result}")
+            with open(l_result, "r") as f:
+                f2.write(f"{f.read()}\n")
 
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
-- 
GitLab


From 8f3884d4095be8e6a4f71144cd6613c72c7d6a8b Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 10:38:21 +0200
Subject: [PATCH 09/90] Fix style

---
 src/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 4f2de0d..42e24a5 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -60,7 +60,8 @@ def merge_splits(output_path: str, destination_path: str,
     if os.path.isfile(output_path):
         os.remove(output_path)
     # create output file
-    with open(output_path, "a") as f2, Pool(processes=parallel_subtasks) as pool:
+    with open(output_path, "a") as f2, \
+         Pool(processes=parallel_subtasks) as pool:
         # run tagger on each chunk
         subtask_pool_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
@@ -73,12 +74,13 @@ def merge_splits(output_path: str, destination_path: str,
                     json_lpmn
                 )
             )
+
         def _run_subtask(subtask):
             subtask.run(blocking=False)
             return subtask.get_output_path()
-        
+
         l_results = pool.map(_run_subtask, subtask_pool_awaiting)
-        
+
         for l_result in l_results:
             _log.debug(f"Result of chunk: {l_result}")
             with open(l_result, "r") as f:
-- 
GitLab


From 1deeb700d1175a4384469d4cbdaeb15f371334d4 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 10:58:48 +0200
Subject: [PATCH 10/90] Move _run_subtask declaration

---
 src/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 42e24a5..7aec2ed 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -38,6 +38,11 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
+def _run_subtask(subtask):
+    subtask.run(blocking=False)
+    return subtask.get_output_path()
+
+
 def merge_splits(output_path: str, destination_path: str,
                  splitted_corpus: List[str], json_lpmn: str,
                  _log: logging.Logger, parallel_subtasks: int = 1):
@@ -75,10 +80,6 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        def _run_subtask(subtask):
-            subtask.run(blocking=False)
-            return subtask.get_output_path()
-
         l_results = pool.map(_run_subtask, subtask_pool_awaiting)
 
         for l_result in l_results:
-- 
GitLab


From 0c851334830dc1551205c34008afd97f3bca7012 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 11:40:01 +0200
Subject: [PATCH 11/90] Add type annotation

---
 src/utils.py  | 2 +-
 tests/test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 7aec2ed..7b737b2 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -38,7 +38,7 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
-def _run_subtask(subtask):
+def _run_subtask(subtask: SubTask):
     subtask.run(blocking=False)
     return subtask.get_output_path()
 
diff --git a/tests/test.py b/tests/test.py
index 6edb025..e3dd1b1 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -262,7 +262,7 @@ def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small_fr))
 
 
-def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
+def test_ru(mocker, worker, input_dir, input_file1_ru,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-- 
GitLab


From d13456c88d331e807ee4fe02959778061ff67ec7 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 13:40:35 +0200
Subject: [PATCH 12/90] Debug thread.lock pickling

---
 src/utils.py  | 16 +++++++++-------
 tests/test.py |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 7b737b2..25e2093 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Pool
+from multiprocessing import Pool, Queue
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -40,7 +40,8 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 
 def _run_subtask(subtask: SubTask):
     subtask.run(blocking=False)
-    return subtask.get_output_path()
+    return ()
+    # return subtask.get_output_path()
 
 
 def merge_splits(output_path: str, destination_path: str,
@@ -80,12 +81,13 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        l_results = pool.map(_run_subtask, subtask_pool_awaiting)
+        #l_results = pool.map(_run_subtask, subtask_pool_awaiting)
+        pool.map(_run_subtask, subtask_pool_awaiting)
 
-        for l_result in l_results:
-            _log.debug(f"Result of chunk: {l_result}")
-            with open(l_result, "r") as f:
-                f2.write(f"{f.read()}\n")
+        # for l_result in l_results:
+        #     _log.debug(f"Result of chunk: {l_result}")
+        #     with open(l_result, "r") as f:
+        #         f2.write(f"{f.read()}\n")
 
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
diff --git a/tests/test.py b/tests/test.py
index e3dd1b1..6edb025 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -262,7 +262,7 @@ def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small_fr))
 
 
-def test_ru(mocker, worker, input_dir, input_file1_ru,
+def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
                         output_dir, expected_dir):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-- 
GitLab


From 893804e1057a9f210085a8112cc91f98bb4d9797 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 14:17:37 +0200
Subject: [PATCH 13/90] debug queue

---
 src/utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 25e2093..8f325ee 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -69,20 +69,22 @@ def merge_splits(output_path: str, destination_path: str,
     with open(output_path, "a") as f2, \
          Pool(processes=parallel_subtasks) as pool:
         # run tagger on each chunk
-        subtask_pool_awaiting = []
+        subtask_queue_awaiting = Queue()
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
             )
-            subtask_pool_awaiting.append(
+            subtask_queue_awaiting.put(
                 SubTask(
                     os.path.join(destination_path, chunk),
                     json_lpmn
                 )
             )
 
-        #l_results = pool.map(_run_subtask, subtask_pool_awaiting)
-        pool.map(_run_subtask, subtask_pool_awaiting)
+        print(subtask_queue_awaiting)
+
+        # l_results = pool.map(_run_subtask, subtask_pool_awaiting)
+        # pool.map(_run_subtask, subtask_pool_awaiting)
 
         # for l_result in l_results:
         #     _log.debug(f"Result of chunk: {l_result}")
-- 
GitLab


From 53f1079ebe31ca73dcd4af1cbc709a820616119f Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 14:20:53 +0200
Subject: [PATCH 14/90] fix unused variables

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 8f325ee..e289a10 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -81,7 +81,7 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        print(subtask_queue_awaiting)
+        print(f"{subtask_queue_awaiting} {f2} {pool}")
 
         # l_results = pool.map(_run_subtask, subtask_pool_awaiting)
         # pool.map(_run_subtask, subtask_pool_awaiting)
-- 
GitLab


From 9781e3cd28dd841e45cca91f0a9fbaa7a7867d1d Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 14:36:14 +0200
Subject: [PATCH 15/90] Return to original approach

---
 src/utils.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index e289a10..4e1e702 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Pool, Queue
+from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -38,12 +38,6 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
-def _run_subtask(subtask: SubTask):
-    subtask.run(blocking=False)
-    return ()
-    # return subtask.get_output_path()
-
-
 def merge_splits(output_path: str, destination_path: str,
                  splitted_corpus: List[str], json_lpmn: str,
                  _log: logging.Logger, parallel_subtasks: int = 1):
@@ -62,6 +56,10 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
+    def _run_subtask(subtask: SubTask):
+        subtask.run(blocking=False)
+        return subtask.get_output_path()
+
     # remove output file if exists
     if os.path.isfile(output_path):
         os.remove(output_path)
@@ -69,7 +67,7 @@ def merge_splits(output_path: str, destination_path: str,
     with open(output_path, "a") as f2, \
          Pool(processes=parallel_subtasks) as pool:
         # run tagger on each chunk
-        subtask_queue_awaiting = Queue()
+        subtask_queue_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
@@ -81,15 +79,13 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        print(f"{subtask_queue_awaiting} {f2} {pool}")
-
-        # l_results = pool.map(_run_subtask, subtask_pool_awaiting)
-        # pool.map(_run_subtask, subtask_pool_awaiting)
+        l_results = pool.map(_run_subtask, subtask_queue_awaiting)
+        pool.map(_run_subtask, subtask_queue_awaiting)
 
-        # for l_result in l_results:
-        #     _log.debug(f"Result of chunk: {l_result}")
-        #     with open(l_result, "r") as f:
-        #         f2.write(f"{f.read()}\n")
+        for l_result in l_results:
+            _log.debug(f"Result of chunk: {l_result}")
+            with open(l_result, "r") as f:
+                f2.write(f"{f.read()}\n")
 
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
-- 
GitLab


From 37a394a0de69cf76a9f3ae5d402a8b6fe135e8c7 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 14:38:05 +0200
Subject: [PATCH 16/90] Quickfix

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 4e1e702..1deaabb 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -72,7 +72,7 @@ def merge_splits(output_path: str, destination_path: str,
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
             )
-            subtask_queue_awaiting.put(
+            subtask_queue_awaiting.append(
                 SubTask(
                     os.path.join(destination_path, chunk),
                     json_lpmn
-- 
GitLab


From ce3ec823534302386242fa506ee19e3b4cb92a20 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 24 Apr 2023 14:41:02 +0200
Subject: [PATCH 17/90] add global keyword

---
 src/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/utils.py b/src/utils.py
index 1deaabb..c391501 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -56,6 +56,8 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
+    global _run_subtask
+
     def _run_subtask(subtask: SubTask):
         subtask.run(blocking=False)
         return subtask.get_output_path()
-- 
GitLab


From c5b524f57ab64cf29e24544d9382cddbc979851d Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 09:07:55 +0200
Subject: [PATCH 18/90] Debug pool.map

---
 src/utils.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index c391501..1442d37 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Pool
+from multiprocessing import Pool, Queue
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -56,9 +56,9 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
-    global _run_subtask
+    global run_subtask
 
-    def _run_subtask(subtask: SubTask):
+    def run_subtask(subtask: SubTask):
         subtask.run(blocking=False)
         return subtask.get_output_path()
 
@@ -69,20 +69,19 @@ def merge_splits(output_path: str, destination_path: str,
     with open(output_path, "a") as f2, \
          Pool(processes=parallel_subtasks) as pool:
         # run tagger on each chunk
-        subtask_queue_awaiting = []
+        subtask_queue_awaiting = Queue()
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
             )
-            subtask_queue_awaiting.append(
+            subtask_queue_awaiting.put(
                 SubTask(
                     os.path.join(destination_path, chunk),
                     json_lpmn
                 )
             )
-
-        l_results = pool.map(_run_subtask, subtask_queue_awaiting)
-        pool.map(_run_subtask, subtask_queue_awaiting)
+        
+        l_results = pool.map(run_subtask, subtask_queue_awaiting)
 
         for l_result in l_results:
             _log.debug(f"Result of chunk: {l_result}")
-- 
GitLab


From 69516d46a32b875121353cf251bac6365426adb1 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 09:10:52 +0200
Subject: [PATCH 19/90] Fix style

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 1442d37..bef9092 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -80,7 +80,7 @@ def merge_splits(output_path: str, destination_path: str,
                     json_lpmn
                 )
             )
-        
+
         l_results = pool.map(run_subtask, subtask_queue_awaiting)
 
         for l_result in l_results:
-- 
GitLab


From 25bcf72245b0717cc51f69904e6a35ded125dfa0 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 09:14:52 +0200
Subject: [PATCH 20/90] Queue to standard list

---
 src/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index bef9092..0758131 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Pool, Queue
+from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -69,12 +69,12 @@ def merge_splits(output_path: str, destination_path: str,
     with open(output_path, "a") as f2, \
          Pool(processes=parallel_subtasks) as pool:
         # run tagger on each chunk
-        subtask_queue_awaiting = Queue()
+        subtask_queue_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
             )
-            subtask_queue_awaiting.put(
+            subtask_queue_awaiting.append(
                 SubTask(
                     os.path.join(destination_path, chunk),
                     json_lpmn
-- 
GitLab


From b2fef207c4a19cc04f89e433c9415a1954521585 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 09:32:38 +0200
Subject: [PATCH 21/90] Debug get_output_path

---
 src/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 0758131..17a98ac 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -60,7 +60,8 @@ def merge_splits(output_path: str, destination_path: str,
 
     def run_subtask(subtask: SubTask):
         subtask.run(blocking=False)
-        return subtask.get_output_path()
+        # return subtask.get_output_path()
+        return "."
 
     # remove output file if exists
     if os.path.isfile(output_path):
-- 
GitLab


From 614dc9b1018b53224112c2c8bd5026c1c94d0559 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 09:35:21 +0200
Subject: [PATCH 22/90] Debug subtask run

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 17a98ac..ff3ffd3 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -59,7 +59,7 @@ def merge_splits(output_path: str, destination_path: str,
     global run_subtask
 
     def run_subtask(subtask: SubTask):
-        subtask.run(blocking=False)
+        # subtask.run(blocking=False)
         # return subtask.get_output_path()
         return "."
 
-- 
GitLab


From 5250a13072704c2e09a0d226a98cf77829f64cb4 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 09:57:49 +0200
Subject: [PATCH 23/90] Add manager

---
 src/utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index ff3ffd3..4cd4c78 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Pool
+from multiprocessing import Pool, Manager
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -59,18 +59,18 @@ def merge_splits(output_path: str, destination_path: str,
     global run_subtask
 
     def run_subtask(subtask: SubTask):
-        # subtask.run(blocking=False)
-        # return subtask.get_output_path()
-        return "."
+        subtask.run(blocking=False)
+        return subtask.get_output_path()
 
     # remove output file if exists
     if os.path.isfile(output_path):
         os.remove(output_path)
     # create output file
     with open(output_path, "a") as f2, \
-         Pool(processes=parallel_subtasks) as pool:
+         Pool(processes=parallel_subtasks) as pool, \
+         Manager() as manager:
         # run tagger on each chunk
-        subtask_queue_awaiting = []
+        subtask_queue_awaiting = manager.list()
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
-- 
GitLab


From fbfd194f7f3322e88094e2e91f99cc4cf28b1e68 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 11:20:44 +0200
Subject: [PATCH 24/90] Add approach based on processes

---
 src/utils.py | 51 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 4cd4c78..2eace02 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Pool, Manager
+from multiprocessing import Process, Manager
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -56,18 +56,20 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
-    global run_subtask
+    def run_subtasks(subtask_queue_awaiting, offset, results, _log):
+        for ind, subtask in enumerate(
+                subtask_queue_awaiting[offset::parallel_subtasks]):
+            subtask.run(blocking=False)
 
-    def run_subtask(subtask: SubTask):
-        subtask.run(blocking=False)
-        return subtask.get_output_path()
+            result = subtask.get_output_path()
+            results.append(result)
+            _log.info(f"Finished task {ind*parallel_subtasks+offset}: {result}")
 
     # remove output file if exists
     if os.path.isfile(output_path):
         os.remove(output_path)
     # create output file
     with open(output_path, "a") as f2, \
-         Pool(processes=parallel_subtasks) as pool, \
          Manager() as manager:
         # run tagger on each chunk
         subtask_queue_awaiting = manager.list()
@@ -82,12 +84,39 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        l_results = pool.map(run_subtask, subtask_queue_awaiting)
+        results = [list() for _ in range(parallel_subtasks)]
 
-        for l_result in l_results:
-            _log.debug(f"Result of chunk: {l_result}")
-            with open(l_result, "r") as f:
-                f2.write(f"{f.read()}\n")
+        processes = [
+            Process(
+                target=run_subtasks,
+                args=(subtask_queue_awaiting, offset, results[offset], _log)
+            )
+            for offset in range(parallel_subtasks)
+        ]
+
+        for process in processes:
+            process.start()
+
+        for process in processes:
+            process.join()
+
+        for i in range(len(results[0])):
+            for j in range(parallel_subtasks):
+                if i * parallel_subtasks + j < len(splitted_corpus):
+                    with open(results[j][i], "r") as f:
+                        f2.write(f"{f.read()}\n")
+
+        # for result in results:
+        #     for res in result:
+        #         with open(res, "r") as f:
+        #             f2.write(f"{f.read()}\n")
+
+        # l_results = pool.map(run_subtask, subtask_queue_awaiting)
+
+        # for l_result in l_results:
+        #     _log.debug(f"Result of chunk: {l_result}")
+        #     with open(l_result, "r") as f:
+        #         f2.write(f"{f.read()}\n")
 
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
-- 
GitLab


From b35a32effd4509205babf6666d7c6570a6808a39 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 11:53:27 +0200
Subject: [PATCH 25/90] Return to approach with pool

---
 src/utils.py | 51 ++++++++++-----------------------------------------
 1 file changed, 10 insertions(+), 41 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 2eace02..b052836 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Process, Manager
+from multiprocessing import Pool, Manager
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -56,20 +56,16 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
-    def run_subtasks(subtask_queue_awaiting, offset, results, _log):
-        for ind, subtask in enumerate(
-                subtask_queue_awaiting[offset::parallel_subtasks]):
-            subtask.run(blocking=False)
-
-            result = subtask.get_output_path()
-            results.append(result)
-            _log.info(f"Finished task {ind*parallel_subtasks+offset}: {result}")
+    def run_subtask(subtask: SubTask):
+        subtask.run(blocking=False)
+        return subtask.get_output_path()
 
     # remove output file if exists
     if os.path.isfile(output_path):
         os.remove(output_path)
     # create output file
     with open(output_path, "a") as f2, \
+         Pool(parallel_subtasks) as pool, \
          Manager() as manager:
         # run tagger on each chunk
         subtask_queue_awaiting = manager.list()
@@ -84,39 +80,12 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        results = [list() for _ in range(parallel_subtasks)]
-
-        processes = [
-            Process(
-                target=run_subtasks,
-                args=(subtask_queue_awaiting, offset, results[offset], _log)
-            )
-            for offset in range(parallel_subtasks)
-        ]
-
-        for process in processes:
-            process.start()
-
-        for process in processes:
-            process.join()
-
-        for i in range(len(results[0])):
-            for j in range(parallel_subtasks):
-                if i * parallel_subtasks + j < len(splitted_corpus):
-                    with open(results[j][i], "r") as f:
-                        f2.write(f"{f.read()}\n")
-
-        # for result in results:
-        #     for res in result:
-        #         with open(res, "r") as f:
-        #             f2.write(f"{f.read()}\n")
-
-        # l_results = pool.map(run_subtask, subtask_queue_awaiting)
+        l_results = pool.map(run_subtask, subtask_queue_awaiting)
 
-        # for l_result in l_results:
-        #     _log.debug(f"Result of chunk: {l_result}")
-        #     with open(l_result, "r") as f:
-        #         f2.write(f"{f.read()}\n")
+        for l_result in l_results:
+            _log.debug(f"Result of chunk: {l_result}")
+            with open(l_result, "r") as f:
+                f2.write(f"{f.read()}\n")
 
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
-- 
GitLab


From 6987d1bfe5c038b116c85e047b1ef76752abc440 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 12:14:21 +0200
Subject: [PATCH 26/90] Add global keyword

---
 src/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/utils.py b/src/utils.py
index b052836..a81962b 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -56,6 +56,8 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
+    global run_subtask
+
     def run_subtask(subtask: SubTask):
         subtask.run(blocking=False)
         return subtask.get_output_path()
-- 
GitLab


From f8591fa7832816193707c5be574b641ee71e6590 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Tue, 25 Apr 2023 13:42:34 +0200
Subject: [PATCH 27/90] Move subtask creation to different processes

---
 src/utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index a81962b..6c2b8fa 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -58,7 +58,8 @@ def merge_splits(output_path: str, destination_path: str,
     """
     global run_subtask
 
-    def run_subtask(subtask: SubTask):
+    def run_subtask(args: tuple) -> str:
+        subtask = SubTask(args[0], args[1])
         subtask.run(blocking=False)
         return subtask.get_output_path()
 
@@ -70,19 +71,19 @@ def merge_splits(output_path: str, destination_path: str,
          Pool(parallel_subtasks) as pool, \
          Manager() as manager:
         # run tagger on each chunk
-        subtask_queue_awaiting = manager.list()
+        subtask_args_queue_awaiting = manager.list()
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
             )
-            subtask_queue_awaiting.append(
-                SubTask(
+            subtask_args_queue_awaiting.append(
+                (
                     os.path.join(destination_path, chunk),
                     json_lpmn
                 )
             )
 
-        l_results = pool.map(run_subtask, subtask_queue_awaiting)
+        l_results = pool.map(run_subtask, subtask_args_queue_awaiting)
 
         for l_result in l_results:
             _log.debug(f"Result of chunk: {l_result}")
-- 
GitLab


From b9af7a269d18279c938b1c8dea144dd896f770ce Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 10:26:58 +0200
Subject: [PATCH 28/90] Approach with apply_async

---
 src/utils.py | 83 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 10 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 6c2b8fa..0e76b50 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -56,22 +56,36 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
-    global run_subtask
 
-    def run_subtask(args: tuple) -> str:
+    def run_subtask(args: tuple, i: int,
+                    l_results: str, _log: logging.Logger) -> str:
         subtask = SubTask(args[0], args[1])
         subtask.run(blocking=False)
-        return subtask.get_output_path()
+        result = subtask.get_output_path()
+        l_results.append((i, result))
+        _log.info(f"Finished task {i}: {result}")
+
+    # def run_subtasks(subtask_args_queue_awaiting, offset, results, _log):
+    #     for ind, args in enumerate(
+    #             subtask_args_queue_awaiting[offset::parallel_subtasks]):
+    #         subtask = Subtask()
+    #         subtask.run(blocking=False)
+
+    #         result = subtask.get_output_path()
+    #         results.append(result)
+    #         _log.info(
+    #               f"Finished task {ind*parallel_subtasks+offset}: {result}"
+    #         )
 
     # remove output file if exists
     if os.path.isfile(output_path):
         os.remove(output_path)
     # create output file
     with open(output_path, "a") as f2, \
-         Pool(parallel_subtasks) as pool, \
+         Pool(processes=parallel_subtasks) as pool, \
          Manager() as manager:
         # run tagger on each chunk
-        subtask_args_queue_awaiting = manager.list()
+        subtask_args_queue_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
             _log.info(
                 f"Spawning task {dbg_i}: {chunk}"
@@ -83,12 +97,61 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        l_results = pool.map(run_subtask, subtask_args_queue_awaiting)
+        # w petli pobierac kawalki wielkosci Poola
+        #     robić for i in kawalek :
+        #           pool.apply_async(run_subtask, args=...)
+        #     zrobic [res.get() for res in results]
+        #     zapis do pliku wynikowego
+
+        while len(subtask_args_queue_awaiting) > 0:
+            l_results = manager.list()
+            # l_results = pool.map(
+            #   run_subtask,
+            #   subtask_args_queue_awaiting[:parallel_subtasks]
+            # )
+            for i in range(parallel_subtasks):
+                if len(subtask_args_queue_awaiting) <= 0:
+                    break
+                pool.apply_async(
+                    run_subtask,
+                    args=(subtask_args_queue_awaiting[i], i, l_results, _log)
+                )
+            l_results.sort(key=lambda x: x[0])
+            for l_result in l_results:
+                _log.debug(f"Result of chunk: {l_result}")
+                with open(l_result, "r") as f:
+                    f2.write(f"{f.read()}\n")
+            del subtask_args_queue_awaiting[:parallel_subtasks]
+
+        # results = [list() for _ in range(parallel_subtasks)]
+
+        # processes = [
+        #     Process(
+        #         target=run_subtasks,
+        #         args=(subtask_args_queue_awaiting,
+        #               offset, results[offset], _log)
+        #     )
+        #     for offset in range(parallel_subtasks)
+        # ]
+
+        # for process in processes:
+        #     process.start()
+
+        # for process in processes:
+        #     process.join()
+
+        # for i in range(len(results[0])):
+        #     for j in range(parallel_subtasks):
+        #         if i * parallel_subtasks + j < len(splitted_corpus):
+        #             with open(results[j][i], "r") as f:
+        #                 f2.write(f"{f.read()}\n")
+
+        # l_results = pool.map(run_subtask, subtask_args_queue_awaiting)
 
-        for l_result in l_results:
-            _log.debug(f"Result of chunk: {l_result}")
-            with open(l_result, "r") as f:
-                f2.write(f"{f.read()}\n")
+        # for l_result in l_results:
+        #     _log.debug(f"Result of chunk: {l_result}")
+        #     with open(l_result, "r") as f:
+        #         f2.write(f"{f.read()}\n")
 
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
-- 
GitLab


From f0e3ac257d0a9e8df57bea9faf7bdd8adf056f60 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 11:35:17 +0200
Subject: [PATCH 29/90] Add asyncresult get

---
 src/utils.py | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 0e76b50..6635ec8 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import List
 
 from nlp_ws import SubTask
-from multiprocessing import Pool, Manager
+from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -57,13 +57,13 @@ def merge_splits(output_path: str, destination_path: str,
     :type parallel_subtasks: int
     """
 
-    def run_subtask(args: tuple, i: int,
-                    l_results: str, _log: logging.Logger) -> str:
+    def run_subtask(args: tuple[str, str], i: int) -> tuple[int, str]:
         subtask = SubTask(args[0], args[1])
         subtask.run(blocking=False)
         result = subtask.get_output_path()
-        l_results.append((i, result))
-        _log.info(f"Finished task {i}: {result}")
+        return (i, result)
+        # l_results.append((i, result))
+        # _log.info(f"Finished task {i}: {result}")
 
     # def run_subtasks(subtask_args_queue_awaiting, offset, results, _log):
     #     for ind, args in enumerate(
@@ -82,8 +82,7 @@ def merge_splits(output_path: str, destination_path: str,
         os.remove(output_path)
     # create output file
     with open(output_path, "a") as f2, \
-         Pool(processes=parallel_subtasks) as pool, \
-         Manager() as manager:
+         Pool(processes=parallel_subtasks) as pool:
         # run tagger on each chunk
         subtask_args_queue_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
@@ -103,20 +102,31 @@ def merge_splits(output_path: str, destination_path: str,
         #     zrobic [res.get() for res in results]
         #     zapis do pliku wynikowego
 
+        # while len(subtask_args_queue_awaiting) > 0:
+
         while len(subtask_args_queue_awaiting) > 0:
-            l_results = manager.list()
+            # l_results = manager.list()
             # l_results = pool.map(
             #   run_subtask,
             #   subtask_args_queue_awaiting[:parallel_subtasks]
             # )
-            for i in range(parallel_subtasks):
-                if len(subtask_args_queue_awaiting) <= 0:
-                    break
-                pool.apply_async(
-                    run_subtask,
-                    args=(subtask_args_queue_awaiting[i], i, l_results, _log)
-                )
-            l_results.sort(key=lambda x: x[0])
+            args = subtask_args_queue_awaiting[:parallel_subtasks]
+            multiple_results = [
+                pool.apply_async(run_subtask, args=(args[i], i))
+                for i in range(len(args))
+            ]
+            multiple_results = [res.get() for res in multiple_results]
+            multiple_results.sort(key=lambda x: x[0])
+            l_results = [res[1] for res in multiple_results]
+
+            # for i in range(parallel_subtasks):
+            #     if len(subtask_args_queue_awaiting) <= 0:
+            #         break
+            #     pool.apply_async(
+            #         run_subtask,
+            #         args=(subtask_args_queue_awaiting[i], i, l_results, _log)
+            #     )
+            # l_results.sort(key=lambda x: x[0])
             for l_result in l_results:
                 _log.debug(f"Result of chunk: {l_result}")
                 with open(l_result, "r") as f:
-- 
GitLab


From cb7df491a83c392a8946ef0cd0c9054ad61a113f Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 11:41:46 +0200
Subject: [PATCH 30/90] Partial remove type annotation

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 6635ec8..e868b7d 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -57,7 +57,7 @@ def merge_splits(output_path: str, destination_path: str,
     :type parallel_subtasks: int
     """
 
-    def run_subtask(args: tuple[str, str], i: int) -> tuple[int, str]:
+    def run_subtask(args: tuple, i: int) -> tuple:
         subtask = SubTask(args[0], args[1])
         subtask.run(blocking=False)
         result = subtask.get_output_path()
-- 
GitLab


From 965939656f4b94a2cbb99f7d4264367b486e0f8b Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 11:43:58 +0200
Subject: [PATCH 31/90] Add global function declaration

---
 src/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/utils.py b/src/utils.py
index e868b7d..edfe407 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -56,6 +56,7 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
+    global run_subtask
 
     def run_subtask(args: tuple, i: int) -> tuple:
         subtask = SubTask(args[0], args[1])
-- 
GitLab


From 5766aca3193cb946bfcf1cda79c6b1dbfcfbda13 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 12:25:25 +0200
Subject: [PATCH 32/90] Add timeout parameter

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index edfe407..3aaa64e 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -116,7 +116,7 @@ def merge_splits(output_path: str, destination_path: str,
                 pool.apply_async(run_subtask, args=(args[i], i))
                 for i in range(len(args))
             ]
-            multiple_results = [res.get() for res in multiple_results]
+            multiple_results = [res.get(timeout=10) for res in multiple_results]
             multiple_results.sort(key=lambda x: x[0])
             l_results = [res[1] for res in multiple_results]
 
-- 
GitLab


From 10a79f7f0b382c9ef472853d14115571e98e5aff Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 13:03:38 +0200
Subject: [PATCH 33/90] Add debug prints

---
 src/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 3aaa64e..d05974a 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -116,7 +116,10 @@ def merge_splits(output_path: str, destination_path: str,
                 pool.apply_async(run_subtask, args=(args[i], i))
                 for i in range(len(args))
             ]
-            multiple_results = [res.get(timeout=10) for res in multiple_results]
+            for res in multiple_results:
+                print(res)
+                print(type(res))
+            multiple_results = [res.get() for res in multiple_results]
             multiple_results.sort(key=lambda x: x[0])
             l_results = [res[1] for res in multiple_results]
 
-- 
GitLab


From c14b1d9d0263e9a3e9a82d23e8f98926be99c00a Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 13:40:56 +0200
Subject: [PATCH 34/90] Remove args

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index d05974a..5728c56 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -113,7 +113,7 @@ def merge_splits(output_path: str, destination_path: str,
             # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
-                pool.apply_async(run_subtask, args=(args[i], i))
+                pool.apply_async(run_subtask, (args[i], i))
                 for i in range(len(args))
             ]
             for res in multiple_results:
-- 
GitLab


From c168eea33010e80a0ee2b85fecf57d8c6ba28c9b Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Wed, 26 Apr 2023 13:52:24 +0200
Subject: [PATCH 35/90] Better style

---
 src/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 5728c56..a584868 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -113,8 +113,8 @@ def merge_splits(output_path: str, destination_path: str,
             # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
-                pool.apply_async(run_subtask, (args[i], i))
-                for i in range(len(args))
+                pool.apply_async(run_subtask, (arg_tuple, i))
+                for i, arg_tuple in enumerate(args)
             ]
             for res in multiple_results:
                 print(res)
-- 
GitLab


From 9aeccecf577952bd3d6fe656241b8420620df615 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 27 Apr 2023 10:03:55 +0200
Subject: [PATCH 36/90] Debug res.get()

---
 src/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index a584868..1052cc0 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -119,7 +119,9 @@ def merge_splits(output_path: str, destination_path: str,
             for res in multiple_results:
                 print(res)
                 print(type(res))
-            multiple_results = [res.get() for res in multiple_results]
+            multiple_results = [res.get()
+                                if res.ready() and res.successful() else None
+                                for res in multiple_results]
             multiple_results.sort(key=lambda x: x[0])
             l_results = [res[1] for res in multiple_results]
 
-- 
GitLab


From 0e659c167583e5720e4d4d0326b5473f2e1be49b Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 10:05:41 +0200
Subject: [PATCH 37/90] Debug res.get()

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 1052cc0..d6d0c22 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -120,7 +120,7 @@ def merge_splits(output_path: str, destination_path: str,
                 print(res)
                 print(type(res))
             multiple_results = [res.get()
-                                if res.ready() and res.successful() else None
+                                if res.ready() else None
                                 for res in multiple_results]
             multiple_results.sort(key=lambda x: x[0])
             l_results = [res[1] for res in multiple_results]
-- 
GitLab


From 594c478e9025fe5bda3aa0fd55fe735881f9a85e Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 10:13:56 +0200
Subject: [PATCH 38/90] Debug run_subtasks

---
 src/utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index d6d0c22..4727f94 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -59,10 +59,11 @@ def merge_splits(output_path: str, destination_path: str,
     global run_subtask
 
     def run_subtask(args: tuple, i: int) -> tuple:
-        subtask = SubTask(args[0], args[1])
-        subtask.run(blocking=False)
-        result = subtask.get_output_path()
-        return (i, result)
+        # subtask = SubTask(args[0], args[1])
+        # subtask.run(blocking=False)
+        # result = subtask.get_output_path()
+        # return (i, result)
+        return (0, "Ala ma kota.")
         # l_results.append((i, result))
         # _log.info(f"Finished task {i}: {result}")
 
-- 
GitLab


From 24248a1f060a00c2995e77d9c419a2ed098e8241 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 10:15:26 +0200
Subject: [PATCH 39/90] Comment import for debug

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 4727f94..94bcfcd 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -5,7 +5,7 @@ import os
 from pathlib import Path
 from typing import List
 
-from nlp_ws import SubTask
+# from nlp_ws import SubTask
 from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
-- 
GitLab


From 4df42cc98ba5a25b7324bf2f0edd8772d7b5c6c0 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 10:33:39 +0200
Subject: [PATCH 40/90] Debug run_subtask function passing

---
 src/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 94bcfcd..70515cb 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -114,15 +114,16 @@ def merge_splits(output_path: str, destination_path: str,
             # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
-                pool.apply_async(run_subtask, (arg_tuple, i))
+                pool.apply_async(os.getpid, ())
                 for i, arg_tuple in enumerate(args)
             ]
             for res in multiple_results:
                 print(res)
                 print(type(res))
             multiple_results = [res.get()
-                                if res.ready() else None
+                                # if res.ready() else None
                                 for res in multiple_results]
+            print(multiple_results)
             multiple_results.sort(key=lambda x: x[0])
             l_results = [res[1] for res in multiple_results]
 
-- 
GitLab


From e7c0b4db239e9848b22a0ffd69e77a6ccc8e50ec Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 10:43:56 +0200
Subject: [PATCH 41/90] Debug run_subtask

---
 src/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 70515cb..5fabe44 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -56,7 +56,9 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
-    global run_subtask
+    parallel_subtasks = 2
+
+    # global run_subtask
 
     def run_subtask(args: tuple, i: int) -> tuple:
         # subtask = SubTask(args[0], args[1])
@@ -114,7 +116,7 @@ def merge_splits(output_path: str, destination_path: str,
             # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
-                pool.apply_async(os.getpid, ())
+                pool.apply_async(run_subtask, (arg_tuple, i))
                 for i, arg_tuple in enumerate(args)
             ]
             for res in multiple_results:
-- 
GitLab


From aaca80059a856e3ffe38278ec173236261f8d51d Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 10:47:40 +0200
Subject: [PATCH 42/90] Debug run_subtask passing

---
 src/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 5fabe44..f9bdd69 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -116,7 +116,8 @@ def merge_splits(output_path: str, destination_path: str,
             # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
-                pool.apply_async(run_subtask, (arg_tuple, i))
+                # pool.apply_async(run_subtask, (arg_tuple, i))
+                pool.apply_async(os.getpid, ())
                 for i, arg_tuple in enumerate(args)
             ]
             for res in multiple_results:
-- 
GitLab


From 15cc31d81cb547023e0dcddb0e3e7309156dd087 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 10:51:41 +0200
Subject: [PATCH 43/90] Run eight processes

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index f9bdd69..4521ac1 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -56,7 +56,7 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
-    parallel_subtasks = 2
+    parallel_subtasks = 8
 
     # global run_subtask
 
-- 
GitLab


From c8831d83360fc84a83c363b026625945e1c9948f Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 11:45:34 +0200
Subject: [PATCH 44/90] Move run_subtask out of merge_splits

---
 src/utils.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 4521ac1..e8ff1b8 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -38,6 +38,18 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
+def run_subtask(args: tuple, i: int) -> tuple:
+    """Run subtask."""
+    # subtask = SubTask(args[0], args[1])
+    # subtask.run(blocking=False)
+    # result = subtask.get_output_path()
+    # return (i, result)
+    print(f"{os.getpid()=} {i=} {args=}")
+    return (0, "Ala ma kota.")
+    # l_results.append((i, result))
+    # _log.info(f"Finished task {i}: {result}")
+
+
 def merge_splits(output_path: str, destination_path: str,
                  splitted_corpus: List[str], json_lpmn: str,
                  _log: logging.Logger, parallel_subtasks: int = 1):
@@ -60,15 +72,6 @@ def merge_splits(output_path: str, destination_path: str,
 
     # global run_subtask
 
-    def run_subtask(args: tuple, i: int) -> tuple:
-        # subtask = SubTask(args[0], args[1])
-        # subtask.run(blocking=False)
-        # result = subtask.get_output_path()
-        # return (i, result)
-        return (0, "Ala ma kota.")
-        # l_results.append((i, result))
-        # _log.info(f"Finished task {i}: {result}")
-
     # def run_subtasks(subtask_args_queue_awaiting, offset, results, _log):
     #     for ind, args in enumerate(
     #             subtask_args_queue_awaiting[offset::parallel_subtasks]):
@@ -116,8 +119,8 @@ def merge_splits(output_path: str, destination_path: str,
             # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
-                # pool.apply_async(run_subtask, (arg_tuple, i))
-                pool.apply_async(os.getpid, ())
+                pool.apply_async(run_subtask, (arg_tuple, i))
+                # pool.apply_async(os.getpid, ())
                 for i, arg_tuple in enumerate(args)
             ]
             for res in multiple_results:
-- 
GitLab


From 3669672db89c6faa52c65a6aadd283aa87f02776 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 11:50:39 +0200
Subject: [PATCH 45/90] Add underscore to name

---
 src/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index e8ff1b8..d1f655c 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -38,7 +38,7 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
-def run_subtask(args: tuple, i: int) -> tuple:
+def _run_subtask(args: tuple, i: int) -> tuple:
     """Run subtask."""
     # subtask = SubTask(args[0], args[1])
     # subtask.run(blocking=False)
@@ -119,7 +119,7 @@ def merge_splits(output_path: str, destination_path: str,
             # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
-                pool.apply_async(run_subtask, (arg_tuple, i))
+                pool.apply_async(_run_subtask, (arg_tuple, i))
                 # pool.apply_async(os.getpid, ())
                 for i, arg_tuple in enumerate(args)
             ]
-- 
GitLab


From e8504033f564c0e82b37eaab19e12260a462ddce Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 11:57:38 +0200
Subject: [PATCH 46/90] Return to original run_subtask implementation

---
 src/utils.py | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index d1f655c..ad5ef47 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -5,7 +5,7 @@ import os
 from pathlib import Path
 from typing import List
 
-# from nlp_ws import SubTask
+from nlp_ws import SubTask
 from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
@@ -40,14 +40,12 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 
 def _run_subtask(args: tuple, i: int) -> tuple:
     """Run subtask."""
-    # subtask = SubTask(args[0], args[1])
-    # subtask.run(blocking=False)
-    # result = subtask.get_output_path()
-    # return (i, result)
     print(f"{os.getpid()=} {i=} {args=}")
-    return (0, "Ala ma kota.")
-    # l_results.append((i, result))
-    # _log.info(f"Finished task {i}: {result}")
+    subtask = SubTask(args[0], args[1])
+    subtask.run(blocking=False)
+    result = subtask.get_output_path()
+    return (i, result)
+    # return (0, "Ala ma kota.")
 
 
 def merge_splits(output_path: str, destination_path: str,
@@ -70,20 +68,6 @@ def merge_splits(output_path: str, destination_path: str,
     """
     parallel_subtasks = 8
 
-    # global run_subtask
-
-    # def run_subtasks(subtask_args_queue_awaiting, offset, results, _log):
-    #     for ind, args in enumerate(
-    #             subtask_args_queue_awaiting[offset::parallel_subtasks]):
-    #         subtask = Subtask()
-    #         subtask.run(blocking=False)
-
-    #         result = subtask.get_output_path()
-    #         results.append(result)
-    #         _log.info(
-    #               f"Finished task {ind*parallel_subtasks+offset}: {result}"
-    #         )
-
     # remove output file if exists
     if os.path.isfile(output_path):
         os.remove(output_path)
-- 
GitLab


From 8dbd3de7c7bdd7c836162aec3312aa54f0dd838c Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 12:05:38 +0200
Subject: [PATCH 47/90] Return to dummy return values

---
 src/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index ad5ef47..046720c 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -44,8 +44,8 @@ def _run_subtask(args: tuple, i: int) -> tuple:
     subtask = SubTask(args[0], args[1])
     subtask.run(blocking=False)
     result = subtask.get_output_path()
-    return (i, result)
-    # return (0, "Ala ma kota.")
+    # return (i, result)
+    return (0, "Ala ma kota.")
 
 
 def merge_splits(output_path: str, destination_path: str,
-- 
GitLab


From e962dfc55fd0903ffe70c217d17de3cdd4658d3c Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 12:06:57 +0200
Subject: [PATCH 48/90] Fix style

---
 src/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.py b/src/utils.py
index 046720c..1add4f5 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -43,7 +43,7 @@ def _run_subtask(args: tuple, i: int) -> tuple:
     print(f"{os.getpid()=} {i=} {args=}")
     subtask = SubTask(args[0], args[1])
     subtask.run(blocking=False)
-    result = subtask.get_output_path()
+    # result = subtask.get_output_path()
     # return (i, result)
     return (0, "Ala ma kota.")
 
-- 
GitLab


From efb80b84fc5e6b994eff891b45db9fe9963e29bf Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 12:09:35 +0200
Subject: [PATCH 49/90] Remove subtask

---
 src/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 1add4f5..5feb68d 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -5,7 +5,7 @@ import os
 from pathlib import Path
 from typing import List
 
-from nlp_ws import SubTask
+# from nlp_ws import SubTask
 from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
@@ -41,8 +41,8 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 def _run_subtask(args: tuple, i: int) -> tuple:
     """Run subtask."""
     print(f"{os.getpid()=} {i=} {args=}")
-    subtask = SubTask(args[0], args[1])
-    subtask.run(blocking=False)
+    # subtask = SubTask(args[0], args[1])
+    # subtask.run(blocking=False)
     # result = subtask.get_output_path()
     # return (i, result)
     return (0, "Ala ma kota.")
-- 
GitLab


From 756b18596da8e0ff58d79fc04dcbdbf28c4a5503 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 12:24:29 +0200
Subject: [PATCH 50/90] Spawn subtask again

---
 src/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 5feb68d..1add4f5 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -5,7 +5,7 @@ import os
 from pathlib import Path
 from typing import List
 
-# from nlp_ws import SubTask
+from nlp_ws import SubTask
 from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
@@ -41,8 +41,8 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 def _run_subtask(args: tuple, i: int) -> tuple:
     """Run subtask."""
     print(f"{os.getpid()=} {i=} {args=}")
-    # subtask = SubTask(args[0], args[1])
-    # subtask.run(blocking=False)
+    subtask = SubTask(args[0], args[1])
+    subtask.run(blocking=False)
     # result = subtask.get_output_path()
     # return (i, result)
     return (0, "Ala ma kota.")
-- 
GitLab


From 1a8af0f6f655b7d538e793a99e509001e60a8919 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 28 Apr 2023 12:40:12 +0200
Subject: [PATCH 51/90] Remove comments and debug info

---
 src/utils.py | 66 ++--------------------------------------------------
 1 file changed, 2 insertions(+), 64 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 1add4f5..3374733 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -39,13 +39,10 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 
 
 def _run_subtask(args: tuple, i: int) -> tuple:
-    """Run subtask."""
-    print(f"{os.getpid()=} {i=} {args=}")
     subtask = SubTask(args[0], args[1])
     subtask.run(blocking=False)
-    # result = subtask.get_output_path()
-    # return (i, result)
-    return (0, "Ala ma kota.")
+    result = subtask.get_output_path()
+    return (i, result)
 
 
 def merge_splits(output_path: str, destination_path: str,
@@ -66,8 +63,6 @@ def merge_splits(output_path: str, destination_path: str,
     :param parallel_subtasks: number of parallel subtasks (default: 1)
     :type parallel_subtasks: int
     """
-    parallel_subtasks = 8
-
     # remove output file if exists
     if os.path.isfile(output_path):
         os.remove(output_path)
@@ -87,80 +82,23 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
-        # w petli pobierac kawalki wielkosci Poola
-        #     robić for i in kawalek :
-        #           pool.apply_async(run_subtask, args=...)
-        #     zrobic [res.get() for res in results]
-        #     zapis do pliku wynikowego
-
-        # while len(subtask_args_queue_awaiting) > 0:
-
         while len(subtask_args_queue_awaiting) > 0:
-            # l_results = manager.list()
-            # l_results = pool.map(
-            #   run_subtask,
-            #   subtask_args_queue_awaiting[:parallel_subtasks]
-            # )
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             multiple_results = [
                 pool.apply_async(_run_subtask, (arg_tuple, i))
-                # pool.apply_async(os.getpid, ())
                 for i, arg_tuple in enumerate(args)
             ]
-            for res in multiple_results:
-                print(res)
-                print(type(res))
             multiple_results = [res.get()
-                                # if res.ready() else None
                                 for res in multiple_results]
-            print(multiple_results)
             multiple_results.sort(key=lambda x: x[0])
             l_results = [res[1] for res in multiple_results]
 
-            # for i in range(parallel_subtasks):
-            #     if len(subtask_args_queue_awaiting) <= 0:
-            #         break
-            #     pool.apply_async(
-            #         run_subtask,
-            #         args=(subtask_args_queue_awaiting[i], i, l_results, _log)
-            #     )
-            # l_results.sort(key=lambda x: x[0])
             for l_result in l_results:
                 _log.debug(f"Result of chunk: {l_result}")
                 with open(l_result, "r") as f:
                     f2.write(f"{f.read()}\n")
             del subtask_args_queue_awaiting[:parallel_subtasks]
 
-        # results = [list() for _ in range(parallel_subtasks)]
-
-        # processes = [
-        #     Process(
-        #         target=run_subtasks,
-        #         args=(subtask_args_queue_awaiting,
-        #               offset, results[offset], _log)
-        #     )
-        #     for offset in range(parallel_subtasks)
-        # ]
-
-        # for process in processes:
-        #     process.start()
-
-        # for process in processes:
-        #     process.join()
-
-        # for i in range(len(results[0])):
-        #     for j in range(parallel_subtasks):
-        #         if i * parallel_subtasks + j < len(splitted_corpus):
-        #             with open(results[j][i], "r") as f:
-        #                 f2.write(f"{f.read()}\n")
-
-        # l_results = pool.map(run_subtask, subtask_args_queue_awaiting)
-
-        # for l_result in l_results:
-        #     _log.debug(f"Result of chunk: {l_result}")
-        #     with open(l_result, "r") as f:
-        #         f2.write(f"{f.read()}\n")
-
 
 def split_corpus(source_path: str, destination_path: str, file_name: str,
                  chunk_size: int, last_chunk_ratio=0.5, sep_diff_ratio=0.02,
-- 
GitLab


From 4481ba0707e8553addbfd6709e2f1b4b82e40cb6 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 4 May 2023 13:49:38 +0200
Subject: [PATCH 52/90] Debug # parallel subtasks

---
 src/tagger.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tagger.py b/src/tagger.py
index f6709f1..f4a503d 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -105,6 +105,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
 
         parallel_subtasks = task_options.get("parallel_subtasks", 1)
 
+        parallel_subtasks = 4
+
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
         convert_lpmn = self.get_converter_directive(
-- 
GitLab


From 2b9154d785966c4406a7cb7dcc52cb8e2f166a9b Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 4 May 2023 14:43:46 +0200
Subject: [PATCH 53/90] Remove debug

---
 src/tagger.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/tagger.py b/src/tagger.py
index f4a503d..f6709f1 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -105,8 +105,6 @@ class TaggerWorker(nlp_ws.NLPWorker):
 
         parallel_subtasks = task_options.get("parallel_subtasks", 1)
 
-        parallel_subtasks = 4
-
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
         convert_lpmn = self.get_converter_directive(
-- 
GitLab


From 73cf69514a4e5f0ef3257224fd5c42c45923beca Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 09:13:48 +0200
Subject: [PATCH 54/90] Test with range of spawned pids

---
 tests/test.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 6edb025..664d6a5 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -29,6 +29,9 @@ def get_output_path(self, timeout=0):
     return tmp_subtask_result_file[dict_key]
 
 
+def subtaskbase_init_gen():
+    pass
+
 def test_init():
     worker = TaggerWorker()
     assert type(worker).__name__ == 'TaggerWorker'
@@ -40,10 +43,12 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    # mocker.patch('nlp_ws._subtaskbase.SubTaskBase.__init__', )
+    for _ in range(15, 200):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid()
+        )
     worker.process(
         os.path.join(input_dir, input_file1),
         {"lang": "en"}, os.path.join(output_dir, input_file1)
-- 
GitLab


From ba4d906b14b474234245e0baf1e37ff3f64bdf63 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 09:59:59 +0200
Subject: [PATCH 55/90] Mock constructor from SubTaskBase

---
 tests/test.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 664d6a5..bd3df42 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -3,7 +3,8 @@ import shutil
 from filecmp import cmp
 
 import aioprocessing as ap
-from nlp_ws import SubTask
+import queue
+from nlp_ws import SubTask, SubTaskBase
 
 from src.tagger import TaggerWorker
 
@@ -29,8 +30,14 @@ def get_output_path(self, timeout=0):
     return tmp_subtask_result_file[dict_key]
 
 
-def subtaskbase_init_gen():
-    pass
+def subtaskbase_init_gen(pid):
+    def subtaskbase_init(self):
+        self.process_id = pid
+        self.q_in = queue.Queue()
+        self.q_out = SubTaskBase._processes[self.process_id]["q_out"]
+        self.idx = None
+
+    return subtaskbase_init
 
 def test_init():
     worker = TaggerWorker()
@@ -43,12 +50,11 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    # mocker.patch('nlp_ws._subtaskbase.SubTaskBase.__init__', )
-    for _ in range(15, 200):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid()
-        )
+    mocker.patch('nlp_ws._subtaskbase.SubTaskBase.__init__', subtaskbase_init_gen(29))
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
     worker.process(
         os.path.join(input_dir, input_file1),
         {"lang": "en"}, os.path.join(output_dir, input_file1)
-- 
GitLab


From cebb19e1cfbf7cb9daadf00c396aa20f6ccfc13d Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 10:24:22 +0200
Subject: [PATCH 56/90] Mock constructor for SubTask

---
 tests/test.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index bd3df42..d0f0ca0 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -4,7 +4,7 @@ from filecmp import cmp
 
 import aioprocessing as ap
 import queue
-from nlp_ws import SubTask, SubTaskBase
+from nlp_ws import SubTask
 
 from src.tagger import TaggerWorker
 
@@ -31,11 +31,14 @@ def get_output_path(self, timeout=0):
 
 
 def subtaskbase_init_gen(pid):
-    def subtaskbase_init(self):
+    def subtaskbase_init(self, input_path: str, task: dict, prefix="/samba"):
         self.process_id = pid
         self.q_in = queue.Queue()
-        self.q_out = SubTaskBase._processes[self.process_id]["q_out"]
+        self.q_out = self._processes[self.process_id]["q_out"]
         self.idx = None
+        self.task = task
+        self.input = self._transform_path(input_path, prefix)
+        self.response = None
 
     return subtaskbase_init
 
-- 
GitLab


From ae95121f29858dbbe48f2f924f574643da05ead1 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 10:35:28 +0200
Subject: [PATCH 57/90] Return to subtaskbase mocking

---
 tests/test.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index d0f0ca0..7239fd9 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -31,14 +31,11 @@ def get_output_path(self, timeout=0):
 
 
 def subtaskbase_init_gen(pid):
-    def subtaskbase_init(self, input_path: str, task: dict, prefix="/samba"):
+    def subtaskbase_init(self):
         self.process_id = pid
         self.q_in = queue.Queue()
         self.q_out = self._processes[self.process_id]["q_out"]
         self.idx = None
-        self.task = task
-        self.input = self._transform_path(input_path, prefix)
-        self.response = None
 
     return subtaskbase_init
 
-- 
GitLab


From 25e2130ba36671c6d84a5ee747a67caefc7f7dcb Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 10:42:01 +0200
Subject: [PATCH 58/90] Experimental

---
 tests/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test.py b/tests/test.py
index 7239fd9..fa1fa2a 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -53,7 +53,7 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtaskbase.SubTaskBase.__init__', subtaskbase_init_gen(29))
     SubTask.prepare_subtask(
         {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
+        29
     )
     worker.process(
         os.path.join(input_dir, input_file1),
-- 
GitLab


From 7ae8148748a8a8735a2b9c1dadd3556dbb8d381a Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 10:49:24 +0200
Subject: [PATCH 59/90] Debug os.getpid()

---
 tests/test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test.py b/tests/test.py
index fa1fa2a..660eb41 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,7 +50,8 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    mocker.patch('nlp_ws._subtaskbase.SubTaskBase.__init__', subtaskbase_init_gen(29))
+    # mocker.patch('nlp_ws._subtaskbase.SubTaskBase.__init__', subtaskbase_init_gen(29))
+    print(f"{os.getpid()}=")
     SubTask.prepare_subtask(
         {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
         29
-- 
GitLab


From 98ea276166f5c48eb92c6db6a7be3c67b8f71d8f Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 11:04:08 +0200
Subject: [PATCH 60/90] Return to os.getpid

---
 tests/test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 660eb41..0c13b9b 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,11 +50,10 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    # mocker.patch('nlp_ws._subtaskbase.SubTaskBase.__init__', subtaskbase_init_gen(29))
     print(f"{os.getpid()}=")
     SubTask.prepare_subtask(
         {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        29
+        os.getpid()
     )
     worker.process(
         os.path.join(input_dir, input_file1),
-- 
GitLab


From 16269cba682dbf7c5d85364f7af3b2b7c3bcd39a Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 11:26:45 +0200
Subject: [PATCH 61/90] Prepare subtask test

---
 tests/test.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 0c13b9b..38f3be2 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,11 +50,11 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    print(f"{os.getpid()}=")
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker.process(
         os.path.join(input_dir, input_file1),
         {"lang": "en"}, os.path.join(output_dir, input_file1)
-- 
GitLab


From 1b017a20fe452f33f53ae6bda2813482d201e8d2 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 11:28:52 +0200
Subject: [PATCH 62/90] Experimental

---
 tests/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test.py b/tests/test.py
index 38f3be2..b0ba7c6 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,7 +50,7 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
-- 
GitLab


From be4ccbb8394d0384ae85fa912402426796ecd8ca Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 11:39:04 +0200
Subject: [PATCH 63/90] Assign queues to pids in range

---
 tests/test.py | 119 ++++++++++++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 53 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index b0ba7c6..e567509 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,7 +50,7 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
+    for i in range(0, 10):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -70,10 +70,11 @@ def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker_small.process(
         os.path.join(input_dir, input_file_small),
         {"lang": "en"}, os.path.join(output_dir, input_file_small)
@@ -89,10 +90,11 @@ def test_base_process_file_pl(mocker, worker, input_dir, input_file1_pl,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker.process(
         os.path.join(input_dir, input_file1_pl),
         {"lang": "pl"}, os.path.join(output_dir, input_file1_pl)
@@ -108,10 +110,11 @@ def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_pl),
         {"lang": "pl"}, os.path.join(output_dir, input_file_small_pl)
@@ -127,10 +130,11 @@ def test_base_process_file_de(mocker, worker, input_dir, input_file1_de,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker.process(
         os.path.join(input_dir, input_file1_de),
         {"lang": "de"}, os.path.join(output_dir, input_file1_de)
@@ -146,10 +150,11 @@ def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_de),
         {"lang": "de"}, os.path.join(output_dir, input_file_small_de)
@@ -165,10 +170,11 @@ def test_base_process_file_es(mocker, worker, input_dir, input_file1_es,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker.process(
         os.path.join(input_dir, input_file1_es),
         {"lang": "es"}, os.path.join(output_dir, input_file1_es)
@@ -184,10 +190,11 @@ def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_es),
         {"lang": "es"}, os.path.join(output_dir, input_file_small_es)
@@ -203,10 +210,11 @@ def test_base_process_file_pt(mocker, worker, input_dir, input_file1_pt,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker.process(
         os.path.join(input_dir, input_file1_pt),
         {"lang": "pt"}, os.path.join(output_dir, input_file1_pt)
@@ -222,10 +230,11 @@ def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_pt),
         {"lang": "pt"}, os.path.join(output_dir, input_file_small_pt)
@@ -241,10 +250,11 @@ def test_base_process_file_fr(mocker, worker, input_dir, input_file1_fr,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker.process(
         os.path.join(input_dir, input_file1_fr),
         {"lang": "fr"}, os.path.join(output_dir, input_file1_fr)
@@ -260,10 +270,11 @@ def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_fr),
         {"lang": "fr"}, os.path.join(output_dir, input_file_small_fr)
@@ -279,10 +290,11 @@ def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker.process(
         os.path.join(input_dir, input_file1_ru),
         {"lang": "ru"}, os.path.join(output_dir, input_file1_ru)
@@ -298,10 +310,11 @@ def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
+    for i in range(0, 10):
+        SubTask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid() + i
+        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_ru),
         {"lang": "ru"}, os.path.join(output_dir, input_file_small_ru)
-- 
GitLab


From 5b08f2465bda67f337f5881d1d85c1bca46592a0 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 12:04:28 +0200
Subject: [PATCH 64/90] Move common part to separate fixture

---
 tests/test.py | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index e567509..638baa6 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -44,17 +44,21 @@ def test_init():
     assert type(worker).__name__ == 'TaggerWorker'
 
 
-def test_base_process_file_en(mocker, worker, input_dir, input_file1,
-                        output_dir, expected_dir):
+@pytest.fixture(autouse=True)
+def run_around_tests(mocker):
     mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
         )
+
+def test_base_process_file_en(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    
     worker.process(
         os.path.join(input_dir, input_file1),
         {"lang": "en"}, os.path.join(output_dir, input_file1)
@@ -70,7 +74,7 @@ def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -90,7 +94,7 @@ def test_base_process_file_pl(mocker, worker, input_dir, input_file1_pl,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -110,7 +114,7 @@ def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -130,7 +134,7 @@ def test_base_process_file_de(mocker, worker, input_dir, input_file1_de,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -150,7 +154,7 @@ def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -170,7 +174,7 @@ def test_base_process_file_es(mocker, worker, input_dir, input_file1_es,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -190,7 +194,7 @@ def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -210,7 +214,7 @@ def test_base_process_file_pt(mocker, worker, input_dir, input_file1_pt,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -230,7 +234,7 @@ def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -250,7 +254,7 @@ def test_base_process_file_fr(mocker, worker, input_dir, input_file1_fr,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -270,7 +274,7 @@ def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -290,7 +294,7 @@ def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -310,7 +314,7 @@ def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 10):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
-- 
GitLab


From 4360959b5c3b7dd072cab86060a530dfa77f7983 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 12:06:32 +0200
Subject: [PATCH 65/90] Delete common part from a few more tests

---
 tests/test.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 638baa6..6303c98 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -58,7 +58,6 @@ def run_around_tests(mocker):
 
 def test_base_process_file_en(mocker, worker, input_dir, input_file1,
                         output_dir, expected_dir):
-    
     worker.process(
         os.path.join(input_dir, input_file1),
         {"lang": "en"}, os.path.join(output_dir, input_file1)
@@ -70,15 +69,6 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
 
 def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input_file_small,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker_small.process(
         os.path.join(input_dir, input_file_small),
         {"lang": "en"}, os.path.join(output_dir, input_file_small)
@@ -90,15 +80,6 @@ def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input
 
 def test_base_process_file_pl(mocker, worker, input_dir, input_file1_pl,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker.process(
         os.path.join(input_dir, input_file1_pl),
         {"lang": "pl"}, os.path.join(output_dir, input_file1_pl)
-- 
GitLab


From 1d39247409f9ef2b0e3d55f489e8c47e1de616be Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 12:13:39 +0200
Subject: [PATCH 66/90] Remove common part from all tests

---
 tests/test.py | 99 ---------------------------------------------------
 1 file changed, 99 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 6303c98..1a26486 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -91,15 +91,6 @@ def test_base_process_file_pl(mocker, worker, input_dir, input_file1_pl,
 
 def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input_file_small_pl,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_pl),
         {"lang": "pl"}, os.path.join(output_dir, input_file_small_pl)
@@ -111,15 +102,6 @@ def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input
 
 def test_base_process_file_de(mocker, worker, input_dir, input_file1_de,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker.process(
         os.path.join(input_dir, input_file1_de),
         {"lang": "de"}, os.path.join(output_dir, input_file1_de)
@@ -131,15 +113,6 @@ def test_base_process_file_de(mocker, worker, input_dir, input_file1_de,
 
 def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input_file_small_de,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_de),
         {"lang": "de"}, os.path.join(output_dir, input_file_small_de)
@@ -151,15 +124,6 @@ def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input
 
 def test_base_process_file_es(mocker, worker, input_dir, input_file1_es,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker.process(
         os.path.join(input_dir, input_file1_es),
         {"lang": "es"}, os.path.join(output_dir, input_file1_es)
@@ -171,15 +135,6 @@ def test_base_process_file_es(mocker, worker, input_dir, input_file1_es,
 
 def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input_file_small_es,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_es),
         {"lang": "es"}, os.path.join(output_dir, input_file_small_es)
@@ -191,15 +146,6 @@ def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input
 
 def test_base_process_file_pt(mocker, worker, input_dir, input_file1_pt,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker.process(
         os.path.join(input_dir, input_file1_pt),
         {"lang": "pt"}, os.path.join(output_dir, input_file1_pt)
@@ -211,15 +157,6 @@ def test_base_process_file_pt(mocker, worker, input_dir, input_file1_pt,
 
 def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input_file_small_pt,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_pt),
         {"lang": "pt"}, os.path.join(output_dir, input_file_small_pt)
@@ -231,15 +168,6 @@ def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input
 
 def test_base_process_file_fr(mocker, worker, input_dir, input_file1_fr,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker.process(
         os.path.join(input_dir, input_file1_fr),
         {"lang": "fr"}, os.path.join(output_dir, input_file1_fr)
@@ -251,15 +179,6 @@ def test_base_process_file_fr(mocker, worker, input_dir, input_file1_fr,
 
 def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input_file_small_fr,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_fr),
         {"lang": "fr"}, os.path.join(output_dir, input_file_small_fr)
@@ -271,15 +190,6 @@ def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input
 
 def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker.process(
         os.path.join(input_dir, input_file1_ru),
         {"lang": "ru"}, os.path.join(output_dir, input_file1_ru)
@@ -291,15 +201,6 @@ def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
 
 def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input_file_small_ru,
                         output_dir, expected_dir):
-    mocker.patch('nlp_ws._subtask.SubTask.run', return_value=None)
-    mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
-    mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
-    mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
     worker_small.process(
         os.path.join(input_dir, input_file_small_ru),
         {"lang": "ru"}, os.path.join(output_dir, input_file_small_ru)
-- 
GitLab


From e43c17a72b48724089c689545233846f0e62d8b1 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 12:20:29 +0200
Subject: [PATCH 67/90] Whole range of possible PIDs

---
 tests/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 1a26486..940a679 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,10 +50,10 @@ def run_around_tests(mocker):
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
+    for i in range(0, 100_000):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
+            i
         )
 
 def test_base_process_file_en(mocker, worker, input_dir, input_file1,
-- 
GitLab


From d71921bb4042d7a7371058e92d528ef3276320c9 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 12:24:02 +0200
Subject: [PATCH 68/90] Revert "Whole range of possible PIDs"

This reverts commit e43c17a72b48724089c689545233846f0e62d8b1.
---
 tests/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 940a679..1a26486 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,10 +50,10 @@ def run_around_tests(mocker):
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100_000):
+    for i in range(0, 100):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            i
+            os.getpid() + i
         )
 
 def test_base_process_file_en(mocker, worker, input_dir, input_file1,
-- 
GitLab


From 2b660c5f59f34b5720f781923b75c3514a375bc2 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 12:32:42 +0200
Subject: [PATCH 69/90] Add sample tests for parallel execution

---
 tests/test.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/test.py b/tests/test.py
index 1a26486..50cfcf0 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -78,6 +78,28 @@ def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small))
 
 
+def test_base_process_file_en(mocker, worker, input_dir, input_file1,
+                        output_dir, expected_dir):
+    worker.process(
+        os.path.join(input_dir, input_file1),
+        {"lang": "en", "parallel_subtasks": 4}, os.path.join(output_dir, input_file1)
+    )
+    assert cmp(os.path.join(output_dir, input_file1),
+               os.path.join(expected_dir, input_file1))
+    os.remove(os.path.join(output_dir, input_file1))
+
+
+def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input_file_small,
+                        output_dir, expected_dir):
+    worker_small.process(
+        os.path.join(input_dir, input_file_small),
+        {"lang": "en", "parallel_subtasks": 4}, os.path.join(output_dir, input_file_small)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small),
+               os.path.join(expected_dir, input_file_small))
+    os.remove(os.path.join(output_dir, input_file_small))
+
+
 def test_base_process_file_pl(mocker, worker, input_dir, input_file1_pl,
                         output_dir, expected_dir):
     worker.process(
-- 
GitLab


From a4315f26de18d4aaa0382c1e624334a210fd67f5 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 12:42:01 +0200
Subject: [PATCH 70/90] Add some more tests for parallel execution

---
 tests/test.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 50cfcf0..f26425c 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,7 +50,7 @@ def run_around_tests(mocker):
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 100):
+    for i in range(0, 200):
         SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid() + i
@@ -78,7 +78,7 @@ def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small))
 
 
-def test_base_process_file_en(mocker, worker, input_dir, input_file1,
+def test_base_process_file_en_parallel(mocker, worker, input_dir, input_file1,
                         output_dir, expected_dir):
     worker.process(
         os.path.join(input_dir, input_file1),
@@ -89,7 +89,7 @@ def test_base_process_file_en(mocker, worker, input_dir, input_file1,
     os.remove(os.path.join(output_dir, input_file1))
 
 
-def test_base_process_file_small_limit_en(mocker, worker_small, input_dir, input_file_small,
+def test_base_process_file_small_limit_en_parallel(mocker, worker_small, input_dir, input_file_small,
                         output_dir, expected_dir):
     worker_small.process(
         os.path.join(input_dir, input_file_small),
@@ -122,6 +122,28 @@ def test_base_process_file_small_limit_pl(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small_pl))
 
 
+def test_base_process_file_pl_parallel(mocker, worker, input_dir, input_file1_pl,
+                        output_dir, expected_dir):
+    worker.process(
+        os.path.join(input_dir, input_file1_pl),
+        {"lang": "pl", "parallel_subtasks": 3}, os.path.join(output_dir, input_file1_pl)
+    )
+    assert cmp(os.path.join(output_dir, input_file1_pl),
+               os.path.join(expected_dir, input_file1_pl))
+    os.remove(os.path.join(output_dir, input_file1_pl))
+
+
+def test_base_process_file_small_limit_pl_parallel(mocker, worker_small, input_dir, input_file_small_pl,
+                        output_dir, expected_dir):
+    worker_small.process(
+        os.path.join(input_dir, input_file_small_pl),
+        {"lang": "pl", "parallel_subtasks": 3}, os.path.join(output_dir, input_file_small_pl)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small_pl),
+               os.path.join(expected_dir, input_file_small_pl))
+    os.remove(os.path.join(output_dir, input_file_small_pl))
+
+
 def test_base_process_file_de(mocker, worker, input_dir, input_file1_de,
                         output_dir, expected_dir):
     worker.process(
@@ -144,6 +166,28 @@ def test_base_process_file_small_limit_de(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small_de))
 
 
+def test_base_process_file_de_parallel(mocker, worker, input_dir, input_file1_de,
+                        output_dir, expected_dir):
+    worker.process(
+        os.path.join(input_dir, input_file1_de),
+        {"lang": "de", "parallel_subtasks": 8}, os.path.join(output_dir, input_file1_de)
+    )
+    assert cmp(os.path.join(output_dir, input_file1_de),
+               os.path.join(expected_dir, input_file1_de))
+    os.remove(os.path.join(output_dir, input_file1_de))
+
+
+def test_base_process_file_small_limit_de_parallel(mocker, worker_small, input_dir, input_file_small_de,
+                        output_dir, expected_dir):
+    worker_small.process(
+        os.path.join(input_dir, input_file_small_de),
+        {"lang": "de", "parallel_subtasks": 8}, os.path.join(output_dir, input_file_small_de)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small_de),
+               os.path.join(expected_dir, input_file_small_de))
+    os.remove(os.path.join(output_dir, input_file_small_de))
+
+
 def test_base_process_file_es(mocker, worker, input_dir, input_file1_es,
                         output_dir, expected_dir):
     worker.process(
-- 
GitLab


From ac18cbce977cb60c0140677934cd59fbd709a1b5 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 13:03:59 +0200
Subject: [PATCH 71/90] Add the rest of tests

---
 tests/test.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/tests/test.py b/tests/test.py
index f26425c..7ec421b 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -210,6 +210,28 @@ def test_base_process_file_small_limit_es(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small_es))
 
 
+def test_base_process_file_es_parallel(mocker, worker, input_dir, input_file1_es,
+                        output_dir, expected_dir):
+    worker.process(
+        os.path.join(input_dir, input_file1_es),
+        {"lang": "es", "parallel_subtasks": 2}, os.path.join(output_dir, input_file1_es)
+    )
+    assert cmp(os.path.join(output_dir, input_file1_es),
+               os.path.join(expected_dir, input_file1_es))
+    os.remove(os.path.join(output_dir, input_file1_es))
+
+
+def test_base_process_file_small_limit_es_parallel(mocker, worker_small, input_dir, input_file_small_es,
+                        output_dir, expected_dir):
+    worker_small.process(
+        os.path.join(input_dir, input_file_small_es),
+        {"lang": "es", "parallel_subtasks": 2}, os.path.join(output_dir, input_file_small_es)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small_es),
+               os.path.join(expected_dir, input_file_small_es))
+    os.remove(os.path.join(output_dir, input_file_small_es))
+
+
 def test_base_process_file_pt(mocker, worker, input_dir, input_file1_pt,
                         output_dir, expected_dir):
     worker.process(
@@ -232,6 +254,28 @@ def test_base_process_file_small_limit_pt(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small_pt))
 
 
+def test_base_process_file_pt_parallel(mocker, worker, input_dir, input_file1_pt,
+                        output_dir, expected_dir):
+    worker.process(
+        os.path.join(input_dir, input_file1_pt),
+        {"lang": "pt", "parallel_subtasks": 4}, os.path.join(output_dir, input_file1_pt)
+    )
+    assert cmp(os.path.join(output_dir, input_file1_pt),
+               os.path.join(expected_dir, input_file1_pt))
+    os.remove(os.path.join(output_dir, input_file1_pt))
+
+
+def test_base_process_file_small_limit_pt_parallel(mocker, worker_small, input_dir, input_file_small_pt,
+                        output_dir, expected_dir):
+    worker_small.process(
+        os.path.join(input_dir, input_file_small_pt),
+        {"lang": "pt", "parallel_subtasks": 4}, os.path.join(output_dir, input_file_small_pt)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small_pt),
+               os.path.join(expected_dir, input_file_small_pt))
+    os.remove(os.path.join(output_dir, input_file_small_pt))
+
+
 def test_base_process_file_fr(mocker, worker, input_dir, input_file1_fr,
                         output_dir, expected_dir):
     worker.process(
@@ -254,6 +298,28 @@ def test_base_process_file_small_limit_fr(mocker, worker_small, input_dir, input
     os.remove(os.path.join(output_dir, input_file_small_fr))
 
 
+def test_base_process_file_fr_parallel(mocker, worker, input_dir, input_file1_fr,
+                        output_dir, expected_dir):
+    worker.process(
+        os.path.join(input_dir, input_file1_fr),
+        {"lang": "fr", "parallel_subtasks": 4}, os.path.join(output_dir, input_file1_fr)
+    )
+    assert cmp(os.path.join(output_dir, input_file1_fr),
+               os.path.join(expected_dir, input_file1_fr))
+    os.remove(os.path.join(output_dir, input_file1_fr))
+
+
+def test_base_process_file_small_limit_fr_parallel(mocker, worker_small, input_dir, input_file_small_fr,
+                        output_dir, expected_dir):
+    worker_small.process(
+        os.path.join(input_dir, input_file_small_fr),
+        {"lang": "fr", "parallel_subtasks": 4}, os.path.join(output_dir, input_file_small_fr)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small_fr),
+               os.path.join(expected_dir, input_file_small_fr))
+    os.remove(os.path.join(output_dir, input_file_small_fr))
+
+
 def test_base_process_file_ru(mocker, worker, input_dir, input_file1_ru,
                         output_dir, expected_dir):
     worker.process(
@@ -274,3 +340,25 @@ def test_base_process_file_small_limit_ru(mocker, worker_small, input_dir, input
     assert cmp(os.path.join(output_dir, input_file_small_ru),
                os.path.join(expected_dir, input_file_small_ru))
     os.remove(os.path.join(output_dir, input_file_small_ru))
+
+
+def test_base_process_file_ru_parallel(mocker, worker, input_dir, input_file1_ru,
+                        output_dir, expected_dir):
+    worker.process(
+        os.path.join(input_dir, input_file1_ru),
+        {"lang": "ru", "parallel_subtasks": 6}, os.path.join(output_dir, input_file1_ru)
+    )
+    assert cmp(os.path.join(output_dir, input_file1_ru),
+               os.path.join(expected_dir, input_file1_ru))
+    os.remove(os.path.join(output_dir, input_file1_ru))
+
+
+def test_base_process_file_small_limit_ru_parallel(mocker, worker_small, input_dir, input_file_small_ru,
+                        output_dir, expected_dir):
+    worker_small.process(
+        os.path.join(input_dir, input_file_small_ru),
+        {"lang": "ru", "parallel_subtasks": 6}, os.path.join(output_dir, input_file_small_ru)
+    )
+    assert cmp(os.path.join(output_dir, input_file_small_ru),
+               os.path.join(expected_dir, input_file_small_ru))
+    os.remove(os.path.join(output_dir, input_file_small_ru))
-- 
GitLab


From 864f609e592813bd2e99cbc9c639ff408c810905 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 5 May 2023 13:25:57 +0200
Subject: [PATCH 72/90] Add line

---
 tests/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test.py b/tests/test.py
index 7ec421b..f0263c2 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -56,6 +56,7 @@ def run_around_tests(mocker):
             os.getpid() + i
         )
 
+
 def test_base_process_file_en(mocker, worker, input_dir, input_file1,
                         output_dir, expected_dir):
     worker.process(
-- 
GitLab


From 022f65c0d6cf42110d469b585b849c16c2410d74 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 11 May 2023 14:16:22 +0200
Subject: [PATCH 73/90] Add pid of child process from child

---
 src/utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 3374733..e67ba01 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -5,13 +5,15 @@ import os
 from pathlib import Path
 from typing import List
 
+import aioprocessing as ap
 from nlp_ws import SubTask
-from multiprocessing import Pool
+from multiprocessing import Pool, Lock
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
 SPACE = " "
 
+process_entry_lock = Lock()
 
 def utf8size(string: str) -> int:
     """Returns size of string in utf8 encoding.
@@ -38,6 +40,14 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
+
+def _add_process_entry():
+    with process_entry_lock:
+        Subtask.prepare_subtask(
+            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+            os.getpid()
+        )
+
 def _run_subtask(args: tuple, i: int) -> tuple:
     subtask = SubTask(args[0], args[1])
     subtask.run(blocking=False)
@@ -68,7 +78,7 @@ def merge_splits(output_path: str, destination_path: str,
         os.remove(output_path)
     # create output file
     with open(output_path, "a") as f2, \
-         Pool(processes=parallel_subtasks) as pool:
+         Pool(processes=parallel_subtasks, initializer=_add_process_entry) as pool:
         # run tagger on each chunk
         subtask_args_queue_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
-- 
GitLab


From 63b19308343099c9a1b688bde4c5b4e32757d712 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 11 May 2023 14:19:14 +0200
Subject: [PATCH 74/90] Fix pep8

---
 src/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index e67ba01..da36523 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -15,6 +15,7 @@ SPACE = " "
 
 process_entry_lock = Lock()
 
+
 def utf8size(string: str) -> int:
     """Returns size of string in utf8 encoding.
 
@@ -40,14 +41,14 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
-
 def _add_process_entry():
     with process_entry_lock:
-        Subtask.prepare_subtask(
+        SubTask.prepare_subtask(
             {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
             os.getpid()
         )
 
+
 def _run_subtask(args: tuple, i: int) -> tuple:
     subtask = SubTask(args[0], args[1])
     subtask.run(blocking=False)
@@ -78,7 +79,8 @@ def merge_splits(output_path: str, destination_path: str,
         os.remove(output_path)
     # create output file
     with open(output_path, "a") as f2, \
-         Pool(processes=parallel_subtasks, initializer=_add_process_entry) as pool:
+         Pool(processes=parallel_subtasks,
+              initializer=_add_process_entry) as pool:
         # run tagger on each chunk
         subtask_args_queue_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
-- 
GitLab


From 1d5fac36455e88fb706c01800219dc6e6a32fe24 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 11 May 2023 14:24:07 +0200
Subject: [PATCH 75/90] Remove range initilization form tests

---
 tests/test.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index f0263c2..b120603 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,11 +50,10 @@ def run_around_tests(mocker):
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    for i in range(0, 200):
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid() + i
-        )
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
 
 
 def test_base_process_file_en(mocker, worker, input_dir, input_file1,
-- 
GitLab


From b1db6d07a54a3aaa84dc4bbe83d3f846cc89c1fe Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 11 May 2023 14:57:34 +0200
Subject: [PATCH 76/90] Remove parent prepare_subtask for tests

---
 tests/test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index b120603..25db366 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,10 +50,6 @@ def run_around_tests(mocker):
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
 
 
 def test_base_process_file_en(mocker, worker, input_dir, input_file1,
-- 
GitLab


From d6de94bd51e077d6b73f562c701b9847aee7d578 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 11 May 2023 15:02:54 +0200
Subject: [PATCH 77/90] Remove Lock

---
 src/utils.py  | 13 +++++--------
 tests/test.py |  4 ++++
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index da36523..87d142b 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -7,14 +7,12 @@ from typing import List
 
 import aioprocessing as ap
 from nlp_ws import SubTask
-from multiprocessing import Pool, Lock
+from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
 SPACE = " "
 
-process_entry_lock = Lock()
-
 
 def utf8size(string: str) -> int:
     """Returns size of string in utf8 encoding.
@@ -42,11 +40,10 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 
 
 def _add_process_entry():
-    with process_entry_lock:
-        SubTask.prepare_subtask(
-            {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-            os.getpid()
-        )
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
 
 
 def _run_subtask(args: tuple, i: int) -> tuple:
diff --git a/tests/test.py b/tests/test.py
index 25db366..b120603 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -50,6 +50,10 @@ def run_around_tests(mocker):
     mocker.patch('nlp_ws._subtask.SubTask.get_output_path', get_output_path)
     mocker.patch('nlp_ws._subtask.SubTask.prepare_subtask', prepare_subtask)
     mocker.patch('nlp_ws._worker.NLPWorker.update_progress')
+    SubTask.prepare_subtask(
+        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
+        os.getpid()
+    )
 
 
 def test_base_process_file_en(mocker, worker, input_dir, input_file1,
-- 
GitLab


From 0d02728eb583aaeefbd7ca6aa9a7f56274070bf0 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 11 May 2023 15:18:50 +0200
Subject: [PATCH 78/90] Add logging debug

---
 src/utils.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/utils.py b/src/utils.py
index 87d142b..5d7e1f9 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -40,16 +40,22 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
 
 
 def _add_process_entry():
+    logging.debug(f"Adding process entry: {os.getpid()}")
     SubTask.prepare_subtask(
         {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
         os.getpid()
     )
+    logging.debug(f"Added process entry: {os.getpid()}")
 
 
 def _run_subtask(args: tuple, i: int) -> tuple:
+    logging.debug(f"Running subtask: {args}")
     subtask = SubTask(args[0], args[1])
+    logging.debug(f"Running subtask: {subtask}")
     subtask.run(blocking=False)
+    logging.debug(f"Getting output: {subtask}")
     result = subtask.get_output_path()
+    logging.debug(f"Got output: {result}")
     return (i, result)
 
 
@@ -75,9 +81,11 @@ def merge_splits(output_path: str, destination_path: str,
     if os.path.isfile(output_path):
         os.remove(output_path)
     # create output file
+    logging.debug(f"Creating output file: {output_path}")
     with open(output_path, "a") as f2, \
          Pool(processes=parallel_subtasks,
               initializer=_add_process_entry) as pool:
+        logging.debug(f"Created output file: {output_path}")
         # run tagger on each chunk
         subtask_args_queue_awaiting = []
         for dbg_i, chunk in enumerate(splitted_corpus):
@@ -91,16 +99,22 @@ def merge_splits(output_path: str, destination_path: str,
                 )
             )
 
+        logging.debug(f"Subtask args queue: {subtask_args_queue_awaiting}")
         while len(subtask_args_queue_awaiting) > 0:
             args = subtask_args_queue_awaiting[:parallel_subtasks]
+            logging.debug(f"Subtask args: {args}")
             multiple_results = [
                 pool.apply_async(_run_subtask, (arg_tuple, i))
                 for i, arg_tuple in enumerate(args)
             ]
+            logging.debug(f"Multiple results: {multiple_results}")
             multiple_results = [res.get()
                                 for res in multiple_results]
+            logging.debug(f"Multiple results2: {multiple_results}")
             multiple_results.sort(key=lambda x: x[0])
+            logging.debug(f"Multiple results3: {multiple_results}")
             l_results = [res[1] for res in multiple_results]
+            logging.debug(f"Multiple results4: {l_results}")
 
             for l_result in l_results:
                 _log.debug(f"Result of chunk: {l_result}")
-- 
GitLab


From 7cd54fd3f319b49d81008a6571c31b159c162712 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Thu, 11 May 2023 15:48:36 +0200
Subject: [PATCH 79/90] Chnage logging msgs

---
 src/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 5d7e1f9..2e58205 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -49,9 +49,9 @@ def _add_process_entry():
 
 
 def _run_subtask(args: tuple, i: int) -> tuple:
-    logging.debug(f"Running subtask: {args}")
+    logging.debug(f"Running subtask for args: {args}")
     subtask = SubTask(args[0], args[1])
-    logging.debug(f"Running subtask: {subtask}")
+    logging.debug(f"Running following subtask: {subtask}")
     subtask.run(blocking=False)
     logging.debug(f"Getting output: {subtask}")
     result = subtask.get_output_path()
-- 
GitLab


From 0200386dbfcbd1dcf89522375e9c4973eb0d77bf Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 12 May 2023 10:11:30 +0200
Subject: [PATCH 80/90] Remove multiprocessing

---
 src/utils.py | 44 ++++++++------------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 2e58205..d098fe5 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -5,9 +5,7 @@ import os
 from pathlib import Path
 from typing import List
 
-import aioprocessing as ap
 from nlp_ws import SubTask
-from multiprocessing import Pool
 
 WORDS_PATTERN = r'\w+'
 PARAGRAPH = ".\n"
@@ -39,26 +37,6 @@ def _update_last_chunk(tail_data: str, chunk_file_name: str,
     logging.debug(f'Updating last chunk({chunk_file_name})...')
 
 
-def _add_process_entry():
-    logging.debug(f"Adding process entry: {os.getpid()}")
-    SubTask.prepare_subtask(
-        {"q_in": ap.AioQueue(), "q_out": ap.AioQueue()},
-        os.getpid()
-    )
-    logging.debug(f"Added process entry: {os.getpid()}")
-
-
-def _run_subtask(args: tuple, i: int) -> tuple:
-    logging.debug(f"Running subtask for args: {args}")
-    subtask = SubTask(args[0], args[1])
-    logging.debug(f"Running following subtask: {subtask}")
-    subtask.run(blocking=False)
-    logging.debug(f"Getting output: {subtask}")
-    result = subtask.get_output_path()
-    logging.debug(f"Got output: {result}")
-    return (i, result)
-
-
 def merge_splits(output_path: str, destination_path: str,
                  splitted_corpus: List[str], json_lpmn: str,
                  _log: logging.Logger, parallel_subtasks: int = 1):
@@ -82,9 +60,7 @@ def merge_splits(output_path: str, destination_path: str,
         os.remove(output_path)
     # create output file
     logging.debug(f"Creating output file: {output_path}")
-    with open(output_path, "a") as f2, \
-         Pool(processes=parallel_subtasks,
-              initializer=_add_process_entry) as pool:
+    with open(output_path, "a") as f2:
         logging.debug(f"Created output file: {output_path}")
         # run tagger on each chunk
         subtask_args_queue_awaiting = []
@@ -103,18 +79,14 @@ def merge_splits(output_path: str, destination_path: str,
         while len(subtask_args_queue_awaiting) > 0:
             args = subtask_args_queue_awaiting[:parallel_subtasks]
             logging.debug(f"Subtask args: {args}")
-            multiple_results = [
-                pool.apply_async(_run_subtask, (arg_tuple, i))
-                for i, arg_tuple in enumerate(args)
+            subtasks = [
+                SubTask(arg_tuple[0], arg_tuple[1]) for arg_tuple in args
             ]
-            logging.debug(f"Multiple results: {multiple_results}")
-            multiple_results = [res.get()
-                                for res in multiple_results]
-            logging.debug(f"Multiple results2: {multiple_results}")
-            multiple_results.sort(key=lambda x: x[0])
-            logging.debug(f"Multiple results3: {multiple_results}")
-            l_results = [res[1] for res in multiple_results]
-            logging.debug(f"Multiple results4: {l_results}")
+            for subtask in subtasks:
+                subtask.run(blocking=False)
+
+            l_results = [subtask.get_output_path() for subtask in subtasks]
+            logging.debug(f"Multiple results: {l_results}")
 
             for l_result in l_results:
                 _log.debug(f"Result of chunk: {l_result}")
-- 
GitLab


From 4a59aa3da02019b1120d1f640e3f65f9b460aec6 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 12 May 2023 11:10:02 +0200
Subject: [PATCH 81/90] Correct some details

---
 config.ini    | 1 +
 src/tagger.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/config.ini b/config.ini
index ad54b32..56e4d57 100644
--- a/config.ini
+++ b/config.ini
@@ -13,6 +13,7 @@ lpmn = lpmn
 [tool]
 config = pos_tagger.yaml
 workers_number = 20
+parallel_subtasks = 10
 chunking_limit = 50000
 
 [logging]
diff --git a/src/tagger.py b/src/tagger.py
index f6709f1..62a2298 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -103,7 +103,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
 
         json_text = task_options.get("json_text", True)
 
-        parallel_subtasks = task_options.get("parallel_subtasks", 1)
+        parallel_subtasks = task_options.get("parallel_subtasks", 10)
 
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
-- 
GitLab


From 119d0026729ac7ae7aa825a746981f1fc416e728 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 12 May 2023 13:02:04 +0200
Subject: [PATCH 82/90] Chnage default parallel_subtask nr

---
 src/tagger.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/tagger.py b/src/tagger.py
index 62a2298..f733307 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -39,6 +39,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
         _log.info(f"Config taggers from yaml: {cls._taggers}")
         _log.info(f"Config ners from yaml: {cls._ners}")
 
+        cls._parallel_subtasks = config.get('tool').get('parallel_subtasks', 10)
         cls._chunking_limit = config.get('tool').get('chunking_limit', 50000)
         if not isinstance(cls._chunking_limit, int):
             cls._chunking_limit = int(cls._chunking_limit)
@@ -75,7 +76,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
         json_text: bool if json output should contain original
         text (default = True)
         method: method of processing (default = 'tagger', values: tagger, ner)
-        parallel_subtasks: number of parallel subtasks (default = 1)
+        parallel_subtasks: number of parallel subtasks (default = 10)
         :type task_options: dict
 
         :param output_path: Path to directory where the
@@ -103,7 +104,10 @@ class TaggerWorker(nlp_ws.NLPWorker):
 
         json_text = task_options.get("json_text", True)
 
-        parallel_subtasks = task_options.get("parallel_subtasks", 10)
+        parallel_subtasks = task_options.get(
+            "parallel_subtasks",
+            self._parallel_subtasks
+        )
 
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
-- 
GitLab


From 33524592d5edeeee9cafe36dadf047cece26b059 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 12 May 2023 13:46:28 +0200
Subject: [PATCH 83/90] Add parallelization for lemmas output

---
 src/tagger.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tagger.py b/src/tagger.py
index f733307..262c613 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -18,6 +18,7 @@ SubTask.turn_on()
 DEFAULT_TYPE = "default"
 OUTPUT = "output"
 JSON = "json"
+LEMMAS = "lemmas"
 TAGSET = "tagset"
 TAGGER = "tagger"
 NER = "ner"
@@ -129,7 +130,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
             _dir_style = True
             json_lpmn = [json_lpmn]
         _log.debug(f"Running LPMN: {json_lpmn}")
-        if output == JSON and not _dir_style:
+        if output in [JSON, LEMMAS] and not _dir_style:
             # split file into chunks
             chunk_size = int(self._chunking_limit * 0.5)
             destination_path = os.path.join(
-- 
GitLab


From ca58f61491d8a2fdb6f40d498ad02f7f06370a9b Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 12 May 2023 14:19:16 +0200
Subject: [PATCH 84/90] DEbug parallel subtasks

---
 src/tagger.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tagger.py b/src/tagger.py
index 262c613..ce3eec7 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -110,6 +110,8 @@ class TaggerWorker(nlp_ws.NLPWorker):
             self._parallel_subtasks
         )
 
+        logging.debug(f"{parallel_subtasks=}")
+
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
         convert_lpmn = self.get_converter_directive(
-- 
GitLab


From 472cab9422d0a1a070a687f57be60b9921cb411a Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 12 May 2023 14:34:16 +0200
Subject: [PATCH 85/90] Add more debug logs

---
 src/tagger.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/tagger.py b/src/tagger.py
index ce3eec7..86870de 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -110,7 +110,11 @@ class TaggerWorker(nlp_ws.NLPWorker):
             self._parallel_subtasks
         )
 
-        logging.debug(f"{parallel_subtasks=}")
+        _log.debug(f"{self._parallel_subtasks=}")
+        _log.debug(f"{parallel_subtasks=}")
+
+        _log.info(f"{self._parallel_subtasks=}")
+        _log.info(f"{parallel_subtasks=}")
 
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
-- 
GitLab


From e8ee685312ee4e6c9568dbfda92fb82332a39da2 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Fri, 12 May 2023 15:06:53 +0200
Subject: [PATCH 86/90] Add default for cls._prallel_subtasks

---
 src/tagger.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tagger.py b/src/tagger.py
index 86870de..098c096 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -40,6 +40,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
         _log.info(f"Config taggers from yaml: {cls._taggers}")
         _log.info(f"Config ners from yaml: {cls._ners}")
 
+        cls._parallel_subtasks = 10
         cls._parallel_subtasks = config.get('tool').get('parallel_subtasks', 10)
         cls._chunking_limit = config.get('tool').get('chunking_limit', 50000)
         if not isinstance(cls._chunking_limit, int):
-- 
GitLab


From 0e4574d722ee2bfd9b3380d9f90b2b3aed9025c7 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 15 May 2023 09:02:18 +0200
Subject: [PATCH 87/90] Fix type error

---
 src/tagger.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tagger.py b/src/tagger.py
index 098c096..c109176 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -40,9 +40,10 @@ class TaggerWorker(nlp_ws.NLPWorker):
         _log.info(f"Config taggers from yaml: {cls._taggers}")
         _log.info(f"Config ners from yaml: {cls._ners}")
 
-        cls._parallel_subtasks = 10
         cls._parallel_subtasks = config.get('tool').get('parallel_subtasks', 10)
         cls._chunking_limit = config.get('tool').get('chunking_limit', 50000)
+        if not isinstance(cls._parallel_subtasks, int):
+            cls._parallel_subtasks = int(cls._parallel_subtasks)
         if not isinstance(cls._chunking_limit, int):
             cls._chunking_limit = int(cls._chunking_limit)
         _log.info(f"Chunk size: {cls._chunking_limit}")
-- 
GitLab


From 8dce7f1953b31ad0727e0a3c2f0e8e4e8f69cfb0 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 15 May 2023 09:13:34 +0200
Subject: [PATCH 88/90] Remove unnecessary logging msgs

---
 src/tagger.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/tagger.py b/src/tagger.py
index c109176..e0af179 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -112,12 +112,6 @@ class TaggerWorker(nlp_ws.NLPWorker):
             self._parallel_subtasks
         )
 
-        _log.debug(f"{self._parallel_subtasks=}")
-        _log.debug(f"{parallel_subtasks=}")
-
-        _log.info(f"{self._parallel_subtasks=}")
-        _log.info(f"{parallel_subtasks=}")
-
         tagger_opt = self._taggers[lang][DEFAULT_TYPE]
         ner_opt = self._ners[lang][DEFAULT_TYPE]
         convert_lpmn = self.get_converter_directive(
-- 
GitLab


From 948bb3e87af54706c1987176ba1ebff7e53e22b9 Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 15 May 2023 10:26:40 +0200
Subject: [PATCH 89/90] Find out where \n for lemmas from

---
 src/tagger.py                                       | 2 +-
 tests/testdata/expected/post_postagger_input_lemmas | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tagger.py b/src/tagger.py
index e0af179..b036c6b 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -132,7 +132,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
             _dir_style = True
             json_lpmn = [json_lpmn]
         _log.debug(f"Running LPMN: {json_lpmn}")
-        if output in [JSON, LEMMAS] and not _dir_style:
+        if output in [JSON] and not _dir_style:
             # split file into chunks
             chunk_size = int(self._chunking_limit * 0.5)
             destination_path = os.path.join(
diff --git a/tests/testdata/expected/post_postagger_input_lemmas b/tests/testdata/expected/post_postagger_input_lemmas
index deac1e9..c6ee9fc 100644
--- a/tests/testdata/expected/post_postagger_input_lemmas
+++ b/tests/testdata/expected/post_postagger_input_lemmas
@@ -11,3 +11,4 @@ woda występować w przyroda być roztwór sól i gaz .
 najwięcej sól mineralny zawierać woda morski i woda mineralny ; najmniej woda z opad atmosferyczny .
 woda o mały zawartość składnik mineralny nazywać woda miękki , natomiast zawierać znaczny ilość sól wapń i magnez – woda twardy .
 oprócz to woda naturalny zawierać rozpuścić substancja pochodzenie organiczny , na przykład . mocznik , kwas humusowy i tym podobne .
+
-- 
GitLab


From 02c23c934e3fe6e4fb3d6d89e63b508f2cec733b Mon Sep 17 00:00:00 2001
From: bmatysiak <bartosz.matysiak@pwr.edu.pl>
Date: Mon, 15 May 2023 10:36:17 +0200
Subject: [PATCH 90/90] Restore LEMMAS

---
 src/tagger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tagger.py b/src/tagger.py
index b036c6b..e0af179 100644
--- a/src/tagger.py
+++ b/src/tagger.py
@@ -132,7 +132,7 @@ class TaggerWorker(nlp_ws.NLPWorker):
             _dir_style = True
             json_lpmn = [json_lpmn]
         _log.debug(f"Running LPMN: {json_lpmn}")
-        if output in [JSON] and not _dir_style:
+        if output in [JSON, LEMMAS] and not _dir_style:
             # split file into chunks
             chunk_size = int(self._chunking_limit * 0.5)
             destination_path = os.path.join(
-- 
GitLab