From d9b6204e04a99abb350de2d7ab866094eeee0466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Bojanowski?=
 <bartlomiej.piotr.bojanowski@gmail.com>
Date: Wed, 31 Mar 2021 10:25:04 +0000
Subject: [PATCH] Developer

---
 requirements.txt |   3 +-
 src/mewex.py     | 226 ++++++++++++++++++++++++++++-------------------
 2 files changed, 135 insertions(+), 94 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e129cd1..7f4464b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ Cython
 nlp_ws
 lxml
 ujson
-configparser
\ No newline at end of file
+configparser
+pandas
diff --git a/src/mewex.py b/src/mewex.py
index ce80d3a..8c25ae9 100644
--- a/src/mewex.py
+++ b/src/mewex.py
@@ -1,93 +1,133 @@
-"""Implementation of MeWex Worker."""
-# !/usr/bin/python3
-import os
-import re
-import io
-
-
-import mewexlib as mwl
-import WrapLem
-from nlp_ws import NLPWorker
-
-
-class MewexWorker(NLPWorker):
-    """Implements mewex worker."""
-
-    def init(self):
-        """Initialize worker."""
-        self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
-
-    def process(self, input_path, task_options, output_path):
-        """Running nlp process."""
-        args = _parse_mewex_options(task_options.get('mewex_options') or {})
-        args['input_files'] = (
-            [os.path.join(input_path, f) for f in os.listdir(input_path)]
-            if os.path.isdir(input_path)
-            else (input_path,)
-        )
-        if not os.path.exists(output_path):
-            os.makedirs(output_path)
-        args['output_file'] = output_path + "/mewex.csv"
-
-        mwl.call_mewex(**args)
-        self.lemmatize(output_path + "/mewex.csv",
-                       output_path + "/mewexlemmatized.csv")
-        self.cut_lines(output_path + "/mewexlemmatized.csv",
-                       output_path + "/mewexshort.csv", 1000)
-
-    def cut_lines(self, inf, outf, lines):
-        """."""
-        f = open(inf, "r")
-        copy = open(outf, "w")
-        n = 0
-        for line in f:
-            copy.write(line)
-            n = n + 1
-            if n > lines:
-                break
-        f.close()
-        copy.close()
-
-    def lemmatize(self, inf, outf):
-        """."""
-        input_file = io.open(inf, "r", encoding="utf-8")
-        output_file = open(outf, "w")
-        next(input_file)
-        next(input_file)  # First two rows are header rows, so just skip them
-        output_file.write(
-            "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
-        orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
-        basereg = re.compile(r'[^:]+:([^ ]+)')
-        for line in input_file:
-            splited = line.strip().split('\t')
-            orthtuple = orthreg.findall(splited[4])
-            baselist = basereg.findall(splited[3])
-            base = u' '.join(baselist)
-            orth = orthtuple[0][0].strip()
-            tag = orthtuple[0][1]
-            result = self._lemmatizer.lemmatizeS(orth, base, tag, False)
-            splited.insert(4, result)
-            output_file.write('\t'.join(splited) + '\n')
-        input_file.close()
-        output_file.close()
-
-
-_OPT_DISPATCH = dict(
-    ranker_func=lambda val: mwl.RankerFunction[val],
-    dispersion_func=lambda val: mwl.DispersionFunction[val],
-    wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val),
-)
-
-
-def _parse_mewex_options(opts):
-    args = {}
-
-    while opts:
-        key, val = opts.popitem()
-
-        if key in _OPT_DISPATCH:
-            val = _OPT_DISPATCH[key](val)
-
-        args[key] = val
-
-    return args
+"""Implementation of MeWex Worker."""
+# !/usr/bin/python3
+import os
+import re
+import io
+import shutil
+
+import pandas as pd
+import mewexlib as mwl
+import WrapLem
+from nlp_ws import NLPWorker
+
+
+class MewexWorker(NLPWorker):
+    """Implements mewex worker."""
+
+    def init(self):
+        """Initialize worker."""
+        self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
+
+    def process(self, input_path, task_options, output_path):
+        """Running nlp process."""
+        args = _parse_mewex_options(task_options.get('mewex_options') or {})
+        args['input_files'] = (
+            [os.path.join(input_path, f) for f in os.listdir(input_path)]
+            if os.path.isdir(input_path)
+            else (input_path,)
+        )
+        files = args['input_files']
+        option = True
+        step = 1000
+        counter = 0
+        if len(files) > step:
+            for counter, chunk in enumerate(
+                    [files[x:x + step] for x in range(0, len(files), step)]
+            ):
+                args['input_files'] = chunk
+                output = output_path + f"/{counter}"
+                if not os.path.exists(output):
+                    os.makedirs(output)
+                args['output_file'] = output + "/mewex.csv"
+                mwl.call_mewex(**args)
+                option = True
+            self.merge_data(output_path, counter)
+        else:
+            if not os.path.exists(output_path):
+                os.makedirs(output_path)
+            args['output_file'] = output_path + "/mewex.csv"
+            mwl.call_mewex(**args)
+            option = False
+        self.lemmatize(output_path + "/mewex.csv",
+                       output_path + "/mewexlemmatized.csv",
+                       option=option)
+        self.cut_lines(output_path + "/mewexlemmatized.csv",
+                       output_path + "/mewexshort.csv", 1000)
+
+    def merge_data(self, path, counter):
+        """Merge subfolder/mewex.csv to one mewex.csv."""
+        mewex = pd.read_csv(f'{path}/1/mewex.csv',
+                            header=None,
+                            skiprows=2,
+                            delimiter='\t')
+        shutil.rmtree(f'{path}/1')
+        for filename in range(2, counter + 1):
+            mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv',
+                                             header=None,
+                                             skiprows=2,
+                                             delimiter='\t',
+                                             engine='python'))
+            shutil.rmtree(f'{path}/{filename}')
+        mewex.to_csv(f'{path}/mewex.csv',
+                     index=False,
+                     header=['Rank', 'Quantity',
+                             'Realtion', 'Base form', 'All forms'],
+                     sep='\t')
+
+    def cut_lines(self, inf, outf, lines):
+        """."""
+        f = open(inf, "r")
+        copy = open(outf, "w")
+        n = 0
+        for line in f:
+            copy.write(line)
+            n = n + 1
+            if n > lines:
+                break
+        f.close()
+        copy.close()
+
+    def lemmatize(self, inf, outf, option):
+        """."""
+        input_file = io.open(inf, "r", encoding="utf-8")
+        output_file = open(outf, "w")
+        next(input_file)
+        if not option:
+            next(input_file)
+        output_file.write(
+            "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
+        orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
+        basereg = re.compile(r'[^:]+:([^ ]+)')
+        for line in input_file:
+            splited = line.strip().split('\t')
+            orthtuple = orthreg.findall(splited[4])
+            baselist = basereg.findall(splited[3])
+            base = u' '.join(baselist)
+            orth = orthtuple[0][0].strip()
+            tag = orthtuple[0][1]
+            result = self._lemmatizer.lemmatizeS(orth, base, tag, False)
+            splited.insert(4, result)
+            output_file.write('\t'.join(splited) + '\n')
+        input_file.close()
+        output_file.close()
+
+
+_OPT_DISPATCH = dict(
+    ranker_func=lambda val: mwl.RankerFunction[val],
+    dispersion_func=lambda val: mwl.DispersionFunction[val],
+    wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val),
+)
+
+
+def _parse_mewex_options(opts):
+    args = {}
+
+    while opts:
+        key, val = opts.popitem()
+
+        if key in _OPT_DISPATCH:
+            val = _OPT_DISPATCH[key](val)
+
+        args[key] = val
+
+    return args
-- 
GitLab