Developer

d9b6204e · Bartłomiej Bojanowski · Mateusz Gniewkowski · d3d60e06 · d9b6204e · d9b6204e
Commit d9b6204e authored Mar 31, 2021 by Bartłomiej Bojanowski Committed by Mateusz Gniewkowski Mar 31, 2021
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ nlp_ws
 lxml
 ujson
 configparser
+pandas
--- a/src/mewex.py
+++ b/src/mewex.py
@@ -3,8 +3,9 @@
 import os
 import re
 import io
+import shutil
+import pandas as pd
 import mewexlib as mwl
 import WrapLem
 from nlp_ws import NLPWorker
@@ -25,16 +26,54 @@ class MewexWorker(NLPWorker):
            if os.path.isdir(input_path)
            else (input_path,)
        )
+        files = args['input_files']
+        option = True
+        step = 1000
+        counter = 0
+        if len(files) > step:
+            for counter, chunk in enumerate(
+                    [files[x:x + step] for x in range(0, len(files), step)]
+            ):
+                args['input_files'] = chunk
+                output = output_path + f"/{counter}"
+                if not os.path.exists(output):
+                    os.makedirs(output)
+                args['output_file'] = output + "/mewex.csv"
+                mwl.call_mewex(**args)
+                option = True
+            self.merge_data(output_path, counter)
+        else:
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            args['output_file'] = output_path + "/mewex.csv"
            mwl.call_mewex(**args)
+            option = False
        self.lemmatize(output_path + "/mewex.csv",
-                       output_path + "/mewexlemmatized.csv")
+                       output_path + "/mewexlemmatized.csv",
+                       option=option)
        self.cut_lines(output_path + "/mewexlemmatized.csv",
                       output_path + "/mewexshort.csv", 1000)
+    def merge_data(self, path, counter):
+        """Merge subfolder/mewex.csv to one mewex.csv."""
+        mewex = pd.read_csv(f'{path}/1/mewex.csv',
+                            header=None,
+                            skiprows=2,
+                            delimiter='\t')
+        shutil.rmtree(f'{path}/1')
+        for filename in range(2, counter + 1):
+            mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv',
+                                             header=None,
+                                             skiprows=2,
+                                             delimiter='\t',
+                                             engine='python'))
+            shutil.rmtree(f'{path}/{filename}')
+        mewex.to_csv(f'{path}/mewex.csv',
+                     index=False,
+                     header=['Rank', 'Quantity',
+                             'Realtion', 'Base form', 'All forms'],
+                     sep='\t')
    def cut_lines(self, inf, outf, lines):
        """."""
        f = open(inf, "r")
@@ -48,12 +87,13 @@ class MewexWorker(NLPWorker):
        f.close()
        copy.close()
-    def lemmatize(self, inf, outf):
+    def lemmatize(self, inf, outf, option):
        """."""
        input_file = io.open(inf, "r", encoding="utf-8")
        output_file = open(outf, "w")
        next(input_file)
-        next(input_file)  # First two rows are header rows, so just skip them
+        if not option:
+            next(input_file)
        output_file.write(
            "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
        orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')