From ff9632a242af932946688ba75976902247c0a05d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Bojanowski?= <bartlomiej.bojanowski@pwr.edu.pl> Date: Wed, 3 Feb 2021 13:22:53 +0100 Subject: [PATCH 1/4] Add feature to split data --- Dockerfile | 1 + config.ini | 8 +- requirements.txt | 4 +- src/mewex.py | 216 +++++++++++++++++++++++++++-------------------- 4 files changed, 131 insertions(+), 98 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9b1e7c9..9604158 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,5 +54,6 @@ RUN cd mewex/mewexlib/ && \ WORKDIR /home/worker COPY ./src ./src COPY ./main.py . +COPY ./config.ini ./config.ini CMD ["python3.6", "main.py", "service"] diff --git a/config.ini b/config.ini index 2504e02..4aa96d6 100644 --- a/config.ini +++ b/config.ini @@ -1,10 +1,10 @@ [service] -tool = mewex1 +tool = mewex_tester root = /samba/requests/ -rabbit_host = rabbitmq -rabbit_user = test -rabbit_password = test +rabbit_host = 10.17.0.85 +rabbit_user = clarin +rabbit_password = clarin123 queue_prefix =nlp_ [tool] diff --git a/requirements.txt b/requirements.txt index e129cd1..b474d0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ Cython nlp_ws lxml ujson -configparser \ No newline at end of file +configparser +tqdm +pandas diff --git a/src/mewex.py b/src/mewex.py index ce80d3a..8ce9c04 100644 --- a/src/mewex.py +++ b/src/mewex.py @@ -1,93 +1,123 @@ -"""Implementation of MeWex Worker.""" -# !/usr/bin/python3 -import os -import re -import io - - -import mewexlib as mwl -import WrapLem -from nlp_ws import NLPWorker - - -class MewexWorker(NLPWorker): - """Implements mewex worker.""" - - def init(self): - """Initialize worker.""" - self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer() - - def process(self, input_path, task_options, output_path): - """Running nlp process.""" - args = _parse_mewex_options(task_options.get('mewex_options') or {}) - args['input_files'] = ( - [os.path.join(input_path, f) for f in os.listdir(input_path)] - if os.path.isdir(input_path) - else (input_path,) - ) - if not os.path.exists(output_path): - os.makedirs(output_path) - args['output_file'] = output_path + "/mewex.csv" - - mwl.call_mewex(**args) - self.lemmatize(output_path + "/mewex.csv", - output_path + "/mewexlemmatized.csv") - self.cut_lines(output_path + "/mewexlemmatized.csv", - output_path + "/mewexshort.csv", 1000) - - def cut_lines(self, inf, outf, lines): - """.""" - f = open(inf, "r") - copy = open(outf, "w") - n = 0 - for line in f: - copy.write(line) - n = n + 1 - if n > lines: - break - f.close() - copy.close() - - def lemmatize(self, inf, outf): - """.""" - input_file = io.open(inf, "r", encoding="utf-8") - output_file = open(outf, "w") - next(input_file) - next(input_file) # First two rows are header rows, so just skip them - output_file.write( - "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n") - orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*') - basereg = re.compile(r'[^:]+:([^ ]+)') - for line in input_file: - splited = line.strip().split('\t') - orthtuple = orthreg.findall(splited[4]) - baselist = basereg.findall(splited[3]) - base = u' '.join(baselist) - orth = orthtuple[0][0].strip() - tag = orthtuple[0][1] - result = self._lemmatizer.lemmatizeS(orth, base, tag, False) - splited.insert(4, result) - output_file.write('\t'.join(splited) + '\n') - input_file.close() - output_file.close() - - -_OPT_DISPATCH = dict( - ranker_func=lambda val: mwl.RankerFunction[val], - dispersion_func=lambda val: mwl.DispersionFunction[val], - wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val), -) - - -def _parse_mewex_options(opts): - args = {} - - while opts: - key, val = opts.popitem() - - if key in _OPT_DISPATCH: - val = _OPT_DISPATCH[key](val) - - args[key] = val - - return args +"""Implementation of MeWex Worker.""" +# !/usr/bin/python3 +import os +import re +import io +from zipfile import ZipFile +from tqdm import tqdm +import pandas as pd + +import mewexlib as mwl +import WrapLem +from nlp_ws import NLPWorker + + +class MewexWorker(NLPWorker): + """Implements mewex worker.""" + + def init(self): + """Initialize worker.""" + self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer() + + def process(self, input_path, task_options, output_path): + """Running nlp process.""" + args = _parse_mewex_options(task_options.get('mewex_options') or {}) + args['input_files'] = ( + [os.path.join(input_path, f) for f in os.listdir(input_path)] + if os.path.isdir(input_path) + else (input_path,) + ) + i = 0 + if len(args['input_files']) > 10000: + listOfFiles = [args['input_files'][x:x+1000] for x in range(0, len(args['input_files']), 1000)] + for file in listOfFiles: + i = i + 1 + args['input_files'] = file + output = output_path + f"/{i}" + if not os.path.exists(output): + os.makedirs(output) + args['output_file'] = output + "/mewex.csv" + mwl.call_mewex(**args) + self.merge_data(output_path) + option = True + else: + if not os.path.exists(output_path): + os.makedirs(output_path) + args['output_file'] = output_path + "/mewex.csv" + mwl.call_mewex(**args) + option = False + self.lemmatize(output_path + "/mewex.csv", + output_path + "/mewexlemmatized.csv", + option=option) + self.cut_lines(output_path + "/mewexlemmatized.csv", + output_path + "/mewexshort.csv", 1000) + + def merge_data(self, path): + all_files = os.listdir(path) + mewex = pd.read_csv(f'{path}/1/mewex.csv', header=None, skiprows=2, delimiter='\t') + all_files.remove('1') + for filename in tqdm(all_files): + mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv', header=None, skiprows=2, delimiter='\t')) + mewex.to_csv(f'{path}/mewex.csv', index=False, + header=['Rank', 'Quantity', 'Realtion', 'Base form', 'All forms'], sep='\t') + + + def cut_lines(self, inf, outf, lines): + """.""" + f = open(inf, "r") + copy = open(outf, "w") + n = 0 + for line in f: + copy.write(line) + n = n + 1 + if n > lines: + break + f.close() + copy.close() + + def lemmatize(self, inf, outf, option): + """.""" + input_file = io.open(inf, "r", encoding="utf-8") + output_file = open(outf, "w") + if option: + next(input_file) + else: + next(input_file) + next(input_file) + output_file.write( + "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n") + orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*') + basereg = re.compile(r'[^:]+:([^ ]+)') + for line in input_file: + splited = line.strip().split('\t') + orthtuple = orthreg.findall(splited[4]) + baselist = basereg.findall(splited[3]) + base = u' '.join(baselist) + orth = orthtuple[0][0].strip() + tag = orthtuple[0][1] + result = self._lemmatizer.lemmatizeS(orth, base, tag, False) + splited.insert(4, result) + output_file.write('\t'.join(splited) + '\n') + input_file.close() + output_file.close() + + +_OPT_DISPATCH = dict( + ranker_func=lambda val: mwl.RankerFunction[val], + dispersion_func=lambda val: mwl.DispersionFunction[val], + wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val), +) + + +def _parse_mewex_options(opts): + args = {} + + while opts: + key, val = opts.popitem() + + if key in _OPT_DISPATCH: + val = _OPT_DISPATCH[key](val) + + args[key] = val + + return args -- GitLab From 2faafac36aae142cfcce428badcd4023d8c5fb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Bojanowski?= <bartlomiej.bojanowski@pwr.edu.pl> Date: Thu, 4 Mar 2021 15:03:57 +0100 Subject: [PATCH 2/4] Fixed feature split data --- Dockerfile | 1 - config.ini | 8 ++++---- requirements.txt | 1 - src/mewex.py | 45 ++++++++++++++++++++++++++++----------------- 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9604158..9b1e7c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,6 +54,5 @@ RUN cd mewex/mewexlib/ && \ WORKDIR /home/worker COPY ./src ./src COPY ./main.py . -COPY ./config.ini ./config.ini CMD ["python3.6", "main.py", "service"] diff --git a/config.ini b/config.ini index 4aa96d6..2504e02 100644 --- a/config.ini +++ b/config.ini @@ -1,10 +1,10 @@ [service] -tool = mewex_tester +tool = mewex1 root = /samba/requests/ -rabbit_host = 10.17.0.85 -rabbit_user = clarin -rabbit_password = clarin123 +rabbit_host = rabbitmq +rabbit_user = test +rabbit_password = test queue_prefix =nlp_ [tool] diff --git a/requirements.txt b/requirements.txt index b474d0c..7f4464b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,4 @@ nlp_ws lxml ujson configparser -tqdm pandas diff --git a/src/mewex.py b/src/mewex.py index 8ce9c04..35d0314 100644 --- a/src/mewex.py +++ b/src/mewex.py @@ -3,7 +3,7 @@ import os import re import io -from zipfile import ZipFile +import shutil from tqdm import tqdm import pandas as pd @@ -27,19 +27,20 @@ class MewexWorker(NLPWorker): if os.path.isdir(input_path) else (input_path,) ) - i = 0 - if len(args['input_files']) > 10000: - listOfFiles = [args['input_files'][x:x+1000] for x in range(0, len(args['input_files']), 1000)] - for file in listOfFiles: - i = i + 1 + counter = 0 + files = args['input_files'] + if len(files) > 1000: + list_files = [files[x:x + 1000] for x in range(0, len(files), 1000)] + for file in list_files: + counter += 1 args['input_files'] = file - output = output_path + f"/{i}" + output = output_path + f"/{counter}" if not os.path.exists(output): os.makedirs(output) args['output_file'] = output + "/mewex.csv" mwl.call_mewex(**args) - self.merge_data(output_path) option = True + self.merge_data(output_path, counter) else: if not os.path.exists(output_path): os.makedirs(output_path) @@ -52,15 +53,25 @@ class MewexWorker(NLPWorker): self.cut_lines(output_path + "/mewexlemmatized.csv", output_path + "/mewexshort.csv", 1000) - def merge_data(self, path): - all_files = os.listdir(path) - mewex = pd.read_csv(f'{path}/1/mewex.csv', header=None, skiprows=2, delimiter='\t') - all_files.remove('1') - for filename in tqdm(all_files): - mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv', header=None, skiprows=2, delimiter='\t')) - mewex.to_csv(f'{path}/mewex.csv', index=False, - header=['Rank', 'Quantity', 'Realtion', 'Base form', 'All forms'], sep='\t') - + def merge_data(self, path, counter): + """Merge subfolder/mewex.csv to one mewex.csv.""" + mewex = pd.read_csv(f'{path}/1/mewex.csv', + header=None, + skiprows=2, + delimiter='\t') + shutil.rmtree(f'{path}/1') + for filename in tqdm(range(2, counter + 1)): + mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv', + header=None, + skiprows=2, + delimiter='\t', + engine='python')) + shutil.rmtree(f'{path}/{filename}') + mewex.to_csv(f'{path}/mewex.csv', + index=False, + header=['Rank', 'Quantity', + 'Realtion', 'Base form', 'All forms'], + sep='\t') def cut_lines(self, inf, outf, lines): """.""" -- GitLab From 22c707ac9d5151f5022c77808857e1f22e0e2b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Bojanowski?= <bartlomiej.bojanowski@pwr.edu.pl> Date: Tue, 30 Mar 2021 15:42:17 +0200 Subject: [PATCH 3/4] Refactor. --- src/mewex.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/mewex.py b/src/mewex.py index 35d0314..8c25ae9 100644 --- a/src/mewex.py +++ b/src/mewex.py @@ -4,9 +4,8 @@ import os import re import io import shutil -from tqdm import tqdm -import pandas as pd +import pandas as pd import mewexlib as mwl import WrapLem from nlp_ws import NLPWorker @@ -27,13 +26,15 @@ class MewexWorker(NLPWorker): if os.path.isdir(input_path) else (input_path,) ) - counter = 0 files = args['input_files'] - if len(files) > 1000: - list_files = [files[x:x + 1000] for x in range(0, len(files), 1000)] - for file in list_files: - counter += 1 - args['input_files'] = file + option = True + step = 1000 + counter = 0 + if len(files) > step: + for counter, chunk in enumerate( + [files[x:x + step] for x in range(0, len(files), step)] + ): + args['input_files'] = chunk output = output_path + f"/{counter}" if not os.path.exists(output): os.makedirs(output) @@ -60,7 +61,7 @@ class MewexWorker(NLPWorker): skiprows=2, delimiter='\t') shutil.rmtree(f'{path}/1') - for filename in tqdm(range(2, counter + 1)): + for filename in range(2, counter + 1): mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv', header=None, skiprows=2, @@ -90,10 +91,8 @@ class MewexWorker(NLPWorker): """.""" input_file = io.open(inf, "r", encoding="utf-8") output_file = open(outf, "w") - if option: - next(input_file) - else: - next(input_file) + next(input_file) + if not option: next(input_file) output_file.write( "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n") -- GitLab From f28903c25cfbff677acac27a47f85a9d03edea19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Bojanowski?= <bartlomiej.bojanowski@pwr.edu.pl> Date: Thu, 29 Apr 2021 10:09:44 +0200 Subject: [PATCH 4/4] Fix mewex --- src/mewex.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mewex.py b/src/mewex.py index 8c25ae9..f8869fc 100644 --- a/src/mewex.py +++ b/src/mewex.py @@ -28,7 +28,7 @@ class MewexWorker(NLPWorker): ) files = args['input_files'] option = True - step = 1000 + step = 10 counter = 0 if len(files) > step: for counter, chunk in enumerate( @@ -56,12 +56,12 @@ class MewexWorker(NLPWorker): def merge_data(self, path, counter): """Merge subfolder/mewex.csv to one mewex.csv.""" - mewex = pd.read_csv(f'{path}/1/mewex.csv', + mewex = pd.read_csv(f'{path}/0/mewex.csv', header=None, skiprows=2, delimiter='\t') - shutil.rmtree(f'{path}/1') - for filename in range(2, counter + 1): + shutil.rmtree(f'{path}/0') + for filename in range(1, counter + 1): mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv', header=None, skiprows=2, -- GitLab