From d9b6204e04a99abb350de2d7ab866094eeee0466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Bojanowski?= <bartlomiej.piotr.bojanowski@gmail.com> Date: Wed, 31 Mar 2021 10:25:04 +0000 Subject: [PATCH] Developer --- requirements.txt | 3 +- src/mewex.py | 226 ++++++++++++++++++++++++++++------------------- 2 files changed, 135 insertions(+), 94 deletions(-) diff --git a/requirements.txt b/requirements.txt index e129cd1..7f4464b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ Cython nlp_ws lxml ujson -configparser \ No newline at end of file +configparser +pandas diff --git a/src/mewex.py b/src/mewex.py index ce80d3a..8c25ae9 100644 --- a/src/mewex.py +++ b/src/mewex.py @@ -1,93 +1,133 @@ -"""Implementation of MeWex Worker.""" -# !/usr/bin/python3 -import os -import re -import io - - -import mewexlib as mwl -import WrapLem -from nlp_ws import NLPWorker - - -class MewexWorker(NLPWorker): - """Implements mewex worker.""" - - def init(self): - """Initialize worker.""" - self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer() - - def process(self, input_path, task_options, output_path): - """Running nlp process.""" - args = _parse_mewex_options(task_options.get('mewex_options') or {}) - args['input_files'] = ( - [os.path.join(input_path, f) for f in os.listdir(input_path)] - if os.path.isdir(input_path) - else (input_path,) - ) - if not os.path.exists(output_path): - os.makedirs(output_path) - args['output_file'] = output_path + "/mewex.csv" - - mwl.call_mewex(**args) - self.lemmatize(output_path + "/mewex.csv", - output_path + "/mewexlemmatized.csv") - self.cut_lines(output_path + "/mewexlemmatized.csv", - output_path + "/mewexshort.csv", 1000) - - def cut_lines(self, inf, outf, lines): - """.""" - f = open(inf, "r") - copy = open(outf, "w") - n = 0 - for line in f: - copy.write(line) - n = n + 1 - if n > lines: - break - f.close() - copy.close() - - def lemmatize(self, inf, outf): - """.""" - input_file = io.open(inf, "r", encoding="utf-8") - output_file = open(outf, "w") - next(input_file) - next(input_file) # First two rows are header rows, so just skip them - output_file.write( - "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n") - orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*') - basereg = re.compile(r'[^:]+:([^ ]+)') - for line in input_file: - splited = line.strip().split('\t') - orthtuple = orthreg.findall(splited[4]) - baselist = basereg.findall(splited[3]) - base = u' '.join(baselist) - orth = orthtuple[0][0].strip() - tag = orthtuple[0][1] - result = self._lemmatizer.lemmatizeS(orth, base, tag, False) - splited.insert(4, result) - output_file.write('\t'.join(splited) + '\n') - input_file.close() - output_file.close() - - -_OPT_DISPATCH = dict( - ranker_func=lambda val: mwl.RankerFunction[val], - dispersion_func=lambda val: mwl.DispersionFunction[val], - wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val), -) - - -def _parse_mewex_options(opts): - args = {} - - while opts: - key, val = opts.popitem() - - if key in _OPT_DISPATCH: - val = _OPT_DISPATCH[key](val) - - args[key] = val - - return args +"""Implementation of MeWex Worker.""" +# !/usr/bin/python3 +import os +import re +import io +import shutil + +import pandas as pd +import mewexlib as mwl +import WrapLem +from nlp_ws import NLPWorker + + +class MewexWorker(NLPWorker): + """Implements mewex worker.""" + + def init(self): + """Initialize worker.""" + self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer() + + def process(self, input_path, task_options, output_path): + """Running nlp process.""" + args = _parse_mewex_options(task_options.get('mewex_options') or {}) + args['input_files'] = ( + [os.path.join(input_path, f) for f in os.listdir(input_path)] + if os.path.isdir(input_path) + else (input_path,) + ) + files = args['input_files'] + option = True + step = 1000 + counter = 0 + if len(files) > step: + for counter, chunk in enumerate( + [files[x:x + step] for x in range(0, len(files), step)] + ): + args['input_files'] = chunk + output = output_path + f"/{counter}" + if not os.path.exists(output): + os.makedirs(output) + args['output_file'] = output + "/mewex.csv" + mwl.call_mewex(**args) + option = True + self.merge_data(output_path, counter) + else: + if not os.path.exists(output_path): + os.makedirs(output_path) + args['output_file'] = output_path + "/mewex.csv" + mwl.call_mewex(**args) + option = False + self.lemmatize(output_path + "/mewex.csv", + output_path + "/mewexlemmatized.csv", + option=option) + self.cut_lines(output_path + "/mewexlemmatized.csv", + output_path + "/mewexshort.csv", 1000) + + def merge_data(self, path, counter): + """Merge subfolder/mewex.csv to one mewex.csv.""" + mewex = pd.read_csv(f'{path}/1/mewex.csv', + header=None, + skiprows=2, + delimiter='\t') + shutil.rmtree(f'{path}/1') + for filename in range(2, counter + 1): + mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv', + header=None, + skiprows=2, + delimiter='\t', + engine='python')) + shutil.rmtree(f'{path}/{filename}') + mewex.to_csv(f'{path}/mewex.csv', + index=False, + header=['Rank', 'Quantity', + 'Realtion', 'Base form', 'All forms'], + sep='\t') + + def cut_lines(self, inf, outf, lines): + """.""" + f = open(inf, "r") + copy = open(outf, "w") + n = 0 + for line in f: + copy.write(line) + n = n + 1 + if n > lines: + break + f.close() + copy.close() + + def lemmatize(self, inf, outf, option): + """.""" + input_file = io.open(inf, "r", encoding="utf-8") + output_file = open(outf, "w") + next(input_file) + if not option: + next(input_file) + output_file.write( + "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n") + orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*') + basereg = re.compile(r'[^:]+:([^ ]+)') + for line in input_file: + splited = line.strip().split('\t') + orthtuple = orthreg.findall(splited[4]) + baselist = basereg.findall(splited[3]) + base = u' '.join(baselist) + orth = orthtuple[0][0].strip() + tag = orthtuple[0][1] + result = self._lemmatizer.lemmatizeS(orth, base, tag, False) + splited.insert(4, result) + output_file.write('\t'.join(splited) + '\n') + input_file.close() + output_file.close() + + +_OPT_DISPATCH = dict( + ranker_func=lambda val: mwl.RankerFunction[val], + dispersion_func=lambda val: mwl.DispersionFunction[val], + wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val), +) + + +def _parse_mewex_options(opts): + args = {} + + while opts: + key, val = opts.popitem() + + if key in _OPT_DISPATCH: + val = _OPT_DISPATCH[key](val) + + args[key] = val + + return args -- GitLab