Add feature to split data

parent d3d60e06
Pipeline #2440 failed with stage
in 32 seconds
......@@ -54,5 +54,6 @@ RUN cd mewex/mewexlib/ && \
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./config.ini ./config.ini
CMD ["python3.6", "main.py", "service"]
[service]
tool = mewex1
tool = mewex_tester
root = /samba/requests/
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
queue_prefix =nlp_
[tool]
......
......@@ -3,4 +3,6 @@ Cython
nlp_ws
lxml
ujson
configparser
\ No newline at end of file
configparser
tqdm
pandas
"""Implementation of MeWex Worker."""
# !/usr/bin/python3
import os
import re
import io
import mewexlib as mwl
import WrapLem
from nlp_ws import NLPWorker
class MewexWorker(NLPWorker):
"""Implements mewex worker."""
def init(self):
"""Initialize worker."""
self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
def process(self, input_path, task_options, output_path):
"""Running nlp process."""
args = _parse_mewex_options(task_options.get('mewex_options') or {})
args['input_files'] = (
[os.path.join(input_path, f) for f in os.listdir(input_path)]
if os.path.isdir(input_path)
else (input_path,)
)
if not os.path.exists(output_path):
os.makedirs(output_path)
args['output_file'] = output_path + "/mewex.csv"
mwl.call_mewex(**args)
self.lemmatize(output_path + "/mewex.csv",
output_path + "/mewexlemmatized.csv")
self.cut_lines(output_path + "/mewexlemmatized.csv",
output_path + "/mewexshort.csv", 1000)
def cut_lines(self, inf, outf, lines):
"""."""
f = open(inf, "r")
copy = open(outf, "w")
n = 0
for line in f:
copy.write(line)
n = n + 1
if n > lines:
break
f.close()
copy.close()
def lemmatize(self, inf, outf):
"""."""
input_file = io.open(inf, "r", encoding="utf-8")
output_file = open(outf, "w")
next(input_file)
next(input_file) # First two rows are header rows, so just skip them
output_file.write(
"Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
basereg = re.compile(r'[^:]+:([^ ]+)')
for line in input_file:
splited = line.strip().split('\t')
orthtuple = orthreg.findall(splited[4])
baselist = basereg.findall(splited[3])
base = u' '.join(baselist)
orth = orthtuple[0][0].strip()
tag = orthtuple[0][1]
result = self._lemmatizer.lemmatizeS(orth, base, tag, False)
splited.insert(4, result)
output_file.write('\t'.join(splited) + '\n')
input_file.close()
output_file.close()
_OPT_DISPATCH = dict(
ranker_func=lambda val: mwl.RankerFunction[val],
dispersion_func=lambda val: mwl.DispersionFunction[val],
wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val),
)
def _parse_mewex_options(opts):
args = {}
while opts:
key, val = opts.popitem()
if key in _OPT_DISPATCH:
val = _OPT_DISPATCH[key](val)
args[key] = val
return args
"""Implementation of MeWex Worker."""
# !/usr/bin/python3
import os
import re
import io
from zipfile import ZipFile
from tqdm import tqdm
import pandas as pd
import mewexlib as mwl
import WrapLem
from nlp_ws import NLPWorker
class MewexWorker(NLPWorker):
"""Implements mewex worker."""
def init(self):
"""Initialize worker."""
self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
def process(self, input_path, task_options, output_path):
"""Running nlp process."""
args = _parse_mewex_options(task_options.get('mewex_options') or {})
args['input_files'] = (
[os.path.join(input_path, f) for f in os.listdir(input_path)]
if os.path.isdir(input_path)
else (input_path,)
)
i = 0
if len(args['input_files']) > 10000:
listOfFiles = [args['input_files'][x:x+1000] for x in range(0, len(args['input_files']), 1000)]
for file in listOfFiles:
i = i + 1
args['input_files'] = file
output = output_path + f"/{i}"
if not os.path.exists(output):
os.makedirs(output)
args['output_file'] = output + "/mewex.csv"
mwl.call_mewex(**args)
self.merge_data(output_path)
option = True
else:
if not os.path.exists(output_path):
os.makedirs(output_path)
args['output_file'] = output_path + "/mewex.csv"
mwl.call_mewex(**args)
option = False
self.lemmatize(output_path + "/mewex.csv",
output_path + "/mewexlemmatized.csv",
option=option)
self.cut_lines(output_path + "/mewexlemmatized.csv",
output_path + "/mewexshort.csv", 1000)
def merge_data(self, path):
all_files = os.listdir(path)
mewex = pd.read_csv(f'{path}/1/mewex.csv', header=None, skiprows=2, delimiter='\t')
all_files.remove('1')
for filename in tqdm(all_files):
mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv', header=None, skiprows=2, delimiter='\t'))
mewex.to_csv(f'{path}/mewex.csv', index=False,
header=['Rank', 'Quantity', 'Realtion', 'Base form', 'All forms'], sep='\t')
def cut_lines(self, inf, outf, lines):
"""."""
f = open(inf, "r")
copy = open(outf, "w")
n = 0
for line in f:
copy.write(line)
n = n + 1
if n > lines:
break
f.close()
copy.close()
def lemmatize(self, inf, outf, option):
"""."""
input_file = io.open(inf, "r", encoding="utf-8")
output_file = open(outf, "w")
if option:
next(input_file)
else:
next(input_file)
next(input_file)
output_file.write(
"Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
basereg = re.compile(r'[^:]+:([^ ]+)')
for line in input_file:
splited = line.strip().split('\t')
orthtuple = orthreg.findall(splited[4])
baselist = basereg.findall(splited[3])
base = u' '.join(baselist)
orth = orthtuple[0][0].strip()
tag = orthtuple[0][1]
result = self._lemmatizer.lemmatizeS(orth, base, tag, False)
splited.insert(4, result)
output_file.write('\t'.join(splited) + '\n')
input_file.close()
output_file.close()
_OPT_DISPATCH = dict(
ranker_func=lambda val: mwl.RankerFunction[val],
dispersion_func=lambda val: mwl.DispersionFunction[val],
wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val),
)
def _parse_mewex_options(opts):
args = {}
while opts:
key, val = opts.popitem()
if key in _OPT_DISPATCH:
val = _OPT_DISPATCH[key](val)
args[key] = val
return args
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment