Commit 47844db7 authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski

Merge branch 'developer' into 'master'

Developer

See merge request !3
parents d3d60e06 d9b6204e
Pipeline #2800 passed with stages
in 13 minutes and 34 seconds
......@@ -3,4 +3,5 @@ Cython
nlp_ws
lxml
ujson
configparser
\ No newline at end of file
configparser
pandas
"""Implementation of MeWex Worker."""
# !/usr/bin/python3
import os
import re
import io
import mewexlib as mwl
import WrapLem
from nlp_ws import NLPWorker
class MewexWorker(NLPWorker):
"""Implements mewex worker."""
def init(self):
"""Initialize worker."""
self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
def process(self, input_path, task_options, output_path):
"""Running nlp process."""
args = _parse_mewex_options(task_options.get('mewex_options') or {})
args['input_files'] = (
[os.path.join(input_path, f) for f in os.listdir(input_path)]
if os.path.isdir(input_path)
else (input_path,)
)
if not os.path.exists(output_path):
os.makedirs(output_path)
args['output_file'] = output_path + "/mewex.csv"
mwl.call_mewex(**args)
self.lemmatize(output_path + "/mewex.csv",
output_path + "/mewexlemmatized.csv")
self.cut_lines(output_path + "/mewexlemmatized.csv",
output_path + "/mewexshort.csv", 1000)
def cut_lines(self, inf, outf, lines):
"""."""
f = open(inf, "r")
copy = open(outf, "w")
n = 0
for line in f:
copy.write(line)
n = n + 1
if n > lines:
break
f.close()
copy.close()
def lemmatize(self, inf, outf):
"""."""
input_file = io.open(inf, "r", encoding="utf-8")
output_file = open(outf, "w")
next(input_file)
next(input_file) # First two rows are header rows, so just skip them
output_file.write(
"Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
basereg = re.compile(r'[^:]+:([^ ]+)')
for line in input_file:
splited = line.strip().split('\t')
orthtuple = orthreg.findall(splited[4])
baselist = basereg.findall(splited[3])
base = u' '.join(baselist)
orth = orthtuple[0][0].strip()
tag = orthtuple[0][1]
result = self._lemmatizer.lemmatizeS(orth, base, tag, False)
splited.insert(4, result)
output_file.write('\t'.join(splited) + '\n')
input_file.close()
output_file.close()
_OPT_DISPATCH = dict(
ranker_func=lambda val: mwl.RankerFunction[val],
dispersion_func=lambda val: mwl.DispersionFunction[val],
wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val),
)
def _parse_mewex_options(opts):
args = {}
while opts:
key, val = opts.popitem()
if key in _OPT_DISPATCH:
val = _OPT_DISPATCH[key](val)
args[key] = val
return args
"""Implementation of MeWex Worker."""
# !/usr/bin/python3
import os
import re
import io
import shutil
import pandas as pd
import mewexlib as mwl
import WrapLem
from nlp_ws import NLPWorker
class MewexWorker(NLPWorker):
"""Implements mewex worker."""
def init(self):
"""Initialize worker."""
self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
def process(self, input_path, task_options, output_path):
"""Running nlp process."""
args = _parse_mewex_options(task_options.get('mewex_options') or {})
args['input_files'] = (
[os.path.join(input_path, f) for f in os.listdir(input_path)]
if os.path.isdir(input_path)
else (input_path,)
)
files = args['input_files']
option = True
step = 1000
counter = 0
if len(files) > step:
for counter, chunk in enumerate(
[files[x:x + step] for x in range(0, len(files), step)]
):
args['input_files'] = chunk
output = output_path + f"/{counter}"
if not os.path.exists(output):
os.makedirs(output)
args['output_file'] = output + "/mewex.csv"
mwl.call_mewex(**args)
option = True
self.merge_data(output_path, counter)
else:
if not os.path.exists(output_path):
os.makedirs(output_path)
args['output_file'] = output_path + "/mewex.csv"
mwl.call_mewex(**args)
option = False
self.lemmatize(output_path + "/mewex.csv",
output_path + "/mewexlemmatized.csv",
option=option)
self.cut_lines(output_path + "/mewexlemmatized.csv",
output_path + "/mewexshort.csv", 1000)
def merge_data(self, path, counter):
"""Merge subfolder/mewex.csv to one mewex.csv."""
mewex = pd.read_csv(f'{path}/1/mewex.csv',
header=None,
skiprows=2,
delimiter='\t')
shutil.rmtree(f'{path}/1')
for filename in range(2, counter + 1):
mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv',
header=None,
skiprows=2,
delimiter='\t',
engine='python'))
shutil.rmtree(f'{path}/{filename}')
mewex.to_csv(f'{path}/mewex.csv',
index=False,
header=['Rank', 'Quantity',
'Realtion', 'Base form', 'All forms'],
sep='\t')
def cut_lines(self, inf, outf, lines):
"""."""
f = open(inf, "r")
copy = open(outf, "w")
n = 0
for line in f:
copy.write(line)
n = n + 1
if n > lines:
break
f.close()
copy.close()
def lemmatize(self, inf, outf, option):
"""."""
input_file = io.open(inf, "r", encoding="utf-8")
output_file = open(outf, "w")
next(input_file)
if not option:
next(input_file)
output_file.write(
"Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
basereg = re.compile(r'[^:]+:([^ ]+)')
for line in input_file:
splited = line.strip().split('\t')
orthtuple = orthreg.findall(splited[4])
baselist = basereg.findall(splited[3])
base = u' '.join(baselist)
orth = orthtuple[0][0].strip()
tag = orthtuple[0][1]
result = self._lemmatizer.lemmatizeS(orth, base, tag, False)
splited.insert(4, result)
output_file.write('\t'.join(splited) + '\n')
input_file.close()
output_file.close()
_OPT_DISPATCH = dict(
ranker_func=lambda val: mwl.RankerFunction[val],
dispersion_func=lambda val: mwl.DispersionFunction[val],
wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val),
)
def _parse_mewex_options(opts):
args = {}
while opts:
key, val = opts.popitem()
if key in _OPT_DISPATCH:
val = _OPT_DISPATCH[key](val)
args[key] = val
return args
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment