Skip to content
Snippets Groups Projects
Commit d9b6204e authored by Bartłomiej Bojanowski's avatar Bartłomiej Bojanowski Committed by Mateusz Gniewkowski
Browse files

Developer

parent d3d60e06
Branches
No related tags found
No related merge requests found
...@@ -4,3 +4,4 @@ nlp_ws ...@@ -4,3 +4,4 @@ nlp_ws
lxml lxml
ujson ujson
configparser configparser
pandas
...@@ -3,8 +3,9 @@ ...@@ -3,8 +3,9 @@
import os import os
import re import re
import io import io
import shutil
import pandas as pd
import mewexlib as mwl import mewexlib as mwl
import WrapLem import WrapLem
from nlp_ws import NLPWorker from nlp_ws import NLPWorker
...@@ -25,16 +26,54 @@ class MewexWorker(NLPWorker): ...@@ -25,16 +26,54 @@ class MewexWorker(NLPWorker):
if os.path.isdir(input_path) if os.path.isdir(input_path)
else (input_path,) else (input_path,)
) )
files = args['input_files']
option = True
step = 1000
counter = 0
if len(files) > step:
for counter, chunk in enumerate(
[files[x:x + step] for x in range(0, len(files), step)]
):
args['input_files'] = chunk
output = output_path + f"/{counter}"
if not os.path.exists(output):
os.makedirs(output)
args['output_file'] = output + "/mewex.csv"
mwl.call_mewex(**args)
option = True
self.merge_data(output_path, counter)
else:
if not os.path.exists(output_path): if not os.path.exists(output_path):
os.makedirs(output_path) os.makedirs(output_path)
args['output_file'] = output_path + "/mewex.csv" args['output_file'] = output_path + "/mewex.csv"
mwl.call_mewex(**args) mwl.call_mewex(**args)
option = False
self.lemmatize(output_path + "/mewex.csv", self.lemmatize(output_path + "/mewex.csv",
output_path + "/mewexlemmatized.csv") output_path + "/mewexlemmatized.csv",
option=option)
self.cut_lines(output_path + "/mewexlemmatized.csv", self.cut_lines(output_path + "/mewexlemmatized.csv",
output_path + "/mewexshort.csv", 1000) output_path + "/mewexshort.csv", 1000)
def merge_data(self, path, counter):
"""Merge subfolder/mewex.csv to one mewex.csv."""
mewex = pd.read_csv(f'{path}/1/mewex.csv',
header=None,
skiprows=2,
delimiter='\t')
shutil.rmtree(f'{path}/1')
for filename in range(2, counter + 1):
mewex = mewex.append(pd.read_csv(f'{path}/{filename}/mewex.csv',
header=None,
skiprows=2,
delimiter='\t',
engine='python'))
shutil.rmtree(f'{path}/{filename}')
mewex.to_csv(f'{path}/mewex.csv',
index=False,
header=['Rank', 'Quantity',
'Realtion', 'Base form', 'All forms'],
sep='\t')
def cut_lines(self, inf, outf, lines): def cut_lines(self, inf, outf, lines):
""".""" """."""
f = open(inf, "r") f = open(inf, "r")
...@@ -48,12 +87,13 @@ class MewexWorker(NLPWorker): ...@@ -48,12 +87,13 @@ class MewexWorker(NLPWorker):
f.close() f.close()
copy.close() copy.close()
def lemmatize(self, inf, outf): def lemmatize(self, inf, outf, option):
""".""" """."""
input_file = io.open(inf, "r", encoding="utf-8") input_file = io.open(inf, "r", encoding="utf-8")
output_file = open(outf, "w") output_file = open(outf, "w")
next(input_file) next(input_file)
next(input_file) # First two rows are header rows, so just skip them if not option:
next(input_file)
output_file.write( output_file.write(
"Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n") "Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*') orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment