From 6013eddc38c924ed10173f4eed8ed314c4a46ddc Mon Sep 17 00:00:00 2001 From: bbojanowski <bartlomiej.piotr.bojanowski@gmail.com> Date: Thu, 2 Jan 2020 14:28:54 +0100 Subject: [PATCH] Refactoring --- docker-compose.yml => .docker-compose.yml | 4 +- .gitignore | 5 + .gitlab-ci.yml | 18 ++ cluto.py | 368 --------------------- cluto_worker.py | 17 - main.py | 23 ++ src/__init__.py | 0 src/cluto.py | 374 ++++++++++++++++++++++ tox.ini | 45 +++ 9 files changed, 467 insertions(+), 387 deletions(-) rename docker-compose.yml => .docker-compose.yml (77%) create mode 100644 .gitignore create mode 100644 .gitlab-ci.yml delete mode 100644 cluto.py delete mode 100755 cluto_worker.py create mode 100755 main.py create mode 100644 src/__init__.py create mode 100644 src/cluto.py create mode 100644 tox.ini diff --git a/docker-compose.yml b/.docker-compose.yml similarity index 77% rename from docker-compose.yml rename to .docker-compose.yml index 6693a3d..b96f838 100644 --- a/docker-compose.yml +++ b/.docker-compose.yml @@ -7,10 +7,10 @@ services: working_dir: /home/worker entrypoint: - python2 - - cluto_worker.py + - main.py volumes: - /samba:/samba - ./config.ini:/home/worker/config.ini - - ./cluto_worker.py:/home/worker/cluto_worker.py + - ./main.py:/home/worker/main.py - ./cluto.py:/home/worker/cluto.py restart: always \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0bf0524 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.pyc + +venv/ +.idea/ +.tox/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..b90b766 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,18 @@ +image: clarinpl/python:2.7 + +cache: + paths: + - .tox + +before_script: + - pip install tox==2.9.1 + +pep8: + script: + - tox -v -e pep8 + +docstyle: + script: + - tox -v -e docstyle + + diff --git a/cluto.py b/cluto.py deleted file mode 100644 index 0b5f373..0000000 --- a/cluto.py +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/python -from __future__ import print_function -import argparse as _argp -import csv as _csv -import json,re,os -import io - -import numpy as _np -import time,glob,os,shutil,tempfile -from subprocess import call -from sklearn.externals import joblib -import xlsxwriter - -verbose = False - -def loadData(inputFile): - with open(inputFile) as json_ifs: - jsonVal = json.load(json_ifs) - rowlabels=_np.asarray(jsonVal["rowlabels"]) - data=_np.asarray(jsonVal["arr"]); - jsonVal["arr"]=None - return data, rowlabels - - - -def saveXLSX(names,clustering_path,outfile): - srow=3;scol=4; - with open(clustering_path) as f: - groups = f.readlines() - names_out = [] - ind=0 - workbook = xlsxwriter.Workbook(outfile) - worksheet = workbook.add_worksheet("result"); - worksheet.write(srow,scol,'Nazwy') - worksheet.write(srow,scol+1,'Grupa') - srow+=1 - for name in names: - worksheet.write(srow,scol,name) - worksheet.write(srow,scol+1,groups[ind]) - srow+=1 - ind=ind+1 - workbook.close() - - -def toHeatMapJSON(cluto_path,clustering_path,names,outfile): - with open(clustering_path) as f: - groups = f.readlines() - names_out = [] - ind=0 - for name in names: - tmp_hsh = { - 'name': name, - 'group': groups[ind].strip() - } - names_out.append(tmp_hsh) - ind=ind+1 - - array = [] - line_num=0 - with open(cluto_path) as f: - content = f.readlines() - - regex = r"\d+\s[0-9]*\.?[0-9]+" - for line in content[1:]: - arr = re.findall(regex, line) - for node in arr: - node = node.split() - tmp_hsh = { - 'source': str(line_num), - 'target': str((int(node[0]) - 1)), - 'value': str(float(node[1])) - } - array.append(tmp_hsh) - line_num += 1 - - out = {'nodes': names_out, 'links': array} - json_hsh = json.dumps(out) - with open(outfile, 'w') as outfile: - outfile.write(json_hsh) - - -# Reads data from set of csvs from fextor -# Creats matrix and normalise it (divides by tok_count) - -def number_of_clusters(options,rowlabels): - if 'no_clusters' in options: - no_clusters=options['no_clusters'] - if not isinstance( no_clusters, int ): - no_clusters=2 - if no_clusters<2: - no_clusters=2 - else: - no_clusters=2 - - if int(no_clusters) > len(rowlabels): - no_clusters = str(len(rowlabels)) - - return no_clusters - -def save_clutofiles(mat,rlabels,clabels,cluto_path,rlabel_path,clabel_path): - # Save cluto file - with open(cluto_path, 'w') as cluto_ofs: - # Print header: - # <num_rows> <num_cols> <num_nonzero> - print( - len(rlabels), - len(clabels), - _np.count_nonzero(mat), - file=cluto_ofs, - ) - - for row in mat: - buf = [] - - for idx in row.nonzero()[0]: - buf.append('{} {}'.format(idx+1, row[idx])) - - print(' '.join(buf), file=cluto_ofs) - - # Save label files - with io.open(rlabel_path, 'w') as rlabel_ofs: - for lab in rlabels: - print(lab, file=rlabel_ofs) - - with io.open(clabel_path, 'w') as clabel_ofs: - for lab in clabels: - print(lab, file=clabel_ofs) - -def run_cluto(options,no_clusters,cluto_input_file,rlabel_path,cl_out_file,clutoout): - - cluto_path="./cluto-2.1.2/Linux-x86_64/scluster" - with open(clutoout, "w") as outfile: - call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree', - '-rlabelfile', rlabel_path, - '-plotformat','ps', - '-'+options['analysis_type']+'=' + cl_out_file],stdout=outfile) - - #print("fulltree") - - -def write_node(node_id, tree_dict,name2group): - - child_node_strings = [] - - if node_id in tree_dict: - for child in tree_dict[node_id]: - child_node_strings.append(write_node(child, tree_dict,name2group)) - - if len(child_node_strings)==0: - node_str = '{"id":"node_' + node_id + '", "group":' + str(name2group[node_id]) + ', "name":"' + node_id + '", "data":{}, "children":[' - else: - node_str = '{"id":"node_' + node_id + '", "name":"' + node_id + '", "data":{}, "children":[' - node_str += ', '.join(child_node_strings) - node_str += ']}' - return node_str - -def run_convert2json(cl_out_file,out_file,labels,clustering_path): - with open(clustering_path) as f: - groups = f.readlines() - name2group={} - for i,gr in enumerate(groups): - name2group[labels[i]]=int(gr) - - tree_dict = {} - with open(cl_out_file, 'rb') as infile: - for i, line in enumerate(infile.readlines()): - if i < len(labels): - child = labels[i] - else: - child = str(i) - parent = line.split(' ')[0] - if parent not in tree_dict: - tree_dict[parent] = [child] - else: - tree_dict[parent].append(child) - - out_string = '' - out_string += write_node(tree_dict['-1'][0], tree_dict,name2group) - out_string += '' - - with io.open(out_file, 'wb') as outfile: - outfile.write(out_string.encode("utf8")) - - -def run_convert(cl_out_file,out_file,options,rowlabels): - density='150' - if options['analysis_type']!='plottree': - density='300' - - if len(rowlabels)<50: - density='100' - - if len(rowlabels)<25: - density='50' - - - if options['analysis_type']=='plottree': - resize='50%' - else: - resize='100%' - - #print density - call(['convert','-density',density,cl_out_file,'png:'+out_file]) - -def run(inputFile, outputFile, options): - - data,rowlabels=loadData(inputFile+"/similarity.json"); - if not "analysis_type" in options: - options["analysis_type"]="plottree" ; - no_clusters=number_of_clusters(options,rowlabels) - temp_folder = tempfile.mkdtemp() - - if not os.path.exists(temp_folder): - os.mkdir(temp_folder) - - cluto_path=os.path.join(temp_folder, 'matrix.txt'); - rlabel_path=os.path.join(temp_folder, 'documents_ids.txt'); - cluto_out_path=os.path.join(temp_folder, 'cluto.ps'); - - shutil.copy2(os.path.join(inputFile, 'matrix.txt'),os.path.join(temp_folder, 'matrix.txt')) - with io.open(rlabel_path, 'w') as rlabel_ofs: - for lab in rowlabels: - print(lab, file=rlabel_ofs) - - - run_cluto(options,no_clusters,cluto_path,rlabel_path,cluto_out_path,os.path.join(temp_folder, 'clutoout.txt')) - - if not os.path.exists(outputFile): - os.mkdir(outputFile) - shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'),os.path.join(outputFile,'clutoout.txt')) - run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'),os.path.join(outputFile,'result.json'),rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters))) - run_convert(cluto_out_path,os.path.join(outputFile,'result.png'),options,rowlabels); - - #for heatmap - toHeatMapJSON(cluto_path,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),rowlabels,outputFile+"/data.json"); - - - #Check if they are required by any tool - shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.clustering')) - shutil.copyfile(cluto_path,os.path.join(outputFile,'matrix.txt')) - joblib.dump(rowlabels,outputFile+"/rowlabels.pkl"); - - #Results in JSON: - with open(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)), 'rb') as f: - clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()] - - - #to be deleted, but now required by visualisation - res={"clusters":clusters,"rowlabels":rowlabels.tolist()} - with open(os.path.join(outputFile,'clusters.json'), 'w') as outfile: - json.dump(res, outfile) - - labels=getLablesFromNames(rowlabels); - labels["groupnames"]["clusters"]=list(set(clusters)); - labels["groups"]["clusters"]=clusters; - with open(os.path.join(outputFile,'labels.json'), 'w') as outfile: - json.dump(labels, outfile) - - - #results in XLSX - saveXLSX(rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.xlsx')) - - #Coping results for next tools - #for visulisation (mds) - #similarity matrix - shutil.copyfile(os.path.join(inputFile, 'similarity.json'),os.path.join(outputFile,'similarity.json')) - shutil.copyfile(os.path.join(inputFile, 'distance.json'),os.path.join(outputFile,'distance.json')) - - #for featsel - #matrix after selection and weighting - shutil.copyfile(os.path.join(inputFile, 'weighted.json'),os.path.join(outputFile,'weighted.json')) - - #remove temp_folder - shutil.rmtree(temp_folder) - - -def getLablesFromNames(row_labels): - #data, data_cleaned,shortest_row_len, row_labels = get_data(row) - shortest_row_len = 10000000 - - data=[]; - for i, t in enumerate(row_labels): - t=str(t.encode('utf-8')); - t = re.split("[,._\-:]", t) - t = list(map(str.strip, t)) - data.append(t) - if shortest_row_len > len(t): - shortest_row_len = len(t) - - repeating = set(data[0]) - for s in data[1:]: - repeating.intersection_update(s) - repeating = list(repeating) - - for i, d in enumerate(data): - for r in repeating: - if r in d: - d.remove(r) - data[i] = d - - - first_lvl_categories = set() - first_lvl_name = 'first level' - - second_lvl_categories = set() - second_lvl_name = 'second level' - - last_lvl_categories = set() - last_lvl_name = 'last level' - - second_lvl_idx = 1 - if shortest_row_len < 2: - second_lvl_idx = 0 - - - for row in data: - if len(row)<=second_lvl_idx: - second_lvl_idx=0; - first_lvl_categories.add(row[0]) - - second_lvl_categories.add(row[second_lvl_idx]) - last_lvl_categories.add('_'.join(row[0:-1])) - - group_names = { - first_lvl_name: list(first_lvl_categories), - second_lvl_name: list(second_lvl_categories), - last_lvl_name: list(last_lvl_categories) - } - - groups = { - first_lvl_name: [], - second_lvl_name: [], - last_lvl_name: [] - } - - for i, row in enumerate(data): - groups[first_lvl_name].append(row[0]) - groups[second_lvl_name].append(row[second_lvl_idx]) - groups[last_lvl_name].append('_'.join(row[0:-1])) - - - return { - 'rowlabels':row_labels.tolist(), - 'groups': groups, - 'groupnames': group_names - } - - -def test0(): - cluto_path="./cluto-2.1.2/Linux-x86_64/scluster" - no_clusters=2; - options={}; - cluto_input_file="test/dane.bin" - options['analysis_type']='plottree' - cl_out_file="out.ps" - rowlabels=[]; - out_file="out.png" - call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree', - '-plotformat','ps', - '-'+options['analysis_type']+'=' + cl_out_file]) - run_convert(cl_out_file,out_file,options,rowlabels) - -def test1(): - run("in","out",{}) - -if __name__ == '__main__': - test1(); - \ No newline at end of file diff --git a/cluto_worker.py b/cluto_worker.py deleted file mode 100755 index 4343af4..0000000 --- a/cluto_worker.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import nlp_ws -import cluto - -class ClutoWorker(nlp_ws.NLPWorker): - #def init(self): - #self.logger.log(INFO, "Iobber model loaded form "+ self.config['model-dir']) - - def process(self, inputFile, taskOptions, outputFile): - cluto.run(inputFile,outputFile,taskOptions) - - -if __name__ == '__main__': - nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True) - diff --git a/main.py b/main.py new file mode 100755 index 0000000..2b75021 --- /dev/null +++ b/main.py @@ -0,0 +1,23 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Implementation of Cluto Worker.""" + +from src import cluto + +import nlp_ws + + +class ClutoWorker(nlp_ws.NLPWorker): + """Implementation class of Cluto Worker.""" + +# def init(self): +# self.logger.log(INFO, "Iobber model loaded form "+ +# self.config['model-dir']) + + def process(self, inputFile, taskOptions, outputFile): + """Starting process.""" + cluto.run(inputFile, outputFile, taskOptions) + + +if __name__ == '__main__': + nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cluto.py b/src/cluto.py new file mode 100644 index 0000000..c10bfe5 --- /dev/null +++ b/src/cluto.py @@ -0,0 +1,374 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Implementation of cluto worker.""" + +from __future__ import print_function +import json +import re +import io + +import numpy as _np +import os +import shutil +import tempfile + +from subprocess import call +from sklearn.externals import joblib +import xlsxwriter + +verbose = False + + +def loadData(inputFile): + """Loading data.""" + with open(inputFile) as json_ifs: + jsonVal = json.load(json_ifs) + rowlabels = _np.asarray(jsonVal["rowlabels"]) + data = _np.asarray(jsonVal["arr"]) + jsonVal["arr"] = None + return data, rowlabels + + +def saveXLSX(names, clustering_path, outfile): + """Saving to XLSX.""" + srow = 3 + scol = 4 + with open(clustering_path) as f: + groups = f.readlines() + ind = 0 + workbook = xlsxwriter.Workbook(outfile) + worksheet = workbook.add_worksheet("result") + worksheet.write(srow, scol, 'Nazwy') + worksheet.write(srow, scol + 1, 'Grupa') + srow += 1 + for name in names: + worksheet.write(srow, scol, name) + worksheet.write(srow, scol + 1, groups[ind]) + srow += 1 + ind = ind + 1 + workbook.close() + + +def toHeatMapJSON(cluto_path, clustering_path, names, outfile): + """Saving to JSON.""" + with open(clustering_path) as f: + groups = f.readlines() + names_out = [] + ind = 0 + for name in names: + tmp_hsh = { + 'name': name, + 'group': groups[ind].strip() + } + names_out.append(tmp_hsh) + ind = ind + 1 + + array = [] + line_num = 0 + with open(cluto_path) as f: + content = f.readlines() + + regex = r"\d+\s[0-9]*\.?[0-9]+" + for line in content[1:]: + arr = re.findall(regex, line) + for node in arr: + node = node.split() + tmp_hsh = { + 'source': str(line_num), + 'target': str((int(node[0]) - 1)), + 'value': str(float(node[1])) + } + array.append(tmp_hsh) + line_num += 1 + + out = {'nodes': names_out, 'links': array} + json_hsh = json.dumps(out) + with open(outfile, 'w') as outfile: + outfile.write(json_hsh) + + +# Reads data from set of csvs from fextor +# Creats matrix and normalise it (divides by tok_count) + +def number_of_clusters(options, rowlabels): + """Calculation of the number of clusters.""" + if 'no_clusters' in options: + no_clusters = options['no_clusters'] + if not isinstance(no_clusters, int): + no_clusters = 2 + if no_clusters < 2: + no_clusters = 2 + else: + no_clusters = 2 + if int(no_clusters) > len(rowlabels): + no_clusters = str(len(rowlabels)) + return no_clusters + + +def save_clutofiles(mat, rlabels, clabels, cluto_path, rlabel_path, + clabel_path): + """Saving cluto file.""" + with open(cluto_path, 'w') as cluto_ofs: + # Print header: + # <num_rows> <num_cols> <num_nonzero> + print( + len(rlabels), + len(clabels), + _np.count_nonzero(mat), + file=cluto_ofs, + ) + for row in mat: + buf = [] + for idx in row.nonzero()[0]: + buf.append('{} {}'.format(idx + 1, row[idx])) + print(' '.join(buf), file=cluto_ofs) + # Save label files + with io.open(rlabel_path, 'w') as rlabel_ofs: + for lab in rlabels: + print(lab, file=rlabel_ofs) + + with io.open(clabel_path, 'w') as clabel_ofs: + for lab in clabels: + print(lab, file=clabel_ofs) + + +def run_cluto(options, no_clusters, cluto_input_file, rlabel_path, cl_out_file, + clutoout): + """Running cluto.""" + cluto_path = "./cluto-2.1.2/Linux-x86_64/scluster" + with open(clutoout, "w") as outfile: + call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree', + '-rlabelfile', rlabel_path, + '-plotformat', 'ps', + '-' + options['analysis_type'] + + '=' + cl_out_file], stdout=outfile) + + # print("fulltree") + + +def write_node(node_id, tree_dict, name2group): + """Writing node.""" + child_node_strings = [] + + if node_id in tree_dict: + for child in tree_dict[node_id]: + child_node_strings.append(write_node(child, tree_dict, + name2group)) + if len(child_node_strings) == 0: + node_str = '{"id":"node_' + node_id + '", "group":' + \ + str(name2group[node_id]) +\ + ', "name":"' + \ + node_id + \ + '", "data":{}, "children":[' + else: + node_str = '{"id":"node_' + node_id + '", "name":"' + node_id +\ + '", "data":{}, "children":[' + node_str += ', '.join(child_node_strings) + node_str += ']}' + return node_str + + +def run_convert2json(cl_out_file, out_file, labels, clustering_path): + """Converting to json.""" + with open(clustering_path) as f: + groups = f.readlines() + name2group = {} + for i, gr in enumerate(groups): + name2group[labels[i]] = int(gr) + + tree_dict = {} + with open(cl_out_file, 'rb') as infile: + for i, line in enumerate(infile.readlines()): + if i < len(labels): + child = labels[i] + else: + child = str(i) + parent = line.split(' ')[0] + if parent not in tree_dict: + tree_dict[parent] = [child] + else: + tree_dict[parent].append(child) + + out_string = '' + out_string += write_node(tree_dict['-1'][0], tree_dict, name2group) + out_string += '' + + with io.open(out_file, 'wb') as outfile: + outfile.write(out_string.encode("utf8")) + + +def run_convert(cl_out_file, out_file, options, rowlabels): + """Running convert.""" + density = '150' + if options['analysis_type'] != 'plottree': + density = '300' + + if len(rowlabels) < 50: + density = '100' + + if len(rowlabels) < 25: + density = '50' + +# if options['analysis_type'] == 'plottree': +# resize = '50%' +# else: +# resize = '100%' + + # print density + call(['convert', '-density', density, cl_out_file, 'png:' + out_file]) + + +def run(inputFile, outputFile, options): + """Running cluto worker.""" + data, rowlabels = loadData(inputFile + "/similarity.json") + if "analysis_type" not in options: + options["analysis_type"] = "plottree" + no_clusters = number_of_clusters(options, rowlabels) + temp_folder = tempfile.mkdtemp() + + if not os.path.exists(temp_folder): + os.mkdir(temp_folder) + + cluto_path = os.path.join(temp_folder, 'matrix.txt') + rlabel_path = os.path.join(temp_folder, 'documents_ids.txt') + cluto_out_path = os.path.join(temp_folder, 'cluto.ps') + + shutil.copy2(os.path.join(inputFile, 'matrix.txt'), + os.path.join(temp_folder, 'matrix.txt')) + with io.open(rlabel_path, 'w') as rlabel_ofs: + for lab in rowlabels: + print(lab, file=rlabel_ofs) + + run_cluto(options, no_clusters, cluto_path, rlabel_path, + cluto_out_path, os.path.join(temp_folder, 'clutoout.txt')) + + if not os.path.exists(outputFile): + os.mkdir(outputFile) + shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'), + os.path.join(outputFile, 'clutoout.txt')) + run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'), + os.path.join(outputFile, 'result.json'), rowlabels, + os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters))) + run_convert(cluto_out_path, os.path.join(outputFile, 'result.png'), + options, rowlabels) + + # for heatmap + toHeatMapJSON(cluto_path, os.path.join(temp_folder, + 'matrix.txt.clustering.' + + str(no_clusters)), rowlabels, + outputFile + "/data.json") + + # Check if they are required by any tool + shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters)), + os.path.join(outputFile, 'result.clustering')) + shutil.copyfile(cluto_path, os.path.join(outputFile, 'matrix.txt')) + joblib.dump(rowlabels, outputFile + "/rowlabels.pkl") + + # Results in JSON: + with open(os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters)), 'rb') as f: + clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()] + + # to be deleted, but now required by visualisation + res = {"clusters": clusters, "rowlabels": rowlabels.tolist()} + with open(os.path.join(outputFile, 'clusters.json'), 'w') as outfile: + json.dump(res, outfile) + + labels = getLablesFromNames(rowlabels) + labels["groupnames"]["clusters"] = list(set(clusters)) + labels["groups"]["clusters"] = clusters + with open(os.path.join(outputFile, 'labels.json'), 'w') as outfile: + json.dump(labels, outfile) + + # results in XLSX + saveXLSX(rowlabels, os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters)), + os.path.join(outputFile, 'result.xlsx')) + + # Coping results for next tools + # for visulisation (mds) + # similarity matrix + shutil.copyfile(os.path.join(inputFile, 'similarity.json'), + os.path.join(outputFile, 'similarity.json')) + shutil.copyfile(os.path.join(inputFile, 'distance.json'), + os.path.join(outputFile, 'distance.json')) + + # for featsel + # matrix after selection and weighting + shutil.copyfile(os.path.join(inputFile, 'weighted.json'), + os.path.join(outputFile, 'weighted.json')) + + # remove temp_folder + shutil.rmtree(temp_folder) + + +def getLablesFromNames(row_labels): + """Getting labels from names.""" + # data, data_cleaned,shortest_row_len, row_labels = get_data(row) + shortest_row_len = 10000000 + + data = [] + for i, t in enumerate(row_labels): + t = str(t.encode('utf-8')) + t = re.split(r"[,._\-:]", t) + t = list(map(str.strip, t)) + data.append(t) + if shortest_row_len > len(t): + shortest_row_len = len(t) + + repeating = set(data[0]) + for s in data[1:]: + repeating.intersection_update(s) + repeating = list(repeating) + + for i, d in enumerate(data): + for r in repeating: + if r in d: + d.remove(r) + data[i] = d + + first_lvl_categories = set() + first_lvl_name = 'first level' + + second_lvl_categories = set() + second_lvl_name = 'second level' + + last_lvl_categories = set() + last_lvl_name = 'last level' + + second_lvl_idx = 1 + if shortest_row_len < 2: + second_lvl_idx = 0 + + for row in data: + if len(row) <= second_lvl_idx: + second_lvl_idx = 0 + first_lvl_categories.add(row[0]) + + second_lvl_categories.add(row[second_lvl_idx]) + last_lvl_categories.add('_'.join(row[0:-1])) + + group_names = { + first_lvl_name: list(first_lvl_categories), + second_lvl_name: list(second_lvl_categories), + last_lvl_name: list(last_lvl_categories) + } + + groups = { + first_lvl_name: [], + second_lvl_name: [], + last_lvl_name: [] + } + + for i, row in enumerate(data): + groups[first_lvl_name].append(row[0]) + groups[second_lvl_name].append(row[second_lvl_idx]) + groups[last_lvl_name].append('_'.join(row[0:-1])) + + return { + 'rowlabels': row_labels.tolist(), + 'groups': groups, + 'groupnames': group_names + } diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..517bd1c --- /dev/null +++ b/tox.ini @@ -0,0 +1,45 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 +basepython = python2.7 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python2.7 +commands = + pydocstyle --verbose {posargs} + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411 +match-dir = ^(?!\.tox|venv).* +match = ^(?!setup).*\.py + -- GitLab