From 6013eddc38c924ed10173f4eed8ed314c4a46ddc Mon Sep 17 00:00:00 2001
From: bbojanowski <bartlomiej.piotr.bojanowski@gmail.com>
Date: Thu, 2 Jan 2020 14:28:54 +0100
Subject: [PATCH] Refactoring

---
 docker-compose.yml => .docker-compose.yml |   4 +-
 .gitignore                                |   5 +
 .gitlab-ci.yml                            |  18 ++
 cluto.py                                  | 368 ---------------------
 cluto_worker.py                           |  17 -
 main.py                                   |  23 ++
 src/__init__.py                           |   0
 src/cluto.py                              | 374 ++++++++++++++++++++++
 tox.ini                                   |  45 +++
 9 files changed, 467 insertions(+), 387 deletions(-)
 rename docker-compose.yml => .docker-compose.yml (77%)
 create mode 100644 .gitignore
 create mode 100644 .gitlab-ci.yml
 delete mode 100644 cluto.py
 delete mode 100755 cluto_worker.py
 create mode 100755 main.py
 create mode 100644 src/__init__.py
 create mode 100644 src/cluto.py
 create mode 100644 tox.ini

diff --git a/docker-compose.yml b/.docker-compose.yml
similarity index 77%
rename from docker-compose.yml
rename to .docker-compose.yml
index 6693a3d..b96f838 100644
--- a/docker-compose.yml
+++ b/.docker-compose.yml
@@ -7,10 +7,10 @@ services:
     working_dir: /home/worker
     entrypoint:
           - python2
-          - cluto_worker.py
+          - main.py
     volumes:
         - /samba:/samba
         - ./config.ini:/home/worker/config.ini        
-        - ./cluto_worker.py:/home/worker/cluto_worker.py        
+        - ./main.py:/home/worker/main.py
         - ./cluto.py:/home/worker/cluto.py           
     restart: always    
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0bf0524
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.pyc
+
+venv/
+.idea/
+.tox/
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..b90b766
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,18 @@
+image: clarinpl/python:2.7
+
+cache:
+  paths:
+  - .tox
+
+before_script:
+  - pip install tox==2.9.1
+
+pep8:
+  script:
+   - tox -v -e pep8
+
+docstyle:
+  script:
+   - tox -v -e docstyle
+
+
diff --git a/cluto.py b/cluto.py
deleted file mode 100644
index 0b5f373..0000000
--- a/cluto.py
+++ /dev/null
@@ -1,368 +0,0 @@
-#!/usr/bin/python
-from __future__ import print_function
-import argparse as _argp
-import csv as _csv
-import json,re,os
-import io
-
-import numpy as _np
-import time,glob,os,shutil,tempfile
-from subprocess import call
-from sklearn.externals import joblib
-import xlsxwriter
-
-verbose = False
-
-def loadData(inputFile):
-	with open(inputFile) as json_ifs:
-			jsonVal = json.load(json_ifs)
-			rowlabels=_np.asarray(jsonVal["rowlabels"])
-			data=_np.asarray(jsonVal["arr"]);
-			jsonVal["arr"]=None
-			return data, rowlabels
-
-
-
-def saveXLSX(names,clustering_path,outfile):
-	srow=3;scol=4;
-	with open(clustering_path) as f:
-			groups = f.readlines()
-	names_out = []
-	ind=0
-	workbook = xlsxwriter.Workbook(outfile)
-	worksheet = workbook.add_worksheet("result");
-	worksheet.write(srow,scol,'Nazwy')
-	worksheet.write(srow,scol+1,'Grupa')
-	srow+=1
-	for name in names:
-		worksheet.write(srow,scol,name)
-		worksheet.write(srow,scol+1,groups[ind])
-		srow+=1
-		ind=ind+1
-	workbook.close()
-		
-
-def toHeatMapJSON(cluto_path,clustering_path,names,outfile):
-		with open(clustering_path) as f:
-			groups = f.readlines()
-		names_out = []
-		ind=0
-		for name in names:
-			tmp_hsh = {
-				'name': name,
-				'group': groups[ind].strip()
-			}
-			names_out.append(tmp_hsh)
-			ind=ind+1
-
-		array = []
-		line_num=0
-		with open(cluto_path) as f:
-			content = f.readlines()
-
-		regex = r"\d+\s[0-9]*\.?[0-9]+"
-		for line in content[1:]:
-			arr = re.findall(regex, line)
-			for node in arr:
-				node = node.split()
-				tmp_hsh = {
-					'source': str(line_num),
-					'target': str((int(node[0]) - 1)),
-					'value': str(float(node[1]))
-				}
-				array.append(tmp_hsh)
-			line_num += 1
-
-		out = {'nodes': names_out, 'links': array}
-		json_hsh = json.dumps(out)
-		with open(outfile, 'w') as outfile:
-			outfile.write(json_hsh)
-
-	
-# Reads data from set of csvs from fextor
-# Creats matrix and normalise it (divides by tok_count)
-
-def number_of_clusters(options,rowlabels):
-	if 'no_clusters' in options:
-		no_clusters=options['no_clusters']
-		if not isinstance( no_clusters, int ):
-			no_clusters=2
-		if no_clusters<2:
-			no_clusters=2
-	else:
-		no_clusters=2
-	
-	if int(no_clusters) > len(rowlabels):
-		no_clusters = str(len(rowlabels))
-		
-	return no_clusters
-
-def save_clutofiles(mat,rlabels,clabels,cluto_path,rlabel_path,clabel_path):
-	# Save cluto file
-	with open(cluto_path, 'w') as cluto_ofs:
-		# Print header:
-		# <num_rows> <num_cols> <num_nonzero>
-		print(
-				len(rlabels),
-				len(clabels),
-				_np.count_nonzero(mat),
-				file=cluto_ofs,
-				)
-
-		for row in mat:
-			buf = []
-
-			for idx in row.nonzero()[0]:
-				buf.append('{} {}'.format(idx+1, row[idx]))
-
-			print(' '.join(buf), file=cluto_ofs)
-
-	# Save label files
-	with io.open(rlabel_path, 'w') as rlabel_ofs:
-		for lab in rlabels:
-			print(lab, file=rlabel_ofs)
-
-	with io.open(clabel_path, 'w') as clabel_ofs:
-		for lab in clabels:
-			print(lab, file=clabel_ofs)
-
-def run_cluto(options,no_clusters,cluto_input_file,rlabel_path,cl_out_file,clutoout):
-	
-	cluto_path="./cluto-2.1.2/Linux-x86_64/scluster"
-	with open(clutoout, "w") as outfile:
-		call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree',
-			  '-rlabelfile', rlabel_path,
-			  '-plotformat','ps',
-			  '-'+options['analysis_type']+'=' + cl_out_file],stdout=outfile)
-		
-		#print("fulltree")
-				
-
-def write_node(node_id, tree_dict,name2group):
-	
-	child_node_strings = []
-
-	if node_id in tree_dict:
-		for child in tree_dict[node_id]:
-			child_node_strings.append(write_node(child, tree_dict,name2group))
-			
-	if len(child_node_strings)==0:
-		node_str = '{"id":"node_' + node_id + '", "group":' + str(name2group[node_id]) + ', "name":"' + node_id + '", "data":{}, "children":['
-	else:		
-		node_str = '{"id":"node_' + node_id + '", "name":"' + node_id + '", "data":{}, "children":['
-	node_str += ', '.join(child_node_strings)
-	node_str += ']}'
-	return node_str
-				
-def run_convert2json(cl_out_file,out_file,labels,clustering_path):
-	with open(clustering_path) as f:
-		groups = f.readlines()
-	name2group={}
-	for i,gr in enumerate(groups):
-		name2group[labels[i]]=int(gr)
-	
-	tree_dict = {}
-	with open(cl_out_file, 'rb') as infile:
-		for i, line in enumerate(infile.readlines()):
-			if i < len(labels):
-				child = labels[i]
-			else:
-				child = str(i)
-			parent = line.split(' ')[0]
-			if parent not in tree_dict:
-				tree_dict[parent] = [child]
-			else:
-				tree_dict[parent].append(child)
-
-	out_string = ''
-	out_string += write_node(tree_dict['-1'][0], tree_dict,name2group)
-	out_string += ''
-
-	with io.open(out_file, 'wb') as outfile:
-		outfile.write(out_string.encode("utf8"))
-
-				
-def run_convert(cl_out_file,out_file,options,rowlabels):
-	density='150'
-	if options['analysis_type']!='plottree':
-		density='300'
-	
-	if len(rowlabels)<50:
-		density='100'
-
-	if len(rowlabels)<25:
-		density='50'
-	
-		
-	if options['analysis_type']=='plottree':
-			resize='50%'
-	else:
-			resize='100%'
-	
-	#print density
-	call(['convert','-density',density,cl_out_file,'png:'+out_file])
-	
-def run(inputFile, outputFile, options):
-	
-	data,rowlabels=loadData(inputFile+"/similarity.json");
-	if not "analysis_type" in options:
-		options["analysis_type"]="plottree"	;
-	no_clusters=number_of_clusters(options,rowlabels)	
-	temp_folder = tempfile.mkdtemp()
-	
-	if not os.path.exists(temp_folder):
-			os.mkdir(temp_folder)
-	
-	cluto_path=os.path.join(temp_folder, 'matrix.txt');
-	rlabel_path=os.path.join(temp_folder, 'documents_ids.txt');
-	cluto_out_path=os.path.join(temp_folder, 'cluto.ps');
-
-	shutil.copy2(os.path.join(inputFile, 'matrix.txt'),os.path.join(temp_folder, 'matrix.txt'))
-	with io.open(rlabel_path, 'w') as rlabel_ofs:
-		for lab in rowlabels:
-			print(lab, file=rlabel_ofs)
-	
-	
-	run_cluto(options,no_clusters,cluto_path,rlabel_path,cluto_out_path,os.path.join(temp_folder, 'clutoout.txt')) 
-	
-	if not os.path.exists(outputFile):
-		os.mkdir(outputFile)
-	shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'),os.path.join(outputFile,'clutoout.txt'))	
-	run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'),os.path.join(outputFile,'result.json'),rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)))
-	run_convert(cluto_out_path,os.path.join(outputFile,'result.png'),options,rowlabels);
-	
-	#for heatmap
-	toHeatMapJSON(cluto_path,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),rowlabels,outputFile+"/data.json");
-	
-	
-	#Check if they are required by any tool
-	shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.clustering'))
-	shutil.copyfile(cluto_path,os.path.join(outputFile,'matrix.txt'))
-	joblib.dump(rowlabels,outputFile+"/rowlabels.pkl");
-	
-	#Results in JSON: 
-	with open(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)), 'rb') as f:
-		clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()]
-	
-	
-	#to be deleted, but now required by visualisation
-	res={"clusters":clusters,"rowlabels":rowlabels.tolist()}
-	with open(os.path.join(outputFile,'clusters.json'), 'w') as outfile:
-		json.dump(res, outfile)
-	
-	labels=getLablesFromNames(rowlabels);
-	labels["groupnames"]["clusters"]=list(set(clusters));
-	labels["groups"]["clusters"]=clusters;	
-	with open(os.path.join(outputFile,'labels.json'), 'w') as outfile:
-		json.dump(labels, outfile)
-	
-	
-	#results in XLSX
-	saveXLSX(rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.xlsx'))
-	
-	#Coping results for next tools
-	#for visulisation (mds)
-	#similarity matrix 
-	shutil.copyfile(os.path.join(inputFile, 'similarity.json'),os.path.join(outputFile,'similarity.json'))
-	shutil.copyfile(os.path.join(inputFile, 'distance.json'),os.path.join(outputFile,'distance.json'))
-	
-	#for featsel
-	#matrix after selection and weighting
-	shutil.copyfile(os.path.join(inputFile, 'weighted.json'),os.path.join(outputFile,'weighted.json'))
-	
-	#remove temp_folder	
-	shutil.rmtree(temp_folder)	
-
-	
-def getLablesFromNames(row_labels):
-    #data, data_cleaned,shortest_row_len, row_labels = get_data(row)
-    shortest_row_len = 10000000
-
-    data=[];
-    for i, t in enumerate(row_labels):
-        t=str(t.encode('utf-8'));
-        t = re.split("[,._\-:]", t)
-        t = list(map(str.strip, t))
-        data.append(t)
-        if shortest_row_len > len(t):
-            shortest_row_len = len(t)
-
-    repeating = set(data[0])
-    for s in data[1:]:
-        repeating.intersection_update(s)
-    repeating = list(repeating)
-
-    for i, d in enumerate(data):
-        for r in repeating:
-            if r in d:
-                d.remove(r)
-                data[i] = d
-
-	
-    first_lvl_categories = set()
-    first_lvl_name = 'first level'
-
-    second_lvl_categories = set()
-    second_lvl_name = 'second level'
-
-    last_lvl_categories = set()
-    last_lvl_name = 'last level'
-
-    second_lvl_idx = 1
-    if shortest_row_len < 2:
-        second_lvl_idx = 0
-    
-    
-    for row in data:
-        if len(row)<=second_lvl_idx:
-            second_lvl_idx=0;
-        first_lvl_categories.add(row[0])
-        
-        second_lvl_categories.add(row[second_lvl_idx])
-        last_lvl_categories.add('_'.join(row[0:-1]))
-
-    group_names = {
-        first_lvl_name: list(first_lvl_categories),
-        second_lvl_name: list(second_lvl_categories),
-        last_lvl_name: list(last_lvl_categories)
-    }
-
-    groups = {
-        first_lvl_name: [],
-        second_lvl_name: [],
-        last_lvl_name: []
-    }
-
-    for i, row in enumerate(data):
-        groups[first_lvl_name].append(row[0])
-        groups[second_lvl_name].append(row[second_lvl_idx])
-        groups[last_lvl_name].append('_'.join(row[0:-1]))
-
-
-    return {
-		'rowlabels':row_labels.tolist(),
-        'groups': groups,
-        'groupnames': group_names
-    }
-	
-	
-def test0():
-	cluto_path="./cluto-2.1.2/Linux-x86_64/scluster"
-	no_clusters=2;
-	options={};
-	cluto_input_file="test/dane.bin"
-	options['analysis_type']='plottree'
-	cl_out_file="out.ps"
-	rowlabels=[];
-	out_file="out.png"
-	call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree',
-			  '-plotformat','ps',
-			  '-'+options['analysis_type']+'=' + cl_out_file])
-	run_convert(cl_out_file,out_file,options,rowlabels)		  
-
-def test1():
-	run("in","out",{})
-	
-if __name__ == '__main__':
-	test1();
-	
\ No newline at end of file
diff --git a/cluto_worker.py b/cluto_worker.py
deleted file mode 100755
index 4343af4..0000000
--- a/cluto_worker.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-import nlp_ws
-import cluto
-
-class ClutoWorker(nlp_ws.NLPWorker):
-	#def init(self):
-		#self.logger.log(INFO, "Iobber model loaded form "+ self.config['model-dir'])
-
-	def process(self, inputFile, taskOptions, outputFile):	
-		cluto.run(inputFile,outputFile,taskOptions)
-			
-
-if __name__ == '__main__':
-	nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True)
-	
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..2b75021
--- /dev/null
+++ b/main.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""Implementation of Cluto Worker."""
+
+from src import cluto
+
+import nlp_ws
+
+
+class ClutoWorker(nlp_ws.NLPWorker):
+    """Implementation class of Cluto Worker."""
+
+#   def init(self):
+#       self.logger.log(INFO, "Iobber model loaded form "+
+#       self.config['model-dir'])
+
+    def process(self, inputFile, taskOptions, outputFile):
+        """Starting process."""
+        cluto.run(inputFile, outputFile, taskOptions)
+
+
+if __name__ == '__main__':
+    nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True)
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/cluto.py b/src/cluto.py
new file mode 100644
index 0000000..c10bfe5
--- /dev/null
+++ b/src/cluto.py
@@ -0,0 +1,374 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""Implementation of cluto worker."""
+
+from __future__ import print_function
+import json
+import re
+import io
+
+import numpy as _np
+import os
+import shutil
+import tempfile
+
+from subprocess import call
+from sklearn.externals import joblib
+import xlsxwriter
+
+verbose = False
+
+
+def loadData(inputFile):
+    """Loading data."""
+    with open(inputFile) as json_ifs:
+        jsonVal = json.load(json_ifs)
+        rowlabels = _np.asarray(jsonVal["rowlabels"])
+        data = _np.asarray(jsonVal["arr"])
+        jsonVal["arr"] = None
+        return data, rowlabels
+
+
+def saveXLSX(names, clustering_path, outfile):
+    """Saving to XLSX."""
+    srow = 3
+    scol = 4
+    with open(clustering_path) as f:
+        groups = f.readlines()
+    ind = 0
+    workbook = xlsxwriter.Workbook(outfile)
+    worksheet = workbook.add_worksheet("result")
+    worksheet.write(srow, scol, 'Nazwy')
+    worksheet.write(srow, scol + 1, 'Grupa')
+    srow += 1
+    for name in names:
+        worksheet.write(srow, scol, name)
+        worksheet.write(srow, scol + 1, groups[ind])
+        srow += 1
+        ind = ind + 1
+    workbook.close()
+
+
+def toHeatMapJSON(cluto_path, clustering_path, names, outfile):
+    """Saving to JSON."""
+    with open(clustering_path) as f:
+        groups = f.readlines()
+    names_out = []
+    ind = 0
+    for name in names:
+        tmp_hsh = {
+            'name': name,
+            'group': groups[ind].strip()
+        }
+        names_out.append(tmp_hsh)
+        ind = ind + 1
+
+    array = []
+    line_num = 0
+    with open(cluto_path) as f:
+        content = f.readlines()
+
+    regex = r"\d+\s[0-9]*\.?[0-9]+"
+    for line in content[1:]:
+        arr = re.findall(regex, line)
+        for node in arr:
+            node = node.split()
+            tmp_hsh = {
+                'source': str(line_num),
+                'target': str((int(node[0]) - 1)),
+                'value': str(float(node[1]))
+            }
+            array.append(tmp_hsh)
+        line_num += 1
+
+    out = {'nodes': names_out, 'links': array}
+    json_hsh = json.dumps(out)
+    with open(outfile, 'w') as outfile:
+        outfile.write(json_hsh)
+
+
+# Reads data from set of csvs from fextor
+# Creats matrix and normalise it (divides by tok_count)
+
+def number_of_clusters(options, rowlabels):
+    """Calculation of the number of clusters."""
+    if 'no_clusters' in options:
+        no_clusters = options['no_clusters']
+        if not isinstance(no_clusters, int):
+            no_clusters = 2
+        if no_clusters < 2:
+            no_clusters = 2
+    else:
+        no_clusters = 2
+    if int(no_clusters) > len(rowlabels):
+        no_clusters = str(len(rowlabels))
+    return no_clusters
+
+
+def save_clutofiles(mat, rlabels, clabels, cluto_path, rlabel_path,
+                    clabel_path):
+    """Saving cluto file."""
+    with open(cluto_path, 'w') as cluto_ofs:
+        # Print header:
+        # <num_rows> <num_cols> <num_nonzero>
+        print(
+            len(rlabels),
+            len(clabels),
+            _np.count_nonzero(mat),
+            file=cluto_ofs,
+        )
+        for row in mat:
+            buf = []
+            for idx in row.nonzero()[0]:
+                buf.append('{} {}'.format(idx + 1, row[idx]))
+            print(' '.join(buf), file=cluto_ofs)
+    # Save label files
+    with io.open(rlabel_path, 'w') as rlabel_ofs:
+        for lab in rlabels:
+            print(lab, file=rlabel_ofs)
+
+    with io.open(clabel_path, 'w') as clabel_ofs:
+        for lab in clabels:
+            print(lab, file=clabel_ofs)
+
+
+def run_cluto(options, no_clusters, cluto_input_file, rlabel_path, cl_out_file,
+              clutoout):
+    """Running cluto."""
+    cluto_path = "./cluto-2.1.2/Linux-x86_64/scluster"
+    with open(clutoout, "w") as outfile:
+        call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree',
+              '-rlabelfile', rlabel_path,
+              '-plotformat', 'ps',
+              '-' + options['analysis_type'] +
+              '=' + cl_out_file], stdout=outfile)
+
+    # print("fulltree")
+
+
+def write_node(node_id, tree_dict, name2group):
+    """Writing node."""
+    child_node_strings = []
+
+    if node_id in tree_dict:
+        for child in tree_dict[node_id]:
+            child_node_strings.append(write_node(child, tree_dict,
+                                                 name2group))
+    if len(child_node_strings) == 0:
+        node_str = '{"id":"node_' + node_id + '", "group":' + \
+                   str(name2group[node_id]) +\
+                   ', "name":"' + \
+                   node_id + \
+                   '", "data":{}, "children":['
+    else:
+        node_str = '{"id":"node_' + node_id + '", "name":"' + node_id +\
+                   '", "data":{}, "children":['
+    node_str += ', '.join(child_node_strings)
+    node_str += ']}'
+    return node_str
+
+
+def run_convert2json(cl_out_file, out_file, labels, clustering_path):
+    """Converting to json."""
+    with open(clustering_path) as f:
+        groups = f.readlines()
+    name2group = {}
+    for i, gr in enumerate(groups):
+        name2group[labels[i]] = int(gr)
+
+    tree_dict = {}
+    with open(cl_out_file, 'rb') as infile:
+        for i, line in enumerate(infile.readlines()):
+            if i < len(labels):
+                child = labels[i]
+            else:
+                child = str(i)
+            parent = line.split(' ')[0]
+            if parent not in tree_dict:
+                tree_dict[parent] = [child]
+            else:
+                tree_dict[parent].append(child)
+
+    out_string = ''
+    out_string += write_node(tree_dict['-1'][0], tree_dict, name2group)
+    out_string += ''
+
+    with io.open(out_file, 'wb') as outfile:
+        outfile.write(out_string.encode("utf8"))
+
+
+def run_convert(cl_out_file, out_file, options, rowlabels):
+    """Running convert."""
+    density = '150'
+    if options['analysis_type'] != 'plottree':
+        density = '300'
+
+    if len(rowlabels) < 50:
+        density = '100'
+
+    if len(rowlabels) < 25:
+        density = '50'
+
+#    if options['analysis_type'] == 'plottree':
+#       resize = '50%'
+#   else:
+#       resize = '100%'
+
+    # print density
+    call(['convert', '-density', density, cl_out_file, 'png:' + out_file])
+
+
+def run(inputFile, outputFile, options):
+    """Running cluto worker."""
+    data, rowlabels = loadData(inputFile + "/similarity.json")
+    if "analysis_type" not in options:
+        options["analysis_type"] = "plottree"
+    no_clusters = number_of_clusters(options, rowlabels)
+    temp_folder = tempfile.mkdtemp()
+
+    if not os.path.exists(temp_folder):
+        os.mkdir(temp_folder)
+
+    cluto_path = os.path.join(temp_folder, 'matrix.txt')
+    rlabel_path = os.path.join(temp_folder, 'documents_ids.txt')
+    cluto_out_path = os.path.join(temp_folder, 'cluto.ps')
+
+    shutil.copy2(os.path.join(inputFile, 'matrix.txt'),
+                 os.path.join(temp_folder, 'matrix.txt'))
+    with io.open(rlabel_path, 'w') as rlabel_ofs:
+        for lab in rowlabels:
+            print(lab, file=rlabel_ofs)
+
+    run_cluto(options, no_clusters, cluto_path, rlabel_path,
+              cluto_out_path, os.path.join(temp_folder, 'clutoout.txt'))
+
+    if not os.path.exists(outputFile):
+        os.mkdir(outputFile)
+    shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'),
+                    os.path.join(outputFile, 'clutoout.txt'))
+    run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'),
+                     os.path.join(outputFile, 'result.json'), rowlabels,
+                     os.path.join(temp_folder, 'matrix.txt.clustering.' +
+                                  str(no_clusters)))
+    run_convert(cluto_out_path, os.path.join(outputFile, 'result.png'),
+                options, rowlabels)
+
+    # for heatmap
+    toHeatMapJSON(cluto_path, os.path.join(temp_folder,
+                                           'matrix.txt.clustering.' +
+                                           str(no_clusters)), rowlabels,
+                  outputFile + "/data.json")
+
+    # Check if they are required by any tool
+    shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.' +
+                                 str(no_clusters)),
+                    os.path.join(outputFile, 'result.clustering'))
+    shutil.copyfile(cluto_path, os.path.join(outputFile, 'matrix.txt'))
+    joblib.dump(rowlabels, outputFile + "/rowlabels.pkl")
+
+    # Results in JSON:
+    with open(os.path.join(temp_folder, 'matrix.txt.clustering.' +
+                                        str(no_clusters)), 'rb') as f:
+        clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()]
+
+    # to be deleted, but now required by visualisation
+    res = {"clusters": clusters, "rowlabels": rowlabels.tolist()}
+    with open(os.path.join(outputFile, 'clusters.json'), 'w') as outfile:
+        json.dump(res, outfile)
+
+    labels = getLablesFromNames(rowlabels)
+    labels["groupnames"]["clusters"] = list(set(clusters))
+    labels["groups"]["clusters"] = clusters
+    with open(os.path.join(outputFile, 'labels.json'), 'w') as outfile:
+        json.dump(labels, outfile)
+
+    # results in XLSX
+    saveXLSX(rowlabels, os.path.join(temp_folder, 'matrix.txt.clustering.' +
+                                     str(no_clusters)),
+             os.path.join(outputFile, 'result.xlsx'))
+
+    # Coping results for next tools
+    # for visulisation (mds)
+    # similarity matrix
+    shutil.copyfile(os.path.join(inputFile, 'similarity.json'),
+                    os.path.join(outputFile, 'similarity.json'))
+    shutil.copyfile(os.path.join(inputFile, 'distance.json'),
+                    os.path.join(outputFile, 'distance.json'))
+
+    # for featsel
+    # matrix after selection and weighting
+    shutil.copyfile(os.path.join(inputFile, 'weighted.json'),
+                    os.path.join(outputFile, 'weighted.json'))
+
+    # remove temp_folder
+    shutil.rmtree(temp_folder)
+
+
+def getLablesFromNames(row_labels):
+    """Getting labels from names."""
+    # data, data_cleaned,shortest_row_len, row_labels = get_data(row)
+    shortest_row_len = 10000000
+
+    data = []
+    for i, t in enumerate(row_labels):
+        t = str(t.encode('utf-8'))
+        t = re.split(r"[,._\-:]", t)
+        t = list(map(str.strip, t))
+        data.append(t)
+        if shortest_row_len > len(t):
+            shortest_row_len = len(t)
+
+    repeating = set(data[0])
+    for s in data[1:]:
+        repeating.intersection_update(s)
+    repeating = list(repeating)
+
+    for i, d in enumerate(data):
+        for r in repeating:
+            if r in d:
+                d.remove(r)
+                data[i] = d
+
+    first_lvl_categories = set()
+    first_lvl_name = 'first level'
+
+    second_lvl_categories = set()
+    second_lvl_name = 'second level'
+
+    last_lvl_categories = set()
+    last_lvl_name = 'last level'
+
+    second_lvl_idx = 1
+    if shortest_row_len < 2:
+        second_lvl_idx = 0
+
+    for row in data:
+        if len(row) <= second_lvl_idx:
+            second_lvl_idx = 0
+        first_lvl_categories.add(row[0])
+
+        second_lvl_categories.add(row[second_lvl_idx])
+        last_lvl_categories.add('_'.join(row[0:-1]))
+
+    group_names = {
+        first_lvl_name: list(first_lvl_categories),
+        second_lvl_name: list(second_lvl_categories),
+        last_lvl_name: list(last_lvl_categories)
+    }
+
+    groups = {
+        first_lvl_name: [],
+        second_lvl_name: [],
+        last_lvl_name: []
+    }
+
+    for i, row in enumerate(data):
+        groups[first_lvl_name].append(row[0])
+        groups[second_lvl_name].append(row[second_lvl_idx])
+        groups[last_lvl_name].append('_'.join(row[0:-1]))
+
+    return {
+        'rowlabels': row_labels.tolist(),
+        'groups': groups,
+        'groupnames': group_names
+    }
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..517bd1c
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,45 @@
+[tox]
+envlist = pep8,docstyle
+skipsdist = True
+
+[testenv:pep8]
+deps =
+    flake8
+basepython = python2.7
+commands =
+    flake8 {posargs}
+
+[testenv:docstyle]
+deps =
+    pydocstyle
+basepython = python2.7
+commands =
+    pydocstyle --verbose {posargs}
+
+[flake8]
+# W504 skipped because it is overeager and unnecessary
+ignore = W504
+show-source = True
+exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
+import-order-style = pep8
+max-line-length = 80
+
+
+[pydocstyle]
+# D104 Missing docstring in public package
+# D203 1 blank line required before class docstring
+# D213 Multi-line docstring summary should start at the second line
+# D214 Section is over-indented
+# D215 Section underline is over-indented
+# D401 First line should be in imperative mood; try rephrasing
+# D405 Section name should be properly capitalized
+# D406 Section name should end with a newline
+# D407 Missing dashed underline after section
+# D408 Section underline should be in the line following the section’s name
+# D409 Section underline should match the length of its name
+# D410 Missing blank line after section
+# D411 Missing blank line before section
+ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
+match-dir = ^(?!\.tox|venv).*
+match = ^(?!setup).*\.py
+
-- 
GitLab