diff --git a/docker-compose.yml b/.docker-compose.yml similarity index 64% rename from docker-compose.yml rename to .docker-compose.yml index 6693a3dfcced575a6f7bf80c0052660f1ad54215..45c4e7f9e537df8abe06521a4197d9fd3df541bf 100644 --- a/docker-compose.yml +++ b/.docker-compose.yml @@ -7,10 +7,9 @@ services: working_dir: /home/worker entrypoint: - python2 - - cluto_worker.py + - main.py volumes: - /samba:/samba - ./config.ini:/home/worker/config.ini - - ./cluto_worker.py:/home/worker/cluto_worker.py - - ./cluto.py:/home/worker/cluto.py + restart: always \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0bf0524ef90f45be5040bc72c52c493b2cfde332 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.pyc + +venv/ +.idea/ +.tox/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..b90b766d2e4f5a92c36e8329393a0b067b2c54cd --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,18 @@ +image: clarinpl/python:2.7 + +cache: + paths: + - .tox + +before_script: + - pip install tox==2.9.1 + +pep8: + script: + - tox -v -e pep8 + +docstyle: + script: + - tox -v -e docstyle + + diff --git a/Dockerfile b/Dockerfile index 098189ddc2f122602c8722f152d1bdb8f8c9205e..4ec0f53218ebe47e2545c1dbe092d37a2377b7ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,19 @@ FROM clarinpl/python:2.7 -COPY requirements.txt . -RUN pip install -r requirements.txt - - - RUN mkdir /home/worker && \ cd /home/worker && \ wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/cluto-2.1.2a.tar.gz && \ tar -xvf cluto-2.1.2a.tar.gz - - -RUN apt-get -y update && apt-get -y install imagemagick -RUN sed -i 's/\(<policy domain="coder" rights=\)"none" \(pattern="PS" \/>\)/\1"read|write"\2/g' /etc/ImageMagick-6/policy.xml -RUN less /etc/ImageM*/policy.xml + +RUN apt-get -y update && apt-get -y install imagemagick && \ + sed -i 's/\(<policy domain="coder" rights=\)"none" \(pattern="PS" \/>\)/\1"read|write"\2/g' /etc/ImageMagick-6/policy.xml && \ + less /etc/ImageM*/policy.xml + +WORKDIR /home/worker +COPY ./src ./src +COPY ./main.py . +COPY requirements.txt . + +RUN pip install -r requirements.txt + +CMD ["python","main.py"] \ No newline at end of file diff --git a/cluto.py b/cluto.py deleted file mode 100644 index 0b5f373623c61bbdf46fb0fe2276cb8ec8f20431..0000000000000000000000000000000000000000 --- a/cluto.py +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/python -from __future__ import print_function -import argparse as _argp -import csv as _csv -import json,re,os -import io - -import numpy as _np -import time,glob,os,shutil,tempfile -from subprocess import call -from sklearn.externals import joblib -import xlsxwriter - -verbose = False - -def loadData(inputFile): - with open(inputFile) as json_ifs: - jsonVal = json.load(json_ifs) - rowlabels=_np.asarray(jsonVal["rowlabels"]) - data=_np.asarray(jsonVal["arr"]); - jsonVal["arr"]=None - return data, rowlabels - - - -def saveXLSX(names,clustering_path,outfile): - srow=3;scol=4; - with open(clustering_path) as f: - groups = f.readlines() - names_out = [] - ind=0 - workbook = xlsxwriter.Workbook(outfile) - worksheet = workbook.add_worksheet("result"); - worksheet.write(srow,scol,'Nazwy') - worksheet.write(srow,scol+1,'Grupa') - srow+=1 - for name in names: - worksheet.write(srow,scol,name) - worksheet.write(srow,scol+1,groups[ind]) - srow+=1 - ind=ind+1 - workbook.close() - - -def toHeatMapJSON(cluto_path,clustering_path,names,outfile): - with open(clustering_path) as f: - groups = f.readlines() - names_out = [] - ind=0 - for name in names: - tmp_hsh = { - 'name': name, - 'group': groups[ind].strip() - } - names_out.append(tmp_hsh) - ind=ind+1 - - array = [] - line_num=0 - with open(cluto_path) as f: - content = f.readlines() - - regex = r"\d+\s[0-9]*\.?[0-9]+" - for line in content[1:]: - arr = re.findall(regex, line) - for node in arr: - node = node.split() - tmp_hsh = { - 'source': str(line_num), - 'target': str((int(node[0]) - 1)), - 'value': str(float(node[1])) - } - array.append(tmp_hsh) - line_num += 1 - - out = {'nodes': names_out, 'links': array} - json_hsh = json.dumps(out) - with open(outfile, 'w') as outfile: - outfile.write(json_hsh) - - -# Reads data from set of csvs from fextor -# Creats matrix and normalise it (divides by tok_count) - -def number_of_clusters(options,rowlabels): - if 'no_clusters' in options: - no_clusters=options['no_clusters'] - if not isinstance( no_clusters, int ): - no_clusters=2 - if no_clusters<2: - no_clusters=2 - else: - no_clusters=2 - - if int(no_clusters) > len(rowlabels): - no_clusters = str(len(rowlabels)) - - return no_clusters - -def save_clutofiles(mat,rlabels,clabels,cluto_path,rlabel_path,clabel_path): - # Save cluto file - with open(cluto_path, 'w') as cluto_ofs: - # Print header: - # <num_rows> <num_cols> <num_nonzero> - print( - len(rlabels), - len(clabels), - _np.count_nonzero(mat), - file=cluto_ofs, - ) - - for row in mat: - buf = [] - - for idx in row.nonzero()[0]: - buf.append('{} {}'.format(idx+1, row[idx])) - - print(' '.join(buf), file=cluto_ofs) - - # Save label files - with io.open(rlabel_path, 'w') as rlabel_ofs: - for lab in rlabels: - print(lab, file=rlabel_ofs) - - with io.open(clabel_path, 'w') as clabel_ofs: - for lab in clabels: - print(lab, file=clabel_ofs) - -def run_cluto(options,no_clusters,cluto_input_file,rlabel_path,cl_out_file,clutoout): - - cluto_path="./cluto-2.1.2/Linux-x86_64/scluster" - with open(clutoout, "w") as outfile: - call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree', - '-rlabelfile', rlabel_path, - '-plotformat','ps', - '-'+options['analysis_type']+'=' + cl_out_file],stdout=outfile) - - #print("fulltree") - - -def write_node(node_id, tree_dict,name2group): - - child_node_strings = [] - - if node_id in tree_dict: - for child in tree_dict[node_id]: - child_node_strings.append(write_node(child, tree_dict,name2group)) - - if len(child_node_strings)==0: - node_str = '{"id":"node_' + node_id + '", "group":' + str(name2group[node_id]) + ', "name":"' + node_id + '", "data":{}, "children":[' - else: - node_str = '{"id":"node_' + node_id + '", "name":"' + node_id + '", "data":{}, "children":[' - node_str += ', '.join(child_node_strings) - node_str += ']}' - return node_str - -def run_convert2json(cl_out_file,out_file,labels,clustering_path): - with open(clustering_path) as f: - groups = f.readlines() - name2group={} - for i,gr in enumerate(groups): - name2group[labels[i]]=int(gr) - - tree_dict = {} - with open(cl_out_file, 'rb') as infile: - for i, line in enumerate(infile.readlines()): - if i < len(labels): - child = labels[i] - else: - child = str(i) - parent = line.split(' ')[0] - if parent not in tree_dict: - tree_dict[parent] = [child] - else: - tree_dict[parent].append(child) - - out_string = '' - out_string += write_node(tree_dict['-1'][0], tree_dict,name2group) - out_string += '' - - with io.open(out_file, 'wb') as outfile: - outfile.write(out_string.encode("utf8")) - - -def run_convert(cl_out_file,out_file,options,rowlabels): - density='150' - if options['analysis_type']!='plottree': - density='300' - - if len(rowlabels)<50: - density='100' - - if len(rowlabels)<25: - density='50' - - - if options['analysis_type']=='plottree': - resize='50%' - else: - resize='100%' - - #print density - call(['convert','-density',density,cl_out_file,'png:'+out_file]) - -def run(inputFile, outputFile, options): - - data,rowlabels=loadData(inputFile+"/similarity.json"); - if not "analysis_type" in options: - options["analysis_type"]="plottree" ; - no_clusters=number_of_clusters(options,rowlabels) - temp_folder = tempfile.mkdtemp() - - if not os.path.exists(temp_folder): - os.mkdir(temp_folder) - - cluto_path=os.path.join(temp_folder, 'matrix.txt'); - rlabel_path=os.path.join(temp_folder, 'documents_ids.txt'); - cluto_out_path=os.path.join(temp_folder, 'cluto.ps'); - - shutil.copy2(os.path.join(inputFile, 'matrix.txt'),os.path.join(temp_folder, 'matrix.txt')) - with io.open(rlabel_path, 'w') as rlabel_ofs: - for lab in rowlabels: - print(lab, file=rlabel_ofs) - - - run_cluto(options,no_clusters,cluto_path,rlabel_path,cluto_out_path,os.path.join(temp_folder, 'clutoout.txt')) - - if not os.path.exists(outputFile): - os.mkdir(outputFile) - shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'),os.path.join(outputFile,'clutoout.txt')) - run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'),os.path.join(outputFile,'result.json'),rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters))) - run_convert(cluto_out_path,os.path.join(outputFile,'result.png'),options,rowlabels); - - #for heatmap - toHeatMapJSON(cluto_path,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),rowlabels,outputFile+"/data.json"); - - - #Check if they are required by any tool - shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.clustering')) - shutil.copyfile(cluto_path,os.path.join(outputFile,'matrix.txt')) - joblib.dump(rowlabels,outputFile+"/rowlabels.pkl"); - - #Results in JSON: - with open(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)), 'rb') as f: - clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()] - - - #to be deleted, but now required by visualisation - res={"clusters":clusters,"rowlabels":rowlabels.tolist()} - with open(os.path.join(outputFile,'clusters.json'), 'w') as outfile: - json.dump(res, outfile) - - labels=getLablesFromNames(rowlabels); - labels["groupnames"]["clusters"]=list(set(clusters)); - labels["groups"]["clusters"]=clusters; - with open(os.path.join(outputFile,'labels.json'), 'w') as outfile: - json.dump(labels, outfile) - - - #results in XLSX - saveXLSX(rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.xlsx')) - - #Coping results for next tools - #for visulisation (mds) - #similarity matrix - shutil.copyfile(os.path.join(inputFile, 'similarity.json'),os.path.join(outputFile,'similarity.json')) - shutil.copyfile(os.path.join(inputFile, 'distance.json'),os.path.join(outputFile,'distance.json')) - - #for featsel - #matrix after selection and weighting - shutil.copyfile(os.path.join(inputFile, 'weighted.json'),os.path.join(outputFile,'weighted.json')) - - #remove temp_folder - shutil.rmtree(temp_folder) - - -def getLablesFromNames(row_labels): - #data, data_cleaned,shortest_row_len, row_labels = get_data(row) - shortest_row_len = 10000000 - - data=[]; - for i, t in enumerate(row_labels): - t=str(t.encode('utf-8')); - t = re.split("[,._\-:]", t) - t = list(map(str.strip, t)) - data.append(t) - if shortest_row_len > len(t): - shortest_row_len = len(t) - - repeating = set(data[0]) - for s in data[1:]: - repeating.intersection_update(s) - repeating = list(repeating) - - for i, d in enumerate(data): - for r in repeating: - if r in d: - d.remove(r) - data[i] = d - - - first_lvl_categories = set() - first_lvl_name = 'first level' - - second_lvl_categories = set() - second_lvl_name = 'second level' - - last_lvl_categories = set() - last_lvl_name = 'last level' - - second_lvl_idx = 1 - if shortest_row_len < 2: - second_lvl_idx = 0 - - - for row in data: - if len(row)<=second_lvl_idx: - second_lvl_idx=0; - first_lvl_categories.add(row[0]) - - second_lvl_categories.add(row[second_lvl_idx]) - last_lvl_categories.add('_'.join(row[0:-1])) - - group_names = { - first_lvl_name: list(first_lvl_categories), - second_lvl_name: list(second_lvl_categories), - last_lvl_name: list(last_lvl_categories) - } - - groups = { - first_lvl_name: [], - second_lvl_name: [], - last_lvl_name: [] - } - - for i, row in enumerate(data): - groups[first_lvl_name].append(row[0]) - groups[second_lvl_name].append(row[second_lvl_idx]) - groups[last_lvl_name].append('_'.join(row[0:-1])) - - - return { - 'rowlabels':row_labels.tolist(), - 'groups': groups, - 'groupnames': group_names - } - - -def test0(): - cluto_path="./cluto-2.1.2/Linux-x86_64/scluster" - no_clusters=2; - options={}; - cluto_input_file="test/dane.bin" - options['analysis_type']='plottree' - cl_out_file="out.ps" - rowlabels=[]; - out_file="out.png" - call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree', - '-plotformat','ps', - '-'+options['analysis_type']+'=' + cl_out_file]) - run_convert(cl_out_file,out_file,options,rowlabels) - -def test1(): - run("in","out",{}) - -if __name__ == '__main__': - test1(); - \ No newline at end of file diff --git a/cluto_worker.py b/cluto_worker.py deleted file mode 100755 index 4343af4c9698b51d608f123d06a0b1a91e150f7d..0000000000000000000000000000000000000000 --- a/cluto_worker.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import nlp_ws -import cluto - -class ClutoWorker(nlp_ws.NLPWorker): - #def init(self): - #self.logger.log(INFO, "Iobber model loaded form "+ self.config['model-dir']) - - def process(self, inputFile, taskOptions, outputFile): - cluto.run(inputFile,outputFile,taskOptions) - - -if __name__ == '__main__': - nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True) - diff --git a/config.ini b/config.ini index d220ce2a5f4d3819fd81b7273e1aff4cbbcbd7e8..c7284103f0f2d74aad29448c8b6da788c917b053 100644 --- a/config.ini +++ b/config.ini @@ -1,9 +1,9 @@ [service] root = /samba/requests/ tool = cluto -rabbit_host =rabbit.clarin.ws -rabbit_user =clarin -rabbit_password =clarin123 +rabbit_host = rabbit.clarin.ws +rabbit_user = clarin +rabbit_password = clarin123 [tool] workers_number = 4 diff --git a/main.py b/main.py new file mode 100755 index 0000000000000000000000000000000000000000..1e0477dcf5e32bd9261d4ccde50b1eab0578ebc0 --- /dev/null +++ b/main.py @@ -0,0 +1,23 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Implementation of Cluto Worker.""" + +from src import cluto + +import nlp_ws + + +class ClutoWorker(nlp_ws.NLPWorker): + """Implementation class of Cluto Worker.""" + +# def init(self): +# self.logger.log(INFO, "Iobber model loaded form "+ +# self.config['model-dir']) + + def process(self, input_file, task_options, output_file): + """Starting process.""" + cluto.run(input_file, output_file, task_options) + + +if __name__ == '__main__': + nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/cluto.py b/src/cluto.py new file mode 100644 index 0000000000000000000000000000000000000000..b88fa851c3574234fdb5e9a56214c269900fe409 --- /dev/null +++ b/src/cluto.py @@ -0,0 +1,373 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Implementation of cluto worker.""" + +from __future__ import print_function +import json +import re +import io +import os +import shutil +import tempfile +from subprocess import call + +import numpy as _np +from sklearn.externals import joblib +import xlsxwriter + +verbose = False + + +def load_data(input_file): + """Loading data.""" + with open(input_file) as json_ifs: + json_val = json.load(json_ifs) + rowlabels = _np.asarray(json_val["rowlabels"]) + data = _np.asarray(json_val["arr"]) + json_val["arr"] = None + return data, rowlabels + + +def save_xlsx(names, clustering_path, outfile): + """Saving to XLSX.""" + srow = 3 + scol = 4 + with open(clustering_path) as f: + groups = f.readlines() + ind = 0 + workbook = xlsxwriter.Workbook(outfile) + worksheet = workbook.add_worksheet("result") + worksheet.write(srow, scol, 'Nazwy') + worksheet.write(srow, scol + 1, 'Grupa') + srow += 1 + for name in names: + worksheet.write(srow, scol, name) + worksheet.write(srow, scol + 1, groups[ind]) + srow += 1 + ind = ind + 1 + workbook.close() + + +def to_heat_map_json(cluto_path, clustering_path, names, outfile): + """Saving to JSON.""" + with open(clustering_path) as f: + groups = f.readlines() + names_out = [] + ind = 0 + for name in names: + tmp_hsh = { + 'name': name, + 'group': groups[ind].strip() + } + names_out.append(tmp_hsh) + ind = ind + 1 + + array = [] + line_num = 0 + with open(cluto_path) as f: + content = f.readlines() + + regex = r"\d+\s[0-9]*\.?[0-9]+" + for line in content[1:]: + arr = re.findall(regex, line) + for node in arr: + node = node.split() + tmp_hsh = { + 'source': str(line_num), + 'target': str((int(node[0]) - 1)), + 'value': str(float(node[1])) + } + array.append(tmp_hsh) + line_num += 1 + + out = {'nodes': names_out, 'links': array} + json_hsh = json.dumps(out) + with open(outfile, 'w') as outfile: + outfile.write(json_hsh) + + +# Reads data from set of csvs from fextor +# Creats matrix and normalise it (divides by tok_count) + +def number_of_clusters(options, rowlabels): + """Calculation of the number of clusters.""" + if 'no_clusters' in options: + no_clusters = options['no_clusters'] + if not isinstance(no_clusters, int): + no_clusters = 2 + if no_clusters < 2: + no_clusters = 2 + else: + no_clusters = 2 + if int(no_clusters) > len(rowlabels): + no_clusters = str(len(rowlabels)) + return no_clusters + + +def save_cluto_files(mat, rlabels, clabels, cluto_path, rlabel_path, + clabel_path): + """Saving cluto file.""" + with open(cluto_path, 'w') as cluto_ofs: + # Print header: + # <num_rows> <num_cols> <num_nonzero> + print( + len(rlabels), + len(clabels), + _np.count_nonzero(mat), + file=cluto_ofs, + ) + for row in mat: + buf = [] + for idx in row.nonzero()[0]: + buf.append('{} {}'.format(idx + 1, row[idx])) + print(' '.join(buf), file=cluto_ofs) + # Save label files + with io.open(rlabel_path, 'w') as rlabel_ofs: + for lab in rlabels: + print(lab, file=rlabel_ofs) + + with io.open(clabel_path, 'w') as clabel_ofs: + for lab in clabels: + print(lab, file=clabel_ofs) + + +def run_cluto(options, no_clusters, cluto_input_file, rlabel_path, cl_out_file, + clutoout): + """Running cluto.""" + cluto_path = "./cluto-2.1.2/Linux-x86_64/scluster" + with open(clutoout, "w") as outfile: + call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree', + '-rlabelfile', rlabel_path, + '-plotformat', 'ps', + '-' + options['analysis_type'] + + '=' + cl_out_file], stdout=outfile) + + # print("fulltree") + + +def write_node(node_id, tree_dict, name2group): + """Writing node.""" + child_node_strings = [] + + if node_id in tree_dict: + for child in tree_dict[node_id]: + child_node_strings.append(write_node(child, tree_dict, + name2group)) + if len(child_node_strings) == 0: + node_str = '{"id":"node_' + node_id + '", "group":' + \ + str(name2group[node_id]) + \ + ', "name":"' + \ + node_id + \ + '", "data":{}, "children":[' + else: + node_str = '{"id":"node_' + node_id + '", "name":"' + node_id + \ + '", "data":{}, "children":[' + node_str += ', '.join(child_node_strings) + node_str += ']}' + return node_str + + +def run_convert2json(cl_out_file, out_file, labels, clustering_path): + """Converting to json.""" + with open(clustering_path) as f: + groups = f.readlines() + name2group = {} + for i, gr in enumerate(groups): + name2group[labels[i]] = int(gr) + + tree_dict = {} + with open(cl_out_file, 'rb') as infile: + for i, line in enumerate(infile.readlines()): + if i < len(labels): + child = labels[i] + else: + child = str(i) + parent = line.split(' ')[0] + if parent not in tree_dict: + tree_dict[parent] = [child] + else: + tree_dict[parent].append(child) + + out_string = '' + out_string += write_node(tree_dict['-1'][0], tree_dict, name2group) + out_string += '' + + with io.open(out_file, 'wb') as outfile: + outfile.write(out_string.encode("utf8")) + + +def run_convert(cl_out_file, out_file, options, rowlabels): + """Running convert.""" + density = '150' + if options['analysis_type'] != 'plottree': + density = '300' + + if len(rowlabels) < 50: + density = '100' + + if len(rowlabels) < 25: + density = '50' + + # if options['analysis_type'] == 'plottree': + # resize = '50%' + # else: + # resize = '100%' + + # print density + call(['convert', '-density', density, cl_out_file, 'png:' + out_file]) + + +def run(input_file, output_file, options): + """Running cluto worker.""" + data, rowlabels = load_data(input_file + "/similarity.json") + if "analysis_type" not in options: + options["analysis_type"] = "plottree" + no_clusters = number_of_clusters(options, rowlabels) + temp_folder = tempfile.mkdtemp() + + if not os.path.exists(temp_folder): + os.mkdir(temp_folder) + + cluto_path = os.path.join(temp_folder, 'matrix.txt') + rlabel_path = os.path.join(temp_folder, 'documents_ids.txt') + cluto_out_path = os.path.join(temp_folder, 'cluto.ps') + + shutil.copy2(os.path.join(input_file, 'matrix.txt'), + os.path.join(temp_folder, 'matrix.txt')) + with io.open(rlabel_path, 'w') as rlabel_ofs: + for lab in rowlabels: + print(lab, file=rlabel_ofs) + + run_cluto(options, no_clusters, cluto_path, rlabel_path, + cluto_out_path, os.path.join(temp_folder, 'clutoout.txt')) + + if not os.path.exists(output_file): + os.mkdir(output_file) + shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'), + os.path.join(output_file, 'clutoout.txt')) + run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'), + os.path.join(output_file, 'result.json'), rowlabels, + os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters))) + run_convert(cluto_out_path, os.path.join(output_file, 'result.png'), + options, rowlabels) + + # for heatmap + to_heat_map_json(cluto_path, os.path.join(temp_folder, + 'matrix.txt.clustering.' + + str(no_clusters)), rowlabels, + output_file + "/data.json") + + # Check if they are required by any tool + shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters)), + os.path.join(output_file, 'result.clustering')) + shutil.copyfile(cluto_path, os.path.join(output_file, 'matrix.txt')) + joblib.dump(rowlabels, output_file + "/rowlabels.pkl") + + # Results in JSON: + with open(os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters)), 'rb') as f: + clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()] + + # to be deleted, but now required by visualisation + res = {"clusters": clusters, "rowlabels": rowlabels.tolist()} + with open(os.path.join(output_file, 'clusters.json'), 'w') as outfile: + json.dump(res, outfile) + + labels = get_lables_from_names(rowlabels) + labels["groupnames"]["clusters"] = list(set(clusters)) + labels["groups"]["clusters"] = clusters + with open(os.path.join(output_file, 'labels.json'), 'w') as outfile: + json.dump(labels, outfile) + + # results in XLSX + save_xlsx(rowlabels, os.path.join(temp_folder, 'matrix.txt.clustering.' + + str(no_clusters)), + os.path.join(output_file, 'result.xlsx')) + + # Coping results for next tools + # for visulisation (mds) + # similarity matrix + shutil.copyfile(os.path.join(input_file, 'similarity.json'), + os.path.join(output_file, 'similarity.json')) + shutil.copyfile(os.path.join(input_file, 'distance.json'), + os.path.join(output_file, 'distance.json')) + + # for featsel + # matrix after selection and weighting + shutil.copyfile(os.path.join(input_file, 'weighted.json'), + os.path.join(output_file, 'weighted.json')) + + # remove temp_folder + shutil.rmtree(temp_folder) + + +def get_lables_from_names(row_labels): + """Getting labels from names.""" + # data, data_cleaned,shortest_row_len, row_labels = get_data(row) + shortest_row_len = 10000000 + + data = [] + for i, t in enumerate(row_labels): + t = str(t.encode('utf-8')) + t = re.split(r"[,._\-:]", t) + t = list(map(str.strip, t)) + data.append(t) + if shortest_row_len > len(t): + shortest_row_len = len(t) + + repeating = set(data[0]) + for s in data[1:]: + repeating.intersection_update(s) + repeating = list(repeating) + + for i, d in enumerate(data): + for r in repeating: + if r in d: + d.remove(r) + data[i] = d + + first_lvl_categories = set() + first_lvl_name = 'first level' + + second_lvl_categories = set() + second_lvl_name = 'second level' + + last_lvl_categories = set() + last_lvl_name = 'last level' + + second_lvl_idx = 1 + if shortest_row_len < 2: + second_lvl_idx = 0 + + for row in data: + if len(row) <= second_lvl_idx: + second_lvl_idx = 0 + first_lvl_categories.add(row[0]) + + second_lvl_categories.add(row[second_lvl_idx]) + last_lvl_categories.add('_'.join(row[0:-1])) + + group_names = { + first_lvl_name: list(first_lvl_categories), + second_lvl_name: list(second_lvl_categories), + last_lvl_name: list(last_lvl_categories) + } + + groups = { + first_lvl_name: [], + second_lvl_name: [], + last_lvl_name: [] + } + + for i, row in enumerate(data): + groups[first_lvl_name].append(row[0]) + groups[second_lvl_name].append(row[second_lvl_idx]) + groups[last_lvl_name].append('_'.join(row[0:-1])) + + return { + 'rowlabels': row_labels.tolist(), + 'groups': groups, + 'groupnames': group_names + } diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000000000000000000000000000000..e2fafce9dca86d232f55aed4ed3e852c76ad7c46 --- /dev/null +++ b/tox.ini @@ -0,0 +1,46 @@ +[tox] +envlist = pep8,docstyle +skipsdist = True + +[testenv:pep8] +deps = + flake8 + pep8-naming +basepython = python2.7 +commands = + flake8 {posargs} + +[testenv:docstyle] +deps = + pydocstyle +basepython = python2.7 +commands = + pydocstyle --verbose {posargs} + +[flake8] +# W504 skipped because it is overeager and unnecessary +ignore = W504 +show-source = True +exclude = .git,.venv,.tox,dist,doc,*egg,build,venv +import-order-style = pep8 +max-line-length = 80 + + +[pydocstyle] +# D104 Missing docstring in public package +# D203 1 blank line required before class docstring +# D213 Multi-line docstring summary should start at the second line +# D214 Section is over-indented +# D215 Section underline is over-indented +# D401 First line should be in imperative mood; try rephrasing +# D405 Section name should be properly capitalized +# D406 Section name should end with a newline +# D407 Missing dashed underline after section +# D408 Section underline should be in the line following the section’s name +# D409 Section underline should match the length of its name +# D410 Missing blank line after section +# D411 Missing blank line before section +ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411,D100 +match-dir = ^(?!\.tox|venv).* +match = ^(?!setup).*\.py +