Initial commit

a4c48b7f · Tomasz Walkowiak · a4c48b7f · a4c48b7f · a4c48b7f · a4c48b7f
Commit a4c48b7f authored 5 years ago by Tomasz Walkowiak
--- a/Dockerfile
+++ b/Dockerfile
+FROM clarinpl/python:2.7
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+
+
+RUN mkdir  /home/worker  && \
+    cd  /home/worker && \
+    wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/cluto-2.1.2a.tar.gz && \
+    tar -xvf cluto-2.1.2a.tar.gz
+    
+    
+RUN apt-get -y update && apt-get -y install  imagemagick
+RUN  sed -i 's/\(<policy domain="coder" rights=\)"none" \(pattern="PS" \/>\)/\1"read|write"\2/g' /etc/ImageMagick-6/policy.xml
+RUN less  /etc/ImageM*/policy.xml
--- a/cluto.py
+++ b/cluto.py
+#!/usr/bin/python
+from __future__ import print_function
+import argparse as _argp
+import csv as _csv
+import json,re,os
+import io
+
+import numpy as _np
+import time,glob,os,shutil,tempfile
+from subprocess import call
+from sklearn.externals import joblib
+import xlsxwriter
+
+verbose = False
+
+def loadData(inputFile):
+	with open(inputFile) as json_ifs:
+			jsonVal = json.load(json_ifs)
+			rowlabels=_np.asarray(jsonVal["rowlabels"])
+			data=_np.asarray(jsonVal["arr"]);
+			jsonVal["arr"]=None
+			return data, rowlabels
+
+
+
+def saveXLSX(names,clustering_path,outfile):
+	srow=3;scol=4;
+	with open(clustering_path) as f:
+			groups = f.readlines()
+	names_out = []
+	ind=0
+	workbook = xlsxwriter.Workbook(outfile)
+	worksheet = workbook.add_worksheet("result");
+	worksheet.write(srow,scol,'Nazwy')
+	worksheet.write(srow,scol+1,'Grupa')
+	srow+=1
+	for name in names:
+		worksheet.write(srow,scol,name)
+		worksheet.write(srow,scol+1,groups[ind])
+		srow+=1
+		ind=ind+1
+	workbook.close()
+		
+
+def toHeatMapJSON(cluto_path,clustering_path,names,outfile):
+		with open(clustering_path) as f:
+			groups = f.readlines()
+		names_out = []
+		ind=0
+		for name in names:
+			tmp_hsh = {
+				'name': name,
+				'group': groups[ind].strip()
+			}
+			names_out.append(tmp_hsh)
+			ind=ind+1
+
+		array = []
+		line_num=0
+		with open(cluto_path) as f:
+			content = f.readlines()
+
+		regex = r"\d+\s[0-9]*\.?[0-9]+"
+		for line in content[1:]:
+			arr = re.findall(regex, line)
+			for node in arr:
+				node = node.split()
+				tmp_hsh = {
+					'source': str(line_num),
+					'target': str((int(node[0]) - 1)),
+					'value': str(float(node[1]))
+				}
+				array.append(tmp_hsh)
+			line_num += 1
+
+		out = {'nodes': names_out, 'links': array}
+		json_hsh = json.dumps(out)
+		with open(outfile, 'w') as outfile:
+			outfile.write(json_hsh)
+
+	
+# Reads data from set of csvs from fextor
+# Creats matrix and normalise it (divides by tok_count)
+
+def number_of_clusters(options,rowlabels):
+	if 'no_clusters' in options:
+		no_clusters=options['no_clusters']
+		if not isinstance( no_clusters, int ):
+			no_clusters=2
+		if no_clusters<2:
+			no_clusters=2
+	else:
+		no_clusters=2
+	
+	if int(no_clusters) > len(rowlabels):
+		no_clusters = str(len(rowlabels))
+		
+	return no_clusters
+
+def save_clutofiles(mat,rlabels,clabels,cluto_path,rlabel_path,clabel_path):
+	# Save cluto file
+	with open(cluto_path, 'w') as cluto_ofs:
+		# Print header:
+		# <num_rows> <num_cols> <num_nonzero>
+		print(
+				len(rlabels),
+				len(clabels),
+				_np.count_nonzero(mat),
+				file=cluto_ofs,
+				)
+
+		for row in mat:
+			buf = []
+
+			for idx in row.nonzero()[0]:
+				buf.append('{} {}'.format(idx+1, row[idx]))
+
+			print(' '.join(buf), file=cluto_ofs)
+
+	# Save label files
+	with io.open(rlabel_path, 'w') as rlabel_ofs:
+		for lab in rlabels:
+			print(lab, file=rlabel_ofs)
+
+	with io.open(clabel_path, 'w') as clabel_ofs:
+		for lab in clabels:
+			print(lab, file=clabel_ofs)
+
+def run_cluto(options,no_clusters,cluto_input_file,rlabel_path,cl_out_file,clutoout):
+	
+	cluto_path="./cluto-2.1.2/Linux-x86_64/scluster"
+	with open(clutoout, "w") as outfile:
+		call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree',
+			  '-rlabelfile', rlabel_path,
+			  '-plotformat','ps',
+			  '-'+options['analysis_type']+'=' + cl_out_file],stdout=outfile)
+		
+		#print("fulltree")
+				
+
+def write_node(node_id, tree_dict,name2group):
+	
+	child_node_strings = []
+
+	if node_id in tree_dict:
+		for child in tree_dict[node_id]:
+			child_node_strings.append(write_node(child, tree_dict,name2group))
+			
+	if len(child_node_strings)==0:
+		node_str = '{"id":"node_' + node_id + '", "group":' + str(name2group[node_id]) + ', "name":"' + node_id + '", "data":{}, "children":['
+	else:		
+		node_str = '{"id":"node_' + node_id + '", "name":"' + node_id + '", "data":{}, "children":['
+	node_str += ', '.join(child_node_strings)
+	node_str += ']}'
+	return node_str
+				
+def run_convert2json(cl_out_file,out_file,labels,clustering_path):
+	with open(clustering_path) as f:
+		groups = f.readlines()
+	name2group={}
+	for i,gr in enumerate(groups):
+		name2group[labels[i]]=int(gr)
+	
+	tree_dict = {}
+	with open(cl_out_file, 'rb') as infile:
+		for i, line in enumerate(infile.readlines()):
+			if i < len(labels):
+				child = labels[i]
+			else:
+				child = str(i)
+			parent = line.split(' ')[0]
+			if parent not in tree_dict:
+				tree_dict[parent] = [child]
+			else:
+				tree_dict[parent].append(child)
+
+	out_string = ''
+	out_string += write_node(tree_dict['-1'][0], tree_dict,name2group)
+	out_string += ''
+
+	with io.open(out_file, 'wb') as outfile:
+		outfile.write(out_string.encode("utf8"))
+
+				
+def run_convert(cl_out_file,out_file,options,rowlabels):
+	density='150'
+	if options['analysis_type']!='plottree':
+		density='300'
+	
+	if len(rowlabels)<50:
+		density='100'
+
+	if len(rowlabels)<25:
+		density='50'
+	
+		
+	if options['analysis_type']=='plottree':
+			resize='50%'
+	else:
+			resize='100%'
+	
+	#print density
+	call(['convert','-density',density,cl_out_file,'png:'+out_file])
+	
+def run(inputFile, outputFile, options):
+	
+	data,rowlabels=loadData(inputFile+"/similarity.json");
+	if not "analysis_type" in options:
+		options["analysis_type"]="plottree"	;
+	no_clusters=number_of_clusters(options,rowlabels)	
+	temp_folder = tempfile.mkdtemp()
+	
+	if not os.path.exists(temp_folder):
+			os.mkdir(temp_folder)
+	
+	cluto_path=os.path.join(temp_folder, 'matrix.txt');
+	rlabel_path=os.path.join(temp_folder, 'documents_ids.txt');
+	cluto_out_path=os.path.join(temp_folder, 'cluto.ps');
+
+	shutil.copy2(os.path.join(inputFile, 'matrix.txt'),os.path.join(temp_folder, 'matrix.txt'))
+	with io.open(rlabel_path, 'w') as rlabel_ofs:
+		for lab in rowlabels:
+			print(lab, file=rlabel_ofs)
+	
+	
+	run_cluto(options,no_clusters,cluto_path,rlabel_path,cluto_out_path,os.path.join(temp_folder, 'clutoout.txt')) 
+	
+	if not os.path.exists(outputFile):
+		os.mkdir(outputFile)
+	shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'),os.path.join(outputFile,'clutoout.txt'))	
+	run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'),os.path.join(outputFile,'result.json'),rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)))
+	run_convert(cluto_out_path,os.path.join(outputFile,'result.png'),options,rowlabels);
+	
+	#for heatmap
+	toHeatMapJSON(cluto_path,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),rowlabels,outputFile+"/data.json");
+	
+	
+	#Check if they are required by any tool
+	shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.clustering'))
+	shutil.copyfile(cluto_path,os.path.join(outputFile,'matrix.txt'))
+	joblib.dump(rowlabels,outputFile+"/rowlabels.pkl");
+	
+	#Results in JSON: 
+	with open(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)), 'rb') as f:
+		clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()]
+	
+	
+	#to be deleted, but now required by visualisation
+	res={"clusters":clusters,"rowlabels":rowlabels.tolist()}
+	with open(os.path.join(outputFile,'clusters.json'), 'w') as outfile:
+		json.dump(res, outfile)
+	
+	labels=getLablesFromNames(rowlabels);
+	labels["groupnames"]["clusters"]=list(set(clusters));
+	labels["groups"]["clusters"]=clusters;	
+	with open(os.path.join(outputFile,'labels.json'), 'w') as outfile:
+		json.dump(labels, outfile)
+	
+	
+	#results in XLSX
+	saveXLSX(rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.xlsx'))
+	
+	#Coping results for next tools
+	#for visulisation (mds)
+	#similarity matrix 
+	shutil.copyfile(os.path.join(inputFile, 'similarity.json'),os.path.join(outputFile,'similarity.json'))
+	shutil.copyfile(os.path.join(inputFile, 'distance.json'),os.path.join(outputFile,'distance.json'))
+	
+	#for featsel
+	#matrix after selection and weighting
+	shutil.copyfile(os.path.join(inputFile, 'weighted.json'),os.path.join(outputFile,'weighted.json'))
+	
+	#remove temp_folder	
+	shutil.rmtree(temp_folder)	
+
+	
+def getLablesFromNames(row_labels):
+    #data, data_cleaned,shortest_row_len, row_labels = get_data(row)
+    shortest_row_len = 10000000
+
+    data=[];
+    for i, t in enumerate(row_labels):
+        t=str(t.encode('utf-8'));
+        t = re.split("[,._\-:]", t)
+        t = list(map(str.strip, t))
+        data.append(t)
+        if shortest_row_len > len(t):
+            shortest_row_len = len(t)
+
+    repeating = set(data[0])
+    for s in data[1:]:
+        repeating.intersection_update(s)
+    repeating = list(repeating)
+
+    for i, d in enumerate(data):
+        for r in repeating:
+            if r in d:
+                d.remove(r)
+                data[i] = d
+
+	
+    first_lvl_categories = set()
+    first_lvl_name = 'first level'
+
+    second_lvl_categories = set()
+    second_lvl_name = 'second level'
+
+    last_lvl_categories = set()
+    last_lvl_name = 'last level'
+
+    second_lvl_idx = 1
+    if shortest_row_len < 2:
+        second_lvl_idx = 0
+    
+    
+    for row in data:
+        if len(row)<=second_lvl_idx:
+            second_lvl_idx=0;
+        first_lvl_categories.add(row[0])
+        
+        second_lvl_categories.add(row[second_lvl_idx])
+        last_lvl_categories.add('_'.join(row[0:-1]))
+
+    group_names = {
+        first_lvl_name: list(first_lvl_categories),
+        second_lvl_name: list(second_lvl_categories),
+        last_lvl_name: list(last_lvl_categories)
+    }
+
+    groups = {
+        first_lvl_name: [],
+        second_lvl_name: [],
+        last_lvl_name: []
+    }
+
+    for i, row in enumerate(data):
+        groups[first_lvl_name].append(row[0])
+        groups[second_lvl_name].append(row[second_lvl_idx])
+        groups[last_lvl_name].append('_'.join(row[0:-1]))
+
+
+    return {
+		'rowlabels':row_labels.tolist(),
+        'groups': groups,
+        'groupnames': group_names
+    }
+	
+	
+def test0():
+	cluto_path="./cluto-2.1.2/Linux-x86_64/scluster"
+	no_clusters=2;
+	options={};
+	cluto_input_file="test/dane.bin"
+	options['analysis_type']='plottree'
+	cl_out_file="out.ps"
+	rowlabels=[];
+	out_file="out.png"
+	call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree',
+			  '-plotformat','ps',
+			  '-'+options['analysis_type']+'=' + cl_out_file])
+	run_convert(cl_out_file,out_file,options,rowlabels)		  
+
+def test1():
+	run("in","out",{})
+	
+if __name__ == '__main__':
+	test1();
+	
\ No newline at end of file
--- a/cluto_worker.py
+++ b/cluto_worker.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import nlp_ws
+import cluto
+
+class ClutoWorker(nlp_ws.NLPWorker):
+	#def init(self):
+		#self.logger.log(INFO, "Iobber model loaded form "+ self.config['model-dir'])
+
+	def process(self, inputFile, taskOptions, outputFile):	
+		cluto.run(inputFile,outputFile,taskOptions)
+			
+
+if __name__ == '__main__':
+	nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True)
+	
--- a/config.ini
+++ b/config.ini
+[service]
+root = /samba/requests/
+tool = cluto
+rabbit_host =rabbit.clarin.ws
+rabbit_user =clarin
+rabbit_password =clarin123
+
+[tool]
+workers_number = 4
+
+[logging]
+port = 9993
+local_log_level = INFO
+
--- a/doc.old
+++ b/doc.old
+FROM builder AS builder
+
+WORKDIR /tmp
+
+## Supermatrix
+COPY supermatrix ./supermatrix
+RUN cd supermatrix/build && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    make -j6 && \
+    make install DESTDIR="/install" && \
+    ldconfig
+
+FROM worker-python
+
+
+RUN apt-get -y update && apt-get install -y \
+    gettext \
+    libboost-filesystem-dev \
+    libboost-iostreams-dev \
+    libboost-program-options-dev \
+    libboost-regex-dev \
+    libboost-system-dev \
+    libicu-dev \
+    libgettextpo-dev \
+    libloki-dev \
+    libxml++2.6-dev \
+    r-base \
+    && rm -rf /var/lib/apt/lists/*
+
+
+RUN pip install \
+        gensim \
+        joblib \
+        pyyaml \
+        rpy2==2.7.2 \
+        scipy \
+        sklearn \
+        sqlalchemy \
+        xlsxwriter
+
+COPY external/ external/
+RUN cd external/ && pip install \
+    colte-0.1.tar.gz \
+    corpus2_filtering.tar.gz \
+    Desifex-4.0.tar.gz \
+    growdict-0.1.tar.gz
+
+
+COPY --from=builder /install/usr/ /usr/
+
+# RUN ln /usr/lib/x86_64-linux-gnu/libicuuc.so.57 /usr/lib/x86_64-linux-gnu/libicuuc.so.55 && \
+#     ln /usr/lib/x86_64-linux-gnu/libicuio.so.57 /usr/lib/x86_64-linux-gnu/libicuio.so.55
+
+# ENV PYTHONPATH="${PYTHONPATH}:/usr/local/lib/python2.7/dist-packages/"
+
+
+RUN ldconfig
+# RUN pwd
+COPY workers/ /workers/
+COPY init.sh /
+
+# Expose ports.
+EXPOSE 5672
+
+ENTRYPOINT ["/init.sh"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
+version: '3'
+services:  
+  cluto:
+    container_name: clarin_cluto
+    build: ./
+    
+    working_dir: /home/worker
+    entrypoint:
+          - python2
+          - cluto_worker.py
+    volumes:
+        - /samba:/samba
+        - ./config.ini:/home/worker/config.ini        
+        - ./cluto_worker.py:/home/worker/cluto_worker.py        
+        - ./cluto.py:/home/worker/cluto.py           
+    restart: always    
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+nlp-ws
+numpy==1.14.3
+scikit-learn==0.19.0
+scipy==0.19.1
+xlsxwriter
\ No newline at end of file