Commit a4c48b7f authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Initial commit

parents
FROM clarinpl/python:2.7
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN mkdir /home/worker && \
cd /home/worker && \
wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/cluto/cluto-2.1.2a.tar.gz && \
tar -xvf cluto-2.1.2a.tar.gz
RUN apt-get -y update && apt-get -y install imagemagick
RUN sed -i 's/\(<policy domain="coder" rights=\)"none" \(pattern="PS" \/>\)/\1"read|write"\2/g' /etc/ImageMagick-6/policy.xml
RUN less /etc/ImageM*/policy.xml
#!/usr/bin/python
from __future__ import print_function
import argparse as _argp
import csv as _csv
import json,re,os
import io
import numpy as _np
import time,glob,os,shutil,tempfile
from subprocess import call
from sklearn.externals import joblib
import xlsxwriter
verbose = False
def loadData(inputFile):
with open(inputFile) as json_ifs:
jsonVal = json.load(json_ifs)
rowlabels=_np.asarray(jsonVal["rowlabels"])
data=_np.asarray(jsonVal["arr"]);
jsonVal["arr"]=None
return data, rowlabels
def saveXLSX(names,clustering_path,outfile):
srow=3;scol=4;
with open(clustering_path) as f:
groups = f.readlines()
names_out = []
ind=0
workbook = xlsxwriter.Workbook(outfile)
worksheet = workbook.add_worksheet("result");
worksheet.write(srow,scol,'Nazwy')
worksheet.write(srow,scol+1,'Grupa')
srow+=1
for name in names:
worksheet.write(srow,scol,name)
worksheet.write(srow,scol+1,groups[ind])
srow+=1
ind=ind+1
workbook.close()
def toHeatMapJSON(cluto_path,clustering_path,names,outfile):
with open(clustering_path) as f:
groups = f.readlines()
names_out = []
ind=0
for name in names:
tmp_hsh = {
'name': name,
'group': groups[ind].strip()
}
names_out.append(tmp_hsh)
ind=ind+1
array = []
line_num=0
with open(cluto_path) as f:
content = f.readlines()
regex = r"\d+\s[0-9]*\.?[0-9]+"
for line in content[1:]:
arr = re.findall(regex, line)
for node in arr:
node = node.split()
tmp_hsh = {
'source': str(line_num),
'target': str((int(node[0]) - 1)),
'value': str(float(node[1]))
}
array.append(tmp_hsh)
line_num += 1
out = {'nodes': names_out, 'links': array}
json_hsh = json.dumps(out)
with open(outfile, 'w') as outfile:
outfile.write(json_hsh)
# Reads data from set of csvs from fextor
# Creats matrix and normalise it (divides by tok_count)
def number_of_clusters(options,rowlabels):
if 'no_clusters' in options:
no_clusters=options['no_clusters']
if not isinstance( no_clusters, int ):
no_clusters=2
if no_clusters<2:
no_clusters=2
else:
no_clusters=2
if int(no_clusters) > len(rowlabels):
no_clusters = str(len(rowlabels))
return no_clusters
def save_clutofiles(mat,rlabels,clabels,cluto_path,rlabel_path,clabel_path):
# Save cluto file
with open(cluto_path, 'w') as cluto_ofs:
# Print header:
# <num_rows> <num_cols> <num_nonzero>
print(
len(rlabels),
len(clabels),
_np.count_nonzero(mat),
file=cluto_ofs,
)
for row in mat:
buf = []
for idx in row.nonzero()[0]:
buf.append('{} {}'.format(idx+1, row[idx]))
print(' '.join(buf), file=cluto_ofs)
# Save label files
with io.open(rlabel_path, 'w') as rlabel_ofs:
for lab in rlabels:
print(lab, file=rlabel_ofs)
with io.open(clabel_path, 'w') as clabel_ofs:
for lab in clabels:
print(lab, file=clabel_ofs)
def run_cluto(options,no_clusters,cluto_input_file,rlabel_path,cl_out_file,clutoout):
cluto_path="./cluto-2.1.2/Linux-x86_64/scluster"
with open(clutoout, "w") as outfile:
call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree',
'-rlabelfile', rlabel_path,
'-plotformat','ps',
'-'+options['analysis_type']+'=' + cl_out_file],stdout=outfile)
#print("fulltree")
def write_node(node_id, tree_dict,name2group):
child_node_strings = []
if node_id in tree_dict:
for child in tree_dict[node_id]:
child_node_strings.append(write_node(child, tree_dict,name2group))
if len(child_node_strings)==0:
node_str = '{"id":"node_' + node_id + '", "group":' + str(name2group[node_id]) + ', "name":"' + node_id + '", "data":{}, "children":['
else:
node_str = '{"id":"node_' + node_id + '", "name":"' + node_id + '", "data":{}, "children":['
node_str += ', '.join(child_node_strings)
node_str += ']}'
return node_str
def run_convert2json(cl_out_file,out_file,labels,clustering_path):
with open(clustering_path) as f:
groups = f.readlines()
name2group={}
for i,gr in enumerate(groups):
name2group[labels[i]]=int(gr)
tree_dict = {}
with open(cl_out_file, 'rb') as infile:
for i, line in enumerate(infile.readlines()):
if i < len(labels):
child = labels[i]
else:
child = str(i)
parent = line.split(' ')[0]
if parent not in tree_dict:
tree_dict[parent] = [child]
else:
tree_dict[parent].append(child)
out_string = ''
out_string += write_node(tree_dict['-1'][0], tree_dict,name2group)
out_string += ''
with io.open(out_file, 'wb') as outfile:
outfile.write(out_string.encode("utf8"))
def run_convert(cl_out_file,out_file,options,rowlabels):
density='150'
if options['analysis_type']!='plottree':
density='300'
if len(rowlabels)<50:
density='100'
if len(rowlabels)<25:
density='50'
if options['analysis_type']=='plottree':
resize='50%'
else:
resize='100%'
#print density
call(['convert','-density',density,cl_out_file,'png:'+out_file])
def run(inputFile, outputFile, options):
data,rowlabels=loadData(inputFile+"/similarity.json");
if not "analysis_type" in options:
options["analysis_type"]="plottree" ;
no_clusters=number_of_clusters(options,rowlabels)
temp_folder = tempfile.mkdtemp()
if not os.path.exists(temp_folder):
os.mkdir(temp_folder)
cluto_path=os.path.join(temp_folder, 'matrix.txt');
rlabel_path=os.path.join(temp_folder, 'documents_ids.txt');
cluto_out_path=os.path.join(temp_folder, 'cluto.ps');
shutil.copy2(os.path.join(inputFile, 'matrix.txt'),os.path.join(temp_folder, 'matrix.txt'))
with io.open(rlabel_path, 'w') as rlabel_ofs:
for lab in rowlabels:
print(lab, file=rlabel_ofs)
run_cluto(options,no_clusters,cluto_path,rlabel_path,cluto_out_path,os.path.join(temp_folder, 'clutoout.txt'))
if not os.path.exists(outputFile):
os.mkdir(outputFile)
shutil.copyfile(os.path.join(temp_folder, 'clutoout.txt'),os.path.join(outputFile,'clutoout.txt'))
run_convert2json(os.path.join(temp_folder, 'matrix.txt.tree'),os.path.join(outputFile,'result.json'),rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)))
run_convert(cluto_out_path,os.path.join(outputFile,'result.png'),options,rowlabels);
#for heatmap
toHeatMapJSON(cluto_path,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),rowlabels,outputFile+"/data.json");
#Check if they are required by any tool
shutil.copyfile(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.clustering'))
shutil.copyfile(cluto_path,os.path.join(outputFile,'matrix.txt'))
joblib.dump(rowlabels,outputFile+"/rowlabels.pkl");
#Results in JSON:
with open(os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)), 'rb') as f:
clusters = [cluster_id.strip('\n') for cluster_id in f.readlines()]
#to be deleted, but now required by visualisation
res={"clusters":clusters,"rowlabels":rowlabels.tolist()}
with open(os.path.join(outputFile,'clusters.json'), 'w') as outfile:
json.dump(res, outfile)
labels=getLablesFromNames(rowlabels);
labels["groupnames"]["clusters"]=list(set(clusters));
labels["groups"]["clusters"]=clusters;
with open(os.path.join(outputFile,'labels.json'), 'w') as outfile:
json.dump(labels, outfile)
#results in XLSX
saveXLSX(rowlabels,os.path.join(temp_folder, 'matrix.txt.clustering.'+str(no_clusters)),os.path.join(outputFile,'result.xlsx'))
#Coping results for next tools
#for visulisation (mds)
#similarity matrix
shutil.copyfile(os.path.join(inputFile, 'similarity.json'),os.path.join(outputFile,'similarity.json'))
shutil.copyfile(os.path.join(inputFile, 'distance.json'),os.path.join(outputFile,'distance.json'))
#for featsel
#matrix after selection and weighting
shutil.copyfile(os.path.join(inputFile, 'weighted.json'),os.path.join(outputFile,'weighted.json'))
#remove temp_folder
shutil.rmtree(temp_folder)
def getLablesFromNames(row_labels):
#data, data_cleaned,shortest_row_len, row_labels = get_data(row)
shortest_row_len = 10000000
data=[];
for i, t in enumerate(row_labels):
t=str(t.encode('utf-8'));
t = re.split("[,._\-:]", t)
t = list(map(str.strip, t))
data.append(t)
if shortest_row_len > len(t):
shortest_row_len = len(t)
repeating = set(data[0])
for s in data[1:]:
repeating.intersection_update(s)
repeating = list(repeating)
for i, d in enumerate(data):
for r in repeating:
if r in d:
d.remove(r)
data[i] = d
first_lvl_categories = set()
first_lvl_name = 'first level'
second_lvl_categories = set()
second_lvl_name = 'second level'
last_lvl_categories = set()
last_lvl_name = 'last level'
second_lvl_idx = 1
if shortest_row_len < 2:
second_lvl_idx = 0
for row in data:
if len(row)<=second_lvl_idx:
second_lvl_idx=0;
first_lvl_categories.add(row[0])
second_lvl_categories.add(row[second_lvl_idx])
last_lvl_categories.add('_'.join(row[0:-1]))
group_names = {
first_lvl_name: list(first_lvl_categories),
second_lvl_name: list(second_lvl_categories),
last_lvl_name: list(last_lvl_categories)
}
groups = {
first_lvl_name: [],
second_lvl_name: [],
last_lvl_name: []
}
for i, row in enumerate(data):
groups[first_lvl_name].append(row[0])
groups[second_lvl_name].append(row[second_lvl_idx])
groups[last_lvl_name].append('_'.join(row[0:-1]))
return {
'rowlabels':row_labels.tolist(),
'groups': groups,
'groupnames': group_names
}
def test0():
cluto_path="./cluto-2.1.2/Linux-x86_64/scluster"
no_clusters=2;
options={};
cluto_input_file="test/dane.bin"
options['analysis_type']='plottree'
cl_out_file="out.ps"
rowlabels=[];
out_file="out.png"
call([cluto_path, cluto_input_file, str(no_clusters), '-fulltree',
'-plotformat','ps',
'-'+options['analysis_type']+'=' + cl_out_file])
run_convert(cl_out_file,out_file,options,rowlabels)
def test1():
run("in","out",{})
if __name__ == '__main__':
test1();
\ No newline at end of file
#!/usr/bin/python
# -*- coding: utf-8 -*-
import nlp_ws
import cluto
class ClutoWorker(nlp_ws.NLPWorker):
#def init(self):
#self.logger.log(INFO, "Iobber model loaded form "+ self.config['model-dir'])
def process(self, inputFile, taskOptions, outputFile):
cluto.run(inputFile,outputFile,taskOptions)
if __name__ == '__main__':
nlp_ws.NLPService.main(ClutoWorker, pause_at_exit=True)
[service]
root = /samba/requests/
tool = cluto
rabbit_host =rabbit.clarin.ws
rabbit_user =clarin
rabbit_password =clarin123
[tool]
workers_number = 4
[logging]
port = 9993
local_log_level = INFO
FROM builder AS builder
WORKDIR /tmp
## Supermatrix
COPY supermatrix ./supermatrix
RUN cd supermatrix/build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j6 && \
make install DESTDIR="/install" && \
ldconfig
FROM worker-python
RUN apt-get -y update && apt-get install -y \
gettext \
libboost-filesystem-dev \
libboost-iostreams-dev \
libboost-program-options-dev \
libboost-regex-dev \
libboost-system-dev \
libicu-dev \
libgettextpo-dev \
libloki-dev \
libxml++2.6-dev \
r-base \
&& rm -rf /var/lib/apt/lists/*
RUN pip install \
gensim \
joblib \
pyyaml \
rpy2==2.7.2 \
scipy \
sklearn \
sqlalchemy \
xlsxwriter
COPY external/ external/
RUN cd external/ && pip install \
colte-0.1.tar.gz \
corpus2_filtering.tar.gz \
Desifex-4.0.tar.gz \
growdict-0.1.tar.gz
COPY --from=builder /install/usr/ /usr/
# RUN ln /usr/lib/x86_64-linux-gnu/libicuuc.so.57 /usr/lib/x86_64-linux-gnu/libicuuc.so.55 && \
# ln /usr/lib/x86_64-linux-gnu/libicuio.so.57 /usr/lib/x86_64-linux-gnu/libicuio.so.55
# ENV PYTHONPATH="${PYTHONPATH}:/usr/local/lib/python2.7/dist-packages/"
RUN ldconfig
# RUN pwd
COPY workers/ /workers/
COPY init.sh /
# Expose ports.
EXPOSE 5672
ENTRYPOINT ["/init.sh"]
version: '3'
services:
cluto:
container_name: clarin_cluto
build: ./
working_dir: /home/worker
entrypoint:
- python2
- cluto_worker.py
volumes:
- /samba:/samba
- ./config.ini:/home/worker/config.ini
- ./cluto_worker.py:/home/worker/cluto_worker.py
- ./cluto.py:/home/worker/cluto.py
restart: always
\ No newline at end of file
nlp-ws
numpy==1.14.3
scikit-learn==0.19.0
scipy==0.19.1
xlsxwriter
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment