Commit 7381213e authored by Szymon Ciombor's avatar Szymon Ciombor

added CI, refactored

parent bf9ffbc3
Pipeline #1684 passed with stage
in 36 seconds
image: clarinpl/python:2.7
cache:
paths:
- .tox
before_script:
- pip install tox==2.9.1
pep8:
script:
- tox -v -e pep8
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/subfeatsel .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/subfeatsel
......@@ -21,10 +21,6 @@ RUN wget http://prdownloads.sourceforge.net/weka/weka-3-8-3.zip && \
unzip weka-3-8-3.zip
COPY requirements.txt .
RUN pip install -r requirements.txt
# Change it to pypi
COPY module/ /home/worker/
RUN python -m easy_install /home/worker/ltcore*
......@@ -35,4 +31,11 @@ WORKDIR /tmp
RUN wget http://www.nlp.pwr.wroc.pl/download/lexcsd/ltlearn-0.2.0.tar.gz && \
tar -xvf ltlearn-0.2.0.tar.gz && \
cd ltlearn-0.2.0 && \
python setup.py install
\ No newline at end of file
python setup.py install
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY requirements.txt .
RUN pip install -r requirements.txt
CMD ["python","main.py"]
\ No newline at end of file
......@@ -2,12 +2,12 @@
tool = subfeatsel
root = /samba/requests/
rabbit_host = rabbit.clarin.ws
rabbit_user = clarin
rabbit_password = clarin123
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
[tool]
workers_number = 12
workers_number = 2
weka_jar=/tmp/weka-3-8-3/weka.jar
[logging]
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Implementation of SubFeatSel worker."""
import nlp_ws
from src.subfeatsel_worker import FeatSelSubWorker
if __name__ == "__main__":
nlp_ws.NLPService.main(FeatSelSubWorker)
This diff is collapsed.
#!/usr/bin/python2
import json
import nlp_ws
from scipy import stats
import numpy,tempfile,scipy
from scipy.sparse import lil_matrix
import ltlearn.transformations.selection.weka.wekaattributeselection as was
from argparse import Namespace
from ltcore import SparseMatrix
import sys,shutil,time
from collections import namedtuple
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import base64
reload(sys)
sys.setdefaultencoding('utf8')
SCIPY_METHODS = ['MannWhitney', 'WilcoxonRankSum', 'KolmogorovSmirnov', 'KruskalWallis', 'TTest']
WEKA_METHODS = ['InfoGainAttributeEval', 'CfsSubsetEval', 'ConsistencySubsetEval',
'GainRatioAttributeEval', 'ChiSquaredAttributeEval']
ENSAMBLE_METHODS =['ExtraTrees','RandomForest','GradientBoosting','AdaBoost']
RFE_METHODS=['RFE_LogisticRegression','RFE_SVM','RFE_NaiveBayes']
stats_file = 'descriptive_features.csv'
verbose=True
class FeatureSelection(object):
"""
A class for feature selection with Weka or Scipy.
"""
def __init__(self, attr_eval, no_features_to_select, weka_jar,tmp_dir):
self.attr_eval = attr_eval
self.no_feats = no_features_to_select
self.weka_jar = weka_jar
self.tmp_dir= tmp_dir
if self.attr_eval in WEKA_METHODS:
if 'Subset' in self.attr_eval:
self.search_method = 'GreedyStepwise'
else:
self.search_method = 'Ranker'
def select_features(self, matrix):
"""
Select features differing classes from the matrix. Matrix has to have only two classes inside.
:return {cluster_id:[(feature, mean in that cluster, std in that cluster, mean in all the other clusters,
std in all the other clusters)]}
"""
if self.attr_eval in WEKA_METHODS:
return self.weka_feature_selection(matrix)
elif self.attr_eval in SCIPY_METHODS:
return self.scipy_feature_selection(matrix)
elif self.attr_eval in ENSAMBLE_METHODS:
return self.ensamble_feature_selection(matrix)
elif self.attr_eval in RFE_METHODS:
return self.rfe_feature_selection(matrix)
else:
raise Exception( 'Unrecognized feature selection method: %s.' % self.attr_eval);
def rfe_feature_selection(self, matrix):
"""
Select features with ENSAMBLE methods from sklearn
.
"""
print matrix.head_word
if self.attr_eval == 'RFE_LogisticRegression':
model = LogisticRegression()
elif self.attr_eval == 'RFE_SVM':
model = SVC(kernel='linear')
elif self.attr_eval == 'RFE_NaiveBayes':
model = MultinomialNB()
rfe = RFE(model, 2)
print type(matrix.data)
print "start"
rfe = rfe.fit(matrix.data, matrix.class_labels)
print "finished"
importances = (self.no_feats-(rfe.ranking_-1))/(self.no_feats*1.0);
indices = numpy.argsort(rfe.ranking_)[::1]
# Print the feature ranking
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
results = []
for f in range(len(matrix.column_labels)):
feature=matrix.column_labels[indices[f]]
if (importances[indices[f]]<0): break;
sample1 = get_column(subclasses[0], feature)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], feature)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append({"feature":feature.encode('utf-8'),"importance":importances[indices[f]], "mean_in":mean_in.item(), "std_in":std_in.item(), "mean_out":mean_out.item(), "std_out":std_out.item()})
return results
def ensamble_feature_selection(self, matrix):
"""
Select features with ENSAMBLE methods from sklearn
.
"""
print matrix.head_word
if self.attr_eval == 'ExtraTrees':
forest = ExtraTreesClassifier(n_estimators=250,max_features=self.no_feats,random_state=0,n_jobs=2)
elif self.attr_eval == 'RandomForest':
forest = RandomForestClassifier(n_estimators=250,max_features=self.no_feats,random_state=0,n_jobs=2)
elif self.attr_eval == 'GradientBoosting':
forest = GradientBoostingClassifier(n_estimators=250,max_features=self.no_feats,random_state=0)
elif self.attr_eval == 'AdaBoost':
forest = AdaBoostClassifier()
forest.fit(matrix.data, matrix.class_labels)
importances = forest.feature_importances_
indices = numpy.argsort(importances)[::-1]
# Print the feature ranking
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
results = []
ind=0;
for f in range(len(matrix.column_labels)):
feature=matrix.column_labels[indices[f]]
if (importances[indices[f]]<0.0001): break;
ind=ind+1;
if ind>self.no_feats: break;
sample1 = get_column(subclasses[0], feature)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], feature)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append({"feature":feature.encode('utf-8'),"importance":importances[indices[f]], "mean_in":mean_in.item(), "std_in":std_in.item(), "mean_out":mean_out.item(), "std_out":std_out.item()})
return results
def scipy_feature_selection(self, matrix):
"""
Select features with Scipy.
"""
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
matrix=[];
results = []
for col_name in subclasses[0].column_labels:
sample1 = get_column(subclasses[0], col_name)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], col_name)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
try:
if self.attr_eval == 'MannWhitney':
res = stats.mannwhitneyu(sample1, sample2)
elif self.attr_eval == 'WilcoxonRankSum':
res = stats.ranksums(sample1, sample2)
elif self.attr_eval == 'KolmogorovSmirnov':
res = stats.ks_2samp(sample1, sample2)
elif self.attr_eval == 'KruskalWallis':
res = stats.kruskal(sample1, sample2)
elif self.attr_eval == 'TTest':
res = stats.ttest_ind(sample1, sample2)
else:
print('No correlation test named %s' % self.attr_eval)
res = None
if res is not None and res[1] < 0.05: # res[1] --> p-value
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append((col_name, res[0], res[1], mean_in, std_in, mean_out, std_out))
except ValueError:
print '%s error for %s' % (self.attr_eval, col_name), set(sample1), set(sample2)
#return [(r[0].encode('utf-8'), r[3], r[4], r[5], r[6]) for r in sorted(results, key=byThirdTerm)]
return [{"feature":r[0].encode('utf-8'), "stat":r[1],"p-value":r[2],"mean_in":r[3], "std_in":r[4], "mean_out":r[5], "std_out":r[6]} for r in sorted(results, key=byThirdTerm)[:self.no_feats]]
def weka_feature_selection(self, matrix):
"""
Select features with Weka.
"""
print 'Selecting features for %s' % matrix.head_word
tmp_mat = SparseMatrix(matrix.head_word, matrix.class_labels, matrix.column_labels, matrix.contexts,
matrix.data.tolil(), boundaries=matrix.boundaries, possible_classes=matrix.possible_classes)
w = was.AttributeSelection()
w._weka_jar = self.weka_jar
w._work_dir = self.tmp_dir
w.set_attribute_evaluator(self.attr_eval)
w.set_search_method(self.search_method)
if self.no_feats is not None and 'Subset' not in self.attr_eval:
w.set_search_method_options('-N %s' % self.no_feats)
elif 'Subset' not in self.attr_eval:
print 'WARNING: number of features to select is not given in config file'
w.make(tmp_mat)
feats = w._selected_features[:-1]
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
results = []
for feature in feats:
sample1 = get_column(subclasses[0], feature)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], feature)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append({"feature":feature.encode('utf-8'), "mean_in":mean_in.item(), "std_in":std_in.item(), "mean_out":mean_out.item(), "std_out":std_out.item()})
return results
def split_matrix_by_classes(self,matrix):
oneclass_matrices = []
cls_dict = {}
for i, cls in enumerate(matrix.class_labels):
for c in cls.split(':'):
if c in cls_dict:
cls_dict[c].append(i)
else:
cls_dict[c] = [i]
for cls in cls_dict:
matrix.head_word = cls
oneclass_matrices.append(cut_matrix_by_indices(matrix, cls_dict[cls]))
return oneclass_matrices
def complete_with_zeros(sample, length):
"""
Extend a list with zeros, so that it has specified length in the end.
"""
return numpy.append(sample, numpy.zeros(length - len(sample)), 0)
def cut_matrix_by_indices(matrix, indices):
"""
Returns SparseMatrix with rows at given indices from given matrix.
"""
documents_ids = []
class_labels = []
contexts = []
data = lil_matrix((len(indices), matrix.data.shape[1]))
row = 0
for i in indices:
documents_ids.append(matrix.documents_ids[i])
class_labels.append(matrix.class_labels[i])
contexts.append(matrix.contexts[i])
data.data[row] = matrix.data.data[i]
data.rows[row] = matrix.data.rows[i]
row += 1
new_matrix = SparseMatrix(matrix.head_word, class_labels, matrix.column_labels, contexts, data, matrix.attributes,
matrix.boundaries, list(set(class_labels)), documents_ids)
new_matrix.data = new_matrix.data.tocsc()
return new_matrix
def get_column(matrix, col_name):
"""
Return data from a specific column.
"""
mat = matrix.data;
for i in range(len(matrix.column_labels)):
if matrix.column_labels[i] == col_name:
return mat.data[mat.indptr[i]:mat.indptr[i+1]]
return []
def get_size(obj, seen=None):
"""Recursively finds size of objects"""
size = sys.getsizeof(obj)
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
# Important mark as seen *before* entering recursion to gracefully handle
# self-referential objects
seen.add(obj_id)
if isinstance(obj, dict):
size += sum([get_size(v, seen) for v in obj.values()])
size += sum([get_size(k, seen) for k in obj.keys()])
elif hasattr(obj, '__dict__'):
size += get_size(obj.__dict__, seen)
elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
size += sum([get_size(i, seen) for i in obj])
return size
class FeatSelSubWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(cls, config):
cls.weka_jar = config['tool']['weka_jar'];
#def __init__(self):
# self.weka_jar='/usr/share/java/weka-3.6.10.jar'
# return
def init(self):
#self._docex = DocExtractor.from_config_dict(self._desconfig)
return
def process(self, input_path, task_options, output_path):
#attr_eval = 'InfoGainAttributeEval'
attr_eval = 'MannWhitney'
if 'method' in task_options: attr_eval=task_options['method'];
no_features_to_select = 100
if 'no_features_to_select' in task_options: no_features_to_select=task_options['no_features_to_select'];
temp_folder = tempfile.mkdtemp()
fs = FeatureSelection(attr_eval, no_features_to_select, self.weka_jar,temp_folder)
cluster_matrix = JSONfile2csr(input_path);
#cluster_matrix.data=cluster_matrix.data.tolil();
cluster_matrix.possible_classes = [u'true', u'false']
cluster_matrix.class_labels=task_options["class_labels"]
cluster_matrix.head_word=task_options["head_word"]
cluster_id = cluster_matrix.head_word
res = fs.select_features(cluster_matrix)
shutil.rmtree(temp_folder)
#result = self._docex.extract_from_document(input_path)
res={"result":res,"id":task_options["id"],"method":attr_eval};
with open(output_path, 'w') as result_ofs:
json.dump(res,result_ofs)
def byThirdTerm(feat_tuple):
return feat_tuple[2]
def get_group_class_lables(class_labels):
"""
This splitter splits data f matrix into a number of matrices (one for each possible class).
The outcome will be a list each corresponding to one class, with class_labels true or false.
e.g. if input matrix has class_labels looking like this:
class1:class2:class3
class1:class4
class2:class3
Matrix for class2 will have class_labels looking like this:
true
false
true
@type matrix: JSON matrix
@rtype: list(tempfile.NamedTemporaryFile)
"""
classes = []
for line in class_labels:
classes.extend(line.split(':'))
classes = set(classes)
#creating dict: {tag: [true, false, false ...]}
label_list_dict = {}
for tag in classes:
new_label_list = []
for line in class_labels:
if tag in line.split(':'):
new_label_list.append('true')
else:
new_label_list.append('false')
label_list_dict[tag] = new_label_list
res = []
for tag in classes:
matrix={};
matrix["class_labels"] = label_list_dict[tag]
matrix["head_word"] = tag
res.append(matrix)
return res
def sparse2JSON(filename,cluster_matrix):
input_matrix["data"]=scipy.sparse.lil.lil_matrix(input_matrix["data"]);
array=cluster_matrix["data"]
print array.shape
cluster_matrix["data"]=dict();
cluster_matrix["data"]["data"]=array.data.tolist();
cluster_matrix["data"]["rows"]=array.rows.tolist();
cluster_matrix["data"]["shape"]=array.shape;
cluster_matrix["data"]["dtype"]=array.dtype.str;
with open(filename,'w') as outfile:
json.dump(cluster_matrix, outfile);
def csr2JSONfile(filename,cluster_matrix):
array=cluster_matrix["data"]
cluster_matrix["data"]=dict();
cluster_matrix["data"]["data"]=array.data.tolist();
cluster_matrix["data"]["indices"]=array.indices.tolist();
cluster_matrix["data"]["shape"]=array.shape;
cluster_matrix["data"]["indptr"]=array.indptr.tolist();
with open(filename,'w') as outfile:
json.dump(cluster_matrix, outfile);
def csrObject2JSONfile(filename,cluster_matrix):
array=cluster_matrix.data;
cluster_matrix.data=dict();
cluster_matrix.data["data"]=array.data.tolist();
cluster_matrix.data["indices"]=array.indices.tolist();
cluster_matrix.data["shape"]=array.shape;
cluster_matrix.data["indptr"]=array.indptr.tolist();
with open(filename,'w') as outfile:
json.dump(cluster_matrix.__dict__, outfile);
def JSONfile2csr(inputFile):
with open(inputFile) as json_data:
cluster_matrix = json.load(json_data,object_hook=lambda d: Namespace(**d))
array=cluster_matrix.data;
cluster_matrix.data=scipy.sparse.csr_matrix((numpy.asarray(array.data), array.indices, array.indptr), shape=array.shape);
return cluster_matrix;
def create():
inputFile='data/input_matrix.json';
with open(inputFile) as json_data:
input_matrix = json.load(json_data);
input_matrix["data"]=scipy.sparse.csr_matrix(input_matrix["data"]);
csr2JSONfile("datas1.json",input_matrix);
def test():
inputFile='data1.json';
input_matrix=JSONfile2csr(inputFile);
matrices_labels = get_group_class_lables(input_matrix.class_labels)
submatrix=matrices_labels[1];
option={'method':'MannWhitney' ,'tree_tmp':'RFE_NaiveBayes ExtraTrees MannWhitney',
'no_features_to_select':100,
"class_labels":submatrix["class_labels"],"head_word":submatrix["head_word"],"id":submatrix["head_word"]}
#clustering_results()
start_time = time.time()
w=FeatSelSubWorker();
w.process(inputFile,option,'res.json');
print "Processed in %f"%(time.time()-start_time);
if __name__ == '__main__':
nlp_ws.NLPService.main(FeatSelSubWorker)
#test()
[tox]
envlist = pep8,docstyle
skipsdist = True