Skip to content
Snippets Groups Projects
Commit 7af2488f authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski
Browse files

Merge branch 'dev' into 'master'

added CI, refactored

See merge request !1
parents bf9ffbc3 e1fe275d
Branches
No related tags found
1 merge request!1added CI, refactored
Pipeline #1741 passed
image: clarinpl/python:2.7
cache:
paths:
- .tox
before_script:
- pip install tox==2.9.1
pep8:
script:
- tox -v -e pep8
build_image:
stage: build
image: 'docker:18.09.7'
only:
- master
services:
- 'docker:18.09.7-dind'
before_script:
- ''
script:
- docker build -t clarinpl/subfeatsel .
- echo $DOCKER_PASSWORD > pass.txt
- cat pass.txt | docker login --username $DOCKER_USERNAME --password-stdin
- rm pass.txt
- docker push clarinpl/subfeatsel
......@@ -20,19 +20,9 @@ WORKDIR /tmp
RUN wget http://prdownloads.sourceforge.net/weka/weka-3-8-3.zip && \
unzip weka-3-8-3.zip
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY requirements.txt .
RUN pip install -r requirements.txt
# Change it to pypi
COPY module/ /home/worker/
RUN python -m easy_install /home/worker/ltcore*
# Change it to pypi
WORKDIR /tmp
RUN wget http://www.nlp.pwr.wroc.pl/download/lexcsd/ltlearn-0.2.0.tar.gz && \
tar -xvf ltlearn-0.2.0.tar.gz && \
cd ltlearn-0.2.0 && \
python setup.py install
\ No newline at end of file
CMD ["python","main.py"]
......@@ -2,12 +2,12 @@
tool = subfeatsel
root = /samba/requests/
rabbit_host = rabbit.clarin.ws
rabbit_user = clarin
rabbit_password = clarin123
rabbit_host = rabbitmq
rabbit_user = test
rabbit_password = test
[tool]
workers_number = 12
workers_number = 2
weka_jar=/tmp/weka-3-8-3/weka.jar
[logging]
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Implementation of SubFeatSel worker."""
import nlp_ws
from src.subfeatsel_worker import FeatSelSubWorker
if __name__ == "__main__":
nlp_ws.NLPService.main(FeatSelSubWorker)
......@@ -3,3 +3,5 @@ numpy==1.14.3
scikit-learn==0.19.0
scipy==0.19.1
sqlalchemy
ltcore
ltlearn
This diff is collapsed.
#!/usr/bin/python2
import json
import nlp_ws
from scipy import stats
import numpy,tempfile,scipy
from scipy.sparse import lil_matrix
import ltlearn.transformations.selection.weka.wekaattributeselection as was
from argparse import Namespace
from ltcore import SparseMatrix
import sys,shutil,time
from collections import namedtuple
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import base64
reload(sys)
sys.setdefaultencoding('utf8')
SCIPY_METHODS = ['MannWhitney', 'WilcoxonRankSum', 'KolmogorovSmirnov', 'KruskalWallis', 'TTest']
WEKA_METHODS = ['InfoGainAttributeEval', 'CfsSubsetEval', 'ConsistencySubsetEval',
'GainRatioAttributeEval', 'ChiSquaredAttributeEval']
ENSAMBLE_METHODS =['ExtraTrees','RandomForest','GradientBoosting','AdaBoost']
RFE_METHODS=['RFE_LogisticRegression','RFE_SVM','RFE_NaiveBayes']
stats_file = 'descriptive_features.csv'
verbose=True
class FeatureSelection(object):
"""
A class for feature selection with Weka or Scipy.
"""
def __init__(self, attr_eval, no_features_to_select, weka_jar,tmp_dir):
self.attr_eval = attr_eval
self.no_feats = no_features_to_select
self.weka_jar = weka_jar
self.tmp_dir= tmp_dir
if self.attr_eval in WEKA_METHODS:
if 'Subset' in self.attr_eval:
self.search_method = 'GreedyStepwise'
else:
self.search_method = 'Ranker'
def select_features(self, matrix):
"""
Select features differing classes from the matrix. Matrix has to have only two classes inside.
:return {cluster_id:[(feature, mean in that cluster, std in that cluster, mean in all the other clusters,
std in all the other clusters)]}
"""
if self.attr_eval in WEKA_METHODS:
return self.weka_feature_selection(matrix)
elif self.attr_eval in SCIPY_METHODS:
return self.scipy_feature_selection(matrix)
elif self.attr_eval in ENSAMBLE_METHODS:
return self.ensamble_feature_selection(matrix)
elif self.attr_eval in RFE_METHODS:
return self.rfe_feature_selection(matrix)
else:
raise Exception( 'Unrecognized feature selection method: %s.' % self.attr_eval);
def rfe_feature_selection(self, matrix):
"""
Select features with ENSAMBLE methods from sklearn
.
"""
print matrix.head_word
if self.attr_eval == 'RFE_LogisticRegression':
model = LogisticRegression()
elif self.attr_eval == 'RFE_SVM':
model = SVC(kernel='linear')
elif self.attr_eval == 'RFE_NaiveBayes':
model = MultinomialNB()
rfe = RFE(model, 2)
print type(matrix.data)
print "start"
rfe = rfe.fit(matrix.data, matrix.class_labels)
print "finished"
importances = (self.no_feats-(rfe.ranking_-1))/(self.no_feats*1.0);
indices = numpy.argsort(rfe.ranking_)[::1]
# Print the feature ranking
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
results = []
for f in range(len(matrix.column_labels)):
feature=matrix.column_labels[indices[f]]
if (importances[indices[f]]<0): break;
sample1 = get_column(subclasses[0], feature)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], feature)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append({"feature":feature.encode('utf-8'),"importance":importances[indices[f]], "mean_in":mean_in.item(), "std_in":std_in.item(), "mean_out":mean_out.item(), "std_out":std_out.item()})
return results
def ensamble_feature_selection(self, matrix):
"""
Select features with ENSAMBLE methods from sklearn
.
"""
print matrix.head_word
if self.attr_eval == 'ExtraTrees':
forest = ExtraTreesClassifier(n_estimators=250,max_features=self.no_feats,random_state=0,n_jobs=2)
elif self.attr_eval == 'RandomForest':
forest = RandomForestClassifier(n_estimators=250,max_features=self.no_feats,random_state=0,n_jobs=2)
elif self.attr_eval == 'GradientBoosting':
forest = GradientBoostingClassifier(n_estimators=250,max_features=self.no_feats,random_state=0)
elif self.attr_eval == 'AdaBoost':
forest = AdaBoostClassifier()
forest.fit(matrix.data, matrix.class_labels)
importances = forest.feature_importances_
indices = numpy.argsort(importances)[::-1]
# Print the feature ranking
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
results = []
ind=0;
for f in range(len(matrix.column_labels)):
feature=matrix.column_labels[indices[f]]
if (importances[indices[f]]<0.0001): break;
ind=ind+1;
if ind>self.no_feats: break;
sample1 = get_column(subclasses[0], feature)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], feature)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append({"feature":feature.encode('utf-8'),"importance":importances[indices[f]], "mean_in":mean_in.item(), "std_in":std_in.item(), "mean_out":mean_out.item(), "std_out":std_out.item()})
return results
def scipy_feature_selection(self, matrix):
"""
Select features with Scipy.
"""
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
matrix=[];
results = []
for col_name in subclasses[0].column_labels:
sample1 = get_column(subclasses[0], col_name)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], col_name)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
try:
if self.attr_eval == 'MannWhitney':
res = stats.mannwhitneyu(sample1, sample2)
elif self.attr_eval == 'WilcoxonRankSum':
res = stats.ranksums(sample1, sample2)
elif self.attr_eval == 'KolmogorovSmirnov':
res = stats.ks_2samp(sample1, sample2)
elif self.attr_eval == 'KruskalWallis':
res = stats.kruskal(sample1, sample2)
elif self.attr_eval == 'TTest':
res = stats.ttest_ind(sample1, sample2)
else:
print('No correlation test named %s' % self.attr_eval)
res = None
if res is not None and res[1] < 0.05: # res[1] --> p-value
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append((col_name, res[0], res[1], mean_in, std_in, mean_out, std_out))
except ValueError:
print '%s error for %s' % (self.attr_eval, col_name), set(sample1), set(sample2)
#return [(r[0].encode('utf-8'), r[3], r[4], r[5], r[6]) for r in sorted(results, key=byThirdTerm)]
return [{"feature":r[0].encode('utf-8'), "stat":r[1],"p-value":r[2],"mean_in":r[3], "std_in":r[4], "mean_out":r[5], "std_out":r[6]} for r in sorted(results, key=byThirdTerm)[:self.no_feats]]
def weka_feature_selection(self, matrix):
"""
Select features with Weka.
"""
print 'Selecting features for %s' % matrix.head_word
tmp_mat = SparseMatrix(matrix.head_word, matrix.class_labels, matrix.column_labels, matrix.contexts,
matrix.data.tolil(), boundaries=matrix.boundaries, possible_classes=matrix.possible_classes)
w = was.AttributeSelection()
w._weka_jar = self.weka_jar
w._work_dir = self.tmp_dir
w.set_attribute_evaluator(self.attr_eval)
w.set_search_method(self.search_method)
if self.no_feats is not None and 'Subset' not in self.attr_eval:
w.set_search_method_options('-N %s' % self.no_feats)
elif 'Subset' not in self.attr_eval:
print 'WARNING: number of features to select is not given in config file'
w.make(tmp_mat)
feats = w._selected_features[:-1]
matrix.data=matrix.data.tolil();
subclasses = self.split_matrix_by_classes(matrix)
results = []
for feature in feats:
sample1 = get_column(subclasses[0], feature)
sample1 = complete_with_zeros(sample1, subclasses[0].data.shape[0])
sample2 = get_column(subclasses[1], feature)
sample2 = complete_with_zeros(sample2, subclasses[1].data.shape[0])
if subclasses[0].head_word == 'true':
mean_in = numpy.mean(sample1)
std_in = numpy.std(sample1)
mean_out = numpy.mean(sample2)
std_out = numpy.std(sample2)
else:
mean_in = numpy.mean(sample2)
std_in = numpy.std(sample2)
mean_out = numpy.mean(sample1)
std_out = numpy.std(sample1)
results.append({"feature":feature.encode('utf-8'), "mean_in":mean_in.item(), "std_in":std_in.item(), "mean_out":mean_out.item(), "std_out":std_out.item()})
return results
def split_matrix_by_classes(self,matrix):
oneclass_matrices = []
cls_dict = {}
for i, cls in enumerate(matrix.class_labels):
for c in cls.split(':'):
if c in cls_dict:
cls_dict[c].append(i)
else:
cls_dict[c] = [i]
for cls in cls_dict:
matrix.head_word = cls
oneclass_matrices.append(cut_matrix_by_indices(matrix, cls_dict[cls]))
return oneclass_matrices
def complete_with_zeros(sample, length):
"""
Extend a list with zeros, so that it has specified length in the end.
"""
return numpy.append(sample, numpy.zeros(length - len(sample)), 0)
def cut_matrix_by_indices(matrix, indices):
"""
Returns SparseMatrix with rows at given indices from given matrix.
"""
documents_ids = []
class_labels = []
contexts = []
data = lil_matrix((len(indices), matrix.data.shape[1]))
row = 0
for i in indices:
documents_ids.append(matrix.documents_ids[i])
class_labels.append(matrix.class_labels[i])
contexts.append(matrix.contexts[i])
data.data[row] = matrix.data.data[i]
data.rows[row] = matrix.data.rows[i]
row += 1
new_matrix = SparseMatrix(matrix.head_word, class_labels, matrix.column_labels, contexts, data, matrix.attributes,
matrix.boundaries, list(set(class_labels)), documents_ids)
new_matrix.data = new_matrix.data.tocsc()
return new_matrix
def get_column(matrix, col_name):
"""
Return data from a specific column.
"""
mat = matrix.data;
for i in range(len(matrix.column_labels)):
if matrix.column_labels[i] == col_name:
return mat.data[mat.indptr[i]:mat.indptr[i+1]]
return []
def get_size(obj, seen=None):
"""Recursively finds size of objects"""
size = sys.getsizeof(obj)
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
# Important mark as seen *before* entering recursion to gracefully handle
# self-referential objects
seen.add(obj_id)
if isinstance(obj, dict):
size += sum([get_size(v, seen) for v in obj.values()])
size += sum([get_size(k, seen) for k in obj.keys()])
elif hasattr(obj, '__dict__'):
size += get_size(obj.__dict__, seen)
elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
size += sum([get_size(i, seen) for i in obj])
return size
class FeatSelSubWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(cls, config):
cls.weka_jar = config['tool']['weka_jar'];
#def __init__(self):
# self.weka_jar='/usr/share/java/weka-3.6.10.jar'
# return
def init(self):
#self._docex = DocExtractor.from_config_dict(self._desconfig)
return
def process(self, input_path, task_options, output_path):
#attr_eval = 'InfoGainAttributeEval'
attr_eval = 'MannWhitney'
if 'method' in task_options: attr_eval=task_options['method'];
no_features_to_select = 100
if 'no_features_to_select' in task_options: no_features_to_select=task_options['no_features_to_select'];
temp_folder = tempfile.mkdtemp()
fs = FeatureSelection(attr_eval, no_features_to_select, self.weka_jar,temp_folder)
cluster_matrix = JSONfile2csr(input_path);
#cluster_matrix.data=cluster_matrix.data.tolil();
cluster_matrix.possible_classes = [u'true', u'false']
cluster_matrix.class_labels=task_options["class_labels"]
cluster_matrix.head_word=task_options["head_word"]
cluster_id = cluster_matrix.head_word
res = fs.select_features(cluster_matrix)
shutil.rmtree(temp_folder)
#result = self._docex.extract_from_document(input_path)
res={"result":res,"id":task_options["id"],"method":attr_eval};
with open(output_path, 'w') as result_ofs:
json.dump(res,result_ofs)
def byThirdTerm(feat_tuple):
return feat_tuple[2]
def get_group_class_lables(class_labels):
"""
This splitter splits data f matrix into a number of matrices (one for each possible class).
The outcome will be a list each corresponding to one class, with class_labels true or false.
e.g. if input matrix has class_labels looking like this:
class1:class2:class3
class1:class4
class2:class3
Matrix for class2 will have class_labels looking like this:
true
false
true
@type matrix: JSON matrix
@rtype: list(tempfile.NamedTemporaryFile)
"""
classes = []
for line in class_labels:
classes.extend(line.split(':'))
classes = set(classes)
#creating dict: {tag: [true, false, false ...]}
label_list_dict = {}
for tag in classes:
new_label_list = []
for line in class_labels:
if tag in line.split(':'):
new_label_list.append('true')
else:
new_label_list.append('false')
label_list_dict[tag] = new_label_list
res = []
for tag in classes:
matrix={};
matrix["class_labels"] = label_list_dict[tag]
matrix["head_word"] = tag
res.append(matrix)
return res
def sparse2JSON(filename,cluster_matrix):
input_matrix["data"]=scipy.sparse.lil.lil_matrix(input_matrix["data"]);
array=cluster_matrix["data"]
print array.shape
cluster_matrix["data"]=dict();
cluster_matrix["data"]["data"]=array.data.tolist();
cluster_matrix["data"]["rows"]=array.rows.tolist();
cluster_matrix["data"]["shape"]=array.shape;
cluster_matrix["data"]["dtype"]=array.dtype.str;
with open(filename,'w') as outfile:
json.dump(cluster_matrix, outfile);
def csr2JSONfile(filename,cluster_matrix):
array=cluster_matrix["data"]
cluster_matrix["data"]=dict();
cluster_matrix["data"]["data"]=array.data.tolist();
cluster_matrix["data"]["indices"]=array.indices.tolist();
cluster_matrix["data"]["shape"]=array.shape;
cluster_matrix["data"]["indptr"]=array.indptr.tolist();
with open(filename,'w') as outfile:
json.dump(cluster_matrix, outfile);
def csrObject2JSONfile(filename,cluster_matrix):
array=cluster_matrix.data;
cluster_matrix.data=dict();
cluster_matrix.data["data"]=array.data.tolist();
cluster_matrix.data["indices"]=array.indices.tolist();
cluster_matrix.data["shape"]=array.shape;
cluster_matrix.data["indptr"]=array.indptr.tolist();
with open(filename,'w') as outfile:
json.dump(cluster_matrix.__dict__, outfile);
def JSONfile2csr(inputFile):
with open(inputFile) as json_data:
cluster_matrix = json.load(json_data,object_hook=lambda d: Namespace(**d))
array=cluster_matrix.data;
cluster_matrix.data=scipy.sparse.csr_matrix((numpy.asarray(array.data), array.indices, array.indptr), shape=array.shape);
return cluster_matrix;
def create():
inputFile='data/input_matrix.json';
with open(inputFile) as json_data:
input_matrix = json.load(json_data);
input_matrix["data"]=scipy.sparse.csr_matrix(input_matrix["data"]);
csr2JSONfile("datas1.json",input_matrix);
def test():
inputFile='data1.json';
input_matrix=JSONfile2csr(inputFile);
matrices_labels = get_group_class_lables(input_matrix.class_labels)
submatrix=matrices_labels[1];
option={'method':'MannWhitney' ,'tree_tmp':'RFE_NaiveBayes ExtraTrees MannWhitney',
'no_features_to_select':100,
"class_labels":submatrix["class_labels"],"head_word":submatrix["head_word"],"id":submatrix["head_word"]}
#clustering_results()
start_time = time.time()
w=FeatSelSubWorker();
w.process(inputFile,option,'res.json');
print "Processed in %f"%(time.time()-start_time);
if __name__ == '__main__':
nlp_ws.NLPService.main(FeatSelSubWorker)
#test()
tox.ini 0 → 100644
[tox]
envlist = pep8,docstyle
skipsdist = True
[testenv:pep8]
deps =
flake8
pep8-naming
basepython = python2.7
commands =
flake8 {posargs}
[flake8]
# W504 skipped because it is overeager and unnecessary
ignore = W504
show-source = True
exclude = .git,.venv,.tox,dist,doc,*egg,build,venv
import-order-style = pep8
max-line-length = 80
[pydocstyle]
# D104 Missing docstring in public package
# D203 1 blank line required before class docstring
# D213 Multi-line docstring summary should start at the second line
# D214 Section is over-indented
# D215 Section underline is over-indented
# D401 First line should be in imperative mood; try rephrasing
# D405 Section name should be properly capitalized
# D406 Section name should end with a newline
# D407 Missing dashed underline after section
# D408 Section underline should be in the line following the section’s name
# D409 Section underline should match the length of its name
# D410 Missing blank line after section
# D411 Missing blank line before section
ignore = D104,D203,D213,D214,D215,D401,D405,D406,D407,D408,D409,D410,D411
match-dir = ^(?!\.tox|venv).*
match = ^(?!setup).*\.py
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment