Commit 86b1a4db authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Initial commit

parents
FROM clarinpl/builder AS builder
WORKDIR /tmp
RUN apt-get update && apt-get install -y libmysql++-dev
#Poliqarp update for supermatrix
RUN git clone https://gitlab.clarin-pl.eu/analysers/corpus2.git && \
cd corpus2 && \
mkdir bin && \
cd bin && \
cmake -D CORPUS2_BUILD_POLIQARP:BOOL=True .. && \
make -j && \
make install DESTDIR="/install" && \
ldconfig
RUN apt-get install -y liblog4cxx-dev
## Supermatrix
RUN git clone https://gitlab.clarin-pl.eu/libraries/supermatrix && \
mkdir supermatrix/build && \
cd supermatrix/build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j6 && \
make install DESTDIR="/install" && \
ldconfig
FROM clarinpl/python:2.7
# STYLO
RUN apt-get update && apt-get install -y r-base
RUN R -e "install.packages('rjson',dependencies=TRUE, repos='http://cran.rstudio.com/')" && \
R -e "install.packages('versions',dependencies=TRUE, repos='http://cran.rstudio.com/')" && \
R -e "library('versions'); install.versions('stylo','0.6.4')"
#python
RUN git clone http://nlp.pwr.wroc.pl/lexcsd.git
RUN cd lexcsd && \
./install.sh
COPY requirements.txt .
RUN pip install -r requirements.txt
#Supermatrix dependencis
COPY --from=builder /install/usr/ /usr/
COPY --from=builder /tmp/supermatrix/build /home/supermatrix
RUN cp /home/supermatrix/parallel/mpi_handler/*.so /usr/lib/ && \
ldconfig
RUN apt-get install -y libboost-filesystem-dev liblog4cxx-dev libboost-regex-dev libmysql++-dev libopenmpi-dev libboost-program-options-dev
[service]
tool = featfilt2_test
root = /samba/requests/
rabbit_host = rabbit.clarin.ws
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 6
SM_WEIGHTER_BIN = /home/supermatrix/tools/matrix_weighter/weighter
SM_SIMILARITY_BIN = /home/supermatrix/parallel/similarity/parallelSimilarity
SM_CLUTOCONV_BIN = /home/supermatrix/tools/ClutoConv/ClutoConv
STYLO_PATH = twstylo.r
[logging]
port = 9991
local_log_level = INFO
version: '3'
services:
featfilt:
container_name: clarin_featfilt
build: ./
working_dir: /home/worker
entrypoint:
- python2
- featfilt_worker.py
volumes:
- /samba:/samba
- ./module:/home/worker
- ./config.ini:/home/worker/config.ini
#restart: always
\ No newline at end of file
#!/usr/bin/python2
import json
import nlp_ws,logging
from sklearn.externals import joblib
from filtrowanie import MatrixProcessor
from loadfextor import Dictionary,readFeatures
import json,re,numpy,glob,sys,os,math
reload(sys)
sys.setdefaultencoding('utf8')
_log = logging.getLogger(__name__)
class FeatFiltWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(cls, config):
cls.configtool = config['tool'];
return
def init(self):
#self._docex = DocExtractor.from_config_dict(self._desconfig)
return
def toJSON(self,arr, rowlabels, collabels, fname):
res={};
res["arr"]=arr.tolist();
res["rowlabels"]=rowlabels;
if collabels is not None:
res["collabels"]=collabels;
with open(fname, 'w') as fp:
json.dump(res,fp)
def parseFilterOptions(self,taskOptions,inputDir):
result=[];
if not "keep_n" in taskOptions:
taskOptions["keep_n"]={"base":1000,"orth":1000};
if not "base" in taskOptions['keep_n']:
taskOptions["keep_n"]["base"]=1000;
if not "orth" in taskOptions['keep_n']:
taskOptions["keep_n"]["orth"]=1000;
for dictName in taskOptions["keep_n"]:
keep_n=taskOptions["keep_n"][dictName];
d=Dictionary(dictName,inputDir).filter(keep_n);
_log.info("Filter "+dictName+" keep_n="+str(keep_n));
result.append(d);
return result;
def removeNaN(self,arr, rowlabels, collabels):
max_arr = numpy.amax(arr, axis=1)
arr=arr[~numpy.isnan(arr).any(axis=1)]
rows=[];
for i, max_val in enumerate(max_arr):
if math.isnan(max_val):
print "found: "+rowlabels[i];
else:
rows.append(rowlabels[i]);
return arr,rowlabels,collabels
def process(self, inputFile, taskOptions, outputFile):
if not taskOptions:
taskOptions = {'filter': 'min_df-2', 'weighting': 'all:sm-mi', 'similarity': 'jaccard'}
##check if data are already in array
if os.path.exists(inputFile+"/weighted.json"):
with open(inputFile+"/weighted.json") as json_ifs:
jsonVal = json.load(json_ifs)
rowlabels=jsonVal["rowlabels"]
collabels=jsonVal["collabels"]
arr=numpy.asarray(jsonVal["arr"]);
if 'filter' in taskOptions:
del taskOptions['filter']
if 'weighting' in taskOptions:
del taskOptions['weighting'];
else:
#default
filters=self.parseFilterOptions(taskOptions,inputFile);
arr, rowlabels, collabels=readFeatures(inputFile,filters).read();
if not os.path.exists(outputFile):
os.mkdir(outputFile)
self.toJSON(arr, rowlabels, collabels,outputFile+'/data.json');
#TO REMOVE, BACKWARD COMPATABILITY
#joblib.dump(collabels,outputFile+"/columns.pkl");
#joblib.dump(rowlabels,outputFile+"/rowlabels.pkl");
#joblib.dump(arr,outputFile+"/data.pkl");
#print self.configtool
cluto_path=os.path.join(outputFile, 'matrix.txt');
matrix_processor = MatrixProcessor(taskOptions,self.configtool)
if matrix_processor.filter is not None:
arr, rowlabels, collabels = matrix_processor.filter_matrix(arr, rowlabels, collabels)
if matrix_processor.weighting_operations is not None:
arr, rowlabels, collabels = matrix_processor.weight_matrix(arr, rowlabels, collabels)
self.toJSON(arr, rowlabels, collabels,outputFile+'/weighted.json');
if matrix_processor.transform_operations is not None:
arr, rowlabels, collabels = matrix_processor.transform_matrix(arr, rowlabels, collabels)
self.toJSON(arr, rowlabels, collabels,outputFile+'/transform.json');
if matrix_processor.similarity is not None:
arr, rowlabels, collabels=self.removeNaN(arr, rowlabels, collabels);
dist=matrix_processor.make_similarity_matrix(arr, rowlabels, collabels, inputFile, cluto_path)
arr=None
self.toJSON(dist, rowlabels, None,outputFile+'/distance.json');
self.matrix2json(rowlabels,cluto_path,outputFile+'/similarity.json')
self.matrix2json2(rowlabels,cluto_path,outputFile+'/simsparse.json')
#self.logger.log(INFO, "FeatFilter similarity finished");
#converts similarity to json
def matrix2json(self,rowlabels,inpath,outpath):
with open(inpath) as f:
content = f.readlines()
number=int(content[0].split(" ")[0]);
#print number;
dt=numpy.dtype(float)
matrix=numpy.zeros((number,number),dtype=dt);
l_n=0;
regex = r"\d+\s[0-9]*\.?[0-9]+"
for line in content[1:]:
arr = re.findall(regex, line)
for node in arr:
node = node.split()
i=l_n;
j=int(node[0]) - 1;
v=float(node[1]);
matrix[i,j]=v;
l_n+=1;
res={};
res["arr"]=matrix.tolist();
res["rowlabels"]=rowlabels;
with open(outpath, 'w') as fp:
json.dump(res,fp)
#converts similarity to json, sparse
def matrix2json2(self,rowlabels, inpath, outpath):
with open(inpath) as f:
content = f.readlines()
number = int(content[0].split(" ")[0])
#print number
matrix = []
l_n = 0
regex = r"\d+\s[0-9]*\.?[0-9]+"
for line in content[1:]:
arr = re.findall(regex, line)
for node in arr:
node = node.split()
i = l_n
j = int(node[0]) - 1
v = float(node[1])
if v != 0:
matrix.append({'r': i, 'c': j, 'v': v})
l_n += 1
res = {}
res["arr"] = matrix
res["rowlabels"] = rowlabels
with open(outpath, 'w') as fp:
json.dump(res, fp)
if __name__ == '__main__':
nlp_ws.NLPService.main(FeatFiltWorker)
This diff is collapsed.
#!/usr/bin/python2
import json
import logging,operator
import json,sys,os,glob,time
import numpy as _np
reload(sys)
sys.setdefaultencoding('utf8')
_log = logging.getLogger(__name__)
def read_json(inputDir):
# File names are in rows, fature names are in columns.
# Column labels are unknown until all files have been read.
colnames = set()
rowvals = []
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
for filename in rowlabels:
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
flatdict = {
':'.join((fetname, fieldname)): fieldval
for fetname, fields in featdict.iteritems()
for fieldname, fieldval in fields.iteritems()
}
colnames.update(flatdict)
rowvals.append(flatdict)
# Any ordering with do, so use alphabetical for consistency.
collabels = sorted(colnames)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
for rownum, flatdict in enumerate(rowvals):
for colname, colval in flatdict.iteritems():
arr[rownum, colname2colnum[colname]] = colval
collabels = [colname.replace(';','') for colname in tuple(collabels)]
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
return arr, rowlabels, collabels
class readFeatures:
def __init__(self,inputDir,dictionaryList):
self.inputDir=inputDir
self.dictionaryList=dictionaryList
self.dictionaryGroup={};
for el in dictionaryList:
self.dictionaryGroup[el.getGroupName()]=el
def filterByDictionary(self,featdict,dictionary):
#_log.info(" Filtering by "+dictionary.getGroupName())
return {k:v for k, v in featdict.iteritems() if k in dictionary.dictionary}
def filterByDictionaries(self,featdict) :
for key in featdict:
if key in self.dictionaryGroup:
featdict[key]=self.filterByDictionary(featdict[key],self.dictionaryGroup[key]);
def readFromDictOl(self,collabels):
inputDir=self.inputDir;
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
for rownum, flatdict in enumerate(rowvals):
for colname, colval in flatdict.iteritems():
colname=colname.replace(';','');
arr[rownum, colname2colnum[colname]] = colval
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
_log.info(" read filtered data in %s seconds ---" % (time.time() - start_time));
return arr, rowlabels
def readFromDict(self,collabels):
# File names are in rows, fature names are in columns.
# Column labels are unknown until all files have been read.
start_time = time.time()
colnames = set()
rowvals = []
inputDir=self.inputDir;
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
for rownum, filename in enumerate(rowlabels):
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
flatdict = {
':'.join((fetname, fieldname)): fieldval
for fetname, fields in featdict.iteritems()
for fieldname, fieldval in fields.iteritems()
}
for colname, colval in flatdict.iteritems():
colname=colname.replace(';','')
if colname in colname2colnum:
arr[rownum, colname2colnum[colname]] = colval
#collabels = [colname.replace(';','') for colname in tuple(collabels)]
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
_log.info(" read filtered data in %s seconds ---" % (time.time() - start_time));
return arr, rowlabels
def read(self):
# File names are in rows, fature names are in columns.
# Column labels are unknown until all files have been read.
start_time = time.time()
colnames = set()
rowvals = []
inputDir=self.inputDir;
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
for filename in rowlabels:
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
if self.dictionaryList<>None:
self.filterByDictionaries(featdict)
flatdict = {
':'.join((fetname, fieldname)): fieldval
for fetname, fields in featdict.iteritems()
for fieldname, fieldval in fields.iteritems()
}
if len(flatdict)<1:
continue
colnames.update(flatdict)
rowvals.append(flatdict)
# Any ordering with do, so use alphabetical for consistency.
collabels = sorted(colnames)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
for rownum, flatdict in enumerate(rowvals):
for colname, colval in flatdict.iteritems():
arr[rownum, colname2colnum[colname]] = colval
collabels = [colname.replace(';','') for colname in tuple(collabels)]
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
_log.info(" read filtered data in %s seconds ---" % (time.time() - start_time));
return arr, rowlabels, collabels
def operatorDf(a,b):
if b<=0: return a;
if a<=0: return 1;
return a+1;
class Dictionary():
def combine_dicts1(self, a, b, op=operator.add):
return dict(a.items() + b.items() +
[(k, op(a[k], b[k])) for k in set(b) & set(a)])
def combine_dicts(self, a, b, op=operator.add):
for el in b:
if el in a:
a[el]=op(a[el],b[el]);
else:
a[el]=op(0,b[el]);
return a
def __init__(self,groupName,input_dir):
self.groupName=groupName;
self.inputDir=input_dir;
self.counts=dict()
self.df=dict()
def filter(self,keep_n=1000):
self.load_data()
start_time = time.time()
keys=self.df.keys();
if len(self.counts)>keep_n:
self.dictionary = sorted(self.counts, key=self.counts.get,reverse=True)[:keep_n]
else:
self.dictionary=self.df.keys();
_log.info(" filtered dictionary in %s seconds ---" % (time.time() - start_time));
return self;
def getGroupName(self):
return self.groupName;
def load_data(self):
start_time = time.time()
self.counts=dict()
self.df=dict()
self.size=0;
for filename in glob.glob(self.inputDir + '/*'):
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
groupDict={};
if (self.groupName in featdict):
groupDict=featdict[self.groupName]
featdict=None;
self.counts=self.combine_dicts(self.counts,groupDict);
self.df=self.combine_dicts(self.df,groupDict,operatorDf);
self.size=self.size+1;
_log.info(" loaded in %s seconds ---" % (time.time() - start_time));
This diff is collapsed.
This diff is collapsed.
nlp-ws
numpy==1.14.3
scikit-learn==0.19.0
scipy==0.19.1
sqlalchemy
rpy2==2.7.2
gensim
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment