Skip to content
Snippets Groups Projects
Commit 86b1a4db authored by Tomasz Walkowiak's avatar Tomasz Walkowiak
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
FROM clarinpl/builder AS builder
WORKDIR /tmp
RUN apt-get update && apt-get install -y libmysql++-dev
#Poliqarp update for supermatrix
RUN git clone https://gitlab.clarin-pl.eu/analysers/corpus2.git && \
cd corpus2 && \
mkdir bin && \
cd bin && \
cmake -D CORPUS2_BUILD_POLIQARP:BOOL=True .. && \
make -j && \
make install DESTDIR="/install" && \
ldconfig
RUN apt-get install -y liblog4cxx-dev
## Supermatrix
RUN git clone https://gitlab.clarin-pl.eu/libraries/supermatrix && \
mkdir supermatrix/build && \
cd supermatrix/build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j6 && \
make install DESTDIR="/install" && \
ldconfig
FROM clarinpl/python:2.7
# STYLO
RUN apt-get update && apt-get install -y r-base
RUN R -e "install.packages('rjson',dependencies=TRUE, repos='http://cran.rstudio.com/')" && \
R -e "install.packages('versions',dependencies=TRUE, repos='http://cran.rstudio.com/')" && \
R -e "library('versions'); install.versions('stylo','0.6.4')"
#python
RUN git clone http://nlp.pwr.wroc.pl/lexcsd.git
RUN cd lexcsd && \
./install.sh
COPY requirements.txt .
RUN pip install -r requirements.txt
#Supermatrix dependencis
COPY --from=builder /install/usr/ /usr/
COPY --from=builder /tmp/supermatrix/build /home/supermatrix
RUN cp /home/supermatrix/parallel/mpi_handler/*.so /usr/lib/ && \
ldconfig
RUN apt-get install -y libboost-filesystem-dev liblog4cxx-dev libboost-regex-dev libmysql++-dev libopenmpi-dev libboost-program-options-dev
[service]
tool = featfilt2_test
root = /samba/requests/
rabbit_host = rabbit.clarin.ws
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 6
SM_WEIGHTER_BIN = /home/supermatrix/tools/matrix_weighter/weighter
SM_SIMILARITY_BIN = /home/supermatrix/parallel/similarity/parallelSimilarity
SM_CLUTOCONV_BIN = /home/supermatrix/tools/ClutoConv/ClutoConv
STYLO_PATH = twstylo.r
[logging]
port = 9991
local_log_level = INFO
version: '3'
services:
featfilt:
container_name: clarin_featfilt
build: ./
working_dir: /home/worker
entrypoint:
- python2
- featfilt_worker.py
volumes:
- /samba:/samba
- ./module:/home/worker
- ./config.ini:/home/worker/config.ini
#restart: always
\ No newline at end of file
#!/usr/bin/python2
import json
import nlp_ws,logging
from sklearn.externals import joblib
from filtrowanie import MatrixProcessor
from loadfextor import Dictionary,readFeatures
import json,re,numpy,glob,sys,os,math
reload(sys)
sys.setdefaultencoding('utf8')
_log = logging.getLogger(__name__)
class FeatFiltWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(cls, config):
cls.configtool = config['tool'];
return
def init(self):
#self._docex = DocExtractor.from_config_dict(self._desconfig)
return
def toJSON(self,arr, rowlabels, collabels, fname):
res={};
res["arr"]=arr.tolist();
res["rowlabels"]=rowlabels;
if collabels is not None:
res["collabels"]=collabels;
with open(fname, 'w') as fp:
json.dump(res,fp)
def parseFilterOptions(self,taskOptions,inputDir):
result=[];
if not "keep_n" in taskOptions:
taskOptions["keep_n"]={"base":1000,"orth":1000};
if not "base" in taskOptions['keep_n']:
taskOptions["keep_n"]["base"]=1000;
if not "orth" in taskOptions['keep_n']:
taskOptions["keep_n"]["orth"]=1000;
for dictName in taskOptions["keep_n"]:
keep_n=taskOptions["keep_n"][dictName];
d=Dictionary(dictName,inputDir).filter(keep_n);
_log.info("Filter "+dictName+" keep_n="+str(keep_n));
result.append(d);
return result;
def removeNaN(self,arr, rowlabels, collabels):
max_arr = numpy.amax(arr, axis=1)
arr=arr[~numpy.isnan(arr).any(axis=1)]
rows=[];
for i, max_val in enumerate(max_arr):
if math.isnan(max_val):
print "found: "+rowlabels[i];
else:
rows.append(rowlabels[i]);
return arr,rowlabels,collabels
def process(self, inputFile, taskOptions, outputFile):
if not taskOptions:
taskOptions = {'filter': 'min_df-2', 'weighting': 'all:sm-mi', 'similarity': 'jaccard'}
##check if data are already in array
if os.path.exists(inputFile+"/weighted.json"):
with open(inputFile+"/weighted.json") as json_ifs:
jsonVal = json.load(json_ifs)
rowlabels=jsonVal["rowlabels"]
collabels=jsonVal["collabels"]
arr=numpy.asarray(jsonVal["arr"]);
if 'filter' in taskOptions:
del taskOptions['filter']
if 'weighting' in taskOptions:
del taskOptions['weighting'];
else:
#default
filters=self.parseFilterOptions(taskOptions,inputFile);
arr, rowlabels, collabels=readFeatures(inputFile,filters).read();
if not os.path.exists(outputFile):
os.mkdir(outputFile)
self.toJSON(arr, rowlabels, collabels,outputFile+'/data.json');
#TO REMOVE, BACKWARD COMPATABILITY
#joblib.dump(collabels,outputFile+"/columns.pkl");
#joblib.dump(rowlabels,outputFile+"/rowlabels.pkl");
#joblib.dump(arr,outputFile+"/data.pkl");
#print self.configtool
cluto_path=os.path.join(outputFile, 'matrix.txt');
matrix_processor = MatrixProcessor(taskOptions,self.configtool)
if matrix_processor.filter is not None:
arr, rowlabels, collabels = matrix_processor.filter_matrix(arr, rowlabels, collabels)
if matrix_processor.weighting_operations is not None:
arr, rowlabels, collabels = matrix_processor.weight_matrix(arr, rowlabels, collabels)
self.toJSON(arr, rowlabels, collabels,outputFile+'/weighted.json');
if matrix_processor.transform_operations is not None:
arr, rowlabels, collabels = matrix_processor.transform_matrix(arr, rowlabels, collabels)
self.toJSON(arr, rowlabels, collabels,outputFile+'/transform.json');
if matrix_processor.similarity is not None:
arr, rowlabels, collabels=self.removeNaN(arr, rowlabels, collabels);
dist=matrix_processor.make_similarity_matrix(arr, rowlabels, collabels, inputFile, cluto_path)
arr=None
self.toJSON(dist, rowlabels, None,outputFile+'/distance.json');
self.matrix2json(rowlabels,cluto_path,outputFile+'/similarity.json')
self.matrix2json2(rowlabels,cluto_path,outputFile+'/simsparse.json')
#self.logger.log(INFO, "FeatFilter similarity finished");
#converts similarity to json
def matrix2json(self,rowlabels,inpath,outpath):
with open(inpath) as f:
content = f.readlines()
number=int(content[0].split(" ")[0]);
#print number;
dt=numpy.dtype(float)
matrix=numpy.zeros((number,number),dtype=dt);
l_n=0;
regex = r"\d+\s[0-9]*\.?[0-9]+"
for line in content[1:]:
arr = re.findall(regex, line)
for node in arr:
node = node.split()
i=l_n;
j=int(node[0]) - 1;
v=float(node[1]);
matrix[i,j]=v;
l_n+=1;
res={};
res["arr"]=matrix.tolist();
res["rowlabels"]=rowlabels;
with open(outpath, 'w') as fp:
json.dump(res,fp)
#converts similarity to json, sparse
def matrix2json2(self,rowlabels, inpath, outpath):
with open(inpath) as f:
content = f.readlines()
number = int(content[0].split(" ")[0])
#print number
matrix = []
l_n = 0
regex = r"\d+\s[0-9]*\.?[0-9]+"
for line in content[1:]:
arr = re.findall(regex, line)
for node in arr:
node = node.split()
i = l_n
j = int(node[0]) - 1
v = float(node[1])
if v != 0:
matrix.append({'r': i, 'c': j, 'v': v})
l_n += 1
res = {}
res["arr"] = matrix
res["rowlabels"] = rowlabels
with open(outpath, 'w') as fp:
json.dump(res, fp)
if __name__ == '__main__':
nlp_ws.NLPService.main(FeatFiltWorker)
This diff is collapsed.
File added
#!/usr/bin/python2
import json
import logging,operator
import json,sys,os,glob,time
import numpy as _np
reload(sys)
sys.setdefaultencoding('utf8')
_log = logging.getLogger(__name__)
def read_json(inputDir):
# File names are in rows, fature names are in columns.
# Column labels are unknown until all files have been read.
colnames = set()
rowvals = []
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
for filename in rowlabels:
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
flatdict = {
':'.join((fetname, fieldname)): fieldval
for fetname, fields in featdict.iteritems()
for fieldname, fieldval in fields.iteritems()
}
colnames.update(flatdict)
rowvals.append(flatdict)
# Any ordering with do, so use alphabetical for consistency.
collabels = sorted(colnames)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
for rownum, flatdict in enumerate(rowvals):
for colname, colval in flatdict.iteritems():
arr[rownum, colname2colnum[colname]] = colval
collabels = [colname.replace(';','') for colname in tuple(collabels)]
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
return arr, rowlabels, collabels
class readFeatures:
def __init__(self,inputDir,dictionaryList):
self.inputDir=inputDir
self.dictionaryList=dictionaryList
self.dictionaryGroup={};
for el in dictionaryList:
self.dictionaryGroup[el.getGroupName()]=el
def filterByDictionary(self,featdict,dictionary):
#_log.info(" Filtering by "+dictionary.getGroupName())
return {k:v for k, v in featdict.iteritems() if k in dictionary.dictionary}
def filterByDictionaries(self,featdict) :
for key in featdict:
if key in self.dictionaryGroup:
featdict[key]=self.filterByDictionary(featdict[key],self.dictionaryGroup[key]);
def readFromDictOl(self,collabels):
inputDir=self.inputDir;
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
for rownum, flatdict in enumerate(rowvals):
for colname, colval in flatdict.iteritems():
colname=colname.replace(';','');
arr[rownum, colname2colnum[colname]] = colval
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
_log.info(" read filtered data in %s seconds ---" % (time.time() - start_time));
return arr, rowlabels
def readFromDict(self,collabels):
# File names are in rows, fature names are in columns.
# Column labels are unknown until all files have been read.
start_time = time.time()
colnames = set()
rowvals = []
inputDir=self.inputDir;
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
for rownum, filename in enumerate(rowlabels):
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
flatdict = {
':'.join((fetname, fieldname)): fieldval
for fetname, fields in featdict.iteritems()
for fieldname, fieldval in fields.iteritems()
}
for colname, colval in flatdict.iteritems():
colname=colname.replace(';','')
if colname in colname2colnum:
arr[rownum, colname2colnum[colname]] = colval
#collabels = [colname.replace(';','') for colname in tuple(collabels)]
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
_log.info(" read filtered data in %s seconds ---" % (time.time() - start_time));
return arr, rowlabels
def read(self):
# File names are in rows, fature names are in columns.
# Column labels are unknown until all files have been read.
start_time = time.time()
colnames = set()
rowvals = []
inputDir=self.inputDir;
files = []
start_time = time.time()
for filename in glob.glob(inputDir + '/*'):
files.append(filename.decode('utf8'))
rowlabels = tuple(files)
for filename in rowlabels:
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
if self.dictionaryList<>None:
self.filterByDictionaries(featdict)
flatdict = {
':'.join((fetname, fieldname)): fieldval
for fetname, fields in featdict.iteritems()
for fieldname, fieldval in fields.iteritems()
}
if len(flatdict)<1:
continue
colnames.update(flatdict)
rowvals.append(flatdict)
# Any ordering with do, so use alphabetical for consistency.
collabels = sorted(colnames)
# Indexes of column names must be known to efficiently insert non-zero
# values.
colname2colnum = {name: num for num, name in enumerate(collabels)}
# The array; R does not have unsigned integers, so it needs to store
# signed.
arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
for rownum, flatdict in enumerate(rowvals):
for colname, colval in flatdict.iteritems():
arr[rownum, colname2colnum[colname]] = colval
collabels = [colname.replace(';','') for colname in tuple(collabels)]
rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
_log.info(" read filtered data in %s seconds ---" % (time.time() - start_time));
return arr, rowlabels, collabels
def operatorDf(a,b):
if b<=0: return a;
if a<=0: return 1;
return a+1;
class Dictionary():
def combine_dicts1(self, a, b, op=operator.add):
return dict(a.items() + b.items() +
[(k, op(a[k], b[k])) for k in set(b) & set(a)])
def combine_dicts(self, a, b, op=operator.add):
for el in b:
if el in a:
a[el]=op(a[el],b[el]);
else:
a[el]=op(0,b[el]);
return a
def __init__(self,groupName,input_dir):
self.groupName=groupName;
self.inputDir=input_dir;
self.counts=dict()
self.df=dict()
def filter(self,keep_n=1000):
self.load_data()
start_time = time.time()
keys=self.df.keys();
if len(self.counts)>keep_n:
self.dictionary = sorted(self.counts, key=self.counts.get,reverse=True)[:keep_n]
else:
self.dictionary=self.df.keys();
_log.info(" filtered dictionary in %s seconds ---" % (time.time() - start_time));
return self;
def getGroupName(self):
return self.groupName;
def load_data(self):
start_time = time.time()
self.counts=dict()
self.df=dict()
self.size=0;
for filename in glob.glob(self.inputDir + '/*'):
with open(filename) as json_ifs:
featdict = json.load(json_ifs)
groupDict={};
if (self.groupName in featdict):
groupDict=featdict[self.groupName]
featdict=None;
self.counts=self.combine_dicts(self.counts,groupDict);
self.df=self.combine_dicts(self.df,groupDict,operatorDf);
self.size=self.size+1;
_log.info(" loaded in %s seconds ---" % (time.time() - start_time));
File added
This diff is collapsed.
This diff is collapsed.
nlp-ws
numpy==1.14.3
scikit-learn==0.19.0
scipy==0.19.1
sqlalchemy
rpy2==2.7.2
gensim
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment