Initial commit

86b1a4db · Tomasz Walkowiak · 86b1a4db · 86b1a4db · 86b1a4db · 86b1a4db
Commit 86b1a4db authored Nov 29, 2019 by Tomasz Walkowiak
--- a/Dockerfile
+++ b/Dockerfile
+FROM clarinpl/builder AS builder
+WORKDIR /tmp
+RUN apt-get update && apt-get install -y libmysql++-dev
+#Poliqarp update for supermatrix
+RUN git clone https://gitlab.clarin-pl.eu/analysers/corpus2.git && \
+    cd corpus2 && \
+    mkdir bin && \
+    cd bin && \
+    cmake -D CORPUS2_BUILD_POLIQARP:BOOL=True .. && \
+    make -j && \
+    make install DESTDIR="/install" && \
+    ldconfig
+RUN apt-get install -y liblog4cxx-dev
+## Supermatrix
+RUN git clone https://gitlab.clarin-pl.eu/libraries/supermatrix && \
+    mkdir supermatrix/build  && \
+    cd supermatrix/build && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    make -j6 && \
+    make install DESTDIR="/install" && \
+    ldconfig
+FROM clarinpl/python:2.7
+# STYLO
+RUN apt-get update && apt-get install -y r-base
+RUN R -e "install.packages('rjson',dependencies=TRUE, repos='http://cran.rstudio.com/')" && \
+    R -e "install.packages('versions',dependencies=TRUE, repos='http://cran.rstudio.com/')"  && \
+    R -e "library('versions'); install.versions('stylo','0.6.4')"    
+#python 
+RUN git clone http://nlp.pwr.wroc.pl/lexcsd.git
+RUN cd lexcsd && \ 
+    ./install.sh
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+#Supermatrix dependencis
+COPY --from=builder /install/usr/ /usr/
+COPY --from=builder /tmp/supermatrix/build /home/supermatrix
+RUN cp  /home/supermatrix/parallel/mpi_handler/*.so /usr/lib/ && \
+    ldconfig
+RUN apt-get install  -y libboost-filesystem-dev liblog4cxx-dev libboost-regex-dev libmysql++-dev libopenmpi-dev libboost-program-options-dev
--- a/config.ini
+++ b/config.ini
+[service]
+tool = featfilt2_test
+root = /samba/requests/
+rabbit_host = rabbit.clarin.ws
+rabbit_user = clarin
+rabbit_password = clarin123
+[tool]
+workers_number = 6
+SM_WEIGHTER_BIN = /home/supermatrix/tools/matrix_weighter/weighter
+SM_SIMILARITY_BIN = /home/supermatrix/parallel/similarity/parallelSimilarity
+SM_CLUTOCONV_BIN = /home/supermatrix/tools/ClutoConv/ClutoConv
+STYLO_PATH = twstylo.r
+[logging]
+port = 9991
+local_log_level = INFO
--- a/docker-compose.yml
+++ b/docker-compose.yml
+version: '3'
+services:  
+  featfilt:
+    container_name: clarin_featfilt
+    build: ./
+    working_dir: /home/worker
+    entrypoint:
+          - python2
+          - featfilt_worker.py
+    volumes:
+        - /samba:/samba
+        - ./module:/home/worker
+        - ./config.ini:/home/worker/config.ini        
+    #restart: always    
\ No newline at end of file
--- a/module/config.ini
+++ b/module/config.ini
--- a/module/featfilt_worker.py
+++ b/module/featfilt_worker.py
+#!/usr/bin/python2
+import json
+import nlp_ws,logging
+from sklearn.externals import joblib
+from filtrowanie import MatrixProcessor
+from loadfextor import Dictionary,readFeatures
+import json,re,numpy,glob,sys,os,math
+reload(sys)  
+sys.setdefaultencoding('utf8')
+_log = logging.getLogger(__name__)
+class FeatFiltWorker(nlp_ws.NLPWorker):
+    @classmethod
+    def static_init(cls, config):
+        cls.configtool = config['tool'];
+        return
+    def init(self):
+        #self._docex = DocExtractor.from_config_dict(self._desconfig)
+        return
+    def toJSON(self,arr, rowlabels, collabels, fname):
+        res={};
+        res["arr"]=arr.tolist();
+        res["rowlabels"]=rowlabels;
+        if collabels is not None:
+            res["collabels"]=collabels;
+        with open(fname, 'w') as fp:
+            json.dump(res,fp)
+    def parseFilterOptions(self,taskOptions,inputDir):    
+        result=[];
+        if not "keep_n" in taskOptions:
+            taskOptions["keep_n"]={"base":1000,"orth":1000};
+        if not "base" in taskOptions['keep_n']:
+            taskOptions["keep_n"]["base"]=1000;
+        if not "orth" in taskOptions['keep_n']:
+            taskOptions["keep_n"]["orth"]=1000;    
+        for dictName in taskOptions["keep_n"]:
+                keep_n=taskOptions["keep_n"][dictName];
+                d=Dictionary(dictName,inputDir).filter(keep_n);    
+                _log.info("Filter "+dictName+" keep_n="+str(keep_n));
+                result.append(d);
+        return result;        
+    def removeNaN(self,arr, rowlabels, collabels):
+        max_arr = numpy.amax(arr, axis=1)
+        arr=arr[~numpy.isnan(arr).any(axis=1)]
+        rows=[];
+        for i, max_val in enumerate(max_arr):
+            if math.isnan(max_val):
+                print "found: "+rowlabels[i];
+            else:
+                rows.append(rowlabels[i]); 
+        return arr,rowlabels,collabels
+    def process(self, inputFile, taskOptions, outputFile):    
+        if not taskOptions:
+            taskOptions = {'filter': 'min_df-2', 'weighting': 'all:sm-mi', 'similarity': 'jaccard'}
+        ##check if data are already in array
+        if os.path.exists(inputFile+"/weighted.json"):
+            with open(inputFile+"/weighted.json") as json_ifs:
+                jsonVal = json.load(json_ifs)
+                rowlabels=jsonVal["rowlabels"]
+                collabels=jsonVal["collabels"]
+                arr=numpy.asarray(jsonVal["arr"]);
+            if 'filter' in taskOptions: 
+                del taskOptions['filter']
+            if 'weighting' in taskOptions:
+                del taskOptions['weighting'];
+        else:
+            #default 
+            filters=self.parseFilterOptions(taskOptions,inputFile);        
+            arr, rowlabels, collabels=readFeatures(inputFile,filters).read();
+        if not os.path.exists(outputFile):
+            os.mkdir(outputFile)
+        self.toJSON(arr, rowlabels, collabels,outputFile+'/data.json');            
+        #TO REMOVE, BACKWARD COMPATABILITY
+        #joblib.dump(collabels,outputFile+"/columns.pkl");
+        #joblib.dump(rowlabels,outputFile+"/rowlabels.pkl");
+        #joblib.dump(arr,outputFile+"/data.pkl");
+        #print self.configtool
+        cluto_path=os.path.join(outputFile, 'matrix.txt');
+        matrix_processor = MatrixProcessor(taskOptions,self.configtool)
+        if matrix_processor.filter is not None:
+            arr, rowlabels, collabels = matrix_processor.filter_matrix(arr, rowlabels, collabels)
+        if matrix_processor.weighting_operations is not None:        
+            arr, rowlabels, collabels = matrix_processor.weight_matrix(arr, rowlabels, collabels)
+        self.toJSON(arr, rowlabels, collabels,outputFile+'/weighted.json');    
+        if matrix_processor.transform_operations is not None:        
+            arr, rowlabels, collabels = matrix_processor.transform_matrix(arr, rowlabels, collabels)
+        self.toJSON(arr, rowlabels, collabels,outputFile+'/transform.json');    
+        if matrix_processor.similarity is not None:
+            arr, rowlabels, collabels=self.removeNaN(arr, rowlabels, collabels);
+            dist=matrix_processor.make_similarity_matrix(arr, rowlabels, collabels, inputFile, cluto_path)
+            arr=None    
+            self.toJSON(dist, rowlabels, None,outputFile+'/distance.json');
+            self.matrix2json(rowlabels,cluto_path,outputFile+'/similarity.json')
+            self.matrix2json2(rowlabels,cluto_path,outputFile+'/simsparse.json')
+        #self.logger.log(INFO, "FeatFilter similarity finished");        
+    #converts similarity to json
+    def matrix2json(self,rowlabels,inpath,outpath):
+        with open(inpath) as f:
+            content = f.readlines()
+        number=int(content[0].split(" ")[0]);
+        #print number;
+        dt=numpy.dtype(float)
+        matrix=numpy.zeros((number,number),dtype=dt);
+        l_n=0;
+        regex = r"\d+\s[0-9]*\.?[0-9]+"
+        for line in content[1:]:
+            arr = re.findall(regex, line)
+            for node in arr:
+                node = node.split()
+                i=l_n;
+                j=int(node[0]) - 1;
+                v=float(node[1]);
+                matrix[i,j]=v;
+            l_n+=1;
+        res={};
+        res["arr"]=matrix.tolist();
+        res["rowlabels"]=rowlabels;
+        with open(outpath, 'w') as fp:
+            json.dump(res,fp)
+    #converts similarity to json, sparse
+    def matrix2json2(self,rowlabels, inpath, outpath):
+        with open(inpath) as f:
+            content = f.readlines()
+        number = int(content[0].split(" ")[0])
+        #print number
+        matrix = []
+        l_n = 0
+        regex = r"\d+\s[0-9]*\.?[0-9]+"
+        for line in content[1:]:
+            arr = re.findall(regex, line)
+            for node in arr:
+                node = node.split()
+                i = l_n
+                j = int(node[0]) - 1
+                v = float(node[1])
+                if v != 0:
+                    matrix.append({'r': i, 'c': j, 'v': v})
+            l_n += 1
+        res = {}
+        res["arr"] = matrix
+        res["rowlabels"] = rowlabels
+        with open(outpath, 'w') as fp:
+            json.dump(res, fp)
+if __name__ == '__main__':
+    nlp_ws.NLPService.main(FeatFiltWorker)
--- a/module/filtrowanie.py
+++ b/module/filtrowanie.py
--- a/module/filtrowanie.pyc
+++ b/module/filtrowanie.pyc
--- a/module/loadfextor.py
+++ b/module/loadfextor.py
+#!/usr/bin/python2
+import json
+import logging,operator
+import json,sys,os,glob,time
+import numpy as _np
+reload(sys)  
+sys.setdefaultencoding('utf8')
+_log = logging.getLogger(__name__)
+def read_json(inputDir):
+     # File names are in rows, fature names are in columns.
+     # Column labels are unknown until all files have been read.
+     colnames = set()
+     rowvals = []
+     files = []
+     start_time = time.time()
+     for filename in glob.glob(inputDir + '/*'):
+          files.append(filename.decode('utf8'))
+     rowlabels = tuple(files)
+     for filename in rowlabels:
+          with open(filename) as json_ifs:
+                featdict = json.load(json_ifs)
+          flatdict = {
+                ':'.join((fetname, fieldname)): fieldval
+                for fetname, fields in featdict.iteritems()
+                for fieldname, fieldval in fields.iteritems()
+          }
+          colnames.update(flatdict)
+          rowvals.append(flatdict)
+     # Any ordering with do, so use alphabetical for consistency.
+     collabels = sorted(colnames)
+     # Indexes of column names must be known to efficiently insert non-zero
+     # values.
+     colname2colnum = {name: num for num, name in enumerate(collabels)}
+     # The array; R does not have unsigned integers, so it needs to store
+     # signed.
+     arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
+     for rownum, flatdict in enumerate(rowvals):
+          for colname, colval in flatdict.iteritems():
+                arr[rownum, colname2colnum[colname]] = colval
+     collabels = [colname.replace(';','')  for colname in tuple(collabels)]          
+     rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
+     return arr, rowlabels, collabels
+class readFeatures:
+    def __init__(self,inputDir,dictionaryList):
+        self.inputDir=inputDir
+        self.dictionaryList=dictionaryList
+        self.dictionaryGroup={};
+        for el in dictionaryList:
+            self.dictionaryGroup[el.getGroupName()]=el
+    def filterByDictionary(self,featdict,dictionary):
+        #_log.info("      Filtering by "+dictionary.getGroupName())
+        return {k:v for k, v in featdict.iteritems() if k in dictionary.dictionary}
+    def filterByDictionaries(self,featdict)    :
+        for key in featdict:
+            if key in self.dictionaryGroup:
+                featdict[key]=self.filterByDictionary(featdict[key],self.dictionaryGroup[key]);
+    def readFromDictOl(self,collabels):
+        inputDir=self.inputDir;
+        files = []
+        start_time = time.time()
+        for filename in glob.glob(inputDir + '/*'):
+          files.append(filename.decode('utf8'))
+        rowlabels = tuple(files)
+        # Indexes of column names must be known to efficiently insert non-zero
+        # values.
+        colname2colnum = {name: num for num, name in enumerate(collabels)}
+        # The array; R does not have unsigned integers, so it needs to store
+        # signed.
+        arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
+        for rownum, flatdict in enumerate(rowvals):
+          for colname, colval in flatdict.iteritems():
+                colname=colname.replace(';','');
+                arr[rownum, colname2colnum[colname]] = colval
+        rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
+        _log.info("  read filtered data in %s seconds ---" % (time.time() - start_time));
+        return arr, rowlabels
+    def readFromDict(self,collabels):    
+        # File names are in rows, fature names are in columns.
+        # Column labels are unknown until all files have been read.
+        start_time = time.time()
+        colnames = set()
+        rowvals = []
+        inputDir=self.inputDir;
+        files = []
+        start_time = time.time()
+        for filename in glob.glob(inputDir + '/*'):
+          files.append(filename.decode('utf8'))
+        rowlabels = tuple(files)
+        # The array; R does not have unsigned integers, so it needs to store
+        # signed.
+        arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
+        # Indexes of column names must be known to efficiently insert non-zero
+        # values.
+        colname2colnum = {name: num for num, name in enumerate(collabels)}
+        for rownum, filename in enumerate(rowlabels):
+          with open(filename) as json_ifs:
+                featdict = json.load(json_ifs)
+          flatdict = {
+                ':'.join((fetname, fieldname)): fieldval
+                for fetname, fields in featdict.iteritems()
+                for fieldname, fieldval in fields.iteritems()
+          }
+          for colname, colval in flatdict.iteritems():
+                colname=colname.replace(';','')
+                if colname in colname2colnum:
+                    arr[rownum, colname2colnum[colname]] = colval
+        #collabels = [colname.replace(';','')  for colname in tuple(collabels)]          
+        rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
+        _log.info("  read filtered data in %s seconds ---" % (time.time() - start_time));
+        return arr, rowlabels
+    def read(self):
+        # File names are in rows, fature names are in columns.
+        # Column labels are unknown until all files have been read.
+        start_time = time.time()
+        colnames = set()
+        rowvals = []
+        inputDir=self.inputDir;
+        files = []
+        start_time = time.time()
+        for filename in glob.glob(inputDir + '/*'):
+          files.append(filename.decode('utf8'))
+        rowlabels = tuple(files)
+        for filename in rowlabels:
+          with open(filename) as json_ifs:
+                featdict = json.load(json_ifs)
+          if self.dictionaryList<>None:
+            self.filterByDictionaries(featdict)
+          flatdict = {
+                ':'.join((fetname, fieldname)): fieldval
+                for fetname, fields in featdict.iteritems()
+                for fieldname, fieldval in fields.iteritems()
+          }
+          if len(flatdict)<1:
+            continue
+          colnames.update(flatdict)
+          rowvals.append(flatdict)
+        # Any ordering with do, so use alphabetical for consistency.
+        collabels = sorted(colnames)
+        # Indexes of column names must be known to efficiently insert non-zero
+        # values.
+        colname2colnum = {name: num for num, name in enumerate(collabels)}
+        # The array; R does not have unsigned integers, so it needs to store
+        # signed.
+        arr = _np.zeros((len(rowlabels), len(collabels)), _np.float)
+        for rownum, flatdict in enumerate(rowvals):
+          for colname, colval in flatdict.iteritems():
+                arr[rownum, colname2colnum[colname]] = colval
+        collabels = [colname.replace(';','')  for colname in tuple(collabels)]          
+        rowlabels = [os.path.basename(rowname) for rowname in rowlabels]
+        _log.info("  read filtered data in %s seconds ---" % (time.time() - start_time));
+        return arr, rowlabels, collabels
+def operatorDf(a,b):
+    if b<=0: return a;
+    if a<=0: return 1;
+    return a+1;
+class Dictionary():
+    def combine_dicts1(self, a, b, op=operator.add):
+        return dict(a.items() + b.items() +
+            [(k, op(a[k], b[k])) for k in set(b) & set(a)])
+    def combine_dicts(self, a, b, op=operator.add):
+        for el in b:
+            if el in a:
+                a[el]=op(a[el],b[el]);
+            else:
+                a[el]=op(0,b[el]);
+        return a
+    def __init__(self,groupName,input_dir):
+        self.groupName=groupName;
+        self.inputDir=input_dir;
+        self.counts=dict()
+        self.df=dict()
+    def filter(self,keep_n=1000):    
+        self.load_data()
+        start_time = time.time()    
+        keys=self.df.keys();
+        if len(self.counts)>keep_n:        
+            self.dictionary = sorted(self.counts, key=self.counts.get,reverse=True)[:keep_n]
+        else:
+            self.dictionary=self.df.keys();
+        _log.info("  filtered dictionary in %s seconds ---" % (time.time() - start_time));
+        return self;
+    def getGroupName(self):
+        return self.groupName;
+    def load_data(self):
+        start_time = time.time()
+        self.counts=dict()
+        self.df=dict()
+        self.size=0;
+        for filename in glob.glob(self.inputDir + '/*'):
+            with open(filename) as json_ifs:
+                featdict = json.load(json_ifs)    
+            groupDict={};    
+            if (self.groupName in featdict):
+                groupDict=featdict[self.groupName]
+            featdict=None;
+            self.counts=self.combine_dicts(self.counts,groupDict);
+            self.df=self.combine_dicts(self.df,groupDict,operatorDf);        
+            self.size=self.size+1;
+        _log.info("  loaded in %s seconds ---" % (time.time() - start_time));    
--- a/module/loadfextor.pyc
+++ b/module/loadfextor.pyc
--- a/module/service.log
+++ b/module/service.log
--- a/module/twstylo.r
+++ b/module/twstylo.r
--- a/requirements.txt
+++ b/requirements.txt
+nlp-ws
+numpy==1.14.3
+scikit-learn==0.19.0
+scipy==0.19.1
+sqlalchemy
+rpy2==2.7.2
+gensim
\ No newline at end of file
--- a/worker/config.ini
+++ b/worker/config.ini