diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..550d67d12af98aaf7df56061a2a382ddecc1e982 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +src/__pycache__ diff --git a/docker-compose.yml b/docker-compose.yml index 83c6a4472b51022902331319552e2e8152cc451b..25b173ed05892946dc30bc94aca7755046cbcbfb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,9 +7,11 @@ services: working_dir: /home/worker entrypoint: - python3.6 - - ner2json.py + - main.py + - service volumes: - /samba:/samba - ./config.ini:/home/worker/config.ini - - ./ner2json.py:/home/worker/ner2json.py + - ./src:/home/worker/src + - ./main.py:/home/worker/main.py restart: always \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef34ff4de142ed5abe5fc7bd3e9b2b87a6e94db --- /dev/null +++ b/main.py @@ -0,0 +1,36 @@ +"""Implementation of clustering service.""" +import argparse + +import nlp_ws + +from src.worker import Worker + + +def get_args(): + """Gets command line arguments.""" + parser = argparse.ArgumentParser(description="ner2json") + + subparsers = parser.add_subparsers(dest="mode") + subparsers.required = True + + subparsers.add_parser( + "service", + help="Run as a service") + + return parser.parse_args() + + +def main(): + """Runs the program.""" + args = get_args() + + generators = { + "service": lambda: nlp_ws.NLPService.main(Worker), + } + + gen_fn = generators.get(args.mode, lambda: None) + gen_fn() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 8e004234a7f3ac494439a1363e98d6dc5fb76b10..c7d698564239ef33235c7c45b75f0a86669437be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ nlp-ws lxml configparser -ujson \ No newline at end of file +ujson +networkx \ No newline at end of file diff --git a/ner2json.py b/src/ner2json.py similarity index 67% rename from ner2json.py rename to src/ner2json.py index 487c4da93b28fd3dbb268c7ae0364da2d61583f7..62d710e09ee046a4736559027fa6a8765a084e26 100644 --- a/ner2json.py +++ b/src/ner2json.py @@ -1,6 +1,5 @@ import WrapLem -import nlp_ws import logging import lxml.etree as ET import os,shutil,ujson @@ -135,66 +134,3 @@ def getAnnotations(ccl,lemmatizer): annotations=ccl_ner(ccl,lemmatizer) #_log.info("Starting grouping..."); return count_annotations(annotations) - -class NER2JSONWorker(nlp_ws.NLPWorker): - - - - def saveResult(self,keywords_dict,outputFile): - json_dict = [] - file = open(outputFile, 'w') - file.write('[') - - for idx in range(len(keywords_dict[0])): - element_dict = {'keyword': keywords_dict[0][idx], 'score': keywords_dict[1][idx], 'alias': keywords_dict[2][idx]} - json_dict.append(element_dict) - file.write(str(element_dict)+', ') - - file.write(']') - file.close() - - - - @classmethod - def static_init(cls, config): - _log.info("Worker started loading models") - cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer() - - _log.info("Worker finished loading models ") - - - def process(self, inputFile, taskOptions,outputFile): - - - try: - - if os.path.isdir(inputFile): - shutil.copytree(inputFile,outputFile) - annotation_lemma=getAnnotations(inputFile+"/text.ccl",self.lemmatizer) - - else: - try: - os.makedirs(outputFile) - except: - pass - annotation_lemma=getAnnotations(inputFile,self.lemmatizer) - shutil.copy2(inputFile,outputFile+"/text.ccl") - - finally: - pass - - - #print(annotation_lemma[:20]) - #res={"ner":annotation_lemma[:20]}; - res=annotation_lemma[:20] - ofn = outputFile + "/ner.json" - with open(ofn,"w") as f: - ujson.dump(res,f) - - - -def main(): - nlp_ws.NLPService.main(NER2JSONWorker) - -if __name__ == '__main__': - main() diff --git a/src/ner2sim.py b/src/ner2sim.py new file mode 100644 index 0000000000000000000000000000000000000000..ea6e29e63eefacfc43095f07ca0230fa667ea166 --- /dev/null +++ b/src/ner2sim.py @@ -0,0 +1,249 @@ +from src.ner2json import Token,Annotation +from src.ner2json import sentence_ner +import os + + +#file(/requests/liner2/72187e41-38ae-43dc-bc55-73f64e21303f)|ner2json_tmp({"type":"relations"}) +#file(/requests/liner2/377e6a30-b89c-4cf9-9f33-bf3721e17556)|ner2json_tmp({"type":"relations"}) + + +import lxml.etree as ET +import networkx as nx + +class FindRelations: + def __init__(self,lemmatizer): + self.lemmatizer=lemmatizer + self.count={} + self.beings={} + + def sentence_ner(self,sentence,type): + channels = {} + for token in sentence.iter("tok"): + orth = token.find("./orth").text + base = token.find("./lex/base").text + ctag = token.find("./lex/ctag").text + t = Token(orth, base, ctag) + for channel in token.iter("ann"): + index = int(channel.text) + chan = channel.attrib["chan"] + + if index > 0 and chan.startswith("nam_"): + channels.setdefault(chan, {}) \ + .setdefault(index, []) \ + .append(t) + + annotations = [] + for (ann_type, group) in channels.items(): + for tokens in group.values(): + if ann_type != type: + continue + an = Annotation(ann_type, tokens) + lemma = self.lemmatizer.lemmatizeS(an.get_orth(), an.get_base(), an.get_ctag(), an.get_space(), an.get_category(), False) + #to remove too short ne + l =len(lemma) + if l<3: + continue + if l<4 and " " in lemma: + continue + if not lemma in self.count: + self.count[lemma]=1 + else: + self.count[lemma]+=1 + #an.set_lemma(lemma) + annotations.append(lemma) + + return annotations + + def add_connect(self,ne1,ne2): + b=self.beings + if not ne1 in b: + b[ne1]={} + if not ne2 in b[ne1]: + b[ne1][ne2]=0 + b[ne1][ne2]=b[ne1][ne2]+1 + + + def process_annots2(self,annotations): + list=[j for i in annotations for j in i] + n=len(list) + for i in range(n): + for j in range(n): + if i != j: + self.add_connect(list[i],list[j]) + + + + def process_annots(self,annotations): + if len(annotations)>0: + print(annotations) + for an in annotations: + self.beings[an]=0 + annotations.clear() + + def saverowlabels(self,rawlabels,file): + file.write('"rowlabels":') + file.write('[') + first=True + for el in rawlabels: + if not first: + file.write(',') + else: + first=False + file.write('"'+el+'"') + + file.write(']') + + def saveprobabilities(self,rawlabels,file): + file.write('"arr":[\n') + + first=True + for el in rawlabels: + if not first: + file.write(',\n') + else: + first=False + self.saveprobability(el,rawlabels,file) + file.write('\n]') + + def saveprobability(self,el,rawlabels,file): + sum=0.0 + dict={} + if el in self.beings: + dict=self.beings[el] + vector=[] + for i in range(len(rawlabels)): + value=0 + label=rawlabels[i] + if not label==el and label in dict: + value=dict[rawlabels[i]] + sum+=value + vector.append(value) + + file.write('[') + + first=True + for v in vector: + if not first: + file.write(',') + else: + first=False + if sum>0: + file.write(str(v/sum)) + file.write(str(sum)) + file.write(']') + + def saveRelation(self,rawlabels,file): + file.write('"arr":[\n') + + first=True + for el in rawlabels: + if not first: + file.write(',\n') + else: + first=False + file.write('[') + firstNext=True + for i in range(len(rawlabels)): + value=0 + label=rawlabels[i] + dict={} + if el in self.beings: + dict=self.beings[el] + if not label==el and label in dict: + value=dict[label] + if not firstNext: + file.write(',') + else: + firstNext=False + file.write(str(value)) + file.write(']') + file.write('\n]') + + + def saveGEXF(self,rawlabels,out): + dg = nx.DiGraph() + for key,val in self.count.items(): + if not rawlabels is None and key in rawlabels: + dg.add_node(key,value=val) + + for node1,dict in self.beings.items(): + if not rawlabels is None and node1 in rawlabels: + for node2,weight in dict.items(): + if not rawlabels is None and node2 in rawlabels: + dg.add_edge(node1,node2,weitght=weight) + + nx.write_gexf(dg, out) + + def saveAllSimilarities(self,rawlabels,out): + file = open(out, 'w') + file.write('{') + + self.saverowlabels(rawlabels,file) + file.write(',\n') + self.saveprobabilities(rawlabels,file) + + + file.write('}') + file.close() + + def saveAllRelations(self,rawlabels,out): + file = open(out, 'w') + file.write('{') + + self.saverowlabels(rawlabels,file) + file.write(',\n') + self.saveRelation(rawlabels,file) + + + file.write('}') + file.close() + + + def results(self,out,limit=100): + + rawlabels = [x[0] for x in sorted(self.count.items(), key=lambda x: x[1],reverse=True)[:limit]] + + try: + os.mkdir(out) + except Exception as ex: + print(ex) + pass + self.saveAllSimilarities(rawlabels,out+"/probability.json") + self.saveAllRelations(rawlabels,out+"/relation.json") + self.saveGEXF(rawlabels,out+"/relation.gexf") + + + + + + def process(self, inputFile, taskOptions,outputFile): + type = "nam_liv" + if "annots" in taskOptions: + type=taskOptions["annots"] + + window=5 + if "window" in taskOptions: + window=taskOptions["window"] + + limit=1000 + if "limit" in taskOptions: + limit=taskOptions["limit"] + if os.path.isdir(inputFile): + for name in os.listdir(path): + filename = os.path.join(path, name) + self.process_file(filename,type,window) + else: + self.process_file(inputFile,type,window) + self.results(outputFile,limit) + + + def process_file(self,filename,type,window): + annotations = []; + tree = ET.parse(filename) + for sentence in tree.iter("sentence"): + annotations.append(self.sentence_ner(sentence,type)) + self.process_annots2(annotations) + + if len(annotations)>=window: + annotations.pop(0) + \ No newline at end of file diff --git a/src/worker.py b/src/worker.py new file mode 100644 index 0000000000000000000000000000000000000000..f58606bb5960df56606805a7b36f0b6e090d12e7 --- /dev/null +++ b/src/worker.py @@ -0,0 +1,73 @@ +"""Implementation of nlp_worker.""" +import logging +import sys + +import nlp_ws +import os,shutil,ujson + +from src.ner2json import getAnnotations +from src.ner2sim import FindRelations + + +import WrapLem + +_log = logging.getLogger(__name__) + +class Worker(nlp_ws.NLPWorker): + """Implements nlp_worker for ner2json service.""" + + def saveResult(self,keywords_dict,outputFile): + json_dict = [] + file = open(outputFile, 'w') + file.write('[') + + for idx in range(len(keywords_dict[0])): + element_dict = {'keyword': keywords_dict[0][idx], 'score': keywords_dict[1][idx], 'alias': keywords_dict[2][idx]} + json_dict.append(element_dict) + file.write(str(element_dict)+', ') + + file.write(']') + file.close() + + + + @classmethod + def static_init(cls, config): + _log.info("Worker started loading models") + cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer() + + _log.info("Worker finished loading models ") + + def keywords_chain(self,inputFile, taskOptions,outputFile): + try: + + if os.path.isdir(inputFile): + shutil.copytree(inputFile,outputFile) + annotation_lemma=getAnnotations(inputFile+"/text.ccl",self.lemmatizer) + + else: + try: + os.makedirs(outputFile) + except: + pass + annotation_lemma=getAnnotations(inputFile,self.lemmatizer) + shutil.copy2(inputFile,outputFile+"/text.ccl") + + finally: + pass + res=annotation_lemma[:20] + ofn = outputFile + "/ner.json" + with open(ofn,"w") as f: + ujson.dump(res,f) + + + def process(self, inputFile, taskOptions,outputFile): + if "type" in taskOptions: + if taskOptions["type"]=="relations": + p=FindRelations(self.lemmatizer) + p.process(inputFile, taskOptions,outputFile) + else: + self.keywords_chain(inputFile, taskOptions,outputFile) + + + \ No newline at end of file