Commit d95a3afd authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Merge branch 'develop' into 'master'

ner2sim

See merge request !1
parents 72c68b2a d6442b83
src/__pycache__
......@@ -7,9 +7,11 @@ services:
working_dir: /home/worker
entrypoint:
- python3.6
- ner2json.py
- main.py
- service
volumes:
- /samba:/samba
- ./config.ini:/home/worker/config.ini
- ./ner2json.py:/home/worker/ner2json.py
- ./src:/home/worker/src
- ./main.py:/home/worker/main.py
restart: always
\ No newline at end of file
"""Implementation of clustering service."""
import argparse
import nlp_ws
from src.worker import Worker
def get_args():
"""Gets command line arguments."""
parser = argparse.ArgumentParser(description="ner2json")
subparsers = parser.add_subparsers(dest="mode")
subparsers.required = True
subparsers.add_parser(
"service",
help="Run as a service")
return parser.parse_args()
def main():
"""Runs the program."""
args = get_args()
generators = {
"service": lambda: nlp_ws.NLPService.main(Worker),
}
gen_fn = generators.get(args.mode, lambda: None)
gen_fn()
if __name__ == "__main__":
main()
nlp-ws
lxml
configparser
ujson
\ No newline at end of file
ujson
networkx
\ No newline at end of file
import WrapLem
import nlp_ws
import logging
import lxml.etree as ET
import os,shutil,ujson
......@@ -135,66 +134,3 @@ def getAnnotations(ccl,lemmatizer):
annotations=ccl_ner(ccl,lemmatizer)
#_log.info("Starting grouping...");
return count_annotations(annotations)
class NER2JSONWorker(nlp_ws.NLPWorker):
def saveResult(self,keywords_dict,outputFile):
json_dict = []
file = open(outputFile, 'w')
file.write('[')
for idx in range(len(keywords_dict[0])):
element_dict = {'keyword': keywords_dict[0][idx], 'score': keywords_dict[1][idx], 'alias': keywords_dict[2][idx]}
json_dict.append(element_dict)
file.write(str(element_dict)+', ')
file.write(']')
file.close()
@classmethod
def static_init(cls, config):
_log.info("Worker started loading models")
cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
_log.info("Worker finished loading models ")
def process(self, inputFile, taskOptions,outputFile):
try:
if os.path.isdir(inputFile):
shutil.copytree(inputFile,outputFile)
annotation_lemma=getAnnotations(inputFile+"/text.ccl",self.lemmatizer)
else:
try:
os.makedirs(outputFile)
except:
pass
annotation_lemma=getAnnotations(inputFile,self.lemmatizer)
shutil.copy2(inputFile,outputFile+"/text.ccl")
finally:
pass
#print(annotation_lemma[:20])
#res={"ner":annotation_lemma[:20]};
res=annotation_lemma[:20]
ofn = outputFile + "/ner.json"
with open(ofn,"w") as f:
ujson.dump(res,f)
def main():
nlp_ws.NLPService.main(NER2JSONWorker)
if __name__ == '__main__':
main()
from src.ner2json import Token,Annotation
from src.ner2json import sentence_ner
import os
#file(/requests/liner2/72187e41-38ae-43dc-bc55-73f64e21303f)|ner2json_tmp({"type":"relations"})
#file(/requests/liner2/377e6a30-b89c-4cf9-9f33-bf3721e17556)|ner2json_tmp({"type":"relations"})
import lxml.etree as ET
import networkx as nx
class FindRelations:
def __init__(self,lemmatizer):
self.lemmatizer=lemmatizer
self.count={}
self.beings={}
def sentence_ner(self,sentence,type):
channels = {}
for token in sentence.iter("tok"):
orth = token.find("./orth").text
base = token.find("./lex/base").text
ctag = token.find("./lex/ctag").text
t = Token(orth, base, ctag)
for channel in token.iter("ann"):
index = int(channel.text)
chan = channel.attrib["chan"]
if index > 0 and chan.startswith("nam_"):
channels.setdefault(chan, {}) \
.setdefault(index, []) \
.append(t)
annotations = []
for (ann_type, group) in channels.items():
for tokens in group.values():
if ann_type != type:
continue
an = Annotation(ann_type, tokens)
lemma = self.lemmatizer.lemmatizeS(an.get_orth(), an.get_base(), an.get_ctag(), an.get_space(), an.get_category(), False)
#to remove too short ne
l =len(lemma)
if l<3:
continue
if l<4 and " " in lemma:
continue
if not lemma in self.count:
self.count[lemma]=1
else:
self.count[lemma]+=1
#an.set_lemma(lemma)
annotations.append(lemma)
return annotations
def add_connect(self,ne1,ne2):
b=self.beings
if not ne1 in b:
b[ne1]={}
if not ne2 in b[ne1]:
b[ne1][ne2]=0
b[ne1][ne2]=b[ne1][ne2]+1
def process_annots2(self,annotations):
list=[j for i in annotations for j in i]
n=len(list)
for i in range(n):
for j in range(n):
if i != j:
self.add_connect(list[i],list[j])
def process_annots(self,annotations):
if len(annotations)>0:
print(annotations)
for an in annotations:
self.beings[an]=0
annotations.clear()
def saverowlabels(self,rawlabels,file):
file.write('"rowlabels":')
file.write('[')
first=True
for el in rawlabels:
if not first:
file.write(',')
else:
first=False
file.write('"'+el+'"')
file.write(']')
def saveprobabilities(self,rawlabels,file):
file.write('"arr":[\n')
first=True
for el in rawlabels:
if not first:
file.write(',\n')
else:
first=False
self.saveprobability(el,rawlabels,file)
file.write('\n]')
def saveprobability(self,el,rawlabels,file):
sum=0.0
dict={}
if el in self.beings:
dict=self.beings[el]
vector=[]
for i in range(len(rawlabels)):
value=0
label=rawlabels[i]
if not label==el and label in dict:
value=dict[rawlabels[i]]
sum+=value
vector.append(value)
file.write('[')
first=True
for v in vector:
if not first:
file.write(',')
else:
first=False
if sum>0:
file.write(str(v/sum))
file.write(str(sum))
file.write(']')
def saveRelation(self,rawlabels,file):
file.write('"arr":[\n')
first=True
for el in rawlabels:
if not first:
file.write(',\n')
else:
first=False
file.write('[')
firstNext=True
for i in range(len(rawlabels)):
value=0
label=rawlabels[i]
dict={}
if el in self.beings:
dict=self.beings[el]
if not label==el and label in dict:
value=dict[label]
if not firstNext:
file.write(',')
else:
firstNext=False
file.write(str(value))
file.write(']')
file.write('\n]')
def saveGEXF(self,rawlabels,out):
dg = nx.DiGraph()
for key,val in self.count.items():
if not rawlabels is None and key in rawlabels:
dg.add_node(key,value=val)
for node1,dict in self.beings.items():
if not rawlabels is None and node1 in rawlabels:
for node2,weight in dict.items():
if not rawlabels is None and node2 in rawlabels:
dg.add_edge(node1,node2,weitght=weight)
nx.write_gexf(dg, out)
def saveAllSimilarities(self,rawlabels,out):
file = open(out, 'w')
file.write('{')
self.saverowlabels(rawlabels,file)
file.write(',\n')
self.saveprobabilities(rawlabels,file)
file.write('}')
file.close()
def saveAllRelations(self,rawlabels,out):
file = open(out, 'w')
file.write('{')
self.saverowlabels(rawlabels,file)
file.write(',\n')
self.saveRelation(rawlabels,file)
file.write('}')
file.close()
def results(self,out,limit=100):
rawlabels = [x[0] for x in sorted(self.count.items(), key=lambda x: x[1],reverse=True)[:limit]]
try:
os.mkdir(out)
except Exception as ex:
print(ex)
pass
self.saveAllSimilarities(rawlabels,out+"/probability.json")
self.saveAllRelations(rawlabels,out+"/relation.json")
self.saveGEXF(rawlabels,out+"/relation.gexf")
def process(self, inputFile, taskOptions,outputFile):
type = "nam_liv"
if "annots" in taskOptions:
type=taskOptions["annots"]
window=5
if "window" in taskOptions:
window=taskOptions["window"]
limit=1000
if "limit" in taskOptions:
limit=taskOptions["limit"]
if os.path.isdir(inputFile):
for name in os.listdir(path):
filename = os.path.join(path, name)
self.process_file(filename,type,window)
else:
self.process_file(inputFile,type,window)
self.results(outputFile,limit)
def process_file(self,filename,type,window):
annotations = [];
tree = ET.parse(filename)
for sentence in tree.iter("sentence"):
annotations.append(self.sentence_ner(sentence,type))
self.process_annots2(annotations)
if len(annotations)>=window:
annotations.pop(0)
\ No newline at end of file
"""Implementation of nlp_worker."""
import logging
import sys
import nlp_ws
import os,shutil,ujson
from src.ner2json import getAnnotations
from src.ner2sim import FindRelations
import WrapLem
_log = logging.getLogger(__name__)
class Worker(nlp_ws.NLPWorker):
"""Implements nlp_worker for ner2json service."""
def saveResult(self,keywords_dict,outputFile):
json_dict = []
file = open(outputFile, 'w')
file.write('[')
for idx in range(len(keywords_dict[0])):
element_dict = {'keyword': keywords_dict[0][idx], 'score': keywords_dict[1][idx], 'alias': keywords_dict[2][idx]}
json_dict.append(element_dict)
file.write(str(element_dict)+', ')
file.write(']')
file.close()
@classmethod
def static_init(cls, config):
_log.info("Worker started loading models")
cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
_log.info("Worker finished loading models ")
def keywords_chain(self,inputFile, taskOptions,outputFile):
try:
if os.path.isdir(inputFile):
shutil.copytree(inputFile,outputFile)
annotation_lemma=getAnnotations(inputFile+"/text.ccl",self.lemmatizer)
else:
try:
os.makedirs(outputFile)
except:
pass
annotation_lemma=getAnnotations(inputFile,self.lemmatizer)
shutil.copy2(inputFile,outputFile+"/text.ccl")
finally:
pass
res=annotation_lemma[:20]
ofn = outputFile + "/ner.json"
with open(ofn,"w") as f:
ujson.dump(res,f)
def process(self, inputFile, taskOptions,outputFile):
if "type" in taskOptions:
if taskOptions["type"]=="relations":
p=FindRelations(self.lemmatizer)
p.process(inputFile, taskOptions,outputFile)
else:
self.keywords_chain(inputFile, taskOptions,outputFile)
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment