Commit 72c68b2a authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Initial commit

parents
FROM clarinpl/builder AS builder
FROM clarinpl/python:3.6
RUN apt-get update && apt-get install -y \
libxml++2.6-dev \
libloki-dev \
libboost-all-dev \
libicu-dev \
libffi-dev \
libssl-dev \
libxml2-utils \
swig \
openjdk-8-jdk
WORKDIR /tmp/
RUN apt remove -y cmake && \
wget https://github.com/Kitware/CMake/releases/download/v3.16.0-rc2/cmake-3.16.0-rc2-Linux-x86_64.tar.gz && \
tar -xzf cmake*tar.gz && \
ln -s $(pwd)/cmake*/bin/cmake /usr/bin/cmake
COPY --from=builder /install/corpus2 /
COPY --from=builder /install/wccl /
COPY --from=builder /usr/lib/libmorfeusz* /usr/lib/
RUN wget -O morf 'https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/download' && \
dpkg -i ./morf && \
git clone https://github.com/gkubon/Polem && \
mkdir -p Polem/build && \
cd Polem/build && \
cmake .. && \
make -j && \
make install && \
ldconfig && \
cd / && rm -r /tmp/*
WORKDIR /home/worker
COPY requirements.txt .
RUN python3.6 -m pip install -r requirements.txt
[service]
tool = ner2json
root = /samba/requests/
rabbit_host = rabbit.clarin.ws
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 5
[logging]
port = 9998
local_log_level = INFO
[logging_levels]
__main__ = INFO
version: '3'
services:
ner2json:
container_name: clarin_ner2json
build: ./
working_dir: /home/worker
entrypoint:
- python3.6
- ner2json.py
volumes:
- /samba:/samba
- ./config.ini:/home/worker/config.ini
- ./ner2json.py:/home/worker/ner2json.py
restart: always
\ No newline at end of file
import WrapLem
import nlp_ws
import logging
import lxml.etree as ET
import os,shutil,ujson
_log = logging.getLogger(__name__)
class Token:
def __init__(self, orth, base, ctag):
self.orth = orth
self.base = base
self.ctag = ctag
def get_orth(self):
return self.orth
def get_base(self):
return self.base
def get_ctag(self):
return self.ctag
class Annotation:
def __init__(self, category, tokens):
self.category = category
self.tokens = tokens
self.lemma = self.get_orth()
self.count = 1
def get_count(self):
return self.count
def inc(self):
self.count=self.count+1
def toDict(self):
return {"keyword":self.lemma, "type":self.category,"count":self.count}
def get_category(self):
return self.category
def get_tokens(self):
return self.tokens
def get_orth(self):
return " ".join([token.get_orth() for token in self.tokens])
def get_base(self):
return " ".join([token.get_base() for token in self.tokens])
def get_ctag(self):
return " ".join([token.get_ctag() for token in self.tokens])
def get_space(self):
return " ".join(["True" for token in self.tokens])
def get_lemma(self):
return self.lemma
def set_lemma(self, lemma):
self.lemma = lemma
def __str__(self):
return "[%s] %s" % (self.get_category(), self.get_lemma())
def sentence_ner(sentence,lemmatizer):
channels = {}
for token in sentence.iter("tok"):
orth = token.find("./orth").text
base = token.find("./lex/base").text
ctag = token.find("./lex/ctag").text
t = Token(orth, base, ctag)
for channel in token.iter("ann"):
index = int(channel.text)
chan = channel.attrib["chan"]
if index > 0 and chan.startswith("nam_"):
channels.setdefault(chan, {}) \
.setdefault(index, []) \
.append(t)
annotations = []
for (ann_type, group) in channels.items():
for tokens in group.values():
if ann_type=='nam_adj':
continue
an = Annotation(ann_type, tokens)
lemma=lemmatizer.lemmatizeS(an.get_orth(), an.get_base(), an.get_ctag(), an.get_space(), an.get_category(), False)
#to remove too short ne
l =len(lemma)
if l<3:
continue
if l<4 and " " in lemma:
continue
an.set_lemma(lemma)
annotations.append(an)
return annotations
def ccl_ner(ccl,lemmatizer):
tree = ET.parse(ccl)
annotations = []
#_log.info("Starting lemmatisation...");
for sentence in tree.iter("sentence"):
annotations += sentence_ner(sentence,lemmatizer)
return annotations
def count_annotations(items):
counts = dict()
for i in items:
val=str(i)
if val in counts:
counts[val].inc();
else:
counts[val]=i
# res=sorted(counts, key=counts.get, reverse=True)
res=sorted(counts.values(),key=lambda x: x.get_count(), reverse=True)
return res
def getAnnotations(ccl,lemmatizer):
annotations=ccl_ner(ccl,lemmatizer)
#_log.info("Starting grouping...");
return count_annotations(annotations)
class NER2JSONWorker(nlp_ws.NLPWorker):
def saveResult(self,keywords_dict,outputFile):
json_dict = []
file = open(outputFile, 'w')
file.write('[')
for idx in range(len(keywords_dict[0])):
element_dict = {'keyword': keywords_dict[0][idx], 'score': keywords_dict[1][idx], 'alias': keywords_dict[2][idx]}
json_dict.append(element_dict)
file.write(str(element_dict)+', ')
file.write(']')
file.close()
@classmethod
def static_init(cls, config):
_log.info("Worker started loading models")
cls.lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
_log.info("Worker finished loading models ")
def process(self, inputFile, taskOptions,outputFile):
try:
if os.path.isdir(inputFile):
shutil.copytree(inputFile,outputFile)
annotation_lemma=getAnnotations(inputFile+"/text.ccl",self.lemmatizer)
else:
try:
os.makedirs(outputFile)
except:
pass
annotation_lemma=getAnnotations(inputFile,self.lemmatizer)
shutil.copy2(inputFile,outputFile+"/text.ccl")
finally:
pass
#print(annotation_lemma[:20])
#res={"ner":annotation_lemma[:20]};
res=annotation_lemma[:20]
ofn = outputFile + "/ner.json"
with open(ofn,"w") as f:
ujson.dump(res,f)
def main():
nlp_ws.NLPService.main(NER2JSONWorker)
if __name__ == '__main__':
main()
nlp-ws
lxml
configparser
ujson
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment