Commit eabb2901 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Initial commit

parents
FROM ubuntu:xenial
# Instal prequesites
RUN apt-get update && apt-get -y upgrade
RUN apt-get -y install unzip software-properties-common && \
add-apt-repository ppa:jonathonf/python-3.6 && \
apt-get update && \
apt-get -y install libxml2-dev libxslt-dev build-essential python3.6 python3.6-dev python3-pip python3.6-venv python3-setuptools && \
apt-get -y install libboost-all-dev libicu-dev git-core wget cmake libantlr-dev libloki-dev python-dev swig libsigc++-2.0-dev libglibmm-2.4-dev libxml++2.6-dev && \
python3.6 -m pip install pip --upgrade && \
python3.6 -m pip install --no-cache-dir Cython
#newest cmake
RUN wget https://cmake.org/files/v3.12/cmake-3.12.0-Linux-x86_64.sh
RUN sh cmake-3.12.0-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir
#download tools
RUN mkdir /build
WORKDIR /build
RUN git clone http://nlp.pwr.wroc.pl/corpus2.git
RUN git clone http://nlp.pwr.edu.pl/wccl.git
RUN wget -O morfeusz2-2.0.0-Linux-amd64.deb https://nextcloud.clarin-pl.eu/index.php/s/VVIvx4w20azcWbp/download
RUN dpkg -i morfeusz2-2.0.0-Linux-amd64.deb
# corpus2
RUN cd corpus2
RUN mkdir bin
WORKDIR /build/corpus2/bin
RUN cmake ..
RUN make -j
RUN make -j
RUN make install
RUN ldconfig
# wccl
RUN mkdir /build/wccl/bin
WORKDIR /build/wccl/bin
RUN cmake ..
RUN make -j
RUN make -j
RUN make install
RUN ldconfig
# Install Java
RUN apt-get update && \
add-apt-repository ppa:openjdk-r/ppa && \
apt-get update && \
apt-get -y install openjdk-8-jre && \
apt-get -y install locales locales-all
# Set locales
RUN sed -i 's/# pl_PL.UTF-8 UTF-8/pl_PL.UTF-8 UTF-8/' /etc/locale.gen
RUN locale-gen pl_PL.UTF-8
ENV LANG pl_PL.UTF-8
ENV LANGUAGE pl_PL
ENV LC_ALL pl_PL.UTF-8
RUN dpkg-reconfigure --frontend noninteractive locales
# Set environment
ENV JAVA_HOME /opt/jdk
ENV PATH ${PATH}:${JAVA_HOME}/bin
# installing polem
RUN cd .
RUN rm /usr/bin/python3
RUN ln -s /usr/bin/python3.6 /usr/bin/python3
RUN alias python3='/usr/bin/python3.6'
RUN apt-get -y install default-jdk
WORKDIR /build/
RUN git clone https://github.com/gkubon/Polem
RUN mkdir -p /build/Polem/build
COPY . /build/Polem
RUN rm /build/Polem/build -rf
WORKDIR /build/Polem/build
RUN cmake ..
RUN make -j
RUN make install
RUN ldconfig
RUN mkdir /data
RUN mv /build/Polem/corpus /data
WORKDIR /app
#installing nlp_ws
RUN pip3 install --extra-index-url https://pypi.clarin-pl.eu/ nlp_ws
# Download lxml
RUN python3.6 -m pip install lxml
RUN python3.6 -m pip install configparser
RUN pip3 install ujson
#install mewex
WORKDIR /build
RUN git clone https://github.com/MGniew/MeWeX.git
RUN cd MeWeX/mwextractor/mwextractor && \
mkdir build && \
cd build && \
cmake .. && \
make install && \
ldconfig
RUN cd MeWeX/mewexlib/ && \
python3.6 setup.py install
[service]
tool = mewex1
root = /samba/requests/
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 4
[logging]
port = 9994
local_log_level = INFO
#!/usr/bin/python3
import os
import re
import io
import mewexlib as mwl
import WrapLem
from nlp_ws import NLPWorker, NLPService
class MewexWorker(NLPWorker):
def init(self):
self._lemmatizer = WrapLem.CascadeLemmatizer.assembleLemmatizer()
def process(self, input_path, task_options, output_path):
args = _parse_mewex_options(task_options.get('mewex_options') or {})
args['input_files'] = (
[os.path.join(input_path, f) for f in os.listdir(input_path)]
if os.path.isdir(input_path)
else (input_path,)
)
if not os.path.exists(output_path):
os.makedirs(output_path)
args['output_file'] = output_path+"/mewex.csv"
mwl.call_mewex(**args)
self.lemmatize(output_path+"/mewex.csv",output_path+"/mewexlemmatized.csv")
self.cut_lines(output_path+"/mewexlemmatized.csv",output_path+"/mewexshort.csv",1000)
def cut_lines(self,inf,outf,lines):
f = open(inf, "r")
copy = open(outf, "w")
n = 0
for line in f:
copy.write(line)
n = n + 1
if n > lines:
break
f.close()
copy.close()
def lemmatize(self, inf, outf):
input_file = io.open(inf, "r", encoding="utf-8")
output_file = open(outf, "w")
next(input_file); next(input_file) # First two rows are header rows, so just skip them
output_file.write("Rank\tQuantity\tRealtion\tBase form\tLemmatized form\tAll forms\n")
orthreg = re.compile(r'[0-9]+:([^(]+)\(([^)]+)\).*')
basereg = re.compile(r'[^:]+:([^ ]+)')
for line in input_file:
splited = line.strip().split('\t')
orthtuple = orthreg.findall(splited[4])
baselist = basereg.findall(splited[3])
base = u' '.join(baselist)
orth = orthtuple[0][0].strip()
tag = orthtuple[0][1]
result = self._lemmatizer.lemmatizeS(orth, base, tag, False)
splited.insert(4,result)
output_file.write('\t'.join(splited) + '\n')
input_file.close()
output_file.close()
_OPT_DISPATCH = dict(
ranker_func=lambda val: mwl.RankerFunction[val],
dispersion_func=lambda val: mwl.DispersionFunction[val],
wccl_rels=lambda val: frozenset(mwl.WCCLRelation[el] for el in val),
)
def _parse_mewex_options(opts):
args = {}
while opts:
key, val = opts.popitem()
if key in _OPT_DISPATCH:
val = _OPT_DISPATCH[key](val)
args[key] = val
return args
if __name__ == '__main__':
NLPService.main(MewexWorker, pause_at_exit=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment