Commit 0dffaf12 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak

Initial commit

parents
FROM clarinpl/openjdk:8 as builder
LABEL application="Any2txt"
LABEL description="Client - Workers - converting any files into txt format"
LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
LABEL maintainer="tomasz.walkowiak@pwr.edu.pl"
WORKDIR /home/install
RUN git clone https://gitlab.clarin-pl.eu/nlpworkers/nlp.worker.git && \
cd nlp.worker && \
mvn clean && \
mvn install
WORKDIR /home/install
COPY ./module ./module
RUN cd module && \
mvn clean && \
mvn install
FROM clarinpl/openjdk-jre:8
WORKDIR /home/worker
COPY --from=builder /home/install/module/target/nlp.worker.any2txt.jar .
FROM clarinpl/openjdk:8 as builder
LABEL application="Any2txt"
LABEL description="Client - Workers - converting any files into txt format"
LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
LABEL maintainer="tomasz.walkowiak@pwr.edu.pl"
WORKDIR /home/install
RUN git clone https://gitlab.clarin-pl.eu/nlpworkers/nlp.worker.git && \
cd nlp.worker && \
mvn clean && \
mvn install
WORKDIR /home/install
COPY ./module ./module
RUN cd module && \
mvn clean && \
mvn install
FROM clarinpl/openjdk-jre:8
WORKDIR /home/worker
COPY --from=builder /home/install/module/target/nlp.worker.any2txt.jar .
; PLIK KONFIGURACYJNY WORKERA
; Plik zawiera konfigurację zarówno Api usługi sieciowej jak i narzędzia.
;
; Autor: Tomasz Walkowiak
; email: tomasz.walkowiak@pwr.edu.pl
; --------- CZĘŚĆ DLA Serwisu ---------
[service]
root = /samba/requests/
tool = any2txt
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 6
max_length=1000000000
version: '3'
services:
termopl:
container_name: clarin_any2txt
build: ./
volumes:
- /samba:/samba
- ./config.ini:/home/worker/config.ini
working_dir: /home/worker/
entrypoint:
- java
- -jar
- nlp.worker.any2txt.jar
restart: always
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>pl.clarin</groupId>
<artifactId>nlp.worker.any2txt</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId>
<version>1.22</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>nlp.worker</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>pl.clarin.any2txt.Converter</mainClass>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<finalName>nlp.worker.any2txt</finalName>
</build>
</project>
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package pl.clarin.any2txt;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.json.JSONObject;
import pl.clarin.ws.worker.IniFile;
import pl.clarin.ws.worker.Service;
import pl.clarin.ws.worker.Worker;
/**
*
* @author Tomasz Walkowiak
*/
public class Converter extends Worker {
//init object for each thread
@Override
public void init() throws Exception
{
}
private int maxLength=(1000*1024*1024);
//init objects shared by threads
@Override
public void static_init(IniFile init) throws Exception
{ if (init.hasKey("tool", "max_length"))
maxLength=init.getInt("tool", "max_length", maxLength);
}
@Override
public void process(JSONObject data, JSONObject param) throws Exception {
// ....
}
@Override
public void process(String fileIn, String fileOut, JSONObject param) {
File file = new File(fileIn);
TikaInputStream inS;
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika =new Tika();
tika.setMaxStringLength(maxLength);
String txt=tika.parseToString(inS,metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(Converter.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
} catch (FileNotFoundException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage());
} catch (IOException | TikaException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Porblems in Tika processing", ex.getMessage());
}
}
public static void main(String[] args) {
new Service<>(Converter.class);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment