Skip to content
Snippets Groups Projects
Commit 0dffaf12 authored by Tomasz Walkowiak's avatar Tomasz Walkowiak
Browse files

Initial commit

parents
Branches
No related tags found
No related merge requests found
FROM clarinpl/openjdk:8 as builder
LABEL application="Any2txt"
LABEL description="Client - Workers - converting any files into txt format"
LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
LABEL maintainer="tomasz.walkowiak@pwr.edu.pl"
WORKDIR /home/install
RUN git clone https://gitlab.clarin-pl.eu/nlpworkers/nlp.worker.git && \
cd nlp.worker && \
mvn clean && \
mvn install
WORKDIR /home/install
COPY ./module ./module
RUN cd module && \
mvn clean && \
mvn install
FROM clarinpl/openjdk-jre:8
WORKDIR /home/worker
COPY --from=builder /home/install/module/target/nlp.worker.any2txt.jar .
FROM clarinpl/openjdk:8 as builder
LABEL application="Any2txt"
LABEL description="Client - Workers - converting any files into txt format"
LABEL organiztation="NLP Tools for Polish from G4.19 Group - Wroclaw University of Science and Technology"
LABEL maintainer="tomasz.walkowiak@pwr.edu.pl"
WORKDIR /home/install
RUN git clone https://gitlab.clarin-pl.eu/nlpworkers/nlp.worker.git && \
cd nlp.worker && \
mvn clean && \
mvn install
WORKDIR /home/install
COPY ./module ./module
RUN cd module && \
mvn clean && \
mvn install
FROM clarinpl/openjdk-jre:8
WORKDIR /home/worker
COPY --from=builder /home/install/module/target/nlp.worker.any2txt.jar .
; PLIK KONFIGURACYJNY WORKERA
; Plik zawiera konfigurację zarówno Api usługi sieciowej jak i narzędzia.
;
; Autor: Tomasz Walkowiak
; email: tomasz.walkowiak@pwr.edu.pl
; --------- CZĘŚĆ DLA Serwisu ---------
[service]
root = /samba/requests/
tool = any2txt
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 6
max_length=1000000000
version: '3'
services:
termopl:
container_name: clarin_any2txt
build: ./
volumes:
- /samba:/samba
- ./config.ini:/home/worker/config.ini
working_dir: /home/worker/
entrypoint:
- java
- -jar
- nlp.worker.any2txt.jar
restart: always
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>pl.clarin</groupId>
<artifactId>nlp.worker.any2txt</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId>
<version>1.22</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>nlp.worker</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>pl.clarin.any2txt.Converter</mainClass>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<finalName>nlp.worker.any2txt</finalName>
</build>
</project>
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package pl.clarin.any2txt;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.json.JSONObject;
import pl.clarin.ws.worker.IniFile;
import pl.clarin.ws.worker.Service;
import pl.clarin.ws.worker.Worker;
/**
*
* @author Tomasz Walkowiak
*/
public class Converter extends Worker {
//init object for each thread
@Override
public void init() throws Exception
{
}
private int maxLength=(1000*1024*1024);
//init objects shared by threads
@Override
public void static_init(IniFile init) throws Exception
{ if (init.hasKey("tool", "max_length"))
maxLength=init.getInt("tool", "max_length", maxLength);
}
@Override
public void process(JSONObject data, JSONObject param) throws Exception {
// ....
}
@Override
public void process(String fileIn, String fileOut, JSONObject param) {
File file = new File(fileIn);
TikaInputStream inS;
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika =new Tika();
tika.setMaxStringLength(maxLength);
String txt=tika.parseToString(inS,metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(Converter.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
} catch (FileNotFoundException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage());
} catch (IOException | TikaException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Porblems in Tika processing", ex.getMessage());
}
}
public static void main(String[] args) {
new Service<>(Converter.class);
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment