Commit b94e9072 authored by Tomasz Naskret's avatar Tomasz Naskret

fix for detection of text file starting with MZ as MSDOS exec file

parent a84b9068
......@@ -17,7 +17,7 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId>
<version>1.22</version>
<version>1.24</version>
</dependency>
<dependency>
......@@ -25,6 +25,12 @@
<artifactId>nlp.worker</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
......
......@@ -20,83 +20,83 @@ import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.detect.TextDetector;
import org.json.JSONObject;
import pl.clarin.ws.worker.IniFile;
import pl.clarin.ws.worker.Service;
import pl.clarin.ws.worker.Worker;
/**
*
* @author Tomasz Walkowiak
*/
public class Converter extends Worker {
//init object for each thread
//init object for each thread
@Override
public void init() throws Exception
{
public void init() throws Exception {
}
private int maxLength=(1000*1024*1024);
private int maxLength = (1000 * 1024 * 1024);
//init objects shared by threads
@Override
public void static_init(IniFile init) throws Exception
{ if (init.hasKey("tool", "max_length"))
maxLength=init.getInt("tool", "max_length", maxLength);
System.out.println("MaxLength: "+maxLength);
public void static_init(IniFile init) throws Exception {
if (init.hasKey("tool", "max_length"))
maxLength = init.getInt("tool", "max_length", maxLength);
System.out.println("MaxLength: " + maxLength);
}
@Override
public void process(JSONObject data, JSONObject param) throws Exception {
// ....
}
// ....
}
@Override
public void process(String fileIn, String fileOut, JSONObject param) {
File file = new File(fileIn);
TikaInputStream inS;
try {
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika =new Tika();
Tika tika = new Tika();
if("application/x-msdownload".equals(tika.detect(inS))){
tika = new Tika(new TextDetector());
}
tika.setMaxStringLength(maxLength);
String txt=tika.parseToString(inS,metadata);
String txt = tika.parseToString(inS, metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
try {
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(Converter.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
}
} catch (FileNotFoundException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage());
} catch (IOException | TikaException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Porblems in Tika processing", ex.getMessage());
}
}
}
public static void main(String[] args) {
new Service<>(Converter.class);
new Service<>(Converter.class);
}
}
import org.apache.tika.detect.TextDetector;
import org.junit.Test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
public class TikaTest {
@Test
public void testTextWithMZ() {
String fileOut = "out.txt";
URL path = ClassLoader.getSystemResource("test.odt");
File file = new File(path.getFile());
TikaInputStream inS;
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika = new Tika();
if("application/x-msdownload".equals(tika.detect(inS))){
tika = new Tika(new TextDetector());
}
tika.setMaxStringLength(1000 * 1024 * 1024);
String txt = tika.parseToString(inS, metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
System.out.println(txt);
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(TikaTest.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
} catch (Exception ex){
ex.printStackTrace();
}
}
}
MZ Jeśli ktoś/coś jest <x#subst:sg:inst:%>, to musi być <y#subst:sg:inst:%>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment