Skip to content
Snippets Groups Projects
Commit b94e9072 authored by Tomasz Naskret's avatar Tomasz Naskret
Browse files

fix for detection of text file starting with MZ as MSDOS exec file

parent a84b9068
Branches
No related tags found
No related merge requests found
......@@ -17,7 +17,7 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId>
<version>1.22</version>
<version>1.24</version>
</dependency>
<dependency>
......@@ -25,6 +25,12 @@
<artifactId>nlp.worker</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
......
......@@ -20,31 +20,29 @@ import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.detect.TextDetector;
import org.json.JSONObject;
import pl.clarin.ws.worker.IniFile;
import pl.clarin.ws.worker.Service;
import pl.clarin.ws.worker.Worker;
/**
*
* @author Tomasz Walkowiak
*/
public class Converter extends Worker {
//init object for each thread
@Override
public void init() throws Exception
{
public void init() throws Exception {
}
private int maxLength = (1000 * 1024 * 1024);
//init objects shared by threads
@Override
public void static_init(IniFile init) throws Exception
{ if (init.hasKey("tool", "max_length"))
public void static_init(IniFile init) throws Exception {
if (init.hasKey("tool", "max_length"))
maxLength = init.getInt("tool", "max_length", maxLength);
System.out.println("MaxLength: " + maxLength);
......@@ -65,6 +63,11 @@ public class Converter extends Worker {
inS = TikaInputStream.get(file, metadata);
Tika tika = new Tika();
if("application/x-msdownload".equals(tika.detect(inS))){
tika = new Tika(new TextDetector());
}
tika.setMaxStringLength(maxLength);
String txt = tika.parseToString(inS, metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
......@@ -83,7 +86,6 @@ public class Converter extends Worker {
}
} catch (FileNotFoundException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage());
} catch (IOException | TikaException ex) {
......@@ -96,7 +98,5 @@ public class Converter extends Worker {
new Service<>(Converter.class);
}
}
import org.apache.tika.detect.TextDetector;
import org.junit.Test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
public class TikaTest {
@Test
public void testTextWithMZ() {
String fileOut = "out.txt";
URL path = ClassLoader.getSystemResource("test.odt");
File file = new File(path.getFile());
TikaInputStream inS;
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika = new Tika();
if("application/x-msdownload".equals(tika.detect(inS))){
tika = new Tika(new TextDetector());
}
tika.setMaxStringLength(1000 * 1024 * 1024);
String txt = tika.parseToString(inS, metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
System.out.println(txt);
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(TikaTest.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
} catch (Exception ex){
ex.printStackTrace();
}
}
}
File added
File added
MZ Jeśli ktoś/coś jest <x#subst:sg:inst:%>, to musi być <y#subst:sg:inst:%>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment