Skip to content
Snippets Groups Projects
Commit b94e9072 authored by Tomasz Naskret's avatar Tomasz Naskret
Browse files

fix for detection of text file starting with MZ as MSDOS exec file

parent a84b9068
Branches
No related merge requests found
......@@ -17,7 +17,7 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId>
<version>1.22</version>
<version>1.24</version>
</dependency>
<dependency>
......@@ -25,6 +25,12 @@
<artifactId>nlp.worker</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
......
......@@ -20,83 +20,83 @@ import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.detect.TextDetector;
import org.json.JSONObject;
import pl.clarin.ws.worker.IniFile;
import pl.clarin.ws.worker.Service;
import pl.clarin.ws.worker.Worker;
/**
*
* @author Tomasz Walkowiak
*/
public class Converter extends Worker {
//init object for each thread
//init object for each thread
@Override
public void init() throws Exception
{
public void init() throws Exception {
}
private int maxLength=(1000*1024*1024);
private int maxLength = (1000 * 1024 * 1024);
//init objects shared by threads
@Override
public void static_init(IniFile init) throws Exception
{ if (init.hasKey("tool", "max_length"))
maxLength=init.getInt("tool", "max_length", maxLength);
System.out.println("MaxLength: "+maxLength);
public void static_init(IniFile init) throws Exception {
if (init.hasKey("tool", "max_length"))
maxLength = init.getInt("tool", "max_length", maxLength);
System.out.println("MaxLength: " + maxLength);
}
@Override
public void process(JSONObject data, JSONObject param) throws Exception {
// ....
}
// ....
}
@Override
public void process(String fileIn, String fileOut, JSONObject param) {
File file = new File(fileIn);
TikaInputStream inS;
try {
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika =new Tika();
Tika tika = new Tika();
if("application/x-msdownload".equals(tika.detect(inS))){
tika = new Tika(new TextDetector());
}
tika.setMaxStringLength(maxLength);
String txt=tika.parseToString(inS,metadata);
String txt = tika.parseToString(inS, metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
try {
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(Converter.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
}
} catch (FileNotFoundException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage());
} catch (IOException | TikaException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Porblems in Tika processing", ex.getMessage());
}
}
}
public static void main(String[] args) {
new Service<>(Converter.class);
new Service<>(Converter.class);
}
}
import org.apache.tika.detect.TextDetector;
import org.junit.Test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
public class TikaTest {
@Test
public void testTextWithMZ() {
String fileOut = "out.txt";
URL path = ClassLoader.getSystemResource("test.odt");
File file = new File(path.getFile());
TikaInputStream inS;
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika = new Tika();
if("application/x-msdownload".equals(tika.detect(inS))){
tika = new Tika(new TextDetector());
}
tika.setMaxStringLength(1000 * 1024 * 1024);
String txt = tika.parseToString(inS, metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
System.out.println(txt);
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(TikaTest.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
} catch (Exception ex){
ex.printStackTrace();
}
}
}
File added
File added
MZ Jeśli ktoś/coś jest <x#subst:sg:inst:%>, to musi być <y#subst:sg:inst:%>
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment