Select Git revision
Converter.java

Paweł Walkowiak authored
Converter.java 5.62 KiB
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package pl.clarin.any2txt;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.detect.TextDetector;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.json.JSONObject;
import pl.clarin.ws.worker.IniFile;
import pl.clarin.ws.worker.Service;
import pl.clarin.ws.worker.Worker;
/**
* Class implementing file converter.
*
* Converts files such as docs, docx, xlsx, txt and others,
* to text file in UTF-8 encoding.
*
* The maximum file length is specified in the max_length configuration option.
*
* Class uses {@link Tika Apache Tika}
*
* @author Tomasz Walkowiak
*/
public class Converter extends Worker {
//init object for each thread
@Override
public void init() throws Exception {
}
private static int maxLength = (1000 * 1024 * 1024);
//init objects shared by threads
@Override
public void static_init(IniFile init) throws Exception {
if (init.hasKey("tool", "max_length"))
maxLength = init.getInt("tool", "max_length", maxLength);
System.out.println("MaxLength: " + maxLength);
}
@Override
public void process(JSONObject data, JSONObject param) throws Exception {
// ....
}
@Override
public void process(String fileIn, String fileOut, JSONObject param) {
Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Input file: " + fileIn);
File file = new File(fileIn);
TikaInputStream inS;
DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document doc = builder.parse(file);
try {
Metadata metadata = new Metadata();
metadata.set("Content-Encoding", "CP1250");
inS = TikaInputStream.get(file, metadata);
Tika tika = new Tika();
Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Detected file type: " + tika.detect(inS));
// || "text/plain".equals(tika.detect(inS))
if("application/x-msdownload".equals(tika.detect(inS))){
Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Using TextDetector for type: " + tika.detect(inS));
tika = new Tika(new TextDetector());
}
tika.setMaxStringLength(maxLength);
String txt = tika.parseToString(inS, metadata);
Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
txt = p.matcher(txt).replaceAll(" ");
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(txt);
}
fileWriter.close();
} catch (IOException exception) {
Logger.getLogger(Converter.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
}
} catch (FileNotFoundException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage());
saveEmptyFile(fileOut);
} catch (IOException | TikaException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in Tika processing " + ex.getMessage() , ex.getMessage());
if (ex.getMessage().contains("XML parse error")) {
try {
// DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
// Document doc = builder.parse(file);
doc.getDocumentElement().normalize();
Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Root element :" + doc.getDocumentElement().getNodeName() + " " + doc.getDocumentElement().getTextContent());
Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Child element :" + doc.getDocumentElement().getChildNodes().item(0).getTextContent());
}
catch (IOException | SAXException | ParserConfigurationException e) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with XML parsing " + e.getMessage() , e.getMessage());
}
} else {
saveEmptyFile(fileOut);
}
}
}
public static void main(String[] args) {
new Service<>(Converter.class);
}
private void saveEmptyFile(String fileOut) {
try {
FileWriter fileWriter = new FileWriter(fileOut);
try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write("");
}
fileWriter.close();
} catch (IOException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with writing: empty.txt", ex.getMessage());
}
}
}