Skip to content
Snippets Groups Projects
Select Git revision
  • 879398ed370aaa90fd587cc3644f692b7608418c
  • master default protected
  • support_xml
  • new_ws
  • repo_java protected
  • develop protected
6 results

Converter.java

Blame
  • Converter.java 5.62 KiB
    /*
     * To change this license header, choose License Headers in Project Properties.
     * To change this template file, choose Tools | Templates
     * and open the template in the editor.
     */
    
    package pl.clarin.any2txt;
    
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileWriter;
    import java.io.IOException;
    
    import java.util.logging.Level;
    import java.util.logging.Logger;
    import java.util.regex.Pattern;
    
    import org.apache.tika.Tika;
    import org.apache.tika.exception.TikaException;
    import org.apache.tika.io.TikaInputStream;
    import org.apache.tika.metadata.Metadata;
    import org.apache.tika.detect.TextDetector;
    import org.apache.tika.parser.xml.XMLParser;
    import org.apache.tika.parser.ParseContext;
    
    import org.xml.sax.SAXException;
    import javax.xml.parsers.ParserConfigurationException;
    import javax.xml.parsers.DocumentBuilder;
    import javax.xml.parsers.DocumentBuilderFactory;
    import org.w3c.dom.Document;
    
    import org.json.JSONObject;
    import pl.clarin.ws.worker.IniFile;
    import pl.clarin.ws.worker.Service;
    import pl.clarin.ws.worker.Worker;
    
    
    /**
     * Class implementing file converter.
     * 
     * Converts files such as docs, docx, xlsx, txt and others,
     * to text file in UTF-8 encoding.
     * 
     * The maximum file length is specified in the max_length configuration option.
     * 
     * Class uses {@link Tika Apache Tika} 
     * 
     * @author Tomasz Walkowiak
     */
    public class Converter extends Worker {
        //init object for each thread
        @Override
        public void init() throws Exception {
        }
    
        private static int maxLength = (1000 * 1024 * 1024);
    
        //init objects shared by threads
        @Override
        public void static_init(IniFile init) throws Exception {
            if (init.hasKey("tool", "max_length"))
                maxLength = init.getInt("tool", "max_length", maxLength);
            System.out.println("MaxLength: " + maxLength);
    
        }
    
        @Override
        public void process(JSONObject data, JSONObject param) throws Exception {
            // ....
        }
    
        @Override
        public void process(String fileIn, String fileOut, JSONObject param) {
            Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Input file: " + fileIn);
            File file = new File(fileIn);
            TikaInputStream inS;
    
            DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            Document doc = builder.parse(file);
    
            try {
                Metadata metadata = new Metadata();
                metadata.set("Content-Encoding", "CP1250");
                inS = TikaInputStream.get(file, metadata);
    
                Tika tika = new Tika();
                Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Detected file type: " + tika.detect(inS));
    //            || "text/plain".equals(tika.detect(inS))
                if("application/x-msdownload".equals(tika.detect(inS))){
                    Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Using TextDetector for type: " + tika.detect(inS));
                    tika = new Tika(new TextDetector());
                }
    
    
                tika.setMaxStringLength(maxLength);
                String txt = tika.parseToString(inS, metadata);
                Pattern p = Pattern.compile("[\\p{Cf}\\p{Co}\\p{Cs}\\p{Cn}\\x00-\\x09\\x11-\\x1f]");
                txt = p.matcher(txt).replaceAll(" ");
    
                try {
                    FileWriter fileWriter = new FileWriter(fileOut);
                    try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
                        bufferedWriter.write(txt);
                    }
    
                    fileWriter.close();
    
                } catch (IOException exception) {
                    Logger.getLogger(Converter.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception);
                }
    
    
            } catch (FileNotFoundException ex) {
                Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage());
                saveEmptyFile(fileOut);
            } catch (IOException | TikaException ex) {
                Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in Tika processing " + ex.getMessage() , ex.getMessage());
                if (ex.getMessage().contains("XML parse error")) {
                    try {
    //                    DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
    //                    Document doc = builder.parse(file);
                        doc.getDocumentElement().normalize();
                        Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Root element :" + doc.getDocumentElement().getNodeName() + " " + doc.getDocumentElement().getTextContent());
                        Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Child element :" + doc.getDocumentElement().getChildNodes().item(0).getTextContent());
                    }
                    catch (IOException | SAXException | ParserConfigurationException e) {
                        Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with XML parsing " + e.getMessage() , e.getMessage());
                    }
                } else {
                    saveEmptyFile(fileOut);
                }
            }
        }
    
    
        public static void main(String[] args) {
            new Service<>(Converter.class);
        }
    
        private void saveEmptyFile(String fileOut) {
            try {
                FileWriter fileWriter = new FileWriter(fileOut);
                try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
                    bufferedWriter.write("");
                }
    
                fileWriter.close();
    
            } catch (IOException ex) {
                Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with writing: empty.txt", ex.getMessage());
            }
        }
    
    }