diff --git a/module/src/main/java/pl/clarin/any2txt/Converter.java b/module/src/main/java/pl/clarin/any2txt/Converter.java index bf844a2b549bb0ba416328d8d49f9a6cf4de482d..e214f6f3b0fd87546d6364982c5428cbbce7863d 100644 --- a/module/src/main/java/pl/clarin/any2txt/Converter.java +++ b/module/src/main/java/pl/clarin/any2txt/Converter.java @@ -24,14 +24,6 @@ import org.apache.tika.detect.TextDetector; import org.apache.tika.parser.xml.XMLParser; import org.apache.tika.parser.ParseContext; - -import org.apache.tika.sax.BodyContentHandler; -import org.xml.sax.SAXException; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import org.w3c.dom.Document; - import org.json.JSONObject; import pl.clarin.ws.worker.IniFile; import pl.clarin.ws.worker.Service; @@ -76,55 +68,19 @@ public class Converter extends Worker { public void process(String fileIn, String fileOut, JSONObject param) { Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Input file: " + fileIn); File file = new File(fileIn); - TikaInputStream inS = null; -// try { -// if (inS != null){ -// try { -// inS.close(); -// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "TikaInputStream closed"); -// } -// catch (IOException exp) { -// Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with closing TikaInputStream " + exp.getMessage() , exp.getMessage()); -// } -// } -// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename " + fileIn + " " + new File(".").getAbsolutePath()); -//// File f = new File(fileIn); -// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename exists? " + file.exists() + " Can read? " + file.canRead()); -// DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); -// Document doc = builder.parse(file); -// doc.getDocumentElement().normalize(); -// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Root element :" + doc.getDocumentElement().getNodeName() + " " + doc.getDocumentElement().getTextContent()); -// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Child element :" + doc.getDocumentElement().getChildNodes().item(0).getTextContent()); -// } -// catch (IOException | SAXException | ParserConfigurationException e) { -// Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with XML parsing " + e.getMessage() , e.getMessage()); -// } + TikaInputStream ins = null; try { Metadata metadata = new Metadata(); metadata.set("Content-Encoding", "CP1250"); -// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename exists before inS? " + file.exists()); inS = TikaInputStream.get(file, metadata); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename exists begin of try? " + file.exists()); - Tika tika = new Tika(); Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Detected file type: " + tika.detect(inS)); -// || "text/plain".equals(tika.detect(inS)) - if("application/x-msdownload".equals(tika.detect(inS))){ + if("application/x-msdownload".equals(tika.detect(inS)) || "text/plain".equals(tika.detect(inS))){ Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Using TextDetector for type: " + tika.detect(inS)); tika = new Tika(new TextDetector()); } - try { - BodyContentHandler handler = new BodyContentHandler(); - ParseContext pcontext = new ParseContext(); - XMLParser xmlparser = new XMLParser(); - xmlparser.parse(inS, handler, metadata, pcontext); - System.out.println("Contents of the document:" + handler.toString()); - } catch (SAXException ex) { - Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in Tika processing " + ex.getMessage(), ex.getMessage()); - } - tika.setMaxStringLength(maxLength); String txt = tika.parseToString(inS, metadata); @@ -143,37 +99,19 @@ public class Converter extends Worker { Logger.getLogger(Converter.class.getName()).log(Level.SEVERE, "Problems with writing: " + fileOut, exception); } -// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename exists end of try ? " + file.exists()); } catch (FileNotFoundException ex) { Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "File not found", ex.getMessage()); saveEmptyFile(fileOut); } catch (IOException | TikaException ex) { Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in Tika processing " + ex.getMessage() , ex.getMessage()); - if (ex.getMessage().contains("XML parse error")) { + saveEmptyFile(fileOut); + if (inS != null){ try { - if (inS != null){ - try { - inS.close(); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "TikaInputStream closed"); - } - catch (IOException exp) { - Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with closing TikaInputStream " + exp.getMessage() , exp.getMessage()); - } - } - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename " + fileIn + " " + new File(".").getAbsolutePath()); - File f = new File(fileIn); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename exists? " + f.exists() + "Can read?" + f.canRead()); - DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); - Document doc = builder.parse(f); - doc.getDocumentElement().normalize(); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Root element :" + doc.getDocumentElement().getNodeName() + " " + doc.getDocumentElement().getTextContent()); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Child element :" + doc.getDocumentElement().getChildNodes().item(0).getTextContent()); + inS.close(); } - catch (IOException | SAXException | ParserConfigurationException e) { - Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with XML parsing " + e.getMessage() , e.getMessage()); + catch (IOException exp) { + Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with closing TikaInputStream " + exp.getMessage() , exp.getMessage()); } - } else { - saveEmptyFile(fileOut); } } } @@ -189,7 +127,6 @@ public class Converter extends Worker { try (BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) { bufferedWriter.write(""); } - fileWriter.close(); } catch (IOException ex) {