diff --git a/module/src/main/java/pl/clarin/any2txt/Converter.java b/module/src/main/java/pl/clarin/any2txt/Converter.java index c6bca9f63bdb4605a0ecd4f53ab23adcf7103a59..d368ebd0774ab70e73f7b06ed6261a5b103f9b5e 100644 --- a/module/src/main/java/pl/clarin/any2txt/Converter.java +++ b/module/src/main/java/pl/clarin/any2txt/Converter.java @@ -75,28 +75,28 @@ public class Converter extends Worker { Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Input file: " + fileIn); File file = new File(fileIn); TikaInputStream inS = null; - try { - if (inS != null){ - try { - inS.close(); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "TikaInputStream closed"); - } - catch (IOException exp) { - Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with closing TikaInputStream " + exp.getMessage() , exp.getMessage()); - } - } - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename " + fileIn + " " + new File(".").getAbsolutePath()); -// File f = new File(fileIn); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename exists? " + file.exists() + " Can read? " + file.canRead()); - DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); - Document doc = builder.parse(file); - doc.getDocumentElement().normalize(); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Root element :" + doc.getDocumentElement().getNodeName() + " " + doc.getDocumentElement().getTextContent()); - Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Child element :" + doc.getDocumentElement().getChildNodes().item(0).getTextContent()); - } - catch (IOException | SAXException | ParserConfigurationException e) { - Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with XML parsing " + e.getMessage() , e.getMessage()); - } +// try { +// if (inS != null){ +// try { +// inS.close(); +// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "TikaInputStream closed"); +// } +// catch (IOException exp) { +// Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with closing TikaInputStream " + exp.getMessage() , exp.getMessage()); +// } +// } +// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename " + fileIn + " " + new File(".").getAbsolutePath()); +//// File f = new File(fileIn); +// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Filename exists? " + file.exists() + " Can read? " + file.canRead()); +// DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); +// Document doc = builder.parse(file); +// doc.getDocumentElement().normalize(); +// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Root element :" + doc.getDocumentElement().getNodeName() + " " + doc.getDocumentElement().getTextContent()); +// Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Child element :" + doc.getDocumentElement().getChildNodes().item(0).getTextContent()); +// } +// catch (IOException | SAXException | ParserConfigurationException e) { +// Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems with XML parsing " + e.getMessage() , e.getMessage()); +// } try { Metadata metadata = new Metadata(); @@ -113,6 +113,15 @@ public class Converter extends Worker { Logger.getLogger(Converter.class.getName()).log(Level.INFO, "Using TextDetector for type: " + tika.detect(inS)); tika = new Tika(new TextDetector()); } + try { + BodyContentHandler handler = new BodyContentHandler(); + ParseContext pcontext = new ParseContext(); + XMLParser xmlparser = new XMLParser(); + xmlparser.parse(inS, handler, metadata, pcontext); + System.out.println("Contents of the document:" + handler.toString()); + } catch (SAXException ex) { + Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in Tika processing " + ex.getMessage(), ex.getMessage()); + } tika.setMaxStringLength(maxLength);