diff --git a/module/src/main/java/pl/clarin/any2txt/Converter.java b/module/src/main/java/pl/clarin/any2txt/Converter.java index e51595c674982ea94305b45509589fe0b7d5c7c9..3ecd3490d60c76c299aab24ae53cd79e4c0c91ea 100644 --- a/module/src/main/java/pl/clarin/any2txt/Converter.java +++ b/module/src/main/java/pl/clarin/any2txt/Converter.java @@ -22,6 +22,17 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.detect.TextDetector; +import org.apache.tika.sax.BodyContentHandler; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.traversal.DocumentTraversal; +import org.w3c.dom.traversal.NodeFilter; +import org.w3c.dom.traversal.NodeIterator; +import org.xml.sax.SAXException; + import org.json.JSONObject; import pl.clarin.ws.worker.IniFile; import pl.clarin.ws.worker.Service; @@ -78,6 +89,35 @@ public class Converter extends Worker { if("application/x-msdownload".equals(tika.detect(inS))){ tika = new Tika(new TextDetector()); } + if("text/plain".equals(tika.detect(inS))){ + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder loader = factory.newDocumentBuilder(); + Document document = loader.parse(inS); + + DocumentTraversal trav = (DocumentTraversal) document; + NodeIterator it = trav.createNodeIterator(document.getDocumentElement(), + NodeFilter.SHOW_ELEMENT, null, true); + + for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) { + + String text = n.getTextContent().trim(); + + if (!text.isEmpty()) { + System.out.println(text); + } + } + +// BodyContentHandler handler = new BodyContentHandler(); +// ParseContext pcontext = new ParseContext(); +// XMLParser xmlparser = new XMLParser(); +// xmlparser.parse(inS, handler, metadata, pcontext); +// System.out.println("Contents of the document:" + handler.toString()); + } catch (SAXException ex) { + Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in xml processing " + ex.getMessage(), ex.getMessage()); + tika = new Tika(new TextDetector()); + } + } tika.setMaxStringLength(maxLength); String txt = tika.parseToString(inS, metadata);