From d385e473e0421e9d74f5cb2e36db891ea03bea43 Mon Sep 17 00:00:00 2001 From: pwalkow <pawel.walkowiak@hotmail.com> Date: Mon, 3 Jul 2023 11:42:27 +0200 Subject: [PATCH] Add document parser --- .../java/pl/clarin/any2txt/Converter.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/module/src/main/java/pl/clarin/any2txt/Converter.java b/module/src/main/java/pl/clarin/any2txt/Converter.java index e51595c..3ecd349 100644 --- a/module/src/main/java/pl/clarin/any2txt/Converter.java +++ b/module/src/main/java/pl/clarin/any2txt/Converter.java @@ -22,6 +22,17 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.detect.TextDetector; +import org.apache.tika.sax.BodyContentHandler; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.traversal.DocumentTraversal; +import org.w3c.dom.traversal.NodeFilter; +import org.w3c.dom.traversal.NodeIterator; +import org.xml.sax.SAXException; + import org.json.JSONObject; import pl.clarin.ws.worker.IniFile; import pl.clarin.ws.worker.Service; @@ -78,6 +89,35 @@ public class Converter extends Worker { if("application/x-msdownload".equals(tika.detect(inS))){ tika = new Tika(new TextDetector()); } + if("text/plain".equals(tika.detect(inS))){ + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder loader = factory.newDocumentBuilder(); + Document document = loader.parse(inS); + + DocumentTraversal trav = (DocumentTraversal) document; + NodeIterator it = trav.createNodeIterator(document.getDocumentElement(), + NodeFilter.SHOW_ELEMENT, null, true); + + for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) { + + String text = n.getTextContent().trim(); + + if (!text.isEmpty()) { + System.out.println(text); + } + } + +// BodyContentHandler handler = new BodyContentHandler(); +// ParseContext pcontext = new ParseContext(); +// XMLParser xmlparser = new XMLParser(); +// xmlparser.parse(inS, handler, metadata, pcontext); +// System.out.println("Contents of the document:" + handler.toString()); + } catch (SAXException ex) { + Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in xml processing " + ex.getMessage(), ex.getMessage()); + tika = new Tika(new TextDetector()); + } + } tika.setMaxStringLength(maxLength); String txt = tika.parseToString(inS, metadata); -- GitLab