From d385e473e0421e9d74f5cb2e36db891ea03bea43 Mon Sep 17 00:00:00 2001
From: pwalkow <pawel.walkowiak@hotmail.com>
Date: Mon, 3 Jul 2023 11:42:27 +0200
Subject: [PATCH] Add document parser

---
 .../java/pl/clarin/any2txt/Converter.java     | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/module/src/main/java/pl/clarin/any2txt/Converter.java b/module/src/main/java/pl/clarin/any2txt/Converter.java
index e51595c..3ecd349 100644
--- a/module/src/main/java/pl/clarin/any2txt/Converter.java
+++ b/module/src/main/java/pl/clarin/any2txt/Converter.java
@@ -22,6 +22,17 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.detect.TextDetector;
 
+import org.apache.tika.sax.BodyContentHandler;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.traversal.DocumentTraversal;
+import org.w3c.dom.traversal.NodeFilter;
+import org.w3c.dom.traversal.NodeIterator;
+import org.xml.sax.SAXException;
+
 import org.json.JSONObject;
 import pl.clarin.ws.worker.IniFile;
 import pl.clarin.ws.worker.Service;
@@ -78,6 +89,35 @@ public class Converter extends Worker {
             if("application/x-msdownload".equals(tika.detect(inS))){
                 tika = new Tika(new TextDetector());
             }
+            if("text/plain".equals(tika.detect(inS))){
+                try {
+                    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+                    DocumentBuilder loader = factory.newDocumentBuilder();
+                    Document document = loader.parse(inS);
+
+                    DocumentTraversal trav = (DocumentTraversal) document;
+                    NodeIterator it = trav.createNodeIterator(document.getDocumentElement(),
+                            NodeFilter.SHOW_ELEMENT, null, true);
+
+                    for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
+
+                        String text = n.getTextContent().trim();
+
+                        if (!text.isEmpty()) {
+                            System.out.println(text);
+                        }
+                    }
+
+//                    BodyContentHandler handler = new BodyContentHandler();
+//                    ParseContext pcontext = new ParseContext();
+//                    XMLParser xmlparser = new XMLParser();
+//                    xmlparser.parse(inS, handler, metadata, pcontext);
+//                    System.out.println("Contents of the document:" + handler.toString());
+                } catch (SAXException ex) {
+                    Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in xml processing " + ex.getMessage(), ex.getMessage());
+                    tika = new Tika(new TextDetector());
+                }
+            }
 
             tika.setMaxStringLength(maxLength);
             String txt = tika.parseToString(inS, metadata);
-- 
GitLab