Skip to content
Snippets Groups Projects
Commit d385e473 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Add document parser

parent 5beb77ea
Branches
1 merge request!1Support xml
Pipeline #11405 failed with stage
in 55 seconds
......@@ -22,6 +22,17 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.detect.TextDetector;
import org.apache.tika.sax.BodyContentHandler;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.NodeIterator;
import org.xml.sax.SAXException;
import org.json.JSONObject;
import pl.clarin.ws.worker.IniFile;
import pl.clarin.ws.worker.Service;
......@@ -78,6 +89,35 @@ public class Converter extends Worker {
if("application/x-msdownload".equals(tika.detect(inS))){
tika = new Tika(new TextDetector());
}
if("text/plain".equals(tika.detect(inS))){
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder loader = factory.newDocumentBuilder();
Document document = loader.parse(inS);
DocumentTraversal trav = (DocumentTraversal) document;
NodeIterator it = trav.createNodeIterator(document.getDocumentElement(),
NodeFilter.SHOW_ELEMENT, null, true);
for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) {
String text = n.getTextContent().trim();
if (!text.isEmpty()) {
System.out.println(text);
}
}
// BodyContentHandler handler = new BodyContentHandler();
// ParseContext pcontext = new ParseContext();
// XMLParser xmlparser = new XMLParser();
// xmlparser.parse(inS, handler, metadata, pcontext);
// System.out.println("Contents of the document:" + handler.toString());
} catch (SAXException ex) {
Logger.getLogger(Converter.class.getName()).log(Level.WARNING, "Problems in xml processing " + ex.getMessage(), ex.getMessage());
tika = new Tika(new TextDetector());
}
}
tika.setMaxStringLength(maxLength);
String txt = tika.parseToString(inS, metadata);
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment