From 66b9905314b88089e6d8cceaffbd2bd896d66e21 Mon Sep 17 00:00:00 2001 From: vikasrathee-cs Date: Thu, 4 Jan 2024 22:29:45 +0530 Subject: [PATCH] changes done for xml performance. Added saxon library changes done for xml performance. Added saxon library --- pom.xml | 5 ++ .../http/common/pagination/page/XmlPage.java | 64 ++++++++++++------- .../http/common/pagination/page/XmlUtil.java | 52 +++++++-------- 3 files changed, 70 insertions(+), 51 deletions(-) diff --git a/pom.xml b/pom.xml index eed6aa4a..e97cbc98 100644 --- a/pom.xml +++ b/pom.xml @@ -361,6 +361,11 @@ unxml ${unxml.version} + + net.sf.saxon + Saxon-HE + 12.4 + org.python jython-standalone diff --git a/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlPage.java b/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlPage.java index f7b81e07..1eb4b1a3 100644 --- a/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlPage.java +++ b/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlPage.java @@ -15,23 +15,24 @@ */ package io.cdap.plugin.http.common.pagination.page; -import com.fasterxml.jackson.databind.node.ArrayNode; import com.google.gson.JsonArray; import com.google.gson.JsonElement; -import com.nerdforge.unxml.Parsing; -import com.nerdforge.unxml.factory.ParsingFactory; -import com.nerdforge.unxml.parsers.Parser; -import com.nerdforge.unxml.parsers.builders.ObjectNodeParserBuilder; +import com.google.gson.JsonObject; import io.cdap.cdap.api.data.format.StructuredRecord; import io.cdap.cdap.api.data.schema.Schema; import io.cdap.cdap.format.StructuredRecordStringConverter; import io.cdap.plugin.http.common.http.HttpResponse; import io.cdap.plugin.http.source.common.BaseHttpSourceConfig; -import org.w3c.dom.Document; +import net.sf.saxon.s9api.Processor; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.XPathCompiler; +import net.sf.saxon.s9api.XdmItem; +import net.sf.saxon.s9api.XdmNode; +import net.sf.saxon.s9api.XdmValue; +import net.sf.saxon.trans.XPathException; import java.util.Iterator; import java.util.Map; -import javax.xml.xpath.XPathConstants; /** * Returns sub elements which are specified by XPath, one by one. @@ -41,15 +42,17 @@ class XmlPage extends BasePage { private final Map fieldsMapping; private final Iterator iterator; - private final Document document; + private final XdmNode document; private final Schema schema; private final BaseHttpSourceConfig config; + private final Processor processor = new Processor(false); + XmlPage(BaseHttpSourceConfig config, HttpResponse httpResponse) { super(httpResponse); this.config = config; this.fieldsMapping = config.getFullFieldsMapping(); - this.document = XmlUtil.createXmlDocument(httpResponse.getBody()); + this.document = XmlUtil.createXmlDocument(processor, httpResponse.getBody()); this.iterator = getDocumentElementsIterator(); this.schema = config.getSchema(); } @@ -79,33 +82,48 @@ public PageEntry next() { */ @Override public String getPrimitiveByPath(String path) { - return (String) XmlUtil.getByXPath(document, path, XPathConstants.STRING); + return XmlUtil.getByXPath(processor, document, path); } /** - * 1. Converts xml to a structure which is defined by "Fields Mapping" configuration. This is done using unxml. + * 1. Converts xml to a structure which is defined by "Fields Mapping" configuration. This is done using saxon. * 2. The result entity is a json array. * 3. An iterator for elements of json array is returned. * * @return an iterator for elements of result json array. */ private Iterator getDocumentElementsIterator() { - Parsing parsing = ParsingFactory.getInstance().create(); - ObjectNodeParserBuilder obj = parsing.obj(); - - for (Map.Entry entry : fieldsMapping.entrySet()) { - String schemaFieldName = entry.getKey(); - String fieldPath = entry.getValue(); - - obj = obj.attribute(schemaFieldName, fieldPath, XmlUtil.xmlTextNodeParser()); + XPathCompiler xPathCompiler = processor.newXPathCompiler(); + JsonArray jsonArray = new JsonArray(); + try { + for (XdmItem entry : xPathCompiler.evaluate(config.getResultPath(), document)) { + JsonObject jsonObject = new JsonObject(); + for (String schemaFieldName : fieldsMapping.keySet()) { + XdmValue xdmItems = xPathCompiler.evaluate(fieldsMapping.get(schemaFieldName), entry); + String value = getValueFromXdmItem(xdmItems); + jsonObject.addProperty(schemaFieldName, value); + } + jsonArray.add(jsonObject); + } + } catch (SaxonApiException | XPathException e) { + throw new RuntimeException(e); } - - Parser parser = parsing.arr(config.getResultPath(), obj).build(); - ArrayNode node = parser.apply(document); - JsonArray jsonArray = JSONUtil.toJsonArray(node.toString()); return jsonArray.iterator(); } + private String getValueFromXdmItem(XdmValue xdmItems) throws XPathException { + StringBuilder value = new StringBuilder(); + int[] i = new int[1]; + ((XdmNode) xdmItems).children().iterator().forEachRemaining(t -> i[0] = i[0] + 1); + // If main node contains child node, return full node else value of the node + if (i[0] > 1) { + value.append(xdmItems); + } else { + value.append(xdmItems.getUnderlyingValue().getStringValue()); + } + return value.toString(); + } + @Override public void close() { diff --git a/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlUtil.java b/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlUtil.java index 71ef81c9..ad17950a 100644 --- a/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlUtil.java +++ b/src/main/java/io/cdap/plugin/http/common/pagination/page/XmlUtil.java @@ -17,28 +17,24 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.TextNode; -import com.google.common.base.Charsets; import com.nerdforge.unxml.parsers.Parser; -import org.w3c.dom.Document; +import net.sf.saxon.s9api.DocumentBuilder; +import net.sf.saxon.s9api.Processor; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.XPathCompiler; +import net.sf.saxon.s9api.XdmNode; +import net.sf.saxon.trans.XPathException; import org.w3c.dom.Node; -import org.xml.sax.SAXException; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; +import java.io.StringReader; import java.io.StringWriter; -import javax.xml.namespace.QName; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathExpression; -import javax.xml.xpath.XPathExpressionException; +import javax.xml.transform.stream.StreamSource; import javax.xml.xpath.XPathFactory; /** @@ -50,19 +46,19 @@ public class XmlUtil { /** * Create xml document instance out of a String. * + * @param processor Saxon processor with xml document configuration * @param xmlString xml in string format - * @return a Document instance representing input xml + * @return a XdmNode Document instance representing input xml */ - public static Document createXmlDocument(String xmlString) { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setIgnoringComments(true); - + public static XdmNode createXmlDocument(Processor processor, String xmlString) { + DocumentBuilder documentBuilder = processor.newDocumentBuilder(); + XdmNode document = null; try { - InputStream input = new ByteArrayInputStream(xmlString.getBytes(Charsets.UTF_8)); - return factory.newDocumentBuilder().parse(input); - } catch (ParserConfigurationException | SAXException | IOException e) { - throw new IllegalStateException("Failed to parse xml document", e); + document = documentBuilder.build(new StreamSource(new StringReader(xmlString))); + } catch (SaxonApiException e) { + throw new RuntimeException(e); } + return document; } /** @@ -106,17 +102,17 @@ public static String nodeToString(Node node) { * Throws an exception if element is not of given path. * Returns null if element not found * - * @param document document instance + * @param processor Saxon processor with xml document configuration + * @param document XdmNode document instance * @param path xpath string representation - * @param returnType a type of element expected to be returned * @return element found by XPath or null if not found. */ - public static Object getByXPath(Document document, String path, QName returnType) { - XPath xpath = xPathfactory.newXPath(); + public static String getByXPath(Processor processor, XdmNode document, String path) { + XPathCompiler xPathCompiler = processor.newXPathCompiler(); try { - XPathExpression expr = xpath.compile(path); - return expr.evaluate(document, returnType); - } catch (XPathExpressionException e) { + return xPathCompiler.evaluate(path, document).getUnderlyingValue() + .getStringValue(); + } catch (XPathException | SaxonApiException e) { return null; } }