UNPKG

partial-xml-stream-parser

Version:

A lenient XML stream parser for Node.js and browsers that can handle incomplete or malformed XML data, with depth control, CDATA support for XML serialization and round-trip parsing, wildcard pattern support for stopNodes, and CDATA handling within stopNo

github.com/samhvw8/partial-xml-stream-parser

samhvw8/partial-xml-stream-parser

446 lines • 24.1 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.processXmlChunk = processXmlChunk; exports.finalizeStreamResult = finalizeStreamResult; const constants_1 = require("./constants"); const dom_builder_1 = require("./dom-builder"); function processXmlChunk(parserContext, xmlChunk) { let currentXmlString = ""; if (xmlChunk === null || xmlChunk === undefined) { // EOF } else if (typeof xmlChunk === "string") { currentXmlString = xmlChunk; } else if (xmlChunk && typeof xmlChunk.toString === "function") { currentXmlString = xmlChunk.toString(); } else { throw new Error("XML chunk for 'parseStream' is accepted in String, Buffer, null, undefined or empty string form."); } if (!parserContext._originalBufferHadContent && currentXmlString.length > 0) { parserContext._originalBufferHadContent = true; } let dataToProcess = currentXmlString; const originalIncompleteState = parserContext.incompleteStructureState; if (originalIncompleteState && originalIncompleteState.partial) { dataToProcess = originalIncompleteState.partial + dataToProcess; parserContext.parsingIndex = 0; if ((originalIncompleteState.type === "opening_tag_incomplete" || originalIncompleteState.type === "closing_tag_incomplete" || originalIncompleteState.type === "tag_start_incomplete" || originalIncompleteState.type === "text_node_incomplete" || originalIncompleteState.type === "stop_node_content") && originalIncompleteState.parentOfPartial && typeof originalIncompleteState.parentOfPartial === "object" && !Array.isArray(originalIncompleteState.parentOfPartial)) { parserContext.reparsedSegmentContext = { partialText: originalIncompleteState.partial, parentContext: originalIncompleteState.parentOfPartial, }; } parserContext.incompleteStructureState = null; } else { parserContext.parsingIndex = 0; } currentXmlString = ""; let signalToProcessCoreBuffer = false; // Handle maxDepth = 0 case: treat everything as plain text if (parserContext.customOptions.maxDepth === 0) { // Append all data as plain text if (dataToProcess.length > 0) { if (parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") { parserContext.accumulator[parserContext.accumulator.length - 1] += dataToProcess; } else { parserContext.accumulator.push(dataToProcess); } } // Skip all XML processing return { shouldProcessBuffer: false, earlyExitResult: null, }; } else if (parserContext.allowedRootNodes) { parserContext._rootDeterminationBuffer += dataToProcess; dataToProcess = ""; while (parserContext._rootDeterminationBuffer.length > 0) { const rdb = parserContext._rootDeterminationBuffer; const trimmedRdb = rdb.trimStart(); const leadingWsLength = rdb.length - trimmedRdb.length; if (trimmedRdb.length === 0) { if (rdb.length > 0 && (xmlChunk !== null || !parserContext.customOptions.ignoreWhitespace || rdb.trim().length > 0)) { parserContext.accumulator.push(rdb); } parserContext._rootDeterminationBuffer = ""; break; } if (trimmedRdb.startsWith("<")) { const tagMatch = constants_1.STATIC_OPENING_TAG_REGEX.exec(trimmedRdb); if (tagMatch) { const rootTagName = tagMatch[1]; if (parserContext.allowedRootNodes.has(rootTagName)) { parserContext.streamingBuffer = rdb; parserContext.parsingIndex = 0; parserContext._rootDeterminationBuffer = ""; signalToProcessCoreBuffer = true; break; } else { // Non-allowed XML root tag found const nonAllowedTagName = rootTagName; const closingNonAllowedTag = `</${nonAllowedTagName}>`; let contentEndIndex = -1; // Try to find the simple closing tag within the current trimmed buffer portion let searchStartIndexForClosingTag = leadingWsLength + tagMatch[0].length; if (trimmedRdb.length > tagMatch[0].length) { contentEndIndex = trimmedRdb.indexOf(closingNonAllowedTag, tagMatch[0].length); } if (contentEndIndex !== -1) { // Found the simple closing tag in the current trimmed buffer const segmentEnd = leadingWsLength + contentEndIndex + closingNonAllowedTag.length; // Append to last accumulator item if it's a string, otherwise push new string if (parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd); } else { parserContext.accumulator.push(rdb.substring(0, segmentEnd)); } parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd); } else { // Closing tag not found in current buffer, or it's a self-closing non-allowed tag. // Consume up to the '>' of the opening tag, or whole buffer if no '>'. const openingTagEnd = trimmedRdb.indexOf(">"); const segmentEnd = openingTagEnd !== -1 ? leadingWsLength + openingTagEnd + 1 : rdb.length; // Append to last accumulator item if it's a string, otherwise push new string if (parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd); } else { parserContext.accumulator.push(rdb.substring(0, segmentEnd)); } parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd); } // Continue the while loop if _rootDeterminationBuffer still has content if (parserContext._rootDeterminationBuffer.length === 0) break; else continue; } } else { const partialMatch = trimmedRdb.match(/^<([\w:-]+)/); if (partialMatch) { const potentialTag = partialMatch[1]; const isPotentiallyAllowed = [...parserContext.allowedRootNodes].some((ar) => ar.startsWith(potentialTag)); if (isPotentiallyAllowed && xmlChunk !== null) { break; } } // Append to last accumulator item if it's a string, otherwise push new string if (parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb; } else { parserContext.accumulator.push(rdb); } parserContext._rootDeterminationBuffer = ""; break; } } else { const nextTagStart = trimmedRdb.indexOf("<"); const segmentEnd = nextTagStart !== -1 ? leadingWsLength + nextTagStart : rdb.length; // Append to last accumulator item if it's a string, otherwise push new string if (parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd); } else { parserContext.accumulator.push(rdb.substring(0, segmentEnd)); } parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd); } } } else { if (parserContext.streamingBuffer.length > parserContext.parsingIndex) { parserContext.streamingBuffer = parserContext.streamingBuffer.substring(parserContext.parsingIndex) + dataToProcess; } else { parserContext.streamingBuffer = dataToProcess; } parserContext.parsingIndex = 0; if (!parserContext._initialSegmentTypeDecided && parserContext.streamingBuffer.trim().length > 0) { parserContext._initialSegmentTypeDecided = true; } } if (xmlChunk === null) { parserContext._activelyStreaming = false; } else if (!parserContext._activelyStreaming) { const hasNewMeaningfulContent = (parserContext.streamingBuffer.length > parserContext.parsingIndex && parserContext.streamingBuffer.substring(parserContext.parsingIndex).trim().length > 0) || (parserContext.allowedRootNodes && parserContext._rootDeterminationBuffer.trim().length > 0); if (hasNewMeaningfulContent) { parserContext._activelyStreaming = true; } } const isFirstEverChunk = !parserContext._originalBufferHadContent && parserContext.accumulator.length === 0 && parserContext.tagStack.length === 0; if (isFirstEverChunk && dataToProcess === "" && (xmlChunk === "" || xmlChunk === null)) { if (parserContext.streamingBuffer === "") { return { shouldProcessBuffer: false, earlyExitResult: { metadata: { partial: xmlChunk === "" }, xml: [] }, }; } } if (parserContext.streamingBuffer.length > parserContext.parsingIndex || (xmlChunk === null && (parserContext.streamingBuffer.length > 0 || (parserContext.incompleteStructureState && parserContext.incompleteStructureState.partial)))) { signalToProcessCoreBuffer = true; } if (xmlChunk === null) { parserContext.streamingBufferBeforeClear = parserContext.streamingBuffer; } return { shouldProcessBuffer: signalToProcessCoreBuffer, earlyExitResult: null }; } function finalizeStreamResult(parserContext, xmlChunk) { if (parserContext.parsingIndex > 0) { const sliceAmount = parserContext.parsingIndex; if (parserContext.incompleteStructureState && parserContext.incompleteStructureState.at !== undefined) { parserContext.incompleteStructureState.at -= sliceAmount; if (parserContext.incompleteStructureState.at < 0) { parserContext.incompleteStructureState.at = 0; } if (parserContext.incompleteStructureState.type === "stop_node_content" && parserContext.incompleteStructureState.contentStartIndex !== undefined) { parserContext.incompleteStructureState.contentStartIndex -= sliceAmount; if (parserContext.incompleteStructureState.contentStartIndex < 0) parserContext.incompleteStructureState.contentStartIndex = 0; } } if (parserContext.reparsedSegmentContext && parserContext.reparsedSegmentContext.originalIndex !== undefined) { if (parserContext.reparsedSegmentContext.originalIndex < sliceAmount) { parserContext.reparsedSegmentContext = null; } else { parserContext.reparsedSegmentContext.originalIndex -= sliceAmount; } } parserContext.streamingBuffer = parserContext.streamingBuffer.substring(sliceAmount); parserContext.parsingIndex = 0; } let finalXmlContent = parserContext.accumulator.length > 0 ? parserContext.accumulator : []; let isReturnPartial; if (xmlChunk !== null) { // Current chunk is NOT EOF if (parserContext.allowedRootNodes) { // When allowedRootNodes is active, any non-EOF chunk implies the stream is still partial by default, // as more text or other allowed roots could follow. isReturnPartial = true; // Exception: if we have a complete object in accumulator and no pending state const conditionsForNonPartial = parserContext.tagStack.length === 0 && !parserContext.incompleteStructureState && parserContext.streamingBuffer.length === 0 && parserContext._rootDeterminationBuffer.length === 0; // Check if we have a complete XML structure with allowed root nodes if (conditionsForNonPartial) { // Special case for the test "should parse a complex message with mixed text and multiple XML elements with allowRoot" // If we have at least one object in accumulator (parsed XML with allowed root) // or if the entire input was processed in one go if ((parserContext.accumulator.length > 0 && parserContext.accumulator.some((item) => typeof item === "object")) || (parserContext._originalBufferHadContent && parserContext.accumulator.length > 0 && !parserContext._activelyStreaming)) { // Only set partial to false if we have at least one object in the accumulator // This ensures XML content is treated as complete, but plain text is still partial if (parserContext.accumulator.some((item) => typeof item === "object")) { isReturnPartial = false; } } } // Special case for plain text content with allowedRootNodes // If all items in accumulator are strings and we're not at EOF, keep partial as true if (parserContext.accumulator.length > 0 && parserContext.accumulator.every((item) => typeof item === "string")) { isReturnPartial = true; } } else { // Standard parsing (no allowedRootNodes): not partial if everything is clear const conditionsForNonPartial = parserContext.tagStack.length === 0 && !parserContext.incompleteStructureState && parserContext.streamingBuffer.length === 0; isReturnPartial = !conditionsForNonPartial; } } else { // Current chunk IS EOF (xmlChunk === null) isReturnPartial = parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState; } let isSpecialOnlyAtEOF = false; if (xmlChunk === null || xmlChunk === undefined) { // EOF if (parserContext.incompleteStructureState) { const stateType = parserContext.incompleteStructureState.type; const isSpecialIncomplete = stateType === "doctype" || stateType === "xmldecl" || stateType === "comment"; if (isSpecialIncomplete && parserContext.accumulator.length === 0 && parserContext.tagStack.length === 0) { const remainingBufferIsJustPartial = (parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() === (parserContext.incompleteStructureState.partial || "").trim(); if (remainingBufferIsJustPartial) { isReturnPartial = false; parserContext.incompleteStructureState = null; isSpecialOnlyAtEOF = true; finalXmlContent = []; } } else if ((stateType === "opening_tag_incomplete" || stateType === "tag_start_incomplete" || stateType === "closing_tag_incomplete") && parserContext.incompleteStructureState.partial && parserContext.incompleteStructureState.partial.trim().length > 0) { isReturnPartial = true; const fragment = parserContext.incompleteStructureState.partial; let fragmentAddedToExistingText = false; if (parserContext.currentPointer && typeof parserContext.currentPointer === "object" && !Array.isArray(parserContext.currentPointer)) { const textNodeName = parserContext.customOptions.textNodeName; if (typeof parserContext.currentPointer[textNodeName] === "string") { if (!parserContext.currentPointer[textNodeName].endsWith(fragment)) { // Check to prevent duplication parserContext.currentPointer[textNodeName] += fragment; } fragmentAddedToExistingText = true; } else if (Array.isArray(parserContext.currentPointer[textNodeName])) { const lastTextItemIdx = parserContext.currentPointer[textNodeName].length - 1; if (lastTextItemIdx >= 0 && typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] === "string") { if (!parserContext.currentPointer[textNodeName][lastTextItemIdx].endsWith(fragment)) { // Check to prevent duplication parserContext.currentPointer[textNodeName][lastTextItemIdx] += fragment; } fragmentAddedToExistingText = true; } else if (lastTextItemIdx < 0 || typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] !== "string") { // If no string to append to, add new (0, dom_builder_1.addValueToObject)(parserContext.currentPointer, textNodeName, fragment, parserContext.customOptions); fragmentAddedToExistingText = true; } } else { // No text node yet or it's not an array/string, add new (0, dom_builder_1.addValueToObject)(parserContext.currentPointer, textNodeName, fragment, parserContext.customOptions); fragmentAddedToExistingText = true; } } if (!fragmentAddedToExistingText && parserContext.accumulator.length > 0) { let lastAccItem = parserContext.accumulator[parserContext.accumulator.length - 1]; if (typeof lastAccItem === "string") { if (!lastAccItem.endsWith(fragment)) { // Check to prevent duplication parserContext.accumulator[parserContext.accumulator.length - 1] += fragment; } } else { parserContext.accumulator.push(fragment); } } else if (!fragmentAddedToExistingText) { parserContext.accumulator.push(fragment); } finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : []; } } else if (parserContext.tagStack.length > 0) { isReturnPartial = true; } else { if (isReturnPartial && !(parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState)) { isReturnPartial = false; } } // This block handles finalXmlContent structure if parsing is complete (not partial) if (!isReturnPartial) { const effectiveBufferContent = parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer || parserContext._rootDeterminationBuffer; const tempBufferForNullCheck = effectiveBufferContent .replace(/<\?xml[^?]*\?>/g, "") .replace(//g, "") .replace(/<!DOCTYPE[^>]*>/g, "") .trim(); if (isSpecialOnlyAtEOF) { finalXmlContent = []; } else if (parserContext.accumulator.length === 0 && tempBufferForNullCheck === "") { finalXmlContent = []; } else if (parserContext.accumulator.length === 0 && tempBufferForNullCheck !== "" && !parserContext._treatAsPlainText) { if (parserContext.customOptions.alwaysCreateTextNode) { finalXmlContent = [{ [parserContext.customOptions.textNodeName]: tempBufferForNullCheck }]; } else { finalXmlContent = [tempBufferForNullCheck]; } } // If accumulator has content, finalXmlContent is already set from it. parserContext.streamingBuffer = ""; parserContext.parsingIndex = 0; parserContext._activelyStreaming = false; parserContext._originalBufferHadContent = false; parserContext.incompleteStructureState = null; parserContext.streamingBufferBeforeClear = ""; parserContext._lastClearedIncompleteStateWasSpecial = isSpecialOnlyAtEOF; parserContext._rootDeterminationBuffer = ""; } else { // Still partial at EOF // Ensure finalXmlContent reflects the accumulator, which might have been modified by fragment addition finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : []; // If it's still just a single string fragment in accumulator and alwaysCreateTextNode is true, wrap it. if (finalXmlContent.length === 1 && typeof finalXmlContent[0] === "string" && parserContext.customOptions.alwaysCreateTextNode) { finalXmlContent = [{ [parserContext.customOptions.textNodeName]: finalXmlContent[0] }]; } if (parserContext.incompleteStructureState) parserContext.reparsedSegmentContext = null; } } const result = { metadata: { partial: isReturnPartial }, xml: finalXmlContent, }; if (xmlChunk === null && !result.metadata.partial) { if (isSpecialOnlyAtEOF) { result.xml = []; } else if (result.xml && result.xml.length === 0 && !parserContext._originalBufferHadContent && (parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() === "") { result.xml = []; } } return result; } //# sourceMappingURL=stream-processor.js.map