partial-xml-stream-parser

Version:

A lenient XML stream parser for Node.js and browsers that can handle incomplete or malformed XML data, with depth control, CDATA support for XML serialization and round-trip parsing, wildcard pattern support for stopNodes, and CDATA handling within stopNo

github.com/samhvw8/partial-xml-stream-parser

samhvw8/partial-xml-stream-parser

515 lines (474 loc) • 19.5 kB

text/typescript

import { STATIC_OPENING_TAG_REGEX } from "./constants" import { addValueToObject } from "./dom-builder" import { ParserContext, ChunkProcessingResult, ParseResult } from "./types" export function processXmlChunk( parserContext: ParserContext, xmlChunk: string | Buffer | null | undefined, ): ChunkProcessingResult { let currentXmlString = "" if (xmlChunk === null || xmlChunk === undefined) { // EOF } else if (typeof xmlChunk === "string") { currentXmlString = xmlChunk } else if (xmlChunk && typeof xmlChunk.toString === "function") { currentXmlString = xmlChunk.toString() } else { throw new Error( "XML chunk for 'parseStream' is accepted in String, Buffer, null, undefined or empty string form.", ) } if (!parserContext._originalBufferHadContent && currentXmlString.length > 0) { parserContext._originalBufferHadContent = true } let dataToProcess = currentXmlString const originalIncompleteState = parserContext.incompleteStructureState if (originalIncompleteState && originalIncompleteState.partial) { dataToProcess = originalIncompleteState.partial + dataToProcess parserContext.parsingIndex = 0 if ( (originalIncompleteState.type === "opening_tag_incomplete" || originalIncompleteState.type === "closing_tag_incomplete" || originalIncompleteState.type === "tag_start_incomplete" || originalIncompleteState.type === "text_node_incomplete" || originalIncompleteState.type === "stop_node_content") && originalIncompleteState.parentOfPartial && typeof originalIncompleteState.parentOfPartial === "object" && !Array.isArray(originalIncompleteState.parentOfPartial) ) { parserContext.reparsedSegmentContext = { partialText: originalIncompleteState.partial, parentContext: originalIncompleteState.parentOfPartial, } } parserContext.incompleteStructureState = null } else { parserContext.parsingIndex = 0 } currentXmlString = "" let signalToProcessCoreBuffer = false // Handle maxDepth = 0 case: treat everything as plain text if (parserContext.customOptions.maxDepth === 0) { // Append all data as plain text if (dataToProcess.length > 0) { if ( parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string" ) { parserContext.accumulator[parserContext.accumulator.length - 1] += dataToProcess } else { parserContext.accumulator.push(dataToProcess) } } // Skip all XML processing return { shouldProcessBuffer: false, earlyExitResult: null, } } else if (parserContext.allowedRootNodes) { parserContext._rootDeterminationBuffer += dataToProcess dataToProcess = "" while (parserContext._rootDeterminationBuffer.length > 0) { const rdb = parserContext._rootDeterminationBuffer const trimmedRdb = rdb.trimStart() const leadingWsLength = rdb.length - trimmedRdb.length if (trimmedRdb.length === 0) { if ( rdb.length > 0 && (xmlChunk !== null || !parserContext.customOptions.ignoreWhitespace || rdb.trim().length > 0) ) { parserContext.accumulator.push(rdb) } parserContext._rootDeterminationBuffer = "" break } if (trimmedRdb.startsWith("<")) { const tagMatch = STATIC_OPENING_TAG_REGEX.exec(trimmedRdb) if (tagMatch) { const rootTagName = tagMatch[1] if (parserContext.allowedRootNodes.has(rootTagName)) { parserContext.streamingBuffer = rdb parserContext.parsingIndex = 0 parserContext._rootDeterminationBuffer = "" signalToProcessCoreBuffer = true break } else { // Non-allowed XML root tag found const nonAllowedTagName = rootTagName const closingNonAllowedTag = `</${nonAllowedTagName}>` let contentEndIndex = -1 // Try to find the simple closing tag within the current trimmed buffer portion let searchStartIndexForClosingTag = leadingWsLength + tagMatch[0].length if (trimmedRdb.length > tagMatch[0].length) { contentEndIndex = trimmedRdb.indexOf(closingNonAllowedTag, tagMatch[0].length) } if (contentEndIndex !== -1) { // Found the simple closing tag in the current trimmed buffer const segmentEnd = leadingWsLength + contentEndIndex + closingNonAllowedTag.length // Append to last accumulator item if it's a string, otherwise push new string if ( parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string" ) { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring( 0, segmentEnd, ) } else { parserContext.accumulator.push(rdb.substring(0, segmentEnd)) } parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd) } else { // Closing tag not found in current buffer, or it's a self-closing non-allowed tag. // Consume up to the '>' of the opening tag, or whole buffer if no '>'. const openingTagEnd = trimmedRdb.indexOf(">") const segmentEnd = openingTagEnd !== -1 ? leadingWsLength + openingTagEnd + 1 : rdb.length // Append to last accumulator item if it's a string, otherwise push new string if ( parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string" ) { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring( 0, segmentEnd, ) } else { parserContext.accumulator.push(rdb.substring(0, segmentEnd)) } parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd) } // Continue the while loop if _rootDeterminationBuffer still has content if (parserContext._rootDeterminationBuffer.length === 0) break else continue } } else { const partialMatch = trimmedRdb.match(/^<([\w:-]+)/) if (partialMatch) { const potentialTag = partialMatch[1] const isPotentiallyAllowed = [...parserContext.allowedRootNodes].some((ar) => ar.startsWith(potentialTag), ) if (isPotentiallyAllowed && xmlChunk !== null) { break } } // Append to last accumulator item if it's a string, otherwise push new string if ( parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string" ) { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb } else { parserContext.accumulator.push(rdb) } parserContext._rootDeterminationBuffer = "" break } } else { const nextTagStart = trimmedRdb.indexOf("<") const segmentEnd = nextTagStart !== -1 ? leadingWsLength + nextTagStart : rdb.length // Append to last accumulator item if it's a string, otherwise push new string if ( parserContext.accumulator.length > 0 && typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string" ) { parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd) } else { parserContext.accumulator.push(rdb.substring(0, segmentEnd)) } parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd) } } } else { if (parserContext.streamingBuffer.length > parserContext.parsingIndex) { parserContext.streamingBuffer = parserContext.streamingBuffer.substring(parserContext.parsingIndex) + dataToProcess } else { parserContext.streamingBuffer = dataToProcess } parserContext.parsingIndex = 0 if (!parserContext._initialSegmentTypeDecided && parserContext.streamingBuffer.trim().length > 0) { parserContext._initialSegmentTypeDecided = true } } if (xmlChunk === null) { parserContext._activelyStreaming = false } else if (!parserContext._activelyStreaming) { const hasNewMeaningfulContent = (parserContext.streamingBuffer.length > parserContext.parsingIndex && parserContext.streamingBuffer.substring(parserContext.parsingIndex).trim().length > 0) || (parserContext.allowedRootNodes && parserContext._rootDeterminationBuffer.trim().length > 0) if (hasNewMeaningfulContent) { parserContext._activelyStreaming = true } } const isFirstEverChunk = !parserContext._originalBufferHadContent && parserContext.accumulator.length === 0 && parserContext.tagStack.length === 0 if (isFirstEverChunk && dataToProcess === "" && (xmlChunk === "" || xmlChunk === null)) { if (parserContext.streamingBuffer === "") { return { shouldProcessBuffer: false, earlyExitResult: { metadata: { partial: xmlChunk === "" }, xml: [] }, } } } if ( parserContext.streamingBuffer.length > parserContext.parsingIndex || (xmlChunk === null && (parserContext.streamingBuffer.length > 0 || (parserContext.incompleteStructureState && parserContext.incompleteStructureState.partial))) ) { signalToProcessCoreBuffer = true } if (xmlChunk === null) { parserContext.streamingBufferBeforeClear = parserContext.streamingBuffer } return { shouldProcessBuffer: signalToProcessCoreBuffer, earlyExitResult: null } } export function finalizeStreamResult( parserContext: ParserContext, xmlChunk: string | Buffer | null | undefined, ): ParseResult { if (parserContext.parsingIndex > 0) { const sliceAmount = parserContext.parsingIndex if (parserContext.incompleteStructureState && parserContext.incompleteStructureState.at !== undefined) { parserContext.incompleteStructureState.at -= sliceAmount if (parserContext.incompleteStructureState.at < 0) { parserContext.incompleteStructureState.at = 0 } if ( parserContext.incompleteStructureState.type === "stop_node_content" && parserContext.incompleteStructureState.contentStartIndex !== undefined ) { parserContext.incompleteStructureState.contentStartIndex -= sliceAmount if (parserContext.incompleteStructureState.contentStartIndex < 0) parserContext.incompleteStructureState.contentStartIndex = 0 } } if (parserContext.reparsedSegmentContext && parserContext.reparsedSegmentContext.originalIndex !== undefined) { if (parserContext.reparsedSegmentContext.originalIndex < sliceAmount) { parserContext.reparsedSegmentContext = null } else { parserContext.reparsedSegmentContext.originalIndex -= sliceAmount } } parserContext.streamingBuffer = parserContext.streamingBuffer.substring(sliceAmount) parserContext.parsingIndex = 0 } let finalXmlContent: any[] = parserContext.accumulator.length > 0 ? parserContext.accumulator : [] let isReturnPartial: boolean if (xmlChunk !== null) { // Current chunk is NOT EOF if (parserContext.allowedRootNodes) { // When allowedRootNodes is active, any non-EOF chunk implies the stream is still partial by default, // as more text or other allowed roots could follow. isReturnPartial = true // Exception: if we have a complete object in accumulator and no pending state const conditionsForNonPartial = parserContext.tagStack.length === 0 && !parserContext.incompleteStructureState && parserContext.streamingBuffer.length === 0 && parserContext._rootDeterminationBuffer.length === 0 // Check if we have a complete XML structure with allowed root nodes if (conditionsForNonPartial) { // Special case for the test "should parse a complex message with mixed text and multiple XML elements with allowRoot" // If we have at least one object in accumulator (parsed XML with allowed root) // or if the entire input was processed in one go if ( (parserContext.accumulator.length > 0 && parserContext.accumulator.some((item) => typeof item === "object")) || (parserContext._originalBufferHadContent && parserContext.accumulator.length > 0 && !parserContext._activelyStreaming) ) { // Only set partial to false if we have at least one object in the accumulator // This ensures XML content is treated as complete, but plain text is still partial if (parserContext.accumulator.some((item) => typeof item === "object")) { isReturnPartial = false } } } // Special case for plain text content with allowedRootNodes // If all items in accumulator are strings and we're not at EOF, keep partial as true if ( parserContext.accumulator.length > 0 && parserContext.accumulator.every((item) => typeof item === "string") ) { isReturnPartial = true } } else { // Standard parsing (no allowedRootNodes): not partial if everything is clear const conditionsForNonPartial = parserContext.tagStack.length === 0 && !parserContext.incompleteStructureState && parserContext.streamingBuffer.length === 0 isReturnPartial = !conditionsForNonPartial } } else { // Current chunk IS EOF (xmlChunk === null) isReturnPartial = parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState } let isSpecialOnlyAtEOF = false if (xmlChunk === null || xmlChunk === undefined) { // EOF if (parserContext.incompleteStructureState) { const stateType = parserContext.incompleteStructureState.type const isSpecialIncomplete = stateType === "doctype" || stateType === "xmldecl" || stateType === "comment" if (isSpecialIncomplete && parserContext.accumulator.length === 0 && parserContext.tagStack.length === 0) { const remainingBufferIsJustPartial = (parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() === (parserContext.incompleteStructureState.partial || "").trim() if (remainingBufferIsJustPartial) { isReturnPartial = false parserContext.incompleteStructureState = null isSpecialOnlyAtEOF = true finalXmlContent = [] } } else if ( (stateType === "opening_tag_incomplete" || stateType === "tag_start_incomplete" || stateType === "closing_tag_incomplete") && parserContext.incompleteStructureState.partial && parserContext.incompleteStructureState.partial.trim().length > 0 ) { isReturnPartial = true const fragment = parserContext.incompleteStructureState.partial let fragmentAddedToExistingText = false if ( parserContext.currentPointer && typeof parserContext.currentPointer === "object" && !Array.isArray(parserContext.currentPointer) ) { const textNodeName = parserContext.customOptions.textNodeName if (typeof parserContext.currentPointer[textNodeName] === "string") { if (!parserContext.currentPointer[textNodeName].endsWith(fragment)) { // Check to prevent duplication parserContext.currentPointer[textNodeName] += fragment } fragmentAddedToExistingText = true } else if (Array.isArray(parserContext.currentPointer[textNodeName])) { const lastTextItemIdx = parserContext.currentPointer[textNodeName].length - 1 if ( lastTextItemIdx >= 0 && typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] === "string" ) { if (!parserContext.currentPointer[textNodeName][lastTextItemIdx].endsWith(fragment)) { // Check to prevent duplication parserContext.currentPointer[textNodeName][lastTextItemIdx] += fragment } fragmentAddedToExistingText = true } else if ( lastTextItemIdx < 0 || typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] !== "string" ) { // If no string to append to, add new addValueToObject( parserContext.currentPointer, textNodeName, fragment, parserContext.customOptions, ) fragmentAddedToExistingText = true } } else { // No text node yet or it's not an array/string, add new addValueToObject( parserContext.currentPointer, textNodeName, fragment, parserContext.customOptions, ) fragmentAddedToExistingText = true } } if (!fragmentAddedToExistingText && parserContext.accumulator.length > 0) { let lastAccItem = parserContext.accumulator[parserContext.accumulator.length - 1] if (typeof lastAccItem === "string") { if (!lastAccItem.endsWith(fragment)) { // Check to prevent duplication parserContext.accumulator[parserContext.accumulator.length - 1] += fragment } } else { parserContext.accumulator.push(fragment) } } else if (!fragmentAddedToExistingText) { parserContext.accumulator.push(fragment) } finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : [] } } else if (parserContext.tagStack.length > 0) { isReturnPartial = true } else { if (isReturnPartial && !(parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState)) { isReturnPartial = false } } // This block handles finalXmlContent structure if parsing is complete (not partial) if (!isReturnPartial) { const effectiveBufferContent = parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer || parserContext._rootDeterminationBuffer const tempBufferForNullCheck = effectiveBufferContent .replace(/<\?xml[^?]*\?>/g, "") .replace(//g, "") .replace(/<!DOCTYPE[^>]*>/g, "") .trim() if (isSpecialOnlyAtEOF) { finalXmlContent = [] } else if (parserContext.accumulator.length === 0 && tempBufferForNullCheck === "") { finalXmlContent = [] } else if ( parserContext.accumulator.length === 0 && tempBufferForNullCheck !== "" && !parserContext._treatAsPlainText ) { if (parserContext.customOptions.alwaysCreateTextNode) { finalXmlContent = [{ [parserContext.customOptions.textNodeName]: tempBufferForNullCheck }] } else { finalXmlContent = [tempBufferForNullCheck] } } // If accumulator has content, finalXmlContent is already set from it. parserContext.streamingBuffer = "" parserContext.parsingIndex = 0 parserContext._activelyStreaming = false parserContext._originalBufferHadContent = false parserContext.incompleteStructureState = null parserContext.streamingBufferBeforeClear = "" parserContext._lastClearedIncompleteStateWasSpecial = isSpecialOnlyAtEOF parserContext._rootDeterminationBuffer = "" } else { // Still partial at EOF // Ensure finalXmlContent reflects the accumulator, which might have been modified by fragment addition finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : [] // If it's still just a single string fragment in accumulator and alwaysCreateTextNode is true, wrap it. if ( finalXmlContent.length === 1 && typeof finalXmlContent[0] === "string" && parserContext.customOptions.alwaysCreateTextNode ) { finalXmlContent = [{ [parserContext.customOptions.textNodeName]: finalXmlContent[0] }] } if (parserContext.incompleteStructureState) parserContext.reparsedSegmentContext = null } } const result: ParseResult = { metadata: { partial: isReturnPartial }, xml: finalXmlContent, } if (xmlChunk === null && !result.metadata.partial) { if (isSpecialOnlyAtEOF) { result.xml = [] } else if ( result.xml && result.xml.length === 0 && !parserContext._originalBufferHadContent && (parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() === "" ) { result.xml = [] } } return result }