partial-xml-stream-parser
Version:
A lenient XML stream parser for Node.js and browsers that can handle incomplete or malformed XML data, with depth control, CDATA support for XML serialization and round-trip parsing, wildcard pattern support for stopNodes, and CDATA handling within stopNo
515 lines (474 loc) • 19.5 kB
text/typescript
import { STATIC_OPENING_TAG_REGEX } from "./constants"
import { addValueToObject } from "./dom-builder"
import { ParserContext, ChunkProcessingResult, ParseResult } from "./types"
export function processXmlChunk(
parserContext: ParserContext,
xmlChunk: string | Buffer | null | undefined,
): ChunkProcessingResult {
let currentXmlString = ""
if (xmlChunk === null || xmlChunk === undefined) {
// EOF
} else if (typeof xmlChunk === "string") {
currentXmlString = xmlChunk
} else if (xmlChunk && typeof xmlChunk.toString === "function") {
currentXmlString = xmlChunk.toString()
} else {
throw new Error(
"XML chunk for 'parseStream' is accepted in String, Buffer, null, undefined or empty string form.",
)
}
if (!parserContext._originalBufferHadContent && currentXmlString.length > 0) {
parserContext._originalBufferHadContent = true
}
let dataToProcess = currentXmlString
const originalIncompleteState = parserContext.incompleteStructureState
if (originalIncompleteState && originalIncompleteState.partial) {
dataToProcess = originalIncompleteState.partial + dataToProcess
parserContext.parsingIndex = 0
if (
(originalIncompleteState.type === "opening_tag_incomplete" ||
originalIncompleteState.type === "closing_tag_incomplete" ||
originalIncompleteState.type === "tag_start_incomplete" ||
originalIncompleteState.type === "text_node_incomplete" ||
originalIncompleteState.type === "stop_node_content") &&
originalIncompleteState.parentOfPartial &&
typeof originalIncompleteState.parentOfPartial === "object" &&
!Array.isArray(originalIncompleteState.parentOfPartial)
) {
parserContext.reparsedSegmentContext = {
partialText: originalIncompleteState.partial,
parentContext: originalIncompleteState.parentOfPartial,
}
}
parserContext.incompleteStructureState = null
} else {
parserContext.parsingIndex = 0
}
currentXmlString = ""
let signalToProcessCoreBuffer = false
// Handle maxDepth = 0 case: treat everything as plain text
if (parserContext.customOptions.maxDepth === 0) {
// Append all data as plain text
if (dataToProcess.length > 0) {
if (
parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string"
) {
parserContext.accumulator[parserContext.accumulator.length - 1] += dataToProcess
} else {
parserContext.accumulator.push(dataToProcess)
}
}
// Skip all XML processing
return {
shouldProcessBuffer: false,
earlyExitResult: null,
}
} else if (parserContext.allowedRootNodes) {
parserContext._rootDeterminationBuffer += dataToProcess
dataToProcess = ""
while (parserContext._rootDeterminationBuffer.length > 0) {
const rdb = parserContext._rootDeterminationBuffer
const trimmedRdb = rdb.trimStart()
const leadingWsLength = rdb.length - trimmedRdb.length
if (trimmedRdb.length === 0) {
if (
rdb.length > 0 &&
(xmlChunk !== null || !parserContext.customOptions.ignoreWhitespace || rdb.trim().length > 0)
) {
parserContext.accumulator.push(rdb)
}
parserContext._rootDeterminationBuffer = ""
break
}
if (trimmedRdb.startsWith("<")) {
const tagMatch = STATIC_OPENING_TAG_REGEX.exec(trimmedRdb)
if (tagMatch) {
const rootTagName = tagMatch[1]
if (parserContext.allowedRootNodes.has(rootTagName)) {
parserContext.streamingBuffer = rdb
parserContext.parsingIndex = 0
parserContext._rootDeterminationBuffer = ""
signalToProcessCoreBuffer = true
break
} else {
// Non-allowed XML root tag found
const nonAllowedTagName = rootTagName
const closingNonAllowedTag = `</${nonAllowedTagName}>`
let contentEndIndex = -1
// Try to find the simple closing tag within the current trimmed buffer portion
let searchStartIndexForClosingTag = leadingWsLength + tagMatch[0].length
if (trimmedRdb.length > tagMatch[0].length) {
contentEndIndex = trimmedRdb.indexOf(closingNonAllowedTag, tagMatch[0].length)
}
if (contentEndIndex !== -1) {
// Found the simple closing tag in the current trimmed buffer
const segmentEnd = leadingWsLength + contentEndIndex + closingNonAllowedTag.length
// Append to last accumulator item if it's a string, otherwise push new string
if (
parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string"
) {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(
0,
segmentEnd,
)
} else {
parserContext.accumulator.push(rdb.substring(0, segmentEnd))
}
parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd)
} else {
// Closing tag not found in current buffer, or it's a self-closing non-allowed tag.
// Consume up to the '>' of the opening tag, or whole buffer if no '>'.
const openingTagEnd = trimmedRdb.indexOf(">")
const segmentEnd = openingTagEnd !== -1 ? leadingWsLength + openingTagEnd + 1 : rdb.length
// Append to last accumulator item if it's a string, otherwise push new string
if (
parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string"
) {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(
0,
segmentEnd,
)
} else {
parserContext.accumulator.push(rdb.substring(0, segmentEnd))
}
parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd)
}
// Continue the while loop if _rootDeterminationBuffer still has content
if (parserContext._rootDeterminationBuffer.length === 0) break
else continue
}
} else {
const partialMatch = trimmedRdb.match(/^<([\w:-]+)/)
if (partialMatch) {
const potentialTag = partialMatch[1]
const isPotentiallyAllowed = [...parserContext.allowedRootNodes].some((ar) =>
ar.startsWith(potentialTag),
)
if (isPotentiallyAllowed && xmlChunk !== null) {
break
}
}
// Append to last accumulator item if it's a string, otherwise push new string
if (
parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string"
) {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb
} else {
parserContext.accumulator.push(rdb)
}
parserContext._rootDeterminationBuffer = ""
break
}
} else {
const nextTagStart = trimmedRdb.indexOf("<")
const segmentEnd = nextTagStart !== -1 ? leadingWsLength + nextTagStart : rdb.length
// Append to last accumulator item if it's a string, otherwise push new string
if (
parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string"
) {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd)
} else {
parserContext.accumulator.push(rdb.substring(0, segmentEnd))
}
parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd)
}
}
} else {
if (parserContext.streamingBuffer.length > parserContext.parsingIndex) {
parserContext.streamingBuffer =
parserContext.streamingBuffer.substring(parserContext.parsingIndex) + dataToProcess
} else {
parserContext.streamingBuffer = dataToProcess
}
parserContext.parsingIndex = 0
if (!parserContext._initialSegmentTypeDecided && parserContext.streamingBuffer.trim().length > 0) {
parserContext._initialSegmentTypeDecided = true
}
}
if (xmlChunk === null) {
parserContext._activelyStreaming = false
} else if (!parserContext._activelyStreaming) {
const hasNewMeaningfulContent =
(parserContext.streamingBuffer.length > parserContext.parsingIndex &&
parserContext.streamingBuffer.substring(parserContext.parsingIndex).trim().length > 0) ||
(parserContext.allowedRootNodes && parserContext._rootDeterminationBuffer.trim().length > 0)
if (hasNewMeaningfulContent) {
parserContext._activelyStreaming = true
}
}
const isFirstEverChunk =
!parserContext._originalBufferHadContent &&
parserContext.accumulator.length === 0 &&
parserContext.tagStack.length === 0
if (isFirstEverChunk && dataToProcess === "" && (xmlChunk === "" || xmlChunk === null)) {
if (parserContext.streamingBuffer === "") {
return {
shouldProcessBuffer: false,
earlyExitResult: { metadata: { partial: xmlChunk === "" }, xml: [] },
}
}
}
if (
parserContext.streamingBuffer.length > parserContext.parsingIndex ||
(xmlChunk === null &&
(parserContext.streamingBuffer.length > 0 ||
(parserContext.incompleteStructureState && parserContext.incompleteStructureState.partial)))
) {
signalToProcessCoreBuffer = true
}
if (xmlChunk === null) {
parserContext.streamingBufferBeforeClear = parserContext.streamingBuffer
}
return { shouldProcessBuffer: signalToProcessCoreBuffer, earlyExitResult: null }
}
export function finalizeStreamResult(
parserContext: ParserContext,
xmlChunk: string | Buffer | null | undefined,
): ParseResult {
if (parserContext.parsingIndex > 0) {
const sliceAmount = parserContext.parsingIndex
if (parserContext.incompleteStructureState && parserContext.incompleteStructureState.at !== undefined) {
parserContext.incompleteStructureState.at -= sliceAmount
if (parserContext.incompleteStructureState.at < 0) {
parserContext.incompleteStructureState.at = 0
}
if (
parserContext.incompleteStructureState.type === "stop_node_content" &&
parserContext.incompleteStructureState.contentStartIndex !== undefined
) {
parserContext.incompleteStructureState.contentStartIndex -= sliceAmount
if (parserContext.incompleteStructureState.contentStartIndex < 0)
parserContext.incompleteStructureState.contentStartIndex = 0
}
}
if (parserContext.reparsedSegmentContext && parserContext.reparsedSegmentContext.originalIndex !== undefined) {
if (parserContext.reparsedSegmentContext.originalIndex < sliceAmount) {
parserContext.reparsedSegmentContext = null
} else {
parserContext.reparsedSegmentContext.originalIndex -= sliceAmount
}
}
parserContext.streamingBuffer = parserContext.streamingBuffer.substring(sliceAmount)
parserContext.parsingIndex = 0
}
let finalXmlContent: any[] = parserContext.accumulator.length > 0 ? parserContext.accumulator : []
let isReturnPartial: boolean
if (xmlChunk !== null) {
// Current chunk is NOT EOF
if (parserContext.allowedRootNodes) {
// When allowedRootNodes is active, any non-EOF chunk implies the stream is still partial by default,
// as more text or other allowed roots could follow.
isReturnPartial = true
// Exception: if we have a complete object in accumulator and no pending state
const conditionsForNonPartial =
parserContext.tagStack.length === 0 &&
!parserContext.incompleteStructureState &&
parserContext.streamingBuffer.length === 0 &&
parserContext._rootDeterminationBuffer.length === 0
// Check if we have a complete XML structure with allowed root nodes
if (conditionsForNonPartial) {
// Special case for the test "should parse a complex message with mixed text and multiple XML elements with allowRoot"
// If we have at least one object in accumulator (parsed XML with allowed root)
// or if the entire input was processed in one go
if (
(parserContext.accumulator.length > 0 &&
parserContext.accumulator.some((item) => typeof item === "object")) ||
(parserContext._originalBufferHadContent &&
parserContext.accumulator.length > 0 &&
!parserContext._activelyStreaming)
) {
// Only set partial to false if we have at least one object in the accumulator
// This ensures XML content is treated as complete, but plain text is still partial
if (parserContext.accumulator.some((item) => typeof item === "object")) {
isReturnPartial = false
}
}
}
// Special case for plain text content with allowedRootNodes
// If all items in accumulator are strings and we're not at EOF, keep partial as true
if (
parserContext.accumulator.length > 0 &&
parserContext.accumulator.every((item) => typeof item === "string")
) {
isReturnPartial = true
}
} else {
// Standard parsing (no allowedRootNodes): not partial if everything is clear
const conditionsForNonPartial =
parserContext.tagStack.length === 0 &&
!parserContext.incompleteStructureState &&
parserContext.streamingBuffer.length === 0
isReturnPartial = !conditionsForNonPartial
}
} else {
// Current chunk IS EOF (xmlChunk === null)
isReturnPartial = parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState
}
let isSpecialOnlyAtEOF = false
if (xmlChunk === null || xmlChunk === undefined) {
// EOF
if (parserContext.incompleteStructureState) {
const stateType = parserContext.incompleteStructureState.type
const isSpecialIncomplete = stateType === "doctype" || stateType === "xmldecl" || stateType === "comment"
if (isSpecialIncomplete && parserContext.accumulator.length === 0 && parserContext.tagStack.length === 0) {
const remainingBufferIsJustPartial =
(parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() ===
(parserContext.incompleteStructureState.partial || "").trim()
if (remainingBufferIsJustPartial) {
isReturnPartial = false
parserContext.incompleteStructureState = null
isSpecialOnlyAtEOF = true
finalXmlContent = []
}
} else if (
(stateType === "opening_tag_incomplete" ||
stateType === "tag_start_incomplete" ||
stateType === "closing_tag_incomplete") &&
parserContext.incompleteStructureState.partial &&
parserContext.incompleteStructureState.partial.trim().length > 0
) {
isReturnPartial = true
const fragment = parserContext.incompleteStructureState.partial
let fragmentAddedToExistingText = false
if (
parserContext.currentPointer &&
typeof parserContext.currentPointer === "object" &&
!Array.isArray(parserContext.currentPointer)
) {
const textNodeName = parserContext.customOptions.textNodeName
if (typeof parserContext.currentPointer[textNodeName] === "string") {
if (!parserContext.currentPointer[textNodeName].endsWith(fragment)) {
// Check to prevent duplication
parserContext.currentPointer[textNodeName] += fragment
}
fragmentAddedToExistingText = true
} else if (Array.isArray(parserContext.currentPointer[textNodeName])) {
const lastTextItemIdx = parserContext.currentPointer[textNodeName].length - 1
if (
lastTextItemIdx >= 0 &&
typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] === "string"
) {
if (!parserContext.currentPointer[textNodeName][lastTextItemIdx].endsWith(fragment)) {
// Check to prevent duplication
parserContext.currentPointer[textNodeName][lastTextItemIdx] += fragment
}
fragmentAddedToExistingText = true
} else if (
lastTextItemIdx < 0 ||
typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] !== "string"
) {
// If no string to append to, add new
addValueToObject(
parserContext.currentPointer,
textNodeName,
fragment,
parserContext.customOptions,
)
fragmentAddedToExistingText = true
}
} else {
// No text node yet or it's not an array/string, add new
addValueToObject(
parserContext.currentPointer,
textNodeName,
fragment,
parserContext.customOptions,
)
fragmentAddedToExistingText = true
}
}
if (!fragmentAddedToExistingText && parserContext.accumulator.length > 0) {
let lastAccItem = parserContext.accumulator[parserContext.accumulator.length - 1]
if (typeof lastAccItem === "string") {
if (!lastAccItem.endsWith(fragment)) {
// Check to prevent duplication
parserContext.accumulator[parserContext.accumulator.length - 1] += fragment
}
} else {
parserContext.accumulator.push(fragment)
}
} else if (!fragmentAddedToExistingText) {
parserContext.accumulator.push(fragment)
}
finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : []
}
} else if (parserContext.tagStack.length > 0) {
isReturnPartial = true
} else {
if (isReturnPartial && !(parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState)) {
isReturnPartial = false
}
}
// This block handles finalXmlContent structure if parsing is complete (not partial)
if (!isReturnPartial) {
const effectiveBufferContent =
parserContext.streamingBufferBeforeClear ||
parserContext.streamingBuffer ||
parserContext._rootDeterminationBuffer
const tempBufferForNullCheck = effectiveBufferContent
.replace(/<\?xml[^?]*\?>/g, "")
.replace(/<!--[\s\S]*?-->/g, "")
.replace(/<!DOCTYPE[^>]*>/g, "")
.trim()
if (isSpecialOnlyAtEOF) {
finalXmlContent = []
} else if (parserContext.accumulator.length === 0 && tempBufferForNullCheck === "") {
finalXmlContent = []
} else if (
parserContext.accumulator.length === 0 &&
tempBufferForNullCheck !== "" &&
!parserContext._treatAsPlainText
) {
if (parserContext.customOptions.alwaysCreateTextNode) {
finalXmlContent = [{ [parserContext.customOptions.textNodeName]: tempBufferForNullCheck }]
} else {
finalXmlContent = [tempBufferForNullCheck]
}
}
// If accumulator has content, finalXmlContent is already set from it.
parserContext.streamingBuffer = ""
parserContext.parsingIndex = 0
parserContext._activelyStreaming = false
parserContext._originalBufferHadContent = false
parserContext.incompleteStructureState = null
parserContext.streamingBufferBeforeClear = ""
parserContext._lastClearedIncompleteStateWasSpecial = isSpecialOnlyAtEOF
parserContext._rootDeterminationBuffer = ""
} else {
// Still partial at EOF
// Ensure finalXmlContent reflects the accumulator, which might have been modified by fragment addition
finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : []
// If it's still just a single string fragment in accumulator and alwaysCreateTextNode is true, wrap it.
if (
finalXmlContent.length === 1 &&
typeof finalXmlContent[0] === "string" &&
parserContext.customOptions.alwaysCreateTextNode
) {
finalXmlContent = [{ [parserContext.customOptions.textNodeName]: finalXmlContent[0] }]
}
if (parserContext.incompleteStructureState) parserContext.reparsedSegmentContext = null
}
}
const result: ParseResult = {
metadata: { partial: isReturnPartial },
xml: finalXmlContent,
}
if (xmlChunk === null && !result.metadata.partial) {
if (isSpecialOnlyAtEOF) {
result.xml = []
} else if (
result.xml &&
result.xml.length === 0 &&
!parserContext._originalBufferHadContent &&
(parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() === ""
) {
result.xml = []
}
}
return result
}