partial-xml-stream-parser

Version:

A lenient XML stream parser for Node.js and browsers that can handle incomplete or malformed XML data, with depth control, CDATA support for XML serialization and round-trip parsing, wildcard pattern support for stopNodes, and CDATA handling within stopNo

github.com/samhvw8/partial-xml-stream-parser

samhvw8/partial-xml-stream-parser

690 lines (627 loc) • 24.4 kB

text/typescript

import { addValueToObject } from "./dom-builder" import { tryParsePrimitive, decodeXmlEntities, parseAttributes, processCDATAInStopnode } from "./utils" import { STATIC_OPENING_TAG_REGEX, STATIC_CLOSING_TAG_REGEX, COMMON_ENTITIES } from "./constants" import { ParserContext, SpecialPrefixResult, TagHandlerResult } from "./types" export function handleSpecialPrefixes( parserContext: ParserContext, buffer: string, charAfterLT: string, ): SpecialPrefixResult { const i = parserContext.parsingIndex const len = buffer.length const textNodeName = parserContext.customOptions.textNodeName if (charAfterLT === "?") { if (buffer.startsWith("<?xml", i)) { const endDeclaration = buffer.indexOf("?>", i + 5) if (endDeclaration === -1) { parserContext.incompleteStructureState = { type: "xmldecl", lookingFor: "?>", at: i, partial: buffer.substring(i, len), } parserContext.parsingIndex = len return { matched: true, shouldReturn: true, shouldContinue: false } } parserContext.parsingIndex = endDeclaration + 2 parserContext.incompleteStructureState = null return { matched: true, shouldReturn: false, shouldContinue: true } } } else if (charAfterLT === "!") { if (i + 3 < len && buffer[i + 2] === "-" && buffer[i + 3] === "-") { // ", i + 4) if (commentEnd === -1) { parserContext.incompleteStructureState = { type: "comment", lookingFor: "-->", at: i, partial: buffer.substring(i, len), } parserContext.parsingIndex = len return { matched: true, shouldReturn: true, shouldContinue: false } } parserContext.parsingIndex = commentEnd + 3 parserContext.incompleteStructureState = null return { matched: true, shouldReturn: false, shouldContinue: true } } else if (buffer.startsWith("<![CDATA[", i)) { const cdataOpenTagEnd = i + 9 const cdataCloseMarker = "]]>" const cdataEnd = buffer.indexOf(cdataCloseMarker, cdataOpenTagEnd) if (cdataEnd === -1) { // Incomplete CDATA const partialContent = buffer.substring(cdataOpenTagEnd, len) let currentPartialData = "" if ( parserContext.incompleteStructureState && parserContext.incompleteStructureState.type === "cdata" && parserContext.incompleteStructureState.at === i ) { currentPartialData = parserContext.incompleteStructureState.partialData || "" } parserContext.incompleteStructureState = { type: "cdata", lookingFor: cdataCloseMarker, at: i, partialData: currentPartialData + partialContent, } if (partialContent.length > 0) { if (parserContext.tagStack.length > 0 && parserContext.currentPointer) { addValueToObject( parserContext.currentPointer, textNodeName, partialContent, parserContext.customOptions, ) } } parserContext.parsingIndex = len return { matched: true, shouldReturn: true, shouldContinue: false } } else { // CDATA finished in this chunk let text = buffer.substring(cdataOpenTagEnd, cdataEnd) parserContext.parsingIndex = cdataEnd + cdataCloseMarker.length const prevPartialData = parserContext.incompleteStructureState && parserContext.incompleteStructureState.type === "cdata" && parserContext.incompleteStructureState.at === i ? parserContext.incompleteStructureState.partialData : "" const fullTextContent = (prevPartialData || "") + text const processedCDATA = parserContext.customOptions.parsePrimitives ? tryParsePrimitive(fullTextContent) : fullTextContent if (fullTextContent.length > 0) { if (parserContext.tagStack.length > 0 && parserContext.currentPointer) { addValueToObject( parserContext.currentPointer, textNodeName, processedCDATA, parserContext.customOptions, ) } else if (parserContext.tagStack.length === 0) { parserContext.accumulator.push(processedCDATA) } } parserContext.incompleteStructureState = null return { matched: true, shouldReturn: false, shouldContinue: true } } } else if (buffer.startsWith("<!DOCTYPE", i)) { const endDoctype = buffer.indexOf(">", i + 9) if (endDoctype === -1) { parserContext.incompleteStructureState = { type: "doctype", lookingFor: ">", at: i, partial: buffer.substring(i, len), } parserContext.parsingIndex = len return { matched: true, shouldReturn: true, shouldContinue: false } } parserContext.parsingIndex = endDoctype + 1 parserContext.incompleteStructureState = null return { matched: true, shouldReturn: false, shouldContinue: true } } } return { matched: false, shouldReturn: false, shouldContinue: false } } export function handleClosingTag(parserContext: ParserContext, tagString: string): boolean { const textNodeName = parserContext.customOptions.textNodeName // 1. Initial Check & Syntax Validation const match = tagString.match(STATIC_CLOSING_TAG_REGEX) if (!match) { // Not syntactically valid, return false to allow fallback to handleFallbackText return false } const closingTagName = match[1] // 2. Check if tagStack is empty if (parserContext.tagStack.length === 0) { // No open parent, return false return false } // 3. Search the tagStack for a Match let matchIndex = -1 for (let i = parserContext.tagStack.length - 1; i >= 0; i--) { if (parserContext.tagStack[i].tagName === closingTagName) { matchIndex = i break } } // 4. If a Match is Found if (matchIndex !== -1) { // Handle Interrupted Tags - only process the topmost instances of each tag name const processedTagNames = new Set<string>() for (let i = parserContext.tagStack.length - 1; i > matchIndex; i--) { const interruptedTagState = parserContext.tagStack[i] const parentOfInterruptedTagState = parserContext.tagStack[i - 1] // Skip if we've already processed a tag with this name (process only the topmost) if (processedTagNames.has(interruptedTagState.tagName)) { continue } processedTagNames.add(interruptedTagState.tagName) // Construct the text representation of the opening tag const openingTagText = "<" + interruptedTagState.tagName + ">" if (parentOfInterruptedTagState && parentOfInterruptedTagState.objPtr) { // Remove the interrupted tag's object from its parent if it exists const parentObj = parentOfInterruptedTagState.objPtr const tagName = interruptedTagState.tagName if (Object.prototype.hasOwnProperty.call(parentObj, tagName)) { // Only remove empty objects or objects that were just created const shouldRemove = !interruptedTagState.objPtr || Object.keys(interruptedTagState.objPtr).length === 0 || (Object.keys(interruptedTagState.objPtr).length === 1 && Object.prototype.hasOwnProperty.call(interruptedTagState.objPtr, tagName) && Object.keys(interruptedTagState.objPtr[tagName]).length === 0) if (shouldRemove) { // If it's an array, remove the interrupted tag's object if (Array.isArray(parentObj[tagName])) { const arr = parentObj[tagName] const idx = arr.indexOf(interruptedTagState.objPtr) if (idx !== -1) { arr.splice(idx, 1) // If array becomes empty, remove the property if (arr.length === 0) { delete parentObj[tagName] } } } else if (parentObj[tagName] === interruptedTagState.objPtr) { // If it's a direct reference, remove it delete parentObj[tagName] } } } // Add the opening tag as text content to the parent addValueToObject( parentOfInterruptedTagState.objPtr, textNodeName, openingTagText, parserContext.customOptions, ) parentOfInterruptedTagState.textOnly = false } } // Pop Tags: Pop all tags from matchIndex to the top of the stack (inclusive) parserContext.tagStack.length = matchIndex // Get the matched tag state before popping const matchedTagState = parserContext.tagStack[matchIndex] // Pop Tags: Pop all tags from matchIndex to the top of the stack (inclusive) parserContext.tagStack.length = matchIndex // Handle text-only optimization for the matched tag (similar to original logic) if ( !parserContext.customOptions.alwaysCreateTextNode && matchedTagState.textOnly && Object.prototype.hasOwnProperty.call(matchedTagState.objPtr, textNodeName) && Object.keys(matchedTagState.objPtr).length === 1 ) { const textVal = matchedTagState.objPtr[textNodeName] if (parserContext.tagStack.length > 0) { const currentParent = parserContext.tagStack[parserContext.tagStack.length - 1].objPtr for (const keyInParent in currentParent) { if (currentParent[keyInParent] === matchedTagState.objPtr) { currentParent[keyInParent] = textVal break } else if (Array.isArray(currentParent[keyInParent])) { const arr = currentParent[keyInParent] const idx = arr.indexOf(matchedTagState.objPtr) if (idx !== -1) { arr[idx] = textVal break } } } } else { for (let k = 0; k < parserContext.accumulator.length; k++) { if (typeof parserContext.accumulator[k] === "object" && parserContext.accumulator[k] !== null) { const rootTagNameFromAccumulator = Object.keys(parserContext.accumulator[k])[0] if ( rootTagNameFromAccumulator === matchedTagState.tagName && parserContext.accumulator[k][rootTagNameFromAccumulator] === matchedTagState.objPtr ) { parserContext.accumulator[k][rootTagNameFromAccumulator] = textVal break } } } } } // Update Context parserContext.currentPointer = parserContext.tagStack.length > 0 ? parserContext.tagStack[parserContext.tagStack.length - 1].objPtr : null parserContext.parsingIndex += tagString.length parserContext.incompleteStructureState = null parserContext.reparsedSegmentContext = null return true } // 5. If No Match is Found in the Entire Stack // The tagString itself becomes literal text if (parserContext.currentPointer) { addValueToObject(parserContext.currentPointer, textNodeName, tagString, parserContext.customOptions) // Mark the current top-of-stack tag as not textOnly parserContext.tagStack[parserContext.tagStack.length - 1].textOnly = false // Advance parsing index and clear incomplete states parserContext.parsingIndex += tagString.length parserContext.incompleteStructureState = null parserContext.reparsedSegmentContext = null return true } // This shouldn't happen since we checked tagStack.length > 0 earlier, // but return false as fallback return false } export function handleOpeningTag(parserContext: ParserContext, tagString: string, i: number): TagHandlerResult { const buffer = parserContext.streamingBuffer const len = buffer.length const textNodeName = parserContext.customOptions.textNodeName const attributeNamePrefix = parserContext.customOptions.attributeNamePrefix !== undefined ? parserContext.customOptions.attributeNamePrefix : "@" const match = tagString.match(STATIC_OPENING_TAG_REGEX) if (match) { const tagName = match[1] if ( parserContext.reparsedSegmentContext && parserContext.reparsedSegmentContext.parentContext && parserContext.reparsedSegmentContext.partialText !== undefined && parserContext.currentPointer === parserContext.reparsedSegmentContext.parentContext ) { const { partialText, parentContext } = parserContext.reparsedSegmentContext const textNodeNameToUse = parserContext.customOptions.textNodeName if (Object.prototype.hasOwnProperty.call(parentContext, textNodeNameToUse)) { const currentTextNodeValue = parentContext[textNodeNameToUse] if (typeof currentTextNodeValue === "string") { if (currentTextNodeValue.endsWith(partialText)) { const newTextValue = currentTextNodeValue.slice(0, -partialText.length) if (newTextValue === "") { delete parentContext[textNodeNameToUse] } else { parentContext[textNodeNameToUse] = newTextValue } } } else if (Array.isArray(currentTextNodeValue)) { let foundAndRemoved = false for (let k = currentTextNodeValue.length - 1; k >= 0; k--) { if (currentTextNodeValue[k] === partialText) { currentTextNodeValue.splice(k, 1) foundAndRemoved = true break } } if (foundAndRemoved && currentTextNodeValue.length === 0) { delete parentContext[textNodeNameToUse] } } } } parserContext.reparsedSegmentContext = null const attributesString = (match[2] || "").trim() const isSelfClosing = match[3] === "/" const parsedAttributes = parseAttributes( attributesString, attributeNamePrefix, parserContext.customOptions, parserContext.attrRegex, decodeXmlEntities, tryParsePrimitive, COMMON_ENTITIES, ) const parentPath = parserContext.tagStack.length > 0 ? parserContext.tagStack[parserContext.tagStack.length - 1].path : "" const currentPath = parentPath ? `${parentPath}.${tagName}` : tagName const isSimpleStopNode = parserContext.simpleStopNodes.has(tagName) // Check for path stopnode matches - exact matches, suffix matches, and wildcard patterns let isPathStopNode = parserContext.pathStopNodes.has(currentPath) // If no exact match, check for suffix matches and wildcard patterns if (!isPathStopNode) { for (const pathStopNode of parserContext.pathStopNodes) { // Check for wildcard patterns (e.g., "a.*", "*.suggest", "a.*.c") if (pathStopNode.includes("*")) { // Convert glob pattern to regex const regexPattern = pathStopNode .replace(/\./g, "\\.") // Escape dots .replace(/\*/g, "[^.]*") // Replace * with non-dot characters // Check both exact match and suffix match for wildcard patterns const exactRegex = new RegExp(`^${regexPattern}$`) const suffixRegex = new RegExp(`\\.${regexPattern}$`) if (exactRegex.test(currentPath) || suffixRegex.test(currentPath)) { isPathStopNode = true break } } // Check for suffix matches (existing logic) else if ( currentPath.endsWith(pathStopNode) && (currentPath === pathStopNode || currentPath.endsWith("." + pathStopNode)) ) { isPathStopNode = true break } } } // Check if maxDepth is exceeded - if so, treat as fallback text // tagStack.length represents the current nesting depth (0-based) // maxDepth=1: allow depth 0 only, treat depth 1+ as text (tagStack.length > 1) // maxDepth=2: allow depths 0,1,2 only, treat depth 3+ as text (tagStack.length > 2) // maxDepth=3: allow depths 0,1,2,3 only, treat depth 4+ as text (tagStack.length > 3) const isMaxDepthExceeded = parserContext.customOptions.maxDepth !== null && parserContext.customOptions.maxDepth !== undefined && parserContext.tagStack.length > parserContext.customOptions.maxDepth if (isMaxDepthExceeded) { // Treat the entire tag as fallback text instead of processing it return { processed: false, shouldReturn: false } } const isStopNode = !isSelfClosing && (isSimpleStopNode || isPathStopNode) if (isStopNode) { const stopNodeObject = { ...parsedAttributes } if (parserContext.tagStack.length === 0) { parserContext.accumulator.push({ [tagName]: stopNodeObject }) } else { addValueToObject(parserContext.currentPointer, tagName, stopNodeObject, parserContext.customOptions) } const openTagEndOffset = tagString.length const contentStartIndex = i + openTagEndOffset let depth = 1 let searchPos = contentStartIndex let rawContentEnd = -1 let closingTagLengthVal = 0 // CDATA-aware parsing instead of regex while (searchPos < len && depth > 0) { // Check for CDATA start if (buffer.startsWith("<![CDATA[", searchPos)) { // Skip over CDATA section const cdataEnd = buffer.indexOf("]]>", searchPos + 9) if (cdataEnd === -1) { // CDATA is incomplete, break out break } searchPos = cdataEnd + 3 continue } // Look for next tag const nextLT = buffer.indexOf("<", searchPos) if (nextLT === -1) break searchPos = nextLT // Check if it's a closing tag for our stopnode const closingTagPattern = new RegExp(`^<\\/\\s*${tagName}\\s*>`, "i") const openingTagPattern = new RegExp(`^<\\s*${tagName}(?:\\s[^>]*)?>`, "i") const closingMatch = buffer.substring(searchPos).match(closingTagPattern) if (closingMatch) { depth-- if (depth === 0) { rawContentEnd = searchPos closingTagLengthVal = closingMatch[0].length break } searchPos += closingMatch[0].length continue } const openingMatch = buffer.substring(searchPos).match(openingTagPattern) if (openingMatch) { // Check if it's self-closing if (!/\/\s*>$/.test(openingMatch[0])) { depth++ } searchPos += openingMatch[0].length continue } // Not a tag we care about, move to next character searchPos++ } if (rawContentEnd !== -1) { const rawContent = buffer.substring(contentStartIndex, rawContentEnd) // Process CDATA content in stopnodes const processedContent = processCDATAInStopnode(rawContent) addValueToObject(stopNodeObject, textNodeName, processedContent, parserContext.customOptions) parserContext.parsingIndex = rawContentEnd + closingTagLengthVal parserContext.incompleteStructureState = null } else { const newPartialContent = buffer.substring(contentStartIndex, len) // Process CDATA content in partial stopnode content const processedPartialContent = processCDATAInStopnode(newPartialContent) addValueToObject(stopNodeObject, textNodeName, processedPartialContent, parserContext.customOptions) parserContext.incompleteStructureState = { type: "stop_node_content", tagName, depth, contentStartIndex, stopNodeObjectRef: stopNodeObject, at: i, } parserContext.parsingIndex = len return { processed: true, shouldReturn: true } } } else { // Regular opening tag const newObjShell = { ...parsedAttributes } if (parserContext.tagStack.length === 0) { parserContext.accumulator.push({ [tagName]: newObjShell }) if (!isSelfClosing) { parserContext.tagStack.push({ tagName, objPtr: newObjShell, path: currentPath, textOnly: true, }) parserContext.currentPointer = newObjShell } else { parserContext.currentPointer = null } } else { if (parserContext.tagStack.length > 0) parserContext.tagStack[parserContext.tagStack.length - 1].textOnly = false addValueToObject(parserContext.currentPointer, tagName, newObjShell, parserContext.customOptions) if (!isSelfClosing) { parserContext.tagStack.push({ tagName, objPtr: newObjShell, path: currentPath, textOnly: true, }) parserContext.currentPointer = newObjShell } } parserContext.parsingIndex = i + tagString.length parserContext.incompleteStructureState = null } return { processed: true, shouldReturn: false } } return { processed: false, shouldReturn: false } } export function handleFallbackText( parserContext: ParserContext, buffer: string, startIndex: number, textNodeName: string, ): number { let endOfProblematicText = buffer.indexOf("<", startIndex + 1) if (endOfProblematicText === -1) endOfProblematicText = buffer.length const fullFallbackText = buffer.substring(startIndex, endOfProblematicText) let textToProcessAsContent = fullFallbackText parserContext.incompleteStructureState = null if (endOfProblematicText === buffer.length && fullFallbackText.startsWith("<")) { if (fullFallbackText === "<") { parserContext.incompleteStructureState = { type: "tag_start_incomplete", at: startIndex, partial: "<", } } else if (fullFallbackText.startsWith("</")) { if (fullFallbackText.indexOf(">") === -1) { parserContext.incompleteStructureState = { type: "closing_tag_incomplete", at: startIndex, partial: fullFallbackText, } } } else if (fullFallbackText.startsWith("<")) { const potentialTagNameMatch = fullFallbackText.match(/^<([\w:-]+)/) if (potentialTagNameMatch && fullFallbackText.indexOf(">") === -1) { parserContext.incompleteStructureState = { type: "opening_tag_incomplete", at: startIndex, partial: fullFallbackText, } } } if (parserContext.incompleteStructureState) { textToProcessAsContent = "" if (parserContext.tagStack.length > 0 && parserContext.currentPointer) { parserContext.incompleteStructureState.parentOfPartial = parserContext.currentPointer const fragmentText = parserContext.incompleteStructureState.partial if (fragmentText && fragmentText.length > 0) { const decodedFragment = decodeXmlEntities(fragmentText, COMMON_ENTITIES) let processedFragment = parserContext.customOptions.parsePrimitives && typeof decodedFragment === "string" ? tryParsePrimitive(decodedFragment) : decodedFragment parserContext.incompleteStructureState.processedPartialForCleanup = processedFragment let skipAddingProvisionalText = false if ( parserContext.reparsedSegmentContext && parserContext.reparsedSegmentContext.parentContext === parserContext.incompleteStructureState.parentOfPartial && parserContext.reparsedSegmentContext.partialText === processedFragment ) { skipAddingProvisionalText = true } if ( !skipAddingProvisionalText && parserContext.incompleteStructureState.parentOfPartial === parserContext.currentPointer ) { addValueToObject( parserContext.currentPointer, textNodeName, processedFragment, parserContext.customOptions, ) if ( parserContext.tagStack.length > 0 && parserContext.tagStack[parserContext.tagStack.length - 1].objPtr === parserContext.currentPointer ) { parserContext.tagStack[parserContext.tagStack.length - 1].textOnly = false } } } } else if (parserContext.tagStack.length === 0) { parserContext.incompleteStructureState.parentOfPartial = parserContext.accumulator } } } if (textToProcessAsContent.length > 0) { const decodedText = decodeXmlEntities(textToProcessAsContent, COMMON_ENTITIES) if (decodedText.trim().length > 0) { let processedContent = parserContext.customOptions.parsePrimitives ? tryParsePrimitive(decodedText) : decodedText if (parserContext.tagStack.length > 0 && parserContext.currentPointer) { if (parserContext.tagStack.length > 0) parserContext.tagStack[parserContext.tagStack.length - 1].textOnly = false addValueToObject( parserContext.currentPointer, textNodeName, processedContent, parserContext.customOptions, ) } else if (parserContext.tagStack.length === 0) { parserContext.accumulator.push(processedContent) } } } return endOfProblematicText } export function handleTextNode(parserContext: ParserContext, i: number): void { const buffer = parserContext.streamingBuffer const len = buffer.length const textNodeName = parserContext.customOptions.textNodeName let textEnd = buffer.indexOf("<", i) if (textEnd === -1) textEnd = len const rawText = buffer.substring(i, textEnd) if (rawText.length > 0) { const decodedText = decodeXmlEntities(rawText, COMMON_ENTITIES) if (decodedText.trim().length > 0) { let processedContent = parserContext.customOptions.parsePrimitives ? tryParsePrimitive(decodedText) : decodedText if (parserContext.tagStack.length > 0 && parserContext.currentPointer) { if (parserContext.tagStack.length > 0) { // Ensure parent tag is not marked as textOnly parserContext.tagStack[parserContext.tagStack.length - 1].textOnly = false } addValueToObject( parserContext.currentPointer, textNodeName, processedContent, parserContext.customOptions, ) } else if (parserContext.tagStack.length === 0) { // Text node at the root level parserContext.accumulator.push(processedContent) } } } parserContext.parsingIndex = textEnd parserContext.incompleteStructureState = null }