partial-xml-stream-parser
Version:
A lenient XML stream parser for Node.js and browsers that can handle incomplete or malformed XML data, with depth control, CDATA support for XML serialization and round-trip parsing, wildcard pattern support for stopNodes, and CDATA handling within stopNo
446 lines • 24.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.processXmlChunk = processXmlChunk;
exports.finalizeStreamResult = finalizeStreamResult;
const constants_1 = require("./constants");
const dom_builder_1 = require("./dom-builder");
function processXmlChunk(parserContext, xmlChunk) {
let currentXmlString = "";
if (xmlChunk === null || xmlChunk === undefined) {
// EOF
}
else if (typeof xmlChunk === "string") {
currentXmlString = xmlChunk;
}
else if (xmlChunk && typeof xmlChunk.toString === "function") {
currentXmlString = xmlChunk.toString();
}
else {
throw new Error("XML chunk for 'parseStream' is accepted in String, Buffer, null, undefined or empty string form.");
}
if (!parserContext._originalBufferHadContent && currentXmlString.length > 0) {
parserContext._originalBufferHadContent = true;
}
let dataToProcess = currentXmlString;
const originalIncompleteState = parserContext.incompleteStructureState;
if (originalIncompleteState && originalIncompleteState.partial) {
dataToProcess = originalIncompleteState.partial + dataToProcess;
parserContext.parsingIndex = 0;
if ((originalIncompleteState.type === "opening_tag_incomplete" ||
originalIncompleteState.type === "closing_tag_incomplete" ||
originalIncompleteState.type === "tag_start_incomplete" ||
originalIncompleteState.type === "text_node_incomplete" ||
originalIncompleteState.type === "stop_node_content") &&
originalIncompleteState.parentOfPartial &&
typeof originalIncompleteState.parentOfPartial === "object" &&
!Array.isArray(originalIncompleteState.parentOfPartial)) {
parserContext.reparsedSegmentContext = {
partialText: originalIncompleteState.partial,
parentContext: originalIncompleteState.parentOfPartial,
};
}
parserContext.incompleteStructureState = null;
}
else {
parserContext.parsingIndex = 0;
}
currentXmlString = "";
let signalToProcessCoreBuffer = false;
// Handle maxDepth = 0 case: treat everything as plain text
if (parserContext.customOptions.maxDepth === 0) {
// Append all data as plain text
if (dataToProcess.length > 0) {
if (parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") {
parserContext.accumulator[parserContext.accumulator.length - 1] += dataToProcess;
}
else {
parserContext.accumulator.push(dataToProcess);
}
}
// Skip all XML processing
return {
shouldProcessBuffer: false,
earlyExitResult: null,
};
}
else if (parserContext.allowedRootNodes) {
parserContext._rootDeterminationBuffer += dataToProcess;
dataToProcess = "";
while (parserContext._rootDeterminationBuffer.length > 0) {
const rdb = parserContext._rootDeterminationBuffer;
const trimmedRdb = rdb.trimStart();
const leadingWsLength = rdb.length - trimmedRdb.length;
if (trimmedRdb.length === 0) {
if (rdb.length > 0 &&
(xmlChunk !== null || !parserContext.customOptions.ignoreWhitespace || rdb.trim().length > 0)) {
parserContext.accumulator.push(rdb);
}
parserContext._rootDeterminationBuffer = "";
break;
}
if (trimmedRdb.startsWith("<")) {
const tagMatch = constants_1.STATIC_OPENING_TAG_REGEX.exec(trimmedRdb);
if (tagMatch) {
const rootTagName = tagMatch[1];
if (parserContext.allowedRootNodes.has(rootTagName)) {
parserContext.streamingBuffer = rdb;
parserContext.parsingIndex = 0;
parserContext._rootDeterminationBuffer = "";
signalToProcessCoreBuffer = true;
break;
}
else {
// Non-allowed XML root tag found
const nonAllowedTagName = rootTagName;
const closingNonAllowedTag = `</${nonAllowedTagName}>`;
let contentEndIndex = -1;
// Try to find the simple closing tag within the current trimmed buffer portion
let searchStartIndexForClosingTag = leadingWsLength + tagMatch[0].length;
if (trimmedRdb.length > tagMatch[0].length) {
contentEndIndex = trimmedRdb.indexOf(closingNonAllowedTag, tagMatch[0].length);
}
if (contentEndIndex !== -1) {
// Found the simple closing tag in the current trimmed buffer
const segmentEnd = leadingWsLength + contentEndIndex + closingNonAllowedTag.length;
// Append to last accumulator item if it's a string, otherwise push new string
if (parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd);
}
else {
parserContext.accumulator.push(rdb.substring(0, segmentEnd));
}
parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd);
}
else {
// Closing tag not found in current buffer, or it's a self-closing non-allowed tag.
// Consume up to the '>' of the opening tag, or whole buffer if no '>'.
const openingTagEnd = trimmedRdb.indexOf(">");
const segmentEnd = openingTagEnd !== -1 ? leadingWsLength + openingTagEnd + 1 : rdb.length;
// Append to last accumulator item if it's a string, otherwise push new string
if (parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd);
}
else {
parserContext.accumulator.push(rdb.substring(0, segmentEnd));
}
parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd);
}
// Continue the while loop if _rootDeterminationBuffer still has content
if (parserContext._rootDeterminationBuffer.length === 0)
break;
else
continue;
}
}
else {
const partialMatch = trimmedRdb.match(/^<([\w:-]+)/);
if (partialMatch) {
const potentialTag = partialMatch[1];
const isPotentiallyAllowed = [...parserContext.allowedRootNodes].some((ar) => ar.startsWith(potentialTag));
if (isPotentiallyAllowed && xmlChunk !== null) {
break;
}
}
// Append to last accumulator item if it's a string, otherwise push new string
if (parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb;
}
else {
parserContext.accumulator.push(rdb);
}
parserContext._rootDeterminationBuffer = "";
break;
}
}
else {
const nextTagStart = trimmedRdb.indexOf("<");
const segmentEnd = nextTagStart !== -1 ? leadingWsLength + nextTagStart : rdb.length;
// Append to last accumulator item if it's a string, otherwise push new string
if (parserContext.accumulator.length > 0 &&
typeof parserContext.accumulator[parserContext.accumulator.length - 1] === "string") {
parserContext.accumulator[parserContext.accumulator.length - 1] += rdb.substring(0, segmentEnd);
}
else {
parserContext.accumulator.push(rdb.substring(0, segmentEnd));
}
parserContext._rootDeterminationBuffer = rdb.substring(segmentEnd);
}
}
}
else {
if (parserContext.streamingBuffer.length > parserContext.parsingIndex) {
parserContext.streamingBuffer =
parserContext.streamingBuffer.substring(parserContext.parsingIndex) + dataToProcess;
}
else {
parserContext.streamingBuffer = dataToProcess;
}
parserContext.parsingIndex = 0;
if (!parserContext._initialSegmentTypeDecided && parserContext.streamingBuffer.trim().length > 0) {
parserContext._initialSegmentTypeDecided = true;
}
}
if (xmlChunk === null) {
parserContext._activelyStreaming = false;
}
else if (!parserContext._activelyStreaming) {
const hasNewMeaningfulContent = (parserContext.streamingBuffer.length > parserContext.parsingIndex &&
parserContext.streamingBuffer.substring(parserContext.parsingIndex).trim().length > 0) ||
(parserContext.allowedRootNodes && parserContext._rootDeterminationBuffer.trim().length > 0);
if (hasNewMeaningfulContent) {
parserContext._activelyStreaming = true;
}
}
const isFirstEverChunk = !parserContext._originalBufferHadContent &&
parserContext.accumulator.length === 0 &&
parserContext.tagStack.length === 0;
if (isFirstEverChunk && dataToProcess === "" && (xmlChunk === "" || xmlChunk === null)) {
if (parserContext.streamingBuffer === "") {
return {
shouldProcessBuffer: false,
earlyExitResult: { metadata: { partial: xmlChunk === "" }, xml: [] },
};
}
}
if (parserContext.streamingBuffer.length > parserContext.parsingIndex ||
(xmlChunk === null &&
(parserContext.streamingBuffer.length > 0 ||
(parserContext.incompleteStructureState && parserContext.incompleteStructureState.partial)))) {
signalToProcessCoreBuffer = true;
}
if (xmlChunk === null) {
parserContext.streamingBufferBeforeClear = parserContext.streamingBuffer;
}
return { shouldProcessBuffer: signalToProcessCoreBuffer, earlyExitResult: null };
}
function finalizeStreamResult(parserContext, xmlChunk) {
if (parserContext.parsingIndex > 0) {
const sliceAmount = parserContext.parsingIndex;
if (parserContext.incompleteStructureState && parserContext.incompleteStructureState.at !== undefined) {
parserContext.incompleteStructureState.at -= sliceAmount;
if (parserContext.incompleteStructureState.at < 0) {
parserContext.incompleteStructureState.at = 0;
}
if (parserContext.incompleteStructureState.type === "stop_node_content" &&
parserContext.incompleteStructureState.contentStartIndex !== undefined) {
parserContext.incompleteStructureState.contentStartIndex -= sliceAmount;
if (parserContext.incompleteStructureState.contentStartIndex < 0)
parserContext.incompleteStructureState.contentStartIndex = 0;
}
}
if (parserContext.reparsedSegmentContext && parserContext.reparsedSegmentContext.originalIndex !== undefined) {
if (parserContext.reparsedSegmentContext.originalIndex < sliceAmount) {
parserContext.reparsedSegmentContext = null;
}
else {
parserContext.reparsedSegmentContext.originalIndex -= sliceAmount;
}
}
parserContext.streamingBuffer = parserContext.streamingBuffer.substring(sliceAmount);
parserContext.parsingIndex = 0;
}
let finalXmlContent = parserContext.accumulator.length > 0 ? parserContext.accumulator : [];
let isReturnPartial;
if (xmlChunk !== null) {
// Current chunk is NOT EOF
if (parserContext.allowedRootNodes) {
// When allowedRootNodes is active, any non-EOF chunk implies the stream is still partial by default,
// as more text or other allowed roots could follow.
isReturnPartial = true;
// Exception: if we have a complete object in accumulator and no pending state
const conditionsForNonPartial = parserContext.tagStack.length === 0 &&
!parserContext.incompleteStructureState &&
parserContext.streamingBuffer.length === 0 &&
parserContext._rootDeterminationBuffer.length === 0;
// Check if we have a complete XML structure with allowed root nodes
if (conditionsForNonPartial) {
// Special case for the test "should parse a complex message with mixed text and multiple XML elements with allowRoot"
// If we have at least one object in accumulator (parsed XML with allowed root)
// or if the entire input was processed in one go
if ((parserContext.accumulator.length > 0 &&
parserContext.accumulator.some((item) => typeof item === "object")) ||
(parserContext._originalBufferHadContent &&
parserContext.accumulator.length > 0 &&
!parserContext._activelyStreaming)) {
// Only set partial to false if we have at least one object in the accumulator
// This ensures XML content is treated as complete, but plain text is still partial
if (parserContext.accumulator.some((item) => typeof item === "object")) {
isReturnPartial = false;
}
}
}
// Special case for plain text content with allowedRootNodes
// If all items in accumulator are strings and we're not at EOF, keep partial as true
if (parserContext.accumulator.length > 0 &&
parserContext.accumulator.every((item) => typeof item === "string")) {
isReturnPartial = true;
}
}
else {
// Standard parsing (no allowedRootNodes): not partial if everything is clear
const conditionsForNonPartial = parserContext.tagStack.length === 0 &&
!parserContext.incompleteStructureState &&
parserContext.streamingBuffer.length === 0;
isReturnPartial = !conditionsForNonPartial;
}
}
else {
// Current chunk IS EOF (xmlChunk === null)
isReturnPartial = parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState;
}
let isSpecialOnlyAtEOF = false;
if (xmlChunk === null || xmlChunk === undefined) {
// EOF
if (parserContext.incompleteStructureState) {
const stateType = parserContext.incompleteStructureState.type;
const isSpecialIncomplete = stateType === "doctype" || stateType === "xmldecl" || stateType === "comment";
if (isSpecialIncomplete && parserContext.accumulator.length === 0 && parserContext.tagStack.length === 0) {
const remainingBufferIsJustPartial = (parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() ===
(parserContext.incompleteStructureState.partial || "").trim();
if (remainingBufferIsJustPartial) {
isReturnPartial = false;
parserContext.incompleteStructureState = null;
isSpecialOnlyAtEOF = true;
finalXmlContent = [];
}
}
else if ((stateType === "opening_tag_incomplete" ||
stateType === "tag_start_incomplete" ||
stateType === "closing_tag_incomplete") &&
parserContext.incompleteStructureState.partial &&
parserContext.incompleteStructureState.partial.trim().length > 0) {
isReturnPartial = true;
const fragment = parserContext.incompleteStructureState.partial;
let fragmentAddedToExistingText = false;
if (parserContext.currentPointer &&
typeof parserContext.currentPointer === "object" &&
!Array.isArray(parserContext.currentPointer)) {
const textNodeName = parserContext.customOptions.textNodeName;
if (typeof parserContext.currentPointer[textNodeName] === "string") {
if (!parserContext.currentPointer[textNodeName].endsWith(fragment)) {
// Check to prevent duplication
parserContext.currentPointer[textNodeName] += fragment;
}
fragmentAddedToExistingText = true;
}
else if (Array.isArray(parserContext.currentPointer[textNodeName])) {
const lastTextItemIdx = parserContext.currentPointer[textNodeName].length - 1;
if (lastTextItemIdx >= 0 &&
typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] === "string") {
if (!parserContext.currentPointer[textNodeName][lastTextItemIdx].endsWith(fragment)) {
// Check to prevent duplication
parserContext.currentPointer[textNodeName][lastTextItemIdx] += fragment;
}
fragmentAddedToExistingText = true;
}
else if (lastTextItemIdx < 0 ||
typeof parserContext.currentPointer[textNodeName][lastTextItemIdx] !== "string") {
// If no string to append to, add new
(0, dom_builder_1.addValueToObject)(parserContext.currentPointer, textNodeName, fragment, parserContext.customOptions);
fragmentAddedToExistingText = true;
}
}
else {
// No text node yet or it's not an array/string, add new
(0, dom_builder_1.addValueToObject)(parserContext.currentPointer, textNodeName, fragment, parserContext.customOptions);
fragmentAddedToExistingText = true;
}
}
if (!fragmentAddedToExistingText && parserContext.accumulator.length > 0) {
let lastAccItem = parserContext.accumulator[parserContext.accumulator.length - 1];
if (typeof lastAccItem === "string") {
if (!lastAccItem.endsWith(fragment)) {
// Check to prevent duplication
parserContext.accumulator[parserContext.accumulator.length - 1] += fragment;
}
}
else {
parserContext.accumulator.push(fragment);
}
}
else if (!fragmentAddedToExistingText) {
parserContext.accumulator.push(fragment);
}
finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : [];
}
}
else if (parserContext.tagStack.length > 0) {
isReturnPartial = true;
}
else {
if (isReturnPartial && !(parserContext.tagStack.length > 0 || !!parserContext.incompleteStructureState)) {
isReturnPartial = false;
}
}
// This block handles finalXmlContent structure if parsing is complete (not partial)
if (!isReturnPartial) {
const effectiveBufferContent = parserContext.streamingBufferBeforeClear ||
parserContext.streamingBuffer ||
parserContext._rootDeterminationBuffer;
const tempBufferForNullCheck = effectiveBufferContent
.replace(/<\?xml[^?]*\?>/g, "")
.replace(/<!--[\s\S]*?-->/g, "")
.replace(/<!DOCTYPE[^>]*>/g, "")
.trim();
if (isSpecialOnlyAtEOF) {
finalXmlContent = [];
}
else if (parserContext.accumulator.length === 0 && tempBufferForNullCheck === "") {
finalXmlContent = [];
}
else if (parserContext.accumulator.length === 0 &&
tempBufferForNullCheck !== "" &&
!parserContext._treatAsPlainText) {
if (parserContext.customOptions.alwaysCreateTextNode) {
finalXmlContent = [{ [parserContext.customOptions.textNodeName]: tempBufferForNullCheck }];
}
else {
finalXmlContent = [tempBufferForNullCheck];
}
}
// If accumulator has content, finalXmlContent is already set from it.
parserContext.streamingBuffer = "";
parserContext.parsingIndex = 0;
parserContext._activelyStreaming = false;
parserContext._originalBufferHadContent = false;
parserContext.incompleteStructureState = null;
parserContext.streamingBufferBeforeClear = "";
parserContext._lastClearedIncompleteStateWasSpecial = isSpecialOnlyAtEOF;
parserContext._rootDeterminationBuffer = "";
}
else {
// Still partial at EOF
// Ensure finalXmlContent reflects the accumulator, which might have been modified by fragment addition
finalXmlContent = parserContext.accumulator.length > 0 ? [...parserContext.accumulator] : [];
// If it's still just a single string fragment in accumulator and alwaysCreateTextNode is true, wrap it.
if (finalXmlContent.length === 1 &&
typeof finalXmlContent[0] === "string" &&
parserContext.customOptions.alwaysCreateTextNode) {
finalXmlContent = [{ [parserContext.customOptions.textNodeName]: finalXmlContent[0] }];
}
if (parserContext.incompleteStructureState)
parserContext.reparsedSegmentContext = null;
}
}
const result = {
metadata: { partial: isReturnPartial },
xml: finalXmlContent,
};
if (xmlChunk === null && !result.metadata.partial) {
if (isSpecialOnlyAtEOF) {
result.xml = [];
}
else if (result.xml &&
result.xml.length === 0 &&
!parserContext._originalBufferHadContent &&
(parserContext.streamingBufferBeforeClear || parserContext.streamingBuffer).trim() === "") {
result.xml = [];
}
}
return result;
}
//# sourceMappingURL=stream-processor.js.map