fast-xml-parser
Version:
Validate XML, Parse XML, Build XML without C/C++ based libraries
729 lines (652 loc) • 25.2 kB
JavaScript
'use strict';
///@ts-check
import { getAllMatches, isExist } from '../util.js';
import xmlNode from './xmlNode.js';
import DocTypeReader from './DocTypeReader.js';
import toNumber from "strnum";
import getIgnoreAttributesFn from "../ignoreAttributes.js";
// const regx =
// '<((!\\[CDATA\\[([\\s\\S]*?)(]]>))|((NAME:)?(NAME))([^>]*)>|((\\/)(NAME)\\s*>))([^<]*)'
// .replace(/NAME/g, util.nameRegexp);
//const tagsRegx = new RegExp("<(\\/?[\\w:\\-\._]+)([^>]*)>(\\s*"+cdataRegx+")*([^<]+)?","g");
//const tagsRegx = new RegExp("<(\\/?)((\\w*:)?([\\w:\\-\._]+))([^>]*)>([^<]*)("+cdataRegx+"([^<]*))*([^<]+)?","g");
export default class OrderedObjParser {
constructor(options) {
this.options = options;
this.currentNode = null;
this.tagsNodeStack = [];
this.docTypeEntities = {};
this.lastEntities = {
"apos": { regex: /&(apos|#39|#x27);/g, val: "'" },
"gt": { regex: /&(gt|#62|#x3E);/g, val: ">" },
"lt": { regex: /&(lt|#60|#x3C);/g, val: "<" },
"quot": { regex: /&(quot|#34|#x22);/g, val: "\"" },
};
this.ampEntity = { regex: /&(amp|#38|#x26);/g, val: "&" };
this.htmlEntities = {
"space": { regex: /&(nbsp|#160);/g, val: " " },
// "lt" : { regex: /&(lt|#60);/g, val: "<" },
// "gt" : { regex: /&(gt|#62);/g, val: ">" },
// "amp" : { regex: /&(amp|#38);/g, val: "&" },
// "quot" : { regex: /&(quot|#34);/g, val: "\"" },
// "apos" : { regex: /&(apos|#39);/g, val: "'" },
"cent": { regex: /&(cent|#162);/g, val: "¢" },
"pound": { regex: /&(pound|#163);/g, val: "£" },
"yen": { regex: /&(yen|#165);/g, val: "¥" },
"euro": { regex: /&(euro|#8364);/g, val: "€" },
"copyright": { regex: /&(copy|#169);/g, val: "©" },
"reg": { regex: /&(reg|#174);/g, val: "®" },
"inr": { regex: /&(inr|#8377);/g, val: "₹" },
"num_dec": { regex: /&#([0-9]{1,7});/g, val: (_, str) => fromCodePoint(str, 10, "&#") },
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val: (_, str) => fromCodePoint(str, 16, "&#x") },
};
this.addExternalEntities = addExternalEntities;
this.parseXml = parseXml;
this.parseTextData = parseTextData;
this.resolveNameSpace = resolveNameSpace;
this.buildAttributesMap = buildAttributesMap;
this.isItStopNode = isItStopNode;
this.replaceEntitiesValue = replaceEntitiesValue;
this.readStopNodeData = readStopNodeData;
this.saveTextToParentTag = saveTextToParentTag;
this.addChild = addChild;
this.ignoreAttributesFn = getIgnoreAttributesFn(this.options.ignoreAttributes)
this.entityExpansionCount = 0;
this.currentExpandedLength = 0;
if (this.options.stopNodes && this.options.stopNodes.length > 0) {
this.stopNodesExact = new Set();
this.stopNodesWildcard = new Set();
for (let i = 0; i < this.options.stopNodes.length; i++) {
const stopNodeExp = this.options.stopNodes[i];
if (typeof stopNodeExp !== 'string') continue;
if (stopNodeExp.startsWith("*.")) {
this.stopNodesWildcard.add(stopNodeExp.substring(2));
} else {
this.stopNodesExact.add(stopNodeExp);
}
}
}
}
}
function addExternalEntities(externalEntities) {
const entKeys = Object.keys(externalEntities);
for (let i = 0; i < entKeys.length; i++) {
const ent = entKeys[i];
const escaped = ent.replace(/[.\-+*:]/g, '\\.');
this.lastEntities[ent] = {
regex: new RegExp("&" + escaped + ";", "g"),
val: externalEntities[ent]
}
}
}
/**
* @param {string} val
* @param {string} tagName
* @param {string} jPath
* @param {boolean} dontTrim
* @param {boolean} hasAttributes
* @param {boolean} isLeafNode
* @param {boolean} escapeEntities
*/
function parseTextData(val, tagName, jPath, dontTrim, hasAttributes, isLeafNode, escapeEntities) {
if (val !== undefined) {
if (this.options.trimValues && !dontTrim) {
val = val.trim();
}
if (val.length > 0) {
if (!escapeEntities) val = this.replaceEntitiesValue(val, tagName, jPath);
const newval = this.options.tagValueProcessor(tagName, val, jPath, hasAttributes, isLeafNode);
if (newval === null || newval === undefined) {
//don't parse
return val;
} else if (typeof newval !== typeof val || newval !== val) {
//overwrite
return newval;
} else if (this.options.trimValues) {
return parseValue(val, this.options.parseTagValue, this.options.numberParseOptions);
} else {
const trimmedVal = val.trim();
if (trimmedVal === val) {
return parseValue(val, this.options.parseTagValue, this.options.numberParseOptions);
} else {
return val;
}
}
}
}
}
function resolveNameSpace(tagname) {
if (this.options.removeNSPrefix) {
const tags = tagname.split(':');
const prefix = tagname.charAt(0) === '/' ? '/' : '';
if (tags[0] === 'xmlns') {
return '';
}
if (tags.length === 2) {
tagname = prefix + tags[1];
}
}
return tagname;
}
//TODO: change regex to capture NS
//const attrsRegx = new RegExp("([\\w\\-\\.\\:]+)\\s*=\\s*(['\"])((.|\n)*?)\\2","gm");
const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm');
function buildAttributesMap(attrStr, jPath, tagName) {
if (this.options.ignoreAttributes !== true && typeof attrStr === 'string') {
// attrStr = attrStr.replace(/\r?\n/g, ' ');
//attrStr = attrStr || attrStr.trim();
const matches = getAllMatches(attrStr, attrsRegx);
const len = matches.length; //don't make it inline
const attrs = {};
for (let i = 0; i < len; i++) {
const attrName = this.resolveNameSpace(matches[i][1]);
if (this.ignoreAttributesFn(attrName, jPath)) {
continue
}
let oldVal = matches[i][4];
let aName = this.options.attributeNamePrefix + attrName;
if (attrName.length) {
if (this.options.transformAttributeName) {
aName = this.options.transformAttributeName(aName);
}
if (aName === "__proto__") aName = "#__proto__";
if (oldVal !== undefined) {
if (this.options.trimValues) {
oldVal = oldVal.trim();
}
oldVal = this.replaceEntitiesValue(oldVal, tagName, jPath);
const newVal = this.options.attributeValueProcessor(attrName, oldVal, jPath);
if (newVal === null || newVal === undefined) {
//don't parse
attrs[aName] = oldVal;
} else if (typeof newVal !== typeof oldVal || newVal !== oldVal) {
//overwrite
attrs[aName] = newVal;
} else {
//parse
attrs[aName] = parseValue(
oldVal,
this.options.parseAttributeValue,
this.options.numberParseOptions
);
}
} else if (this.options.allowBooleanAttributes) {
attrs[aName] = true;
}
}
}
if (!Object.keys(attrs).length) {
return;
}
if (this.options.attributesGroupName) {
const attrCollection = {};
attrCollection[this.options.attributesGroupName] = attrs;
return attrCollection;
}
return attrs
}
}
const parseXml = function (xmlData) {
xmlData = xmlData.replace(/\r\n?/g, "\n"); //TODO: remove this line
const xmlObj = new xmlNode('!xml');
let currentNode = xmlObj;
let textData = "";
let jPath = "";
// Reset entity expansion counters for this document
this.entityExpansionCount = 0;
this.currentExpandedLength = 0;
const docTypeReader = new DocTypeReader(this.options.processEntities);
for (let i = 0; i < xmlData.length; i++) {//for each char in XML data
const ch = xmlData[i];
if (ch === '<') {
// const nextIndex = i+1;
// const _2ndChar = xmlData[nextIndex];
if (xmlData[i + 1] === '/') {//Closing Tag
const closeIndex = findClosingIndex(xmlData, ">", i, "Closing Tag is not closed.")
let tagName = xmlData.substring(i + 2, closeIndex).trim();
if (this.options.removeNSPrefix) {
const colonIndex = tagName.indexOf(":");
if (colonIndex !== -1) {
tagName = tagName.substr(colonIndex + 1);
}
}
if (this.options.transformTagName) {
tagName = this.options.transformTagName(tagName);
}
if (currentNode) {
textData = this.saveTextToParentTag(textData, currentNode, jPath);
}
//check if last tag of nested tag was unpaired tag
const lastTagName = jPath.substring(jPath.lastIndexOf(".") + 1);
if (tagName && this.options.unpairedTags.indexOf(tagName) !== -1) {
throw new Error(`Unpaired tag can not be used as closing tag: </${tagName}>`);
}
let propIndex = 0
if (lastTagName && this.options.unpairedTags.indexOf(lastTagName) !== -1) {
propIndex = jPath.lastIndexOf('.', jPath.lastIndexOf('.') - 1)
this.tagsNodeStack.pop();
} else {
propIndex = jPath.lastIndexOf(".");
}
jPath = jPath.substring(0, propIndex);
currentNode = this.tagsNodeStack.pop();//avoid recursion, set the parent tag scope
textData = "";
i = closeIndex;
} else if (xmlData[i + 1] === '?') {
let tagData = readTagExp(xmlData, i, false, "?>");
if (!tagData) throw new Error("Pi Tag is not closed.");
textData = this.saveTextToParentTag(textData, currentNode, jPath);
if ((this.options.ignoreDeclaration && tagData.tagName === "?xml") || this.options.ignorePiTags) {
//do nothing
} else {
const childNode = new xmlNode(tagData.tagName);
childNode.add(this.options.textNodeName, "");
if (tagData.tagName !== tagData.tagExp && tagData.attrExpPresent) {
childNode[":@"] = this.buildAttributesMap(tagData.tagExp, jPath, tagData.tagName);
}
this.addChild(currentNode, childNode, jPath, i);
}
i = tagData.closeIndex + 1;
} else if (xmlData.substr(i + 1, 3) === '!--') {
const endIndex = findClosingIndex(xmlData, "-->", i + 4, "Comment is not closed.")
if (this.options.commentPropName) {
const comment = xmlData.substring(i + 4, endIndex - 2);
textData = this.saveTextToParentTag(textData, currentNode, jPath);
currentNode.add(this.options.commentPropName, [{ [this.options.textNodeName]: comment }]);
}
i = endIndex;
} else if (xmlData.substr(i + 1, 2) === '!D') {
const result = docTypeReader.readDocType(xmlData, i);
this.docTypeEntities = result.entities;
i = result.i;
} else if (xmlData.substr(i + 1, 2) === '![') {
const closeIndex = findClosingIndex(xmlData, "]]>", i, "CDATA is not closed.") - 2;
const tagExp = xmlData.substring(i + 9, closeIndex);
textData = this.saveTextToParentTag(textData, currentNode, jPath);
let val = this.parseTextData(tagExp, currentNode.tagname, jPath, true, false, true, true);
if (val == undefined) val = "";
//cdata should be set even if it is 0 length string
if (this.options.cdataPropName) {
currentNode.add(this.options.cdataPropName, [{ [this.options.textNodeName]: tagExp }]);
} else {
currentNode.add(this.options.textNodeName, val);
}
i = closeIndex + 2;
} else {//Opening tag
let result = readTagExp(xmlData, i, this.options.removeNSPrefix);
let tagName = result.tagName;
const rawTagName = result.rawTagName;
let tagExp = result.tagExp;
let attrExpPresent = result.attrExpPresent;
let closeIndex = result.closeIndex;
if (this.options.transformTagName) {
//console.log(tagExp, tagName)
const newTagName = this.options.transformTagName(tagName);
if (tagExp === tagName) {
tagExp = newTagName
}
tagName = newTagName;
}
if (this.options.strictReservedNames &&
(tagName === this.options.commentPropName
|| tagName === this.options.cdataPropName
)) {
throw new Error(`Invalid tag name: ${tagName}`);
}
//save text as child node
if (currentNode && textData) {
if (currentNode.tagname !== '!xml') {
//when nested tag is found
textData = this.saveTextToParentTag(textData, currentNode, jPath, false);
}
}
//check if last tag was unpaired tag
const lastTag = currentNode;
if (lastTag && this.options.unpairedTags.indexOf(lastTag.tagname) !== -1) {
currentNode = this.tagsNodeStack.pop();
jPath = jPath.substring(0, jPath.lastIndexOf("."));
}
if (tagName !== xmlObj.tagname) {
jPath += jPath ? "." + tagName : tagName;
}
const startIndex = i;
if (this.isItStopNode(this.stopNodesExact, this.stopNodesWildcard, jPath, tagName)) {
let tagContent = "";
//self-closing tag
if (tagExp.length > 0 && tagExp.lastIndexOf("/") === tagExp.length - 1) {
if (tagName[tagName.length - 1] === "/") { //remove trailing '/'
tagName = tagName.substr(0, tagName.length - 1);
jPath = jPath.substr(0, jPath.length - 1);
tagExp = tagName;
} else {
tagExp = tagExp.substr(0, tagExp.length - 1);
}
i = result.closeIndex;
}
//unpaired tag
else if (this.options.unpairedTags.indexOf(tagName) !== -1) {
i = result.closeIndex;
}
//normal tag
else {
//read until closing tag is found
const result = this.readStopNodeData(xmlData, rawTagName, closeIndex + 1);
if (!result) throw new Error(`Unexpected end of ${rawTagName}`);
i = result.i;
tagContent = result.tagContent;
}
const childNode = new xmlNode(tagName);
if (tagName !== tagExp && attrExpPresent) {
childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName);
}
if (tagContent) {
tagContent = this.parseTextData(tagContent, tagName, jPath, true, attrExpPresent, true, true);
}
jPath = jPath.substr(0, jPath.lastIndexOf("."));
childNode.add(this.options.textNodeName, tagContent);
this.addChild(currentNode, childNode, jPath, startIndex);
} else {
//selfClosing tag
if (tagExp.length > 0 && tagExp.lastIndexOf("/") === tagExp.length - 1) {
if (tagName[tagName.length - 1] === "/") { //remove trailing '/'
tagName = tagName.substr(0, tagName.length - 1);
jPath = jPath.substr(0, jPath.length - 1);
tagExp = tagName;
} else {
tagExp = tagExp.substr(0, tagExp.length - 1);
}
if (this.options.transformTagName) {
const newTagName = this.options.transformTagName(tagName);
if (tagExp === tagName) {
tagExp = newTagName
}
tagName = newTagName;
}
const childNode = new xmlNode(tagName);
if (tagName !== tagExp && attrExpPresent) {
childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName);
}
this.addChild(currentNode, childNode, jPath, startIndex);
jPath = jPath.substr(0, jPath.lastIndexOf("."));
}
else if(this.options.unpairedTags.indexOf(tagName) !== -1){//unpaired tag
const childNode = new xmlNode(tagName);
if(tagName !== tagExp && attrExpPresent){
childNode[":@"] = this.buildAttributesMap(tagExp, jPath);
}
this.addChild(currentNode, childNode, jPath, startIndex);
jPath = jPath.substr(0, jPath.lastIndexOf("."));
i = result.closeIndex;
// Continue to next iteration without changing currentNode
continue;
}
//opening tag
else {
const childNode = new xmlNode(tagName);
if (this.tagsNodeStack.length > this.options.maxNestedTags) {
throw new Error("Maximum nested tags exceeded");
}
this.tagsNodeStack.push(currentNode);
if (tagName !== tagExp && attrExpPresent) {
childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName);
}
this.addChild(currentNode, childNode, jPath, startIndex);
currentNode = childNode;
}
textData = "";
i = closeIndex;
}
}
} else {
textData += xmlData[i];
}
}
return xmlObj.child;
}
function addChild(currentNode, childNode, jPath, startIndex) {
// unset startIndex if not requested
if (!this.options.captureMetaData) startIndex = undefined;
const result = this.options.updateTag(childNode.tagname, jPath, childNode[":@"])
if (result === false) {
//do nothing
} else if (typeof result === "string") {
childNode.tagname = result
currentNode.addChild(childNode, startIndex);
} else {
currentNode.addChild(childNode, startIndex);
}
}
const replaceEntitiesValue = function (val, tagName, jPath) {
// Performance optimization: Early return if no entities to replace
if (val.indexOf('&') === -1) {
return val;
}
const entityConfig = this.options.processEntities;
if (!entityConfig.enabled) {
return val;
}
// Check tag-specific filtering
if (entityConfig.allowedTags) {
if (!entityConfig.allowedTags.includes(tagName)) {
return val; // Skip entity replacement for current tag as not set
}
}
if (entityConfig.tagFilter) {
if (!entityConfig.tagFilter(tagName, jPath)) {
return val; // Skip based on custom filter
}
}
// Replace DOCTYPE entities
for (let entityName in this.docTypeEntities) {
const entity = this.docTypeEntities[entityName];
const matches = val.match(entity.regx);
if (matches) {
// Track expansions
this.entityExpansionCount += matches.length;
// Check expansion limit
if (entityConfig.maxTotalExpansions &&
this.entityExpansionCount > entityConfig.maxTotalExpansions) {
throw new Error(
`Entity expansion limit exceeded: ${this.entityExpansionCount} > ${entityConfig.maxTotalExpansions}`
);
}
// Store length before replacement
const lengthBefore = val.length;
val = val.replace(entity.regx, entity.val);
// Check expanded length immediately after replacement
if (entityConfig.maxExpandedLength) {
this.currentExpandedLength += (val.length - lengthBefore);
if (this.currentExpandedLength > entityConfig.maxExpandedLength) {
throw new Error(
`Total expanded content size exceeded: ${this.currentExpandedLength} > ${entityConfig.maxExpandedLength}`
);
}
}
}
}
if (val.indexOf('&') === -1) return val; // Early exit
// Replace standard entities
for (let entityName in this.lastEntities) {
const entity = this.lastEntities[entityName];
val = val.replace(entity.regex, entity.val);
}
if (val.indexOf('&') === -1) return val; // Early exit
// Replace HTML entities if enabled
if (this.options.htmlEntities) {
for (let entityName in this.htmlEntities) {
const entity = this.htmlEntities[entityName];
val = val.replace(entity.regex, entity.val);
}
}
// Replace ampersand entity last
val = val.replace(this.ampEntity.regex, this.ampEntity.val);
return val;
}
function saveTextToParentTag(textData, parentNode, jPath, isLeafNode) {
if (textData) { //store previously collected data as textNode
if (isLeafNode === undefined) isLeafNode = parentNode.child.length === 0
textData = this.parseTextData(textData,
parentNode.tagname,
jPath,
false,
parentNode[":@"] ? Object.keys(parentNode[":@"]).length !== 0 : false,
isLeafNode);
if (textData !== undefined && textData !== "")
parentNode.add(this.options.textNodeName, textData);
textData = "";
}
return textData;
}
//TODO: use jPath to simplify the logic
/**
* @param {Set} stopNodesExact
* @param {Set} stopNodesWildcard
* @param {string} jPath
* @param {string} currentTagName
*/
function isItStopNode(stopNodesExact, stopNodesWildcard, jPath, currentTagName) {
if (stopNodesWildcard && stopNodesWildcard.has(currentTagName)) return true;
if (stopNodesExact && stopNodesExact.has(jPath)) return true;
return false;
}
/**
* Returns the tag Expression and where it is ending handling single-double quotes situation
* @param {string} xmlData
* @param {number} i starting index
* @returns
*/
function tagExpWithClosingIndex(xmlData, i, closingChar = ">") {
let attrBoundary;
let tagExp = "";
for (let index = i; index < xmlData.length; index++) {
let ch = xmlData[index];
if (attrBoundary) {
if (ch === attrBoundary) attrBoundary = "";//reset
} else if (ch === '"' || ch === "'") {
attrBoundary = ch;
} else if (ch === closingChar[0]) {
if (closingChar[1]) {
if (xmlData[index + 1] === closingChar[1]) {
return {
data: tagExp,
index: index
}
}
} else {
return {
data: tagExp,
index: index
}
}
} else if (ch === '\t') {
ch = " "
}
tagExp += ch;
}
}
function findClosingIndex(xmlData, str, i, errMsg) {
const closingIndex = xmlData.indexOf(str, i);
if (closingIndex === -1) {
throw new Error(errMsg)
} else {
return closingIndex + str.length - 1;
}
}
function readTagExp(xmlData, i, removeNSPrefix, closingChar = ">") {
const result = tagExpWithClosingIndex(xmlData, i + 1, closingChar);
if (!result) return;
let tagExp = result.data;
const closeIndex = result.index;
const separatorIndex = tagExp.search(/\s/);
let tagName = tagExp;
let attrExpPresent = true;
if (separatorIndex !== -1) {//separate tag name and attributes expression
tagName = tagExp.substring(0, separatorIndex);
tagExp = tagExp.substring(separatorIndex + 1).trimStart();
}
const rawTagName = tagName;
if (removeNSPrefix) {
const colonIndex = tagName.indexOf(":");
if (colonIndex !== -1) {
tagName = tagName.substr(colonIndex + 1);
attrExpPresent = tagName !== result.data.substr(colonIndex + 1);
}
}
return {
tagName: tagName,
tagExp: tagExp,
closeIndex: closeIndex,
attrExpPresent: attrExpPresent,
rawTagName: rawTagName,
}
}
/**
* find paired tag for a stop node
* @param {string} xmlData
* @param {string} tagName
* @param {number} i
*/
function readStopNodeData(xmlData, tagName, i) {
const startIndex = i;
// Starting at 1 since we already have an open tag
let openTagCount = 1;
for (; i < xmlData.length; i++) {
if (xmlData[i] === "<") {
if (xmlData[i + 1] === "/") {//close tag
const closeIndex = findClosingIndex(xmlData, ">", i, `${tagName} is not closed`);
let closeTagName = xmlData.substring(i + 2, closeIndex).trim();
if (closeTagName === tagName) {
openTagCount--;
if (openTagCount === 0) {
return {
tagContent: xmlData.substring(startIndex, i),
i: closeIndex
}
}
}
i = closeIndex;
} else if (xmlData[i + 1] === '?') {
const closeIndex = findClosingIndex(xmlData, "?>", i + 1, "StopNode is not closed.")
i = closeIndex;
} else if (xmlData.substr(i + 1, 3) === '!--') {
const closeIndex = findClosingIndex(xmlData, "-->", i + 3, "StopNode is not closed.")
i = closeIndex;
} else if (xmlData.substr(i + 1, 2) === '![') {
const closeIndex = findClosingIndex(xmlData, "]]>", i, "StopNode is not closed.") - 2;
i = closeIndex;
} else {
const tagData = readTagExp(xmlData, i, '>')
if (tagData) {
const openTagName = tagData && tagData.tagName;
if (openTagName === tagName && tagData.tagExp[tagData.tagExp.length - 1] !== "/") {
openTagCount++;
}
i = tagData.closeIndex;
}
}
}
}//end for loop
}
function parseValue(val, shouldParse, options) {
if (shouldParse && typeof val === 'string') {
//console.log(options)
const newval = val.trim();
if (newval === 'true') return true;
else if (newval === 'false') return false;
else return toNumber(val, options);
} else {
if (isExist(val)) {
return val;
} else {
return '';
}
}
}
function fromCodePoint(str, base, prefix) {
const codePoint = Number.parseInt(str, base);
if (codePoint >= 0 && codePoint <= 0x10FFFF) {
return String.fromCodePoint(codePoint);
} else {
return prefix + str + ";";
}
}