UNPKG

fast-xml-parser

Version:

Validate XML, Parse XML, Build XML without C/C++ based libraries

729 lines (652 loc) 25.2 kB
'use strict'; ///@ts-check import { getAllMatches, isExist } from '../util.js'; import xmlNode from './xmlNode.js'; import DocTypeReader from './DocTypeReader.js'; import toNumber from "strnum"; import getIgnoreAttributesFn from "../ignoreAttributes.js"; // const regx = // '<((!\\[CDATA\\[([\\s\\S]*?)(]]>))|((NAME:)?(NAME))([^>]*)>|((\\/)(NAME)\\s*>))([^<]*)' // .replace(/NAME/g, util.nameRegexp); //const tagsRegx = new RegExp("<(\\/?[\\w:\\-\._]+)([^>]*)>(\\s*"+cdataRegx+")*([^<]+)?","g"); //const tagsRegx = new RegExp("<(\\/?)((\\w*:)?([\\w:\\-\._]+))([^>]*)>([^<]*)("+cdataRegx+"([^<]*))*([^<]+)?","g"); export default class OrderedObjParser { constructor(options) { this.options = options; this.currentNode = null; this.tagsNodeStack = []; this.docTypeEntities = {}; this.lastEntities = { "apos": { regex: /&(apos|#39|#x27);/g, val: "'" }, "gt": { regex: /&(gt|#62|#x3E);/g, val: ">" }, "lt": { regex: /&(lt|#60|#x3C);/g, val: "<" }, "quot": { regex: /&(quot|#34|#x22);/g, val: "\"" }, }; this.ampEntity = { regex: /&(amp|#38|#x26);/g, val: "&" }; this.htmlEntities = { "space": { regex: /&(nbsp|#160);/g, val: " " }, // "lt" : { regex: /&(lt|#60);/g, val: "<" }, // "gt" : { regex: /&(gt|#62);/g, val: ">" }, // "amp" : { regex: /&(amp|#38);/g, val: "&" }, // "quot" : { regex: /&(quot|#34);/g, val: "\"" }, // "apos" : { regex: /&(apos|#39);/g, val: "'" }, "cent": { regex: /&(cent|#162);/g, val: "¢" }, "pound": { regex: /&(pound|#163);/g, val: "£" }, "yen": { regex: /&(yen|#165);/g, val: "¥" }, "euro": { regex: /&(euro|#8364);/g, val: "€" }, "copyright": { regex: /&(copy|#169);/g, val: "©" }, "reg": { regex: /&(reg|#174);/g, val: "®" }, "inr": { regex: /&(inr|#8377);/g, val: "₹" }, "num_dec": { regex: /&#([0-9]{1,7});/g, val: (_, str) => fromCodePoint(str, 10, "&#") }, "num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val: (_, str) => fromCodePoint(str, 16, "&#x") }, }; this.addExternalEntities = addExternalEntities; this.parseXml = parseXml; this.parseTextData = parseTextData; this.resolveNameSpace = resolveNameSpace; this.buildAttributesMap = buildAttributesMap; this.isItStopNode = isItStopNode; this.replaceEntitiesValue = replaceEntitiesValue; this.readStopNodeData = readStopNodeData; this.saveTextToParentTag = saveTextToParentTag; this.addChild = addChild; this.ignoreAttributesFn = getIgnoreAttributesFn(this.options.ignoreAttributes) this.entityExpansionCount = 0; this.currentExpandedLength = 0; if (this.options.stopNodes && this.options.stopNodes.length > 0) { this.stopNodesExact = new Set(); this.stopNodesWildcard = new Set(); for (let i = 0; i < this.options.stopNodes.length; i++) { const stopNodeExp = this.options.stopNodes[i]; if (typeof stopNodeExp !== 'string') continue; if (stopNodeExp.startsWith("*.")) { this.stopNodesWildcard.add(stopNodeExp.substring(2)); } else { this.stopNodesExact.add(stopNodeExp); } } } } } function addExternalEntities(externalEntities) { const entKeys = Object.keys(externalEntities); for (let i = 0; i < entKeys.length; i++) { const ent = entKeys[i]; const escaped = ent.replace(/[.\-+*:]/g, '\\.'); this.lastEntities[ent] = { regex: new RegExp("&" + escaped + ";", "g"), val: externalEntities[ent] } } } /** * @param {string} val * @param {string} tagName * @param {string} jPath * @param {boolean} dontTrim * @param {boolean} hasAttributes * @param {boolean} isLeafNode * @param {boolean} escapeEntities */ function parseTextData(val, tagName, jPath, dontTrim, hasAttributes, isLeafNode, escapeEntities) { if (val !== undefined) { if (this.options.trimValues && !dontTrim) { val = val.trim(); } if (val.length > 0) { if (!escapeEntities) val = this.replaceEntitiesValue(val, tagName, jPath); const newval = this.options.tagValueProcessor(tagName, val, jPath, hasAttributes, isLeafNode); if (newval === null || newval === undefined) { //don't parse return val; } else if (typeof newval !== typeof val || newval !== val) { //overwrite return newval; } else if (this.options.trimValues) { return parseValue(val, this.options.parseTagValue, this.options.numberParseOptions); } else { const trimmedVal = val.trim(); if (trimmedVal === val) { return parseValue(val, this.options.parseTagValue, this.options.numberParseOptions); } else { return val; } } } } } function resolveNameSpace(tagname) { if (this.options.removeNSPrefix) { const tags = tagname.split(':'); const prefix = tagname.charAt(0) === '/' ? '/' : ''; if (tags[0] === 'xmlns') { return ''; } if (tags.length === 2) { tagname = prefix + tags[1]; } } return tagname; } //TODO: change regex to capture NS //const attrsRegx = new RegExp("([\\w\\-\\.\\:]+)\\s*=\\s*(['\"])((.|\n)*?)\\2","gm"); const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm'); function buildAttributesMap(attrStr, jPath, tagName) { if (this.options.ignoreAttributes !== true && typeof attrStr === 'string') { // attrStr = attrStr.replace(/\r?\n/g, ' '); //attrStr = attrStr || attrStr.trim(); const matches = getAllMatches(attrStr, attrsRegx); const len = matches.length; //don't make it inline const attrs = {}; for (let i = 0; i < len; i++) { const attrName = this.resolveNameSpace(matches[i][1]); if (this.ignoreAttributesFn(attrName, jPath)) { continue } let oldVal = matches[i][4]; let aName = this.options.attributeNamePrefix + attrName; if (attrName.length) { if (this.options.transformAttributeName) { aName = this.options.transformAttributeName(aName); } if (aName === "__proto__") aName = "#__proto__"; if (oldVal !== undefined) { if (this.options.trimValues) { oldVal = oldVal.trim(); } oldVal = this.replaceEntitiesValue(oldVal, tagName, jPath); const newVal = this.options.attributeValueProcessor(attrName, oldVal, jPath); if (newVal === null || newVal === undefined) { //don't parse attrs[aName] = oldVal; } else if (typeof newVal !== typeof oldVal || newVal !== oldVal) { //overwrite attrs[aName] = newVal; } else { //parse attrs[aName] = parseValue( oldVal, this.options.parseAttributeValue, this.options.numberParseOptions ); } } else if (this.options.allowBooleanAttributes) { attrs[aName] = true; } } } if (!Object.keys(attrs).length) { return; } if (this.options.attributesGroupName) { const attrCollection = {}; attrCollection[this.options.attributesGroupName] = attrs; return attrCollection; } return attrs } } const parseXml = function (xmlData) { xmlData = xmlData.replace(/\r\n?/g, "\n"); //TODO: remove this line const xmlObj = new xmlNode('!xml'); let currentNode = xmlObj; let textData = ""; let jPath = ""; // Reset entity expansion counters for this document this.entityExpansionCount = 0; this.currentExpandedLength = 0; const docTypeReader = new DocTypeReader(this.options.processEntities); for (let i = 0; i < xmlData.length; i++) {//for each char in XML data const ch = xmlData[i]; if (ch === '<') { // const nextIndex = i+1; // const _2ndChar = xmlData[nextIndex]; if (xmlData[i + 1] === '/') {//Closing Tag const closeIndex = findClosingIndex(xmlData, ">", i, "Closing Tag is not closed.") let tagName = xmlData.substring(i + 2, closeIndex).trim(); if (this.options.removeNSPrefix) { const colonIndex = tagName.indexOf(":"); if (colonIndex !== -1) { tagName = tagName.substr(colonIndex + 1); } } if (this.options.transformTagName) { tagName = this.options.transformTagName(tagName); } if (currentNode) { textData = this.saveTextToParentTag(textData, currentNode, jPath); } //check if last tag of nested tag was unpaired tag const lastTagName = jPath.substring(jPath.lastIndexOf(".") + 1); if (tagName && this.options.unpairedTags.indexOf(tagName) !== -1) { throw new Error(`Unpaired tag can not be used as closing tag: </${tagName}>`); } let propIndex = 0 if (lastTagName && this.options.unpairedTags.indexOf(lastTagName) !== -1) { propIndex = jPath.lastIndexOf('.', jPath.lastIndexOf('.') - 1) this.tagsNodeStack.pop(); } else { propIndex = jPath.lastIndexOf("."); } jPath = jPath.substring(0, propIndex); currentNode = this.tagsNodeStack.pop();//avoid recursion, set the parent tag scope textData = ""; i = closeIndex; } else if (xmlData[i + 1] === '?') { let tagData = readTagExp(xmlData, i, false, "?>"); if (!tagData) throw new Error("Pi Tag is not closed."); textData = this.saveTextToParentTag(textData, currentNode, jPath); if ((this.options.ignoreDeclaration && tagData.tagName === "?xml") || this.options.ignorePiTags) { //do nothing } else { const childNode = new xmlNode(tagData.tagName); childNode.add(this.options.textNodeName, ""); if (tagData.tagName !== tagData.tagExp && tagData.attrExpPresent) { childNode[":@"] = this.buildAttributesMap(tagData.tagExp, jPath, tagData.tagName); } this.addChild(currentNode, childNode, jPath, i); } i = tagData.closeIndex + 1; } else if (xmlData.substr(i + 1, 3) === '!--') { const endIndex = findClosingIndex(xmlData, "-->", i + 4, "Comment is not closed.") if (this.options.commentPropName) { const comment = xmlData.substring(i + 4, endIndex - 2); textData = this.saveTextToParentTag(textData, currentNode, jPath); currentNode.add(this.options.commentPropName, [{ [this.options.textNodeName]: comment }]); } i = endIndex; } else if (xmlData.substr(i + 1, 2) === '!D') { const result = docTypeReader.readDocType(xmlData, i); this.docTypeEntities = result.entities; i = result.i; } else if (xmlData.substr(i + 1, 2) === '![') { const closeIndex = findClosingIndex(xmlData, "]]>", i, "CDATA is not closed.") - 2; const tagExp = xmlData.substring(i + 9, closeIndex); textData = this.saveTextToParentTag(textData, currentNode, jPath); let val = this.parseTextData(tagExp, currentNode.tagname, jPath, true, false, true, true); if (val == undefined) val = ""; //cdata should be set even if it is 0 length string if (this.options.cdataPropName) { currentNode.add(this.options.cdataPropName, [{ [this.options.textNodeName]: tagExp }]); } else { currentNode.add(this.options.textNodeName, val); } i = closeIndex + 2; } else {//Opening tag let result = readTagExp(xmlData, i, this.options.removeNSPrefix); let tagName = result.tagName; const rawTagName = result.rawTagName; let tagExp = result.tagExp; let attrExpPresent = result.attrExpPresent; let closeIndex = result.closeIndex; if (this.options.transformTagName) { //console.log(tagExp, tagName) const newTagName = this.options.transformTagName(tagName); if (tagExp === tagName) { tagExp = newTagName } tagName = newTagName; } if (this.options.strictReservedNames && (tagName === this.options.commentPropName || tagName === this.options.cdataPropName )) { throw new Error(`Invalid tag name: ${tagName}`); } //save text as child node if (currentNode && textData) { if (currentNode.tagname !== '!xml') { //when nested tag is found textData = this.saveTextToParentTag(textData, currentNode, jPath, false); } } //check if last tag was unpaired tag const lastTag = currentNode; if (lastTag && this.options.unpairedTags.indexOf(lastTag.tagname) !== -1) { currentNode = this.tagsNodeStack.pop(); jPath = jPath.substring(0, jPath.lastIndexOf(".")); } if (tagName !== xmlObj.tagname) { jPath += jPath ? "." + tagName : tagName; } const startIndex = i; if (this.isItStopNode(this.stopNodesExact, this.stopNodesWildcard, jPath, tagName)) { let tagContent = ""; //self-closing tag if (tagExp.length > 0 && tagExp.lastIndexOf("/") === tagExp.length - 1) { if (tagName[tagName.length - 1] === "/") { //remove trailing '/' tagName = tagName.substr(0, tagName.length - 1); jPath = jPath.substr(0, jPath.length - 1); tagExp = tagName; } else { tagExp = tagExp.substr(0, tagExp.length - 1); } i = result.closeIndex; } //unpaired tag else if (this.options.unpairedTags.indexOf(tagName) !== -1) { i = result.closeIndex; } //normal tag else { //read until closing tag is found const result = this.readStopNodeData(xmlData, rawTagName, closeIndex + 1); if (!result) throw new Error(`Unexpected end of ${rawTagName}`); i = result.i; tagContent = result.tagContent; } const childNode = new xmlNode(tagName); if (tagName !== tagExp && attrExpPresent) { childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName); } if (tagContent) { tagContent = this.parseTextData(tagContent, tagName, jPath, true, attrExpPresent, true, true); } jPath = jPath.substr(0, jPath.lastIndexOf(".")); childNode.add(this.options.textNodeName, tagContent); this.addChild(currentNode, childNode, jPath, startIndex); } else { //selfClosing tag if (tagExp.length > 0 && tagExp.lastIndexOf("/") === tagExp.length - 1) { if (tagName[tagName.length - 1] === "/") { //remove trailing '/' tagName = tagName.substr(0, tagName.length - 1); jPath = jPath.substr(0, jPath.length - 1); tagExp = tagName; } else { tagExp = tagExp.substr(0, tagExp.length - 1); } if (this.options.transformTagName) { const newTagName = this.options.transformTagName(tagName); if (tagExp === tagName) { tagExp = newTagName } tagName = newTagName; } const childNode = new xmlNode(tagName); if (tagName !== tagExp && attrExpPresent) { childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName); } this.addChild(currentNode, childNode, jPath, startIndex); jPath = jPath.substr(0, jPath.lastIndexOf(".")); } else if(this.options.unpairedTags.indexOf(tagName) !== -1){//unpaired tag const childNode = new xmlNode(tagName); if(tagName !== tagExp && attrExpPresent){ childNode[":@"] = this.buildAttributesMap(tagExp, jPath); } this.addChild(currentNode, childNode, jPath, startIndex); jPath = jPath.substr(0, jPath.lastIndexOf(".")); i = result.closeIndex; // Continue to next iteration without changing currentNode continue; } //opening tag else { const childNode = new xmlNode(tagName); if (this.tagsNodeStack.length > this.options.maxNestedTags) { throw new Error("Maximum nested tags exceeded"); } this.tagsNodeStack.push(currentNode); if (tagName !== tagExp && attrExpPresent) { childNode[":@"] = this.buildAttributesMap(tagExp, jPath, tagName); } this.addChild(currentNode, childNode, jPath, startIndex); currentNode = childNode; } textData = ""; i = closeIndex; } } } else { textData += xmlData[i]; } } return xmlObj.child; } function addChild(currentNode, childNode, jPath, startIndex) { // unset startIndex if not requested if (!this.options.captureMetaData) startIndex = undefined; const result = this.options.updateTag(childNode.tagname, jPath, childNode[":@"]) if (result === false) { //do nothing } else if (typeof result === "string") { childNode.tagname = result currentNode.addChild(childNode, startIndex); } else { currentNode.addChild(childNode, startIndex); } } const replaceEntitiesValue = function (val, tagName, jPath) { // Performance optimization: Early return if no entities to replace if (val.indexOf('&') === -1) { return val; } const entityConfig = this.options.processEntities; if (!entityConfig.enabled) { return val; } // Check tag-specific filtering if (entityConfig.allowedTags) { if (!entityConfig.allowedTags.includes(tagName)) { return val; // Skip entity replacement for current tag as not set } } if (entityConfig.tagFilter) { if (!entityConfig.tagFilter(tagName, jPath)) { return val; // Skip based on custom filter } } // Replace DOCTYPE entities for (let entityName in this.docTypeEntities) { const entity = this.docTypeEntities[entityName]; const matches = val.match(entity.regx); if (matches) { // Track expansions this.entityExpansionCount += matches.length; // Check expansion limit if (entityConfig.maxTotalExpansions && this.entityExpansionCount > entityConfig.maxTotalExpansions) { throw new Error( `Entity expansion limit exceeded: ${this.entityExpansionCount} > ${entityConfig.maxTotalExpansions}` ); } // Store length before replacement const lengthBefore = val.length; val = val.replace(entity.regx, entity.val); // Check expanded length immediately after replacement if (entityConfig.maxExpandedLength) { this.currentExpandedLength += (val.length - lengthBefore); if (this.currentExpandedLength > entityConfig.maxExpandedLength) { throw new Error( `Total expanded content size exceeded: ${this.currentExpandedLength} > ${entityConfig.maxExpandedLength}` ); } } } } if (val.indexOf('&') === -1) return val; // Early exit // Replace standard entities for (let entityName in this.lastEntities) { const entity = this.lastEntities[entityName]; val = val.replace(entity.regex, entity.val); } if (val.indexOf('&') === -1) return val; // Early exit // Replace HTML entities if enabled if (this.options.htmlEntities) { for (let entityName in this.htmlEntities) { const entity = this.htmlEntities[entityName]; val = val.replace(entity.regex, entity.val); } } // Replace ampersand entity last val = val.replace(this.ampEntity.regex, this.ampEntity.val); return val; } function saveTextToParentTag(textData, parentNode, jPath, isLeafNode) { if (textData) { //store previously collected data as textNode if (isLeafNode === undefined) isLeafNode = parentNode.child.length === 0 textData = this.parseTextData(textData, parentNode.tagname, jPath, false, parentNode[":@"] ? Object.keys(parentNode[":@"]).length !== 0 : false, isLeafNode); if (textData !== undefined && textData !== "") parentNode.add(this.options.textNodeName, textData); textData = ""; } return textData; } //TODO: use jPath to simplify the logic /** * @param {Set} stopNodesExact * @param {Set} stopNodesWildcard * @param {string} jPath * @param {string} currentTagName */ function isItStopNode(stopNodesExact, stopNodesWildcard, jPath, currentTagName) { if (stopNodesWildcard && stopNodesWildcard.has(currentTagName)) return true; if (stopNodesExact && stopNodesExact.has(jPath)) return true; return false; } /** * Returns the tag Expression and where it is ending handling single-double quotes situation * @param {string} xmlData * @param {number} i starting index * @returns */ function tagExpWithClosingIndex(xmlData, i, closingChar = ">") { let attrBoundary; let tagExp = ""; for (let index = i; index < xmlData.length; index++) { let ch = xmlData[index]; if (attrBoundary) { if (ch === attrBoundary) attrBoundary = "";//reset } else if (ch === '"' || ch === "'") { attrBoundary = ch; } else if (ch === closingChar[0]) { if (closingChar[1]) { if (xmlData[index + 1] === closingChar[1]) { return { data: tagExp, index: index } } } else { return { data: tagExp, index: index } } } else if (ch === '\t') { ch = " " } tagExp += ch; } } function findClosingIndex(xmlData, str, i, errMsg) { const closingIndex = xmlData.indexOf(str, i); if (closingIndex === -1) { throw new Error(errMsg) } else { return closingIndex + str.length - 1; } } function readTagExp(xmlData, i, removeNSPrefix, closingChar = ">") { const result = tagExpWithClosingIndex(xmlData, i + 1, closingChar); if (!result) return; let tagExp = result.data; const closeIndex = result.index; const separatorIndex = tagExp.search(/\s/); let tagName = tagExp; let attrExpPresent = true; if (separatorIndex !== -1) {//separate tag name and attributes expression tagName = tagExp.substring(0, separatorIndex); tagExp = tagExp.substring(separatorIndex + 1).trimStart(); } const rawTagName = tagName; if (removeNSPrefix) { const colonIndex = tagName.indexOf(":"); if (colonIndex !== -1) { tagName = tagName.substr(colonIndex + 1); attrExpPresent = tagName !== result.data.substr(colonIndex + 1); } } return { tagName: tagName, tagExp: tagExp, closeIndex: closeIndex, attrExpPresent: attrExpPresent, rawTagName: rawTagName, } } /** * find paired tag for a stop node * @param {string} xmlData * @param {string} tagName * @param {number} i */ function readStopNodeData(xmlData, tagName, i) { const startIndex = i; // Starting at 1 since we already have an open tag let openTagCount = 1; for (; i < xmlData.length; i++) { if (xmlData[i] === "<") { if (xmlData[i + 1] === "/") {//close tag const closeIndex = findClosingIndex(xmlData, ">", i, `${tagName} is not closed`); let closeTagName = xmlData.substring(i + 2, closeIndex).trim(); if (closeTagName === tagName) { openTagCount--; if (openTagCount === 0) { return { tagContent: xmlData.substring(startIndex, i), i: closeIndex } } } i = closeIndex; } else if (xmlData[i + 1] === '?') { const closeIndex = findClosingIndex(xmlData, "?>", i + 1, "StopNode is not closed.") i = closeIndex; } else if (xmlData.substr(i + 1, 3) === '!--') { const closeIndex = findClosingIndex(xmlData, "-->", i + 3, "StopNode is not closed.") i = closeIndex; } else if (xmlData.substr(i + 1, 2) === '![') { const closeIndex = findClosingIndex(xmlData, "]]>", i, "StopNode is not closed.") - 2; i = closeIndex; } else { const tagData = readTagExp(xmlData, i, '>') if (tagData) { const openTagName = tagData && tagData.tagName; if (openTagName === tagName && tagData.tagExp[tagData.tagExp.length - 1] !== "/") { openTagCount++; } i = tagData.closeIndex; } } } }//end for loop } function parseValue(val, shouldParse, options) { if (shouldParse && typeof val === 'string') { //console.log(options) const newval = val.trim(); if (newval === 'true') return true; else if (newval === 'false') return false; else return toNumber(val, options); } else { if (isExist(val)) { return val; } else { return ''; } } } function fromCodePoint(str, base, prefix) { const codePoint = Number.parseInt(str, base); if (codePoint >= 0 && codePoint <= 0x10FFFF) { return String.fromCodePoint(codePoint); } else { return prefix + str + ";"; } }