minify-xml
Version:
Fast XML minifier / compressor / uglifier with a command-line
406 lines (357 loc) • 24.2 kB
JavaScript
const strict = "strict", strictOption = option => option === strict && { strict: true };
/**
* Options to minify an XML document.
*
* @typedef {object} MinifyOptions
* @property {boolean} removeComments Remove XML comments <!-- ... -->
* @property {boolean|string} removeWhitespaceBetweenTags Remove whitespace only between tags <anyTag/> <anyOtherTag/> (true / false or 'strict', strict will not consider prolog / doctype, as tags)
* @property {boolean} considerPreserveWhitespace Remove / trim whitespace in texts like <anyTag> foo </anyTag>
* @property {boolean} collapseWhitespaceInTags Remove / collapse whitespace in tags <anyTag attributeA = "..." attributeB = "..."> ... </anyTag >
* @property {boolean} collapseEmptyElements Collapse elements with start / end tags and no content to empty element tags <anyTag anyAttribute = "..." ></anyTag >
* @property {boolean|string} trimWhitespaceFromTexts Remove / trim whitespace in texts like <anyTag> foo </anyTag> (true / false or 'strict', strict will not consider prolog / doctype, as tags)
* @property {boolean|string} collapseWhitespaceInTexts Collapse whitespace in texts like <anyTag>foo bar baz</anyTag> (true / false or 'strict', strict will not consider prolog / doctype, as tags)
* @property {boolean} collapseWhitespaceInProlog Remove / collapse whitespace in the xml prolog <?xml version = "1.0" ?>
* @property {boolean} collapseWhitespaceInDocType Remove / collapse whitespace in the xml document type declaration <!DOCTYPE DocType >
* @property {boolean} removeSchemaLocationAttributes Remove any xsi:schemaLocation / xsi:noNamespaceSchemaLocation attributes <anyTag xsi:schemaLocation="/schema/" />
* @property {boolean} removeUnnecessaryStandaloneDeclaration Remove unnecessary standalone declaration in prolog <?xml standalone = "yes" ?>
* @property {boolean} removeUnusedNamespaces Remove unused namespaces and shorten the remaining ones to a minimum length
* @property {boolean} removeUnusedDefaultNamespace Remove unused default namespace declaration if no tags with no namespace declaration are present
* @property {boolean} shortenNamespaces Shorten existing (non already one character namespaces) to a shorter equivalent
* @property {boolean} ignoreCData Ignore CDATA sections <![CDATA[ ... ]]>
*/
/**
* The default options applied when minifying an XML document.
*
* @type {MinifyOptions}
*/
export const defaultOptions = {
removeComments: true,
removeWhitespaceBetweenTags: true, // true / false or 'strict' (will not consider prolog / doctype, as tags)
considerPreserveWhitespace: true,
collapseWhitespaceInTags: true,
collapseEmptyElements: true,
trimWhitespaceFromTexts: false, // true / false or 'strict'
collapseWhitespaceInTexts: false, // true / false or 'strict'
collapseWhitespaceInProlog: true,
collapseWhitespaceInDocType: true,
removeSchemaLocationAttributes: false,
removeUnnecessaryStandaloneDeclaration: true,
removeUnusedNamespaces: true,
removeUnusedDefaultNamespace: true,
shortenNamespaces: true,
ignoreCData: true
};
function trim(string) {
return string.replace(/^[\s\uFEFF\xA0]+/g, String()).replace(/[\s\uFEFF\xA0]+$/g, String());
}
const emptyRegExp = new RegExp(), emptyPattern = emptyRegExp.source, regExpGlobal = "g";
function escapeRegExp(string) {
return string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
}
function findAllMatches(string, regexp, group) {
let matches = [], match;
while ((match = regexp.exec(string))) {
if (typeof group === "number") {
match[group] && matches.push(match[group]);
} else {
matches.push(match);
}
} return matches;
}
// note: this funky looking positive lookbehind regular expression is necessary to match contents inside of tags <...>. this
// is due to that literally any characters except <&" are allowed to be put next to everywhere in XML. as even > is an allowed
// character, simply checking for (?<=<[^>]*) would not do the trick if e.g. > is used inside of a tag attribute.
const tagPattern = /(?<=<\/?[^?!\s\/>]+\b(?:\s+[^=\s>]+\s*=\s*(?:"[^"]*"|'[^']*'))*%1)/.source, noTagPattern = /[^<]*/.source,
bracketPattern = tagPattern.replace(/(?<!\(\?)</, "<(?:" + /!\s*(?:--(?:[^-]|-[^-])*--\s*)|!\[(?:CDATA|.*?)\[(?:[^\]]|][^\]]|]][^>])*]]|!DOCTYPE\s+(?:[^>[]|\[[^\]]*\])*|\?[^>]*|/.source).replace("%1", ")%1"),
prologPattern = tagPattern.replace(/(?<=(?<!\(\?)<).*(?=\\b)/, "\\?xml"),
docTypePattern = /<!DOCTYPE\s+([^\s>[]+)(?:\s+(SYSTEM|PUBLIC)\s+("[^"]*"|'[^']*')(?:\s+("[^"]*"|'[^']*'))?)?(?:\s*\[([^\]]*)\])?\s*>/.source,
preservePattern = /(?<!<(?:[^\s\/>:]+:)?pre[^<]*?>|\s+xml:space\s*=\s*(?:"preserve"|'preserve'|preserve)(?:\s+[^=\s>]+\s*=\s*(?:"[^"]*"|'[^']*'))*\s*>)/.source;
function findAllMatchesInTags(xml, regexp, options = { tagPattern, lookbehind: emptyRegExp, lookbehindPattern: String(), group: 0 }) {
const lookbehindPattern = options.lookbehindPattern || (options.lookbehind || emptyRegExp).source;
return findAllMatches(xml, new RegExp((options.tagPattern || tagPattern).replace("%1", lookbehindPattern) + regexp.source, regExpGlobal), options.group);
}
// include non-tags means declaration like <! comments / doctype declaration and <? prolog / processing instructions
function replaceInTags(xml, regexp, replacement, options = { tagPattern, lookbehind: emptyRegExp, lookbehindPattern: String() }) {
const lookbehindPattern = options.lookbehindPattern || (options.lookbehind || emptyRegExp).source;
return xml.replace(new RegExp((options.tagPattern || tagPattern).replace("%1", lookbehindPattern) + regexp.source, regExpGlobal), replacement);
}
const defaultReplaceBetweenOptions = { lookbehind: emptyRegExp, lookbehindPattern: String(), lookahead: emptyRegExp, lookaheadPattern: String() };
function replaceBetweenTags(xml, regexp, replacement, options = defaultReplaceBetweenOptions) {
const lookbehindPattern = "\\s*/?>" + (options.lookbehindPattern || (options.lookbehind || emptyRegExp).source),
lookaheadPattern = (options.lookaheadPattern || (options.lookahead || emptyRegExp).source) + "<[^?!]";
return replaceInTags(xml, new RegExp(regexp.source + `(?=${ lookaheadPattern })`), replacement, { lookbehindPattern });
}
function replaceBetweenBrackets(xml, regexp, replacement, options = defaultReplaceBetweenOptions) {
const lookbehindPattern = "\\s*[!?/]?>" + (options.lookbehindPattern || (options.lookbehind || emptyRegExp).source),
lookaheadPattern = (options.lookaheadPattern || (options.lookahead || emptyRegExp).source) + "<";
return replaceInTags(xml, new RegExp(regexp.source + `(?=${ lookaheadPattern })`), replacement, { tagPattern: bracketPattern, lookbehindPattern });
}
function replaceBetween(xml, regexp, replacement, options = { ...defaultOptions, strict: false }) {
// if not strict also consider the prolog <?xml ... ?>, processing instructions <?pi ... ?>, the document type declaration <!DOCTYPE ... >, CDATA sections <![CDATA[ ... ]]> and comments <!-- ... --> as tags here
return (options.strict ? replaceBetweenTags : replaceBetweenBrackets)(xml, regexp, replacement, options);
}
function ignoreCData(replacement) {
return function(match, offset, string, groups) {
// the interface of replacement functions contains any number of arguments at the second position, for contents of capturing groups.
// the last argument is either an object (for browsers supporting named capturing groups) or the examined string otherwise.
let argument = arguments.length - 1, captures;
groups = typeof arguments[argument] === "object" ? arguments[argument--] : undefined;
string = arguments[argument--]; offset = arguments[argument--];
captures = Array.prototype.slice.call(arguments, 1, argument + 1);
// check if the offset lies inside of a CData section
if (/<!\[CDATA\[(?![\s\S]*?]]>)/.test(string.substring(0, offset))) {
return match; // if so do not replace anything
}
// if the replacement is a function, apply our arguments
if (typeof replacement === "function") {
return replacement.apply(this, arguments);
}
// otherwise execute the replacement of the capturing groups manually
return captures ? replacement.replace(/(?<!\$)\$(\d+|\&)/g, (group, number) =>
["0", "&"].includes(number) ? match : captures[parseInt(number - 1)] || String()) : replacement;
};
}
/**
* Minify an XML document.
*
* @param {string} xml The XML document to minify
* @param {MinifyOptions} [options=defaultOptions] The options to minify the XML document
* @returns {string} The minified XML document
*/
export function minify(xml, options) {
// apply the default options
options = {
...defaultOptions,
...(options || {})
};
// decide on whether to use the ignoreCData replacement function or not, to improve performance
const replacer = options.ignoreCData && xml.includes("<![CDATA[") ? ignoreCData : replacement => replacement, emptyReplacer = replacer(String());
function removeComments(xml) {
return xml.replace(/<!\s*(?:--(?:[^-]|-[^-])*--\s*)>/g, emptyReplacer);
}
// remove XML comments <!-- ... -->
if (options.removeComments) {
xml = removeComments(xml);
}
// remove whitespace only between tags <anyTag/> <anyOtherTag/>
if (options.removeWhitespaceBetweenTags) {
xml = replaceBetween(xml, /\s+/, emptyReplacer, strictOption(options.removeWhitespaceBetweenTags));
}
function collapseWhitespaceInTags(xml, options = { tagPattern }) {
xml = replaceInTags(xml, /\s+/, replacer(" "), options); // collapse whitespace between attributes
xml = replaceInTags(xml, /\s*=\s*/, replacer("="), { ...options, lookbehind: /\s+[^=\s>]+/ }); // remove leading / tailing whitespace around attribute equal signs
xml = replaceInTags(xml, /\s+(?=[/?]?>)/, emptyReplacer, options); // remove whitespace before closing > /> ?> of tags
return xml;
}
// remove any xsi:schemaLocation / xsi:noNamespaceSchemaLocation attributes <anyTag xsi:schemaLocation="/schema/" />
if (options.removeSchemaLocationAttributes) {
xml = replaceInTags(xml, /\s+xsi:(?:noNamespaceS|s)chemaLocation\s*=\s*(?:"[^"]*"|'[^']*')/, replacer(" "));
}
// remove / collapse whitespace in tags <anyTag attributeA = "..." attributeB = "..."> ... </anyTag >
if (options.collapseWhitespaceInTags) {
xml = collapseWhitespaceInTags(xml);
}
// collapse elements with start / end tags and no content to empty element tags <anyTag anyAttribute = "..." ></anyTag >
if (options.collapseEmptyElements) {
xml = xml.replace(/<([^\s\/>]+)([^<]*?)(?<!\/)><\/\1\s*>/g, replacer("<$1$2/>"));
}
// remove / trim whitespace in texts like <anyTag> foo </anyTag>
if (options.trimWhitespaceFromTexts) {
// note, to avoid zero-length matches use two replaceBetween here (a zero-length match causes an endless loop in replacestream)
xml = replaceBetween(xml, /\s+/, emptyReplacer, { lookbehindPattern: options.considerPreserveWhitespace ?
preservePattern : null, lookaheadPattern: noTagPattern, ...strictOption(options.trimWhitespaceFromTexts) });
xml = replaceBetween(xml, /\s+/, emptyReplacer, { lookbehindPattern: (options.considerPreserveWhitespace ?
preservePattern : String()) + noTagPattern, ...strictOption(options.trimWhitespaceFromTexts) });
}
// collapse whitespace in texts like <anyTag>foo bar baz</anyTag>
if (options.collapseWhitespaceInTexts) {
xml = replaceBetween(xml, /\s+/, replacer(" "), { lookbehindPattern: (options.considerPreserveWhitespace ?
preservePattern : emptyPattern ) + noTagPattern, lookaheadPattern: noTagPattern, ...strictOption(options.collapseWhitespaceInTexts) });
}
// remove remove unnecessary standalone declaration in prolog <?xml standalone = "yes" ?>
// the standalone declaration has "no meaning" according to the W3C definition, in case neither the external subset of the DocType declaration
// contains any markup declarations (<!ELEMENT, <!ATTLIST, <!ENTITY, <!NOTATION) or a parameter entity (<!ENTITY %) is defined in the any subset
// (because we do not read the external subset definition file e.g. schema.dtd, we assume as soon as either a SYSTEM/PUBLIC subset is defined, the standalone attribute must stay)
if (options.removeUnnecessaryStandaloneDeclaration) {
const docType = xml.match(new RegExp(docTypePattern));
if (!docType || (!docType[2] && !(docType[5] && /<!ENTITY\s+%/.test(docType[5])))) {
xml = replaceInTags(xml, /\s+standalone\s*=\s*(?:"yes"|'yes'|yes|"no"|'no'|no)/, emptyReplacer, { tagPattern: prologPattern });
}
}
// remove / collapse whitespace in the xml prolog <?xml version = "1.0" ?>
if (options.collapseWhitespaceInProlog) {
xml = collapseWhitespaceInTags(xml, { tagPattern: prologPattern });
}
// remove / collapse whitespace in the xml document type declaration <!DOCTYPE DocType >
if (options.collapseWhitespaceInDocType) {
xml = xml.replace(new RegExp(docTypePattern), replacer(
(match, name, type, literal1, literal2, subset) => `<!DOCTYPE ${name}${ [type, literal1, literal2]
.map(token => token && " " + token).join(String()) }${ subset ? `[${ (xml => {
// use a simplified minify xml for the internal subset declaration of the document type
xml = removeComments(xml); // remove comments
xml = xml.replace(/\s+/g, " "); // collapse whitespace
xml = xml.replace(/>\s+</g, "><"); // remove any whitespace between declarations (assuming that > cannot appear in the declarations themselves)
return xml.trim ? xml.trim() : trim(xml);
})(subset) }]` : String() }>`));
}
// remove unused namespaces and shorten the remaining ones to a minimum length
if (options.removeUnusedNamespaces || options.shortenNamespaces) {
// the search for all xml namespaces in tags could result in some "fake" namespaces if a xmlns:... string is found inside of CDATA
// tags. this however comes with no major drawback as we the replace only inside of tags and thus it simplifies the search
let all = [...new Set(findAllMatchesInTags(xml, /\s+xmlns:([^\s=]+)\s*=/g, { group: 1 }))];
// remove namespace declarations which are not used anywhere in the document (limitation: the approach taken here will not consider the structure of the XML document
// thus namespaces which might be only used in a certain sub-tree of elements might not be removed, even though they are not used in that sub-tree)
if (options.removeUnusedNamespaces) {
let used = [...new Set([
...findAllMatches(xml, /<([^\s\/>:]+):/g, 1), // look for all tags with namespaces (limitation: might also include tags inside of CData, we ignore that for now)
...findAllMatchesInTags(xml, /([^\s=:]+):/, { lookbehind: /\s+/, group: 1 }) // look for all attributes with namespaces
])].filter(ns => ns !== "xmlns"), unused = all.filter(ns => !used.includes(ns));
if (unused.length) {
xml = replaceInTags(xml, new RegExp(`\\s+xmlns:(?:${ unused.map(escapeRegExp).join("|") })\\s*=\\s*(?:"[^"]*"|'[^']*')`), emptyReplacer);
all = used; // only used namespaces still present in the file
}
}
// special case: remove unused default namespace declaration if no tags with no namespace declaration are present
// (it's impossible for attributes with namespaces to refer back to the default namespace, so we can omit searching for them)
if (options.removeUnusedDefaultNamespace && !/<([^\s\/>:]+)[\s\/>]/.test(xml)) {
xml = replaceInTags(xml, /\s+xmlns\s*=\s*(?:"[^"]*"|'[^']*')/, emptyReplacer);
}
// shorten existing (non already one character namespaces) to a shorter equivalent
if(options.shortenNamespaces) {
const startCharset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_",
charset = startCharset.substring(0, 52) + "0123456789-_.";
function firstUnusedNamespace(prefix, length) {
if (!arguments.length) {
for (length = 1; !(prefix = firstUnusedNamespace(
String(), length)); length++);
return prefix;
} else if (!length) {
return prefix;
}
const chars = prefix ? charset : startCharset;
for (let char = 0; char < chars.length; char++) {
let ns = firstUnusedNamespace(prefix + chars[char], length - 1);
if (ns && !all.includes(ns)) {
return ns;
}
}
return false; // for this length / prefix there is no unused namespace to choose from
}
all.forEach((ns, idx) => {
// never shorten the special "xsi" namespace or if already at absolute minimal length
if (ns === "xsi" || ns.length === 1) {
return;
}
// try to shorten the existing namespace to one character first, if it is occupied already, find the first unused one by brute force
let newNs = !all.includes(ns[0]) ? ns[0] : firstUnusedNamespace();
if (ns.length <= newNs.length) {
return; // already at minimal length
}
// replace all occurrences of the namespace in the document and mark it as "used"
xml = xml.replace(new RegExp(`<(/)?${ns}:`, regExpGlobal), replacer(`<$1${newNs}:`)); // tags with namespaces
xml = replaceInTags(xml, new RegExp(`${ns}:`), replacer(`${newNs}:`), { lookbehind: /\s+/ }); // attributes with namespaces
xml = replaceInTags(xml, new RegExp(`xmlns:${ns}(?=[\\s=])`), replacer(`xmlns:${newNs}`), { lookbehind: /\s+/ }); // namespace declaration
all[idx] = newNs;
});
}
}
return xml.trim ? xml.trim() : trim(xml);
}; export default minify;
import pumpify from "pumpify"; // XXX: to be replaced by node:stream compose as soon as it is stable
import replaceStream from "replacestream"; // note that replacestream does NOT support zero-length regex matches!
import { PassThrough } from "node:stream";
/**
* Options to minify an XML document stream.
*
* @typedef {object} MinifyStreamSpecificOptions
* @property {number} streamMaxMatchLength The maximum size of matches between chunks
*/
/**
* Options to minify an XML document stream.
*
* @typedef {Omit<MinifyOptions, "removeUnnecessaryStandaloneDeclaration" | "removeUnusedNamespaces" | "removeUnusedDefaultNamespace" | "shortenNamespaces" | "ignoreCData"> & MinifyStreamSpecificOptions} MinifyStreamOptions
*/
// some options require prior knowledge, like 'removeUnnecessaryStandaloneDeclaration' will have to read the DocType first and
// 'removeUnusedNamespaces' needs to scan the document for namespaces in use, thus some options cannot be used when streaming
const unsupportedStreamOptions = ["removeUnnecessaryStandaloneDeclaration", "removeUnusedNamespaces", "removeUnusedDefaultNamespace", "shortenNamespaces", "ignoreCData"];
/**
* The default options applied when minifying an XML document stream.
*
* @type {MinifyStreamOptions}
*/
export const defaultStreamOptions = {
...defaultOptions,
streamMaxMatchLength: 256 * 1024, // 256 KiB, maximum size of matches between chunks
// all these options require prior knowledge about the stream, for instance if we are in a CData block, or what namespaces are present
...Object.fromEntries(unsupportedStreamOptions.map(option => [option, false]))
};
/**
* Minify an XML document stream.
*
* @param {MinifyStreamOptions} [options=defaultStreamOptions] The options to minify the XML document stream
* @returns {import('node:stream').Duplex} A duplex stream that minifies an XML document
*/
export function minifyStream(options) {
// apply the default options
options = {
...defaultStreamOptions,
...(options || {})
};
// ignoring CData sections is not supported w/ streams
const unsupportedOption = unsupportedStreamOptions.find(option => !!options[option]);
if (unsupportedOption) {
throw new Error(`The '${unsupportedOption}' option cannot be used with streams, as it requires prior knowledge about the stream to minify`);
}
// the minify function accepts strings only, however as we took care (e.g. by using a polyfill 'trim' function) that the function is only repeatedly
// calling the strings 'replace' function on an unmodified input object, we can take advantage of duck typing here and pass an object resembling a
// very simple string, that captures all calls to 'replace' and maps them into 'replacestream'. all unsupported options, e.g. those requiring prior
// knowledge about the stream like 'removeUnusedNamespaces', must be disabled
const streams = [], replaceOptions = { maxMatchLen: options.streamMaxMatchLength }, stringImposter = {
replace: function() {
streams.push(replaceStream(...arguments, replaceOptions));
return stringImposter;
}
};
// called with the string-like object, it will create a chain of (replace)streams, which, if we pipe data into the first stream, apply all minifications
minify(stringImposter, options);
// minify will always 'trim' the output, if more minification transformations have been applied, pumpify all streams into one
if (streams.length > 1) {
return pumpify(streams);
} else if (streams.length) {
return streams[0].pipe(new PassThrough()) // bug: replacestream returns an old transform stream that is not async. iterable, we fix that by piping it through a PassThrough stream ourselves
} else {
return new PassThrough();
}
};
import { pipeline } from "node:stream/promises";
/**
* Minify an XML document pipeline.
*
* @param {import("node:stream").PipelineSource<string>} source The source of the XML document pipeline
* @param {import("node:stream").PipelineDestination<import("node:stream").PipelineTransformSource<string>, string>} destination The destination of the XML document pipeline
* @param {MinifyStreamOptions} [options=defaultStreamOptions] The options to minify the XML document pipeline
* @returns {import("node:stream").PipelinePromise<import("node:stream").PipelineDestination<any, string>>} A promise that resolves into the destination of the XML document pipeline
*/
export const minifyPipeline = async (source, destination, options) =>
await pipeline(source, minifyStream(options), destination, { end: options?.end });
/**
* Debug minifying an XML document.
*
* @ignore
* @param {string} xml The XML document to debug minifying
* @param {MinifyOptions} [options=defaultOptions] The options to minify the XML document
*/
export function debug(xml, options) {
xml && console.log(`\x1b[90m${xml}\x1b[0m`);
// the minify function accepts strings only, however only 'replace' is being called repeatedly, so we can take advantage of duck typing here
const stringImposter = {
includes: (...args) => xml ? xml.includes(...args) : true,
replace: function(...args) {
console.log(`\x1b[31m${args[0]}\x1b[0m`);
xml && console.log(`\x1b[90m${xml = xml.replace(...args)}\x1b[0m`);
return stringImposter;
}
};
// called with the string-like object, to dump all regular expressions into the console
minify(stringImposter, options);
};