xmldom-sre
Version:
A pure JavaScript W3C standard-based (XML DOM Level 2 Core) DOMParser and XMLSerializer module.
510 lines (480 loc) • 17.5 kB
JavaScript
'use strict';
var conventions = require('./conventions');
var dom = require('./dom');
var entities = require('./entities');
var sax = require('./sax');
var DOMImplementation = dom.DOMImplementation;
var MIME_TYPE = conventions.MIME_TYPE;
var NAMESPACE = conventions.NAMESPACE;
var ParseError = sax.ParseError;
var XMLReader = sax.XMLReader;
/**
* Normalizes line ending according to <https://www.w3.org/TR/xml11/#sec-line-ends>:
*
* > XML parsed entities are often stored in computer files which,
* > for editing convenience, are organized into lines.
* > These lines are typically separated by some combination
* > of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
* >
* > To simplify the tasks of applications, the XML processor must behave
* > as if it normalized all line breaks in external parsed entities (including the document entity)
* > on input, before parsing, by translating all of the following to a single #xA character:
* >
* > 1. the two-character sequence #xD #xA
* > 2. the two-character sequence #xD #x85
* > 3. the single character #x85
* > 4. the single character #x2028
* > 5. any #xD character that is not immediately followed by #xA or #x85.
*
* @param {string} input
* @returns {string}
*/
function normalizeLineEndings(input) {
return input.replace(/\r[\n\u0085]/g, '\n').replace(/[\r\u0085\u2028]/g, '\n');
}
/**
* @typedef Locator
* @property {number} [columnNumber]
* @property {number} [lineNumber]
*/
/**
* @typedef DOMParserOptions
* @property {typeof conventions.assign} [assign=Object.assign || conventions.assign]
* The method to use instead of `Object.assign` (or if not available `conventions.assign`),
* which is used to copy values from the options before they are used for parsing.
* @property {typeof DOMHandler} [domHandler]
* For internal testing: The class for creating an instance for handling events from the SAX parser.
* Warning: By configuring a faulty implementation, the specified behavior can completely be broken.
* @property {Function} [errorHandler]
* @property {boolean} [locator=true]
* Configures if the nodes created during parsing
* will have a `lineNumber` and a `columnNumber` attribute
* describing their location in the XML string.
* Default is true.
* @property {(string) => string} [normalizeLineEndings]
* used to replace line endings before parsing, defaults to `normalizeLineEndings`
* @property {object} [xmlns]
* The XML namespaces that should be assumed when parsing.
* The default namespace can be provided by the key that is the empty string.
* When the `mimeType` for HTML, XHTML or SVG are passed to `parseFromString`,
* the default namespace that will be used,
* will be overridden according to the specification.
*
* @see normalizeLineEndings
*/
/**
* The DOMParser interface provides the ability to parse XML or HTML source code
* from a string into a DOM `Document`.
*
* _xmldom is different from the spec in that it allows an `options` parameter,
* to control the behavior._
*
* @param {DOMParserOptions} [options]
* @constructor
*
* @see https://developer.mozilla.org/en-US/docs/Web/API/DOMParser
* @see https://html.spec.whatwg.org/multipage/dynamic-markup-insertion.html#dom-parsing-and-serialization
*/
function DOMParser(options) {
options = options || { locator: true };
/**
* The method to use instead of `Object.assign` (or if not available `conventions.assign`),
* which is used to copy values from the options before they are used for parsing.
*
* @type {function (target: object, source: object | null | undefined): object}
* @readonly
* @private
* @see conventions.assign
*/
this.assign = options.assign || Object.assign || conventions.assign;
/**
* For internal testing: The class for creating an instance for handling events from the SAX parser.
* __**Warning: By configuring a faulty implementation, the specified behavior can completely be broken.**__
*
* @type {typeof DOMHandler}
* @readonly
* @private
*/
this.domHandler = options.domHandler || DOMHandler;
/**
* A function that can be invoked as the errorHandler instead of the default ones.
* @type {Function | undefined}
* @readonly
*/
this.errorHandler = options.errorHandler;
/**
* used to replace line endings before parsing, defaults to `normalizeLineEndings`
*
* @type {(string) => string}
* @readonly
*/
this.normalizeLineEndings = options.normalizeLineEndings || normalizeLineEndings;
/**
* Configures if the nodes created during parsing
* will have a `lineNumber` and a `columnNumber` attribute
* describing their location in the XML string.
* Default is true.
* @type {boolean}
* @readonly
*/
this.locator = !!options.locator;
/**
* The default namespace can be provided by the key that is the empty string.
* When the `mimeType` for HTML, XHTML or SVG are passed to `parseFromString`,
* the default namespace that will be used,
* will be overridden according to the specification.
* @type {Readonly<object>}
* @readonly
*/
this.xmlns = options.xmlns || {};
}
/**
* Parses `source` using the options in the way configured by the `DOMParserOptions` of `this` `DOMParser`.
* If `mimeType` is `text/html` an HTML `Document` is created, otherwise an XML `Document` is created.
*
* __It behaves very different from the description in the living standard__:
* - Only allows the first argument to be a string (calls `error` handler otherwise.)
* - The second parameter is optional (defaults to `application/xml`) and can be any string,
* no `TypeError` will be thrown for values not listed in the spec.
* - Uses the `options` passed to the `DOMParser` constructor to modify the behavior/implementation.
* - Instead of creating a Document containing the error message,
* it triggers `errorHandler`(s) when unexpected input is found, which means it can return `undefined`.
* All error handlers can throw an `Error`, by default only the `fatalError` handler throws (a `ParserError`).
* - All errors thrown during the parsing that are not a `ParserError` are caught and reported using the `error` handler.
* - If no `ParserError` is thrown, this method returns the `DOMHandler.doc`,
* which most likely is the `Document` that has been created during parsing, or `undefined`.
* __**Warning: By configuring a faulty DOMHandler implementation,
* the specified behavior can completely be broken.**__
*
* @param {string} source Only string input is possible!
* @param {string} [mimeType='application/xml']
* the mimeType or contentType of the document to be created
* determines the `type` of document created (XML or HTML)
* @returns {Document | undefined}
* @throws ParseError for specific errors depending on the configured `errorHandler`s and/or `domBuilder`
*
* @see https://developer.mozilla.org/en-US/docs/Web/API/DOMParser/parseFromString
* @see https://html.spec.whatwg.org/#dom-domparser-parsefromstring-dev
*/
DOMParser.prototype.parseFromString = function (source, mimeType) {
var defaultNSMap = this.assign({}, this.xmlns);
var entityMap = entities.XML_ENTITIES;
var defaultNamespace = defaultNSMap[''] || null;
if (MIME_TYPE.hasDefaultHTMLNamespace(mimeType)) {
entityMap = entities.HTML_ENTITIES;
defaultNamespace = NAMESPACE.HTML;
} else if (mimeType === MIME_TYPE.XML_SVG_IMAGE) {
defaultNamespace = NAMESPACE.SVG;
}
defaultNSMap[''] = defaultNamespace;
defaultNSMap.xml = defaultNSMap.xml || NAMESPACE.XML;
var domBuilder = new this.domHandler({
mimeType: mimeType,
defaultNamespace: defaultNamespace,
});
var locator = this.locator ? {} : undefined;
if (this.locator) {
domBuilder.setDocumentLocator(locator);
}
var sax = new XMLReader();
sax.errorHandler = buildErrorHandler(this.errorHandler, domBuilder, locator);
sax.domBuilder = domBuilder;
if (source && typeof source === 'string') {
sax.parse(this.normalizeLineEndings(source), defaultNSMap, entityMap);
} else {
sax.errorHandler.error('invalid doc source');
}
return domBuilder.doc;
};
function buildErrorHandler(errorImpl, domBuilder, locator) {
if (!errorImpl) {
if (domBuilder instanceof DOMHandler) {
return domBuilder;
}
errorImpl = domBuilder;
}
var errorHandler = {};
var isCallback = errorImpl instanceof Function;
locator = locator || {};
function build(key) {
var fn = errorImpl[key];
if (!fn && isCallback) {
fn =
errorImpl.length == 2
? function (msg) {
errorImpl(key, msg);
}
: errorImpl;
}
errorHandler[key] =
(fn &&
function (msg) {
fn('[xmldom ' + key + ']\t' + msg + _locator(locator));
}) ||
function () {};
}
build('warning');
build('error');
build('fatalError');
return errorHandler;
}
/**
* @typedef DOMHandlerOptions
* @property {string} [mimeType=MIME_TYPE.XML_APPLICATION]
* @property {string|null} [defaultNamespace=null]
*/
/**
* The class that is used to handle events from the SAX parser to create the related DOM elements.
*
* Some methods are only implemented as an empty function,
* since they are (at least currently) not relevant for xmldom.
*
* @constructor
* @param {DOMHandlerOptions} [options]
* @see http://www.saxproject.org/apidoc/org/xml/sax/ext/DefaultHandler2.html
*/
function DOMHandler(options) {
var opt = options || {};
/**
* The mime type is used to determine if the DOM handler will create an XML or HTML document.
* Only if it is set to `text/html` it will create an HTML document.
* It defaults to MIME_TYPE.XML_APPLICATION.
*
* @type {string}
* @readonly
* @see MIME_TYPE
*/
this.mimeType = opt.mimeType || MIME_TYPE.XML_APPLICATION;
/**
* The namespace to use to create an XML document.
* For the following reasons this is required:
* - The SAX API for `startDocument` doesn't offer any way to pass a namespace,
* since at that point there is no way for the parser to know what the default namespace from the document will be.
* - When creating using `DOMImplementation.createDocument` it is required to pass a namespace,
* to determine the correct `Document.contentType`, which should match `this.mimeType`.
* - When parsing an XML document with the `application/xhtml+xml` mimeType,
* the HTML namespace needs to be the default namespace.
*
* @type {string|null}
* @readonly
* @private
*/
this.defaultNamespace = opt.defaultNamespace || null;
/**
* @private
* @type {boolean}
*/
this.cdata = false;
/**
* The last `Element` that was created by `startElement`.
* `endElement` sets it to the `currentElement.parentNode`.
*
* Note: The sax parser currently sets it to white space text nodes between tags.
*
* @type {Element | Node | undefined}
* @private
*/
this.currentElement = undefined;
/**
* The Document that is created as part of `startDocument`,
* and returned by `DOMParser.parseFromString`.
*
* @type {Document | undefined}
* @readonly
*/
this.doc = undefined;
/**
* The locator is stored as part of setDocumentLocator.
* It is controlled and mutated by the SAX parser
* to store the current parsing position.
* It is used by DOMHandler to set `columnNumber` and `lineNumber`
* on the DOM nodes.
*
* @type {Readonly<Locator> | undefined}
* @readonly (the sax parser currently sometimes set's it)
* @private
*/
this.locator = undefined;
}
function position(locator, node) {
node.lineNumber = locator.lineNumber;
node.columnNumber = locator.columnNumber;
}
DOMHandler.prototype = {
/**
* Either creates an XML or an HTML document and stores it under `this.doc`.
* If it is an XML document, `this.defaultNamespace` is used to create it,
* and it will not contain any `childNodes`.
* If it is an HTML document, it will be created without any `childNodes`.
*
* @see http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
*/
startDocument: function () {
var impl = new DOMImplementation();
this.doc = MIME_TYPE.isHTML(this.mimeType) ? impl.createHTMLDocument(false) : impl.createDocument(this.defaultNamespace, '');
},
startElement: function (namespaceURI, localName, qName, attrs) {
var doc = this.doc;
var el = doc.createElementNS(namespaceURI, qName || localName);
var len = attrs.length;
appendElement(this, el);
this.currentElement = el;
this.locator && position(this.locator, el);
for (var i = 0; i < len; i++) {
var namespaceURI = attrs.getURI(i);
var value = attrs.getValue(i);
var qName = attrs.getQName(i);
var attr = doc.createAttributeNS(namespaceURI, qName);
this.locator && position(attrs.getLocator(i), attr);
attr.value = attr.nodeValue = value;
el.setAttributeNode(attr);
}
},
endElement: function (namespaceURI, localName, qName) {
this.currentElement = this.currentElement.parentNode;
},
startPrefixMapping: function (prefix, uri) {},
endPrefixMapping: function (prefix) {},
processingInstruction: function (target, data) {
var ins = this.doc.createProcessingInstruction(target, data);
this.locator && position(this.locator, ins);
appendElement(this, ins);
},
ignorableWhitespace: function (ch, start, length) {},
characters: function (chars, start, length) {
chars = _toString.apply(this, arguments);
//console.log(chars)
if (chars) {
if (this.cdata) {
var charNode = this.doc.createCDATASection(chars);
} else {
var charNode = this.doc.createTextNode(chars);
}
if (this.currentElement) {
this.currentElement.appendChild(charNode);
} else if (/^\s*$/.test(chars)) {
this.doc.appendChild(charNode);
//process xml
}
this.locator && position(this.locator, charNode);
}
},
skippedEntity: function (name) {},
endDocument: function () {
this.doc.normalize();
},
/**
* Stores the locator to be able to set the `columnNumber` and `lineNumber`
* on the created DOM nodes.
*
* @param {Locator} locator
*/
setDocumentLocator: function (locator) {
if (locator) {
locator.lineNumber = 0;
}
this.locator = locator;
},
//LexicalHandler
comment: function (chars, start, length) {
chars = _toString.apply(this, arguments);
var comm = this.doc.createComment(chars);
this.locator && position(this.locator, comm);
appendElement(this, comm);
},
startCDATA: function () {
//used in characters() methods
this.cdata = true;
},
endCDATA: function () {
this.cdata = false;
},
startDTD: function (name, publicId, systemId) {
var impl = this.doc.implementation;
if (impl && impl.createDocumentType) {
var dt = impl.createDocumentType(name, publicId, systemId);
this.locator && position(this.locator, dt);
appendElement(this, dt);
this.doc.doctype = dt;
}
},
/**
* @see org.xml.sax.ErrorHandler
* @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
*/
warning: function (error) {
console.warn('[xmldom warning]\t' + error, _locator(this.locator));
},
error: function (error) {
console.error('[xmldom error]\t' + error, _locator(this.locator));
},
fatalError: function (error) {
throw new ParseError(error, this.locator);
},
};
function _locator(l) {
if (l) {
return '\n@#[line:' + l.lineNumber + ',col:' + l.columnNumber + ']';
}
}
function _toString(chars, start, length) {
if (typeof chars == 'string') {
return chars.substr(start, length);
} else {
//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
if (chars.length >= start + length || start) {
return new java.lang.String(chars, start, length) + '';
}
return chars;
}
}
/*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
* used method of org.xml.sax.ext.LexicalHandler:
* #comment(chars, start, length)
* #startCDATA()
* #endCDATA()
* #startDTD(name, publicId, systemId)
*
*
* IGNORED method of org.xml.sax.ext.LexicalHandler:
* #endDTD()
* #startEntity(name)
* #endEntity(name)
*
*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
* IGNORED method of org.xml.sax.ext.DeclHandler
* #attributeDecl(eName, aName, type, mode, value)
* #elementDecl(name, model)
* #externalEntityDecl(name, publicId, systemId)
* #internalEntityDecl(name, value)
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
* IGNORED method of org.xml.sax.EntityResolver2
* #resolveEntity(String name,String publicId,String baseURI,String systemId)
* #resolveEntity(publicId, systemId)
* #getExternalSubset(name, baseURI)
* @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
* IGNORED method of org.xml.sax.DTDHandler
* #notationDecl(name, publicId, systemId) {};
* #unparsedEntityDecl(name, publicId, systemId, notationName) {};
*/
'endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl'.replace(
/\w+/g,
function (key) {
DOMHandler.prototype[key] = function () {
return null;
};
}
);
/* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
function appendElement(hander, node) {
if (!hander.currentElement) {
hander.doc.appendChild(node);
} else {
hander.currentElement.appendChild(node);
}
} //appendChild and setAttributeNS are preformance key
exports.__DOMHandler = DOMHandler;
exports.normalizeLineEndings = normalizeLineEndings;
exports.DOMParser = DOMParser;