saxen
Version:
A tiny, super fast, namespace aware sax-style XML parser written in plain JavaScript
1,043 lines (827 loc) • 23.3 kB
JavaScript
var fromCharCode = String.fromCharCode;
var hasOwnProperty = Object.prototype.hasOwnProperty;
var ENTITY_PATTERN = /&#(\d+);|&#x([0-9a-f]+);|&(\w+);/ig;
var ENTITY_MAPPING = {
'amp': '&',
'apos': '\'',
'gt': '>',
'lt': '<',
'quot': '"'
};
// map UPPERCASE variants of supported special chars
Object.keys(ENTITY_MAPPING).forEach(function(k) {
ENTITY_MAPPING[k.toUpperCase()] = ENTITY_MAPPING[k];
});
function replaceEntities(_, d, x, z) {
// reserved names, i.e.
if (z) {
if (hasOwnProperty.call(ENTITY_MAPPING, z)) {
return ENTITY_MAPPING[z];
} else {
// fall back to original value
return '&' + z + ';';
}
}
// decimal encoded char
if (d) {
return fromCharCode(d);
}
// hex encoded char
return fromCharCode(parseInt(x, 16));
}
/**
* A basic entity decoder that can decode a minimal
* sub-set of reserved names (&) as well as
* hex (ય) and decimal (ӏ) encoded characters.
*
* @param {string} s
*
* @return {string} decoded string
*/
function decodeEntities(s) {
if (s.length > 3 && s.indexOf('&') !== -1) {
return s.replace(ENTITY_PATTERN, replaceEntities);
}
return s;
}
var NON_WHITESPACE_OUTSIDE_ROOT_NODE = 'non-whitespace outside of root node';
function error(msg) {
return new Error(msg);
}
function missingNamespaceForPrefix(prefix) {
return 'missing namespace for prefix <' + prefix + '>';
}
function getter(getFn) {
return {
'get': getFn,
'enumerable': true
};
}
function cloneNsMatrix(nsMatrix) {
var clone = {}, key;
for (key in nsMatrix) {
clone[key] = nsMatrix[key];
}
return clone;
}
function uriPrefix(prefix) {
return prefix + '$uri';
}
function buildNsMatrix(nsUriToPrefix) {
var nsMatrix = {},
uri,
prefix;
for (uri in nsUriToPrefix) {
prefix = nsUriToPrefix[uri];
nsMatrix[prefix] = prefix;
nsMatrix[uriPrefix(prefix)] = uri;
}
return nsMatrix;
}
function noopGetContext() {
return { line: 0, column: 0 };
}
function throwFunc(err) {
throw err;
}
/**
* Creates a new parser with the given options.
*
* @constructor
*
* @param {!Object<string, ?>=} options
*/
function Parser(options) {
if (!this) {
return new Parser(options);
}
var proxy = options && options['proxy'];
var onText,
onOpenTag,
onCloseTag,
onCDATA,
onError = throwFunc,
onWarning,
onComment,
onQuestion,
onAttention;
var getContext = noopGetContext;
/**
* Do we need to parse the current elements attributes for namespaces?
*
* @type {boolean}
*/
var maybeNS = false;
/**
* Do we process namespaces at all?
*
* @type {boolean}
*/
var isNamespace = false;
/**
* The caught error returned on parse end
*
* @type {Error}
*/
var returnError = null;
/**
* Should we stop parsing?
*
* @type {boolean}
*/
var parseStop = false;
/**
* A map of { uri: prefix } used by the parser.
*
* This map will ensure we can normalize prefixes during processing;
* for each uri, only one prefix will be exposed to the handlers.
*
* @type {!Object<string, string>}}
*/
var nsUriToPrefix;
/**
* Handle parse error.
*
* @param {string|Error} err
*/
function handleError(err) {
if (!(err instanceof Error)) {
err = error(err);
}
returnError = err;
onError(err, getContext);
}
/**
* Handle parse error.
*
* @param {string|Error} err
*/
function handleWarning(err) {
if (!onWarning) {
return;
}
if (!(err instanceof Error)) {
err = error(err);
}
onWarning(err, getContext);
}
/**
* Register parse listener.
*
* @param {string} name
* @param {Function} cb
*
* @return {Parser}
*/
this['on'] = function(name, cb) {
if (typeof cb !== 'function') {
throw error('required args <name, cb>');
}
switch (name) {
case 'openTag': onOpenTag = cb; break;
case 'text': onText = cb; break;
case 'closeTag': onCloseTag = cb; break;
case 'error': onError = cb; break;
case 'warn': onWarning = cb; break;
case 'cdata': onCDATA = cb; break;
case 'attention': onAttention = cb; break; // <!XXXXX zzzz="eeee">
case 'question': onQuestion = cb; break; // <? .... ?>
case 'comment': onComment = cb; break;
default:
throw error('unsupported event: ' + name);
}
return this;
};
/**
* Set the namespace to prefix mapping.
*
* @example
*
* parser.ns({
* 'http://foo': 'foo',
* 'http://bar': 'bar'
* });
*
* @param {!Object<string, string>} nsMap
*
* @return {Parser}
*/
this['ns'] = function(nsMap) {
if (typeof nsMap === 'undefined') {
nsMap = {};
}
if (typeof nsMap !== 'object') {
throw error('required args <nsMap={}>');
}
var _nsUriToPrefix = {}, k;
for (k in nsMap) {
_nsUriToPrefix[k] = nsMap[k];
}
isNamespace = true;
nsUriToPrefix = _nsUriToPrefix;
return this;
};
/**
* Parse xml string.
*
* @param {string} xml
*
* @return {Error} returnError, if not thrown
*/
this['parse'] = function(xml) {
if (typeof xml !== 'string') {
throw error('required args <xml=string>');
}
returnError = null;
parse(xml);
getContext = noopGetContext;
parseStop = false;
return returnError;
};
/**
* Stop parsing.
*/
this['stop'] = function() {
parseStop = true;
};
/**
* Parse string, invoking configured listeners on element.
*
* @param {string} xml
*/
function parse(xml) {
var nsMatrixStack = isNamespace ? [] : null,
nsMatrix = isNamespace ? buildNsMatrix(nsUriToPrefix) : null,
_nsMatrix,
nodeStack = [],
anonymousNsCount = 0,
tagStart = false,
tagEnd = false,
i = 0, j = 0,
x, y, q, w, v,
xmlns,
elementName,
_elementName,
elementProxy
;
var attrsString = '',
attrsStart = 0,
cachedAttrs // false = parsed with errors, null = needs parsing
;
/**
* Parse attributes on demand and returns the parsed attributes.
*
* Return semantics: (1) `false` on attribute parse error,
* (2) object hash on extracted attrs.
*
* @return {boolean|Object}
*/
function getAttrs() {
if (cachedAttrs !== null) {
return cachedAttrs;
}
var nsUri,
nsUriPrefix,
nsName,
defaultAlias = isNamespace && nsMatrix['xmlns'],
attrList = isNamespace && maybeNS ? [] : null,
i = attrsStart,
s = attrsString,
l = s.length,
hasNewMatrix,
newalias,
value,
alias,
name,
attrs = {},
seenAttrs = {},
skipAttr,
w,
j;
parseAttr:
for (; i < l; i++) {
skipAttr = false;
w = s.charCodeAt(i);
if (w === 32 || (w < 14 && w > 8)) { // WHITESPACE={ \f\n\r\t\v}
continue;
}
// wait for non whitespace character
if (w < 65 || w > 122 || (w > 90 && w < 97)) {
if (w !== 95 && w !== 58) { // char 95"_" 58":"
handleWarning('illegal first char attribute name');
skipAttr = true;
}
}
// parse attribute name
for (j = i + 1; j < l; j++) {
w = s.charCodeAt(j);
if (
w > 96 && w < 123 ||
w > 64 && w < 91 ||
w > 47 && w < 59 ||
w === 46 || // '.'
w === 45 || // '-'
w === 95 // '_'
) {
continue;
}
// unexpected whitespace
if (w === 32 || (w < 14 && w > 8)) { // WHITESPACE
handleWarning('missing attribute value');
i = j;
continue parseAttr;
}
// expected "="
if (w === 61) { // "=" == 61
break;
}
handleWarning('illegal attribute name char');
skipAttr = true;
}
name = s.substring(i, j);
if (name === 'xmlns:xmlns') {
handleWarning('illegal declaration of xmlns');
skipAttr = true;
}
w = s.charCodeAt(j + 1);
if (w === 34) { // '"'
j = s.indexOf('"', i = j + 2);
if (j === -1) {
j = s.indexOf('\'', i);
if (j !== -1) {
handleWarning('attribute value quote missmatch');
skipAttr = true;
}
}
} else if (w === 39) { // "'"
j = s.indexOf('\'', i = j + 2);
if (j === -1) {
j = s.indexOf('"', i);
if (j !== -1) {
handleWarning('attribute value quote missmatch');
skipAttr = true;
}
}
} else {
handleWarning('missing attribute value quotes');
skipAttr = true;
// skip to next space
for (j = j + 1; j < l; j++) {
w = s.charCodeAt(j + 1);
if (w === 32 || (w < 14 && w > 8)) { // WHITESPACE
break;
}
}
}
if (j === -1) {
handleWarning('missing closing quotes');
j = l;
skipAttr = true;
}
if (!skipAttr) {
value = s.substring(i, j);
}
i = j;
// ensure SPACE follows attribute
// skip illegal content otherwise
// example a="b"c
for (; j + 1 < l; j++) {
w = s.charCodeAt(j + 1);
if (w === 32 || (w < 14 && w > 8)) { // WHITESPACE
break;
}
// FIRST ILLEGAL CHAR
if (i === j) {
handleWarning('illegal character after attribute end');
skipAttr = true;
}
}
// advance cursor to next attribute
i = j + 1;
if (skipAttr) {
continue parseAttr;
}
// check attribute re-declaration
if (name in seenAttrs) {
handleWarning('attribute <' + name + '> already defined');
continue;
}
seenAttrs[name] = true;
if (!isNamespace) {
attrs[name] = value;
continue;
}
// try to extract namespace information
if (maybeNS) {
newalias = (
name === 'xmlns'
? 'xmlns'
: (name.charCodeAt(0) === 120 && name.substr(0, 6) === 'xmlns:')
? name.substr(6)
: null
);
// handle xmlns(:alias) assignment
if (newalias !== null) {
nsUri = decodeEntities(value);
nsUriPrefix = uriPrefix(newalias);
alias = nsUriToPrefix[nsUri];
if (!alias) {
// no prefix defined or prefix collision
if (
(newalias === 'xmlns') ||
(nsUriPrefix in nsMatrix && nsMatrix[nsUriPrefix] !== nsUri)
) {
// alocate free ns prefix
do {
alias = 'ns' + (anonymousNsCount++);
} while (typeof nsMatrix[alias] !== 'undefined');
} else {
alias = newalias;
}
nsUriToPrefix[nsUri] = alias;
}
if (nsMatrix[newalias] !== alias) {
if (!hasNewMatrix) {
nsMatrix = cloneNsMatrix(nsMatrix);
hasNewMatrix = true;
}
nsMatrix[newalias] = alias;
if (newalias === 'xmlns') {
nsMatrix[uriPrefix(alias)] = nsUri;
defaultAlias = alias;
}
nsMatrix[nsUriPrefix] = nsUri;
}
// expose xmlns(:asd)="..." in attributes
attrs[name] = value;
continue;
}
// collect attributes until all namespace
// declarations are processed
attrList.push(name, value);
continue;
} /** end if (maybeNs) */
// handle attributes on element without
// namespace declarations
w = name.indexOf(':');
if (w === -1) {
attrs[name] = value;
continue;
}
// normalize ns attribute name
if (!(nsName = nsMatrix[name.substring(0, w)])) {
handleWarning(missingNamespaceForPrefix(name.substring(0, w)));
continue;
}
name = defaultAlias === nsName
? name.substr(w + 1)
: nsName + name.substr(w);
// end: normalize ns attribute name
attrs[name] = value;
}
// handle deferred, possibly namespaced attributes
if (maybeNS) {
// normalize captured attributes
for (i = 0, l = attrList.length; i < l; i++) {
name = attrList[i++];
value = attrList[i];
w = name.indexOf(':');
if (w !== -1) {
// normalize ns attribute name
if (!(nsName = nsMatrix[name.substring(0, w)])) {
handleWarning(missingNamespaceForPrefix(name.substring(0, w)));
continue;
}
name = defaultAlias === nsName
? name.substr(w + 1)
: nsName + name.substr(w);
// end: normalize ns attribute name
}
attrs[name] = value;
}
// end: normalize captured attributes
}
return cachedAttrs = attrs;
}
/**
* Extract the parse context { line, column, part }
* from the current parser position.
*
* @return {Object} parse context
*/
function getParseContext() {
var splitsRe = /(\r\n|\r|\n)/g;
var line = 0;
var column = 0;
var startOfLine = 0;
var endOfLine = j;
var match;
var data;
while (i >= startOfLine) {
match = splitsRe.exec(xml);
if (!match) {
break;
}
// end of line = (break idx + break chars)
endOfLine = match[0].length + match.index;
if (endOfLine > i) {
break;
}
// advance to next line
line += 1;
startOfLine = endOfLine;
}
// EOF errors
if (i == -1) {
column = endOfLine;
data = xml.substring(j);
} else
// start errors
if (j === 0) {
data = xml.substring(j, i);
}
// other errors
else {
column = i - startOfLine;
data = (j == -1 ? xml.substring(i) : xml.substring(i, j + 1));
}
return {
'data': data,
'line': line,
'column': column
};
}
getContext = getParseContext;
if (proxy) {
elementProxy = Object.create({}, {
'name': getter(function() {
return elementName;
}),
'originalName': getter(function() {
return _elementName;
}),
'attrs': getter(getAttrs),
'ns': getter(function() {
return nsMatrix;
})
});
}
// actual parse logic
while (j !== -1) {
if (xml.charCodeAt(j) === 60) { // "<"
i = j;
} else {
i = xml.indexOf('<', j);
}
// parse end
if (i === -1) {
if (nodeStack.length) {
return handleError('unexpected end of file');
}
if (j === 0) {
return handleError('missing start tag');
}
if (j < xml.length) {
if (xml.substring(j).trim()) {
handleWarning(NON_WHITESPACE_OUTSIDE_ROOT_NODE);
}
}
return;
}
// parse text
if (j !== i) {
if (nodeStack.length) {
if (onText) {
onText(xml.substring(j, i), decodeEntities, getContext);
if (parseStop) {
return;
}
}
} else {
if (xml.substring(j, i).trim()) {
handleWarning(NON_WHITESPACE_OUTSIDE_ROOT_NODE);
if (parseStop) {
return;
}
}
}
}
w = xml.charCodeAt(i + 1);
// parse comments + CDATA
if (w === 33) { // "!"
q = xml.charCodeAt(i + 2);
// CDATA section
if (q === 91 && xml.substr(i + 3, 6) === 'CDATA[') { // 91 == "["
j = xml.indexOf(']]>', i);
if (j === -1) {
return handleError('unclosed cdata');
}
if (onCDATA) {
onCDATA(xml.substring(i + 9, j), getContext);
if (parseStop) {
return;
}
}
j += 3;
continue;
}
// comment
if (q === 45 && xml.charCodeAt(i + 3) === 45) { // 45 == "-"
j = xml.indexOf('-->', i);
if (j === -1) {
return handleError('unclosed comment');
}
if (onComment) {
onComment(xml.substring(i + 4, j), decodeEntities, getContext);
if (parseStop) {
return;
}
}
j += 3;
continue;
}
}
// parse question <? ... ?>
if (w === 63) { // "?"
j = xml.indexOf('?>', i);
if (j === -1) {
return handleError('unclosed question');
}
if (onQuestion) {
onQuestion(xml.substring(i, j + 2), getContext);
if (parseStop) {
return;
}
}
j += 2;
continue;
}
// find matching closing tag for attention or standard tags
// for that we must skip through attribute values
// (enclosed in single or double quotes)
for (x = i + 1; ; x++) {
v = xml.charCodeAt(x);
if (isNaN(v)) {
j = -1;
return handleError('unclosed tag');
}
// [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
// skips the quoted string
// (double quotes) does not appear in a literal enclosed by (double quotes)
// (single quote) does not appear in a literal enclosed by (single quote)
if (v === 34) { // '"'
q = xml.indexOf('"', x + 1);
x = q !== -1 ? q : x;
} else if (v === 39) { // "'"
q = xml.indexOf("'", x + 1);
x = q !== -1 ? q : x;
} else if (v === 62) { // '>'
j = x;
break;
}
}
// parse attention <! ...>
// previously comment and CDATA have already been parsed
if (w === 33) { // "!"
if (onAttention) {
onAttention(xml.substring(i, j + 1), decodeEntities, getContext);
if (parseStop) {
return;
}
}
j += 1;
continue;
}
// don't process attributes;
// there are none
cachedAttrs = {};
// if (xml.charCodeAt(i+1) === 47) { // </...
if (w === 47) { // </...
tagStart = false;
tagEnd = true;
if (!nodeStack.length) {
return handleError('missing open tag');
}
// verify open <-> close tag match
x = elementName = nodeStack.pop();
q = i + 2 + x.length;
if (xml.substring(i + 2, q) !== x) {
return handleError('closing tag mismatch');
}
// verify chars in close tag
for (; q < j; q++) {
w = xml.charCodeAt(q);
if (w === 32 || (w > 8 && w < 14)) { // \f\n\r\t\v space
continue;
}
return handleError('close tag');
}
} else {
if (xml.charCodeAt(j - 1) === 47) { // .../>
x = elementName = xml.substring(i + 1, j - 1);
tagStart = true;
tagEnd = true;
} else {
x = elementName = xml.substring(i + 1, j);
tagStart = true;
tagEnd = false;
}
if (!(w > 96 && w < 123 || w > 64 && w < 91 || w === 95 || w === 58)) { // char 95"_" 58":"
return handleError('illegal first char nodeName');
}
for (q = 1, y = x.length; q < y; q++) {
w = x.charCodeAt(q);
if (w > 96 && w < 123 || w > 64 && w < 91 || w > 47 && w < 59 || w === 45 || w === 95 || w == 46) {
continue;
}
if (w === 32 || (w < 14 && w > 8)) { // \f\n\r\t\v space
elementName = x.substring(0, q);
// maybe there are attributes
cachedAttrs = null;
break;
}
return handleError('invalid nodeName');
}
if (!tagEnd) {
nodeStack.push(elementName);
}
}
if (isNamespace) {
_nsMatrix = nsMatrix;
if (tagStart) {
// remember old namespace
// unless we're self-closing
if (!tagEnd) {
nsMatrixStack.push(_nsMatrix);
}
if (cachedAttrs === null) {
// quick check, whether there may be namespace
// declarations on the node; if that is the case
// we need to eagerly parse the node attributes
if ((maybeNS = x.indexOf('xmlns', q) !== -1)) {
attrsStart = q;
attrsString = x;
getAttrs();
maybeNS = false;
}
}
}
_elementName = elementName;
w = elementName.indexOf(':');
if (w !== -1) {
xmlns = nsMatrix[elementName.substring(0, w)];
// prefix given; namespace must exist
if (!xmlns) {
return handleError('missing namespace on <' + _elementName + '>');
}
elementName = elementName.substr(w + 1);
} else {
xmlns = nsMatrix['xmlns'];
// if no default namespace is defined,
// we'll import the element as anonymous.
//
// it is up to users to correct that to the document defined
// targetNamespace, or whatever their undersanding of the
// XML spec mandates.
}
// adjust namespace prefixs as configured
if (xmlns) {
elementName = xmlns + ':' + elementName;
}
}
if (tagStart) {
attrsStart = q;
attrsString = x;
if (onOpenTag) {
if (proxy) {
onOpenTag(elementProxy, decodeEntities, tagEnd, getContext);
} else {
onOpenTag(elementName, getAttrs, decodeEntities, tagEnd, getContext);
}
if (parseStop) {
return;
}
}
}
if (tagEnd) {
if (onCloseTag) {
onCloseTag(proxy ? elementProxy : elementName, decodeEntities, tagStart, getContext);
if (parseStop) {
return;
}
}
// restore old namespace
if (isNamespace) {
if (!tagStart) {
nsMatrix = nsMatrixStack.pop();
} else {
nsMatrix = _nsMatrix;
}
}
}
j += 1;
}
} /** end parse */
}
export { Parser, decodeEntities as decode };
//# sourceMappingURL=index.js.map