xmldom-sre
Version:
A pure JavaScript W3C standard-based (XML DOM Level 2 Core) DOMParser and XMLSerializer module.
678 lines (657 loc) • 20.7 kB
JavaScript
'use strict';
var conventions = require('./conventions');
var isHTMLRawTextElement = conventions.isHTMLRawTextElement;
var isHTMLEscapableRawTextElement = conventions.isHTMLEscapableRawTextElement;
var NAMESPACE = conventions.NAMESPACE;
var MIME_TYPE = conventions.MIME_TYPE;
//[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
//[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
//[5] Name ::= NameStartChar (NameChar)*
var nameStartChar =
/[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; //\u10000-\uEFFFF
var nameChar = new RegExp('[\\-\\.0-9' + nameStartChar.source.slice(1, -1) + '\\u00B7\\u0300-\\u036F\\u203F-\\u2040]');
var tagNamePattern = new RegExp(
'^' + nameStartChar.source + nameChar.source + '*(?::' + nameStartChar.source + nameChar.source + '*)?$'
);
//var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/
//var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
var S_TAG = 0; //tag name offerring
var S_ATTR = 1; //attr name offerring
var S_ATTR_SPACE = 2; //attr name end and space offer
var S_EQ = 3; //=space?
var S_ATTR_NOQUOT_VALUE = 4; //attr value(no quot value only)
var S_ATTR_END = 5; //attr value end and no space(quot end)
var S_TAG_SPACE = 6; //(attr value end || tag end ) && (space offer)
var S_TAG_CLOSE = 7; //closed el<el />
/**
* Creates an error that will not be caught by XMLReader aka the SAX parser.
*
* @param {string} message
* @param {any?} locator Optional, can provide details about the location in the source
* @constructor
*/
function ParseError(message, locator) {
this.message = message;
this.locator = locator;
if (Error.captureStackTrace) Error.captureStackTrace(this, ParseError);
}
ParseError.prototype = new Error();
ParseError.prototype.name = ParseError.name;
function XMLReader() {}
XMLReader.prototype = {
parse: function (source, defaultNSMap, entityMap) {
var domBuilder = this.domBuilder;
domBuilder.startDocument();
_copy(defaultNSMap, (defaultNSMap = {}));
parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler);
domBuilder.endDocument();
},
};
function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) {
var isHTML = MIME_TYPE.isHTML(domBuilder.mimeType);
function fixedFromCharCode(code) {
// String.prototype.fromCharCode does not supports
// > 2 bytes unicode chars directly
if (code > 0xffff) {
code -= 0x10000;
var surrogate1 = 0xd800 + (code >> 10),
surrogate2 = 0xdc00 + (code & 0x3ff);
return String.fromCharCode(surrogate1, surrogate2);
} else {
return String.fromCharCode(code);
}
}
function entityReplacer(a) {
var k = a.slice(1, -1);
if (Object.hasOwnProperty.call(entityMap, k)) {
return entityMap[k];
} else if (k.charAt(0) === '#') {
return fixedFromCharCode(parseInt(k.substr(1).replace('x', '0x')));
} else {
errorHandler.error('entity not found:' + a);
return a;
}
}
function appendText(end) {
//has some bugs
if (end > start) {
var xt = source.substring(start, end).replace(/&#?\w+;/g, entityReplacer);
locator && position(start);
domBuilder.characters(xt, 0, end - start);
start = end;
}
}
function position(p, m) {
while (p >= lineEnd && (m = linePattern.exec(source))) {
lineStart = m.index;
lineEnd = lineStart + m[0].length;
locator.lineNumber++;
}
locator.columnNumber = p - lineStart + 1;
}
var lineStart = 0;
var lineEnd = 0;
var linePattern = /.*(?:\r\n?|\n)|.*$/g;
var locator = domBuilder.locator;
var parseStack = [{ currentNSMap: defaultNSMapCopy }];
var closeMap = {};
var start = 0;
while (true) {
try {
var tagStart = source.indexOf('<', start);
if (tagStart < 0) {
if (!source.substr(start).match(/^\s*$/)) {
var doc = domBuilder.doc;
var text = doc.createTextNode(source.substr(start));
doc.appendChild(text);
domBuilder.currentElement = text;
}
return;
}
if (tagStart > start) {
appendText(tagStart);
}
switch (source.charAt(tagStart + 1)) {
case '/':
var config = parseStack.pop();
var end = source.indexOf('>', tagStart + 3);
var tagNameRaw = source.substring(tagStart + 2, end > 0 ? end : undefined);
var tagNameMatch = new RegExp('(' + tagNamePattern.source.slice(0, -1) + ')').exec(tagNameRaw);
// for the root level the config does not contain the tagName
var tagName =
tagNameMatch && tagNameMatch[1] ? tagNameMatch[1] : config.tagName || domBuilder.doc.documentElement.tagName;
if (end < 0) {
errorHandler.error('end tag name: ' + tagName + ' is not complete');
end = tagStart + 1 + tagName.length;
} else if (tagNameRaw.match(/</) && !isHTML) {
errorHandler.error('end tag name: ' + tagName + ' maybe not complete');
}
var localNSMap = config.localNSMap;
var endMatch = config.tagName == tagName;
var endIgnoreCaseMach = endMatch || (config.tagName && config.tagName.toLowerCase() == tagName.toLowerCase());
if (endIgnoreCaseMach) {
domBuilder.endElement(config.uri, config.localName, tagName);
if (localNSMap) {
for (var prefix in localNSMap) {
if (Object.prototype.hasOwnProperty.call(localNSMap, prefix)) {
domBuilder.endPrefixMapping(prefix);
}
}
}
if (!endMatch) {
// No known test case
errorHandler.fatalError('end tag name: ' + tagName + ' is not match the current start tagName:' + config.tagName);
}
} else {
parseStack.push(config);
}
end++;
break;
// end elment
case '?': // ...
locator && position(tagStart);
end = parseInstruction(source, tagStart, domBuilder);
break;
case '!': // ':
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p));
case S_ATTR_END:
case S_TAG_SPACE:
case S_TAG_CLOSE:
break; //normal
case S_ATTR_NOQUOT_VALUE: //Compatible state
case S_ATTR:
value = source.slice(start, p);
if (value.slice(-1) === '/') {
el.closed = true;
value = value.slice(0, -1);
}
case S_ATTR_SPACE:
if (s === S_ATTR_SPACE) {
value = attrName;
}
if (s == S_ATTR_NOQUOT_VALUE) {
errorHandler.warning('attribute "' + value + '" missed quot(")!');
addAttribute(attrName, value, start);
} else {
if (!isHTML) {
errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!');
}
addAttribute(value, value, start);
}
break;
case S_EQ:
throw new Error('attribute value missed!!');
}
// console.log(tagName,tagNamePattern,tagNamePattern.test(tagName))
return p;
/*xml space '\x20' | #x9 | #xD | #xA; */
case '\u0080':
c = ' ';
default:
if (c <= ' ') {
//space
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p)); //tagName
s = S_TAG_SPACE;
break;
case S_ATTR:
attrName = source.slice(start, p);
s = S_ATTR_SPACE;
break;
case S_ATTR_NOQUOT_VALUE:
var value = source.slice(start, p);
errorHandler.warning('attribute "' + value + '" missed quot(")!!');
addAttribute(attrName, value, start);
case S_ATTR_END:
s = S_TAG_SPACE;
break;
//case S_TAG_SPACE:
//case S_EQ:
//case S_ATTR_SPACE:
// void();break;
//case S_TAG_CLOSE:
//ignore warning
}
} else {
//not space
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
switch (s) {
//case S_TAG:void();break;
//case S_ATTR:void();break;
//case S_ATTR_NOQUOT_VALUE:void();break;
case S_ATTR_SPACE:
if (!isHTML) {
errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead2!!');
}
addAttribute(attrName, attrName, start);
start = p;
s = S_ATTR;
break;
case S_ATTR_END:
errorHandler.warning('attribute space is required"' + attrName + '"!!');
case S_TAG_SPACE:
s = S_ATTR;
start = p;
break;
case S_EQ:
s = S_ATTR_NOQUOT_VALUE;
start = p;
break;
case S_TAG_CLOSE:
throw new Error("elements closed character '/' and '>' must be connected to");
}
}
} //end outer switch
//console.log('p++',p)
p++;
}
}
/**
* @return true if has new namespace define
*/
function appendElement(el, domBuilder, currentNSMap) {
var tagName = el.tagName;
var localNSMap = null;
//var currentNSMap = parseStack[parseStack.length-1].currentNSMap;
var i = el.length;
while (i--) {
var a = el[i];
var qName = a.qName;
var value = a.value;
var nsp = qName.indexOf(':');
if (nsp > 0) {
var prefix = (a.prefix = qName.slice(0, nsp));
var localName = qName.slice(nsp + 1);
var nsPrefix = prefix === 'xmlns' && localName;
} else {
localName = qName;
prefix = null;
nsPrefix = qName === 'xmlns' && '';
}
//can not set prefix,because prefix !== ''
a.localName = localName;
//prefix == null for no ns prefix attribute
if (nsPrefix !== false) {
//hack!!
if (localNSMap == null) {
localNSMap = {};
//console.log(currentNSMap,0)
_copy(currentNSMap, (currentNSMap = {}));
//console.log(currentNSMap,1)
}
currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value;
a.uri = NAMESPACE.XMLNS;
domBuilder.startPrefixMapping(nsPrefix, value);
}
}
var i = el.length;
while (i--) {
a = el[i];
if (a.prefix) {
//no prefix attribute has no namespace
if (a.prefix === 'xml') {
a.uri = NAMESPACE.XML;
}
if (a.prefix !== 'xmlns') {
a.uri = currentNSMap[a.prefix];
}
}
}
var nsp = tagName.indexOf(':');
if (nsp > 0) {
prefix = el.prefix = tagName.slice(0, nsp);
localName = el.localName = tagName.slice(nsp + 1);
} else {
prefix = null; //important!!
localName = el.localName = tagName;
}
//no prefix element has default namespace
var ns = (el.uri = currentNSMap[prefix || '']);
domBuilder.startElement(ns, localName, tagName, el);
//endPrefixMapping and startPrefixMapping have not any help for dom builder
//localNSMap = null
if (el.closed) {
domBuilder.endElement(ns, localName, tagName);
if (localNSMap) {
for (prefix in localNSMap) {
if (Object.prototype.hasOwnProperty.call(localNSMap, prefix)) {
domBuilder.endPrefixMapping(prefix);
}
}
}
} else {
el.currentNSMap = currentNSMap;
el.localNSMap = localNSMap;
//parseStack.push(el);
return true;
}
}
function parseHtmlSpecialContent(source, elStartEnd, tagName, entityReplacer, domBuilder) {
// https://html.spec.whatwg.org/#raw-text-elements
// https://html.spec.whatwg.org/#escapable-raw-text-elements
// https://html.spec.whatwg.org/#cdata-rcdata-restrictions:raw-text-elements
// TODO: https://html.spec.whatwg.org/#cdata-rcdata-restrictions
var isEscapableRaw = isHTMLEscapableRawTextElement(tagName);
if (isEscapableRaw || isHTMLRawTextElement(tagName)) {
var elEndStart = source.indexOf('</' + tagName + '>', elStartEnd);
var text = source.substring(elStartEnd + 1, elEndStart);
if (isEscapableRaw) {
text = text.replace(/&#?\w+;/g, entityReplacer);
}
domBuilder.characters(text, 0, text.length);
return elEndStart;
}
return elStartEnd + 1;
}
function fixSelfClosed(source, elStartEnd, tagName, closeMap) {
//if(tagName in closeMap){
var pos = closeMap[tagName];
if (pos == null) {
//console.log(tagName)
pos = source.lastIndexOf('</' + tagName + '>');
if (pos < elStartEnd) {
//忘记闭合
pos = source.lastIndexOf('</' + tagName);
}
closeMap[tagName] = pos;
}
return pos < elStartEnd;
//}
}
function _copy(source, target) {
for (var n in source) {
if (Object.prototype.hasOwnProperty.call(source, n)) {
target[n] = source[n];
}
}
}
function parseDCC(source, start, domBuilder, errorHandler) {
//sure start with '<!'
var next = source.charAt(start + 2);
switch (next) {
case '-':
if (source.charAt(start + 3) === '-') {
var end = source.indexOf('-->', start + 4);
//append comment source.substring(4,end)//<!--
if (end > start) {
domBuilder.comment(source, start + 4, end - start - 4);
return end + 3;
} else {
errorHandler.error('Unclosed comment');
return -1;
}
} else {
//error
return -1;
}
default:
if (source.substr(start + 3, 6) == 'CDATA[') {
var end = source.indexOf(']]>', start + 9);
domBuilder.startCDATA();
domBuilder.characters(source, start + 9, end - start - 9);
domBuilder.endCDATA();
return end + 3;
}
//<!DOCTYPE
//startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId)
var matchs = split(source, start);
var len = matchs.length;
if (len > 1 && /!doctype/i.test(matchs[0][0])) {
var name = matchs[1][0];
var pubid = false;
var sysid = false;
if (len > 3) {
if (/^public$/i.test(matchs[2][0])) {
pubid = matchs[3][0];
sysid = len > 4 && matchs[4][0];
} else if (/^system$/i.test(matchs[2][0])) {
sysid = matchs[3][0];
}
}
var lastMatch = matchs[len - 1];
domBuilder.startDTD(name, pubid, sysid);
domBuilder.endDTD();
return lastMatch.index + lastMatch[0].length;
}
}
return -1;
}
function parseInstruction(source, start, domBuilder) {
var end = source.indexOf('?>', start);
if (end) {
var match = source.substring(start, end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/);
if (match) {
domBuilder.processingInstruction(match[1], match[2]);
return end + 2;
} else {
//error
return -1;
}
}
return -1;
}
function ElementAttributes() {
this.attributeNames = {};
}
ElementAttributes.prototype = {
setTagName: function (tagName) {
if (!tagNamePattern.test(tagName)) {
throw new Error('invalid tagName:' + tagName);
}
this.tagName = tagName;
},
addValue: function (qName, value, offset) {
if (!tagNamePattern.test(qName)) {
throw new Error('invalid attribute:' + qName);
}
this.attributeNames[qName] = this.length;
this[this.length++] = { qName: qName, value: value, offset: offset };
},
length: 0,
getLocalName: function (i) {
return this[i].localName;
},
getLocator: function (i) {
return this[i].locator;
},
getQName: function (i) {
return this[i].qName;
},
getURI: function (i) {
return this[i].uri;
},
getValue: function (i) {
return this[i].value;
},
// ,getIndex:function(uri, localName)){
// if(localName){
//
// }else{
// var qName = uri
// }
// },
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))},
// getType:function(uri,localName){}
// getType:function(i){},
};
function split(source, start) {
var match;
var buf = [];
var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g;
reg.lastIndex = start;
reg.exec(source); //skip <
while ((match = reg.exec(source))) {
buf.push(match);
if (match[1]) return buf;
}
}
exports.XMLReader = XMLReader;
exports.ParseError = ParseError;