UNPKG

xmldom-sre

Version:

A pure JavaScript W3C standard-based (XML DOM Level 2 Core) DOMParser and XMLSerializer module.

678 lines (657 loc) 20.7 kB
'use strict'; var conventions = require('./conventions'); var isHTMLRawTextElement = conventions.isHTMLRawTextElement; var isHTMLEscapableRawTextElement = conventions.isHTMLEscapableRawTextElement; var NAMESPACE = conventions.NAMESPACE; var MIME_TYPE = conventions.MIME_TYPE; //[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] //[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] //[5] Name ::= NameStartChar (NameChar)* var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; //\u10000-\uEFFFF var nameChar = new RegExp('[\\-\\.0-9' + nameStartChar.source.slice(1, -1) + '\\u00B7\\u0300-\\u036F\\u203F-\\u2040]'); var tagNamePattern = new RegExp( '^' + nameStartChar.source + nameChar.source + '*(?::' + nameStartChar.source + nameChar.source + '*)?$' ); //var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/ //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',') //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE var S_TAG = 0; //tag name offerring var S_ATTR = 1; //attr name offerring var S_ATTR_SPACE = 2; //attr name end and space offer var S_EQ = 3; //=space? var S_ATTR_NOQUOT_VALUE = 4; //attr value(no quot value only) var S_ATTR_END = 5; //attr value end and no space(quot end) var S_TAG_SPACE = 6; //(attr value end || tag end ) && (space offer) var S_TAG_CLOSE = 7; //closed el<el /> /** * Creates an error that will not be caught by XMLReader aka the SAX parser. * * @param {string} message * @param {any?} locator Optional, can provide details about the location in the source * @constructor */ function ParseError(message, locator) { this.message = message; this.locator = locator; if (Error.captureStackTrace) Error.captureStackTrace(this, ParseError); } ParseError.prototype = new Error(); ParseError.prototype.name = ParseError.name; function XMLReader() {} XMLReader.prototype = { parse: function (source, defaultNSMap, entityMap) { var domBuilder = this.domBuilder; domBuilder.startDocument(); _copy(defaultNSMap, (defaultNSMap = {})); parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler); domBuilder.endDocument(); }, }; function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) { var isHTML = MIME_TYPE.isHTML(domBuilder.mimeType); function fixedFromCharCode(code) { // String.prototype.fromCharCode does not supports // > 2 bytes unicode chars directly if (code > 0xffff) { code -= 0x10000; var surrogate1 = 0xd800 + (code >> 10), surrogate2 = 0xdc00 + (code & 0x3ff); return String.fromCharCode(surrogate1, surrogate2); } else { return String.fromCharCode(code); } } function entityReplacer(a) { var k = a.slice(1, -1); if (Object.hasOwnProperty.call(entityMap, k)) { return entityMap[k]; } else if (k.charAt(0) === '#') { return fixedFromCharCode(parseInt(k.substr(1).replace('x', '0x'))); } else { errorHandler.error('entity not found:' + a); return a; } } function appendText(end) { //has some bugs if (end > start) { var xt = source.substring(start, end).replace(/&#?\w+;/g, entityReplacer); locator && position(start); domBuilder.characters(xt, 0, end - start); start = end; } } function position(p, m) { while (p >= lineEnd && (m = linePattern.exec(source))) { lineStart = m.index; lineEnd = lineStart + m[0].length; locator.lineNumber++; } locator.columnNumber = p - lineStart + 1; } var lineStart = 0; var lineEnd = 0; var linePattern = /.*(?:\r\n?|\n)|.*$/g; var locator = domBuilder.locator; var parseStack = [{ currentNSMap: defaultNSMapCopy }]; var closeMap = {}; var start = 0; while (true) { try { var tagStart = source.indexOf('<', start); if (tagStart < 0) { if (!source.substr(start).match(/^\s*$/)) { var doc = domBuilder.doc; var text = doc.createTextNode(source.substr(start)); doc.appendChild(text); domBuilder.currentElement = text; } return; } if (tagStart > start) { appendText(tagStart); } switch (source.charAt(tagStart + 1)) { case '/': var config = parseStack.pop(); var end = source.indexOf('>', tagStart + 3); var tagNameRaw = source.substring(tagStart + 2, end > 0 ? end : undefined); var tagNameMatch = new RegExp('(' + tagNamePattern.source.slice(0, -1) + ')').exec(tagNameRaw); // for the root level the config does not contain the tagName var tagName = tagNameMatch && tagNameMatch[1] ? tagNameMatch[1] : config.tagName || domBuilder.doc.documentElement.tagName; if (end < 0) { errorHandler.error('end tag name: ' + tagName + ' is not complete'); end = tagStart + 1 + tagName.length; } else if (tagNameRaw.match(/</) && !isHTML) { errorHandler.error('end tag name: ' + tagName + ' maybe not complete'); } var localNSMap = config.localNSMap; var endMatch = config.tagName == tagName; var endIgnoreCaseMach = endMatch || (config.tagName && config.tagName.toLowerCase() == tagName.toLowerCase()); if (endIgnoreCaseMach) { domBuilder.endElement(config.uri, config.localName, tagName); if (localNSMap) { for (var prefix in localNSMap) { if (Object.prototype.hasOwnProperty.call(localNSMap, prefix)) { domBuilder.endPrefixMapping(prefix); } } } if (!endMatch) { // No known test case errorHandler.fatalError('end tag name: ' + tagName + ' is not match the current start tagName:' + config.tagName); } } else { parseStack.push(config); } end++; break; // end elment case '?': // <?...?> locator && position(tagStart); end = parseInstruction(source, tagStart, domBuilder); break; case '!': // <!doctype,<![CDATA,<!-- locator && position(tagStart); end = parseDCC(source, tagStart, domBuilder, errorHandler); break; default: locator && position(tagStart); var el = new ElementAttributes(); var currentNSMap = parseStack[parseStack.length - 1].currentNSMap; //elStartEnd var end = parseElementStartPart(source, tagStart, el, currentNSMap, entityReplacer, errorHandler, isHTML); var len = el.length; if (!el.closed && fixSelfClosed(source, end, el.tagName, closeMap)) { el.closed = true; if (!isHTML) { errorHandler.warning('unclosed xml attribute'); } } if (locator && len) { var locator2 = copyLocator(locator, {}); //try{//attribute position fixed for (var i = 0; i < len; i++) { var a = el[i]; position(a.offset); a.locator = copyLocator(locator, {}); } domBuilder.locator = locator2; if (appendElement(el, domBuilder, currentNSMap)) { parseStack.push(el); } domBuilder.locator = locator; } else { if (appendElement(el, domBuilder, currentNSMap)) { parseStack.push(el); } } if (isHTML && !el.closed) { end = parseHtmlSpecialContent(source, end, el.tagName, entityReplacer, domBuilder); } else { end++; } } } catch (e) { if (e instanceof ParseError) { throw e; } errorHandler.error('element parse error: ' + e); end = -1; } if (end > start) { start = end; } else { //TODO: 这里有可能sax回退,有位置错误风险 appendText(Math.max(tagStart, start) + 1); } } } function copyLocator(f, t) { t.lineNumber = f.lineNumber; t.columnNumber = f.columnNumber; return t; } /** * @see #appendElement(source,elStartEnd,el,selfClosed,entityReplacer,domBuilder,parseStack); * @return end of the elementStartPart(end of elementEndPart for selfClosed el) */ function parseElementStartPart(source, start, el, currentNSMap, entityReplacer, errorHandler, isHTML) { /** * @param {string} qname * @param {string} value * @param {number} startIndex */ function addAttribute(qname, value, startIndex) { if (el.attributeNames.hasOwnProperty(qname)) { errorHandler.fatalError('Attribute ' + qname + ' redefined'); } el.addValue( qname, // @see https://www.w3.org/TR/xml/#AVNormalize // since the xmldom sax parser does not "interpret" DTD the following is not implemented: // - recursive replacement of (DTD) entity references // - trimming and collapsing multiple spaces into a single one for attributes that are not of type CDATA value.replace(/[\t\n\r]/g, ' ').replace(/&#?\w+;/g, entityReplacer), startIndex ); } var attrName; var value; var p = ++start; var s = S_TAG; //status while (true) { var c = source.charAt(p); switch (c) { case '=': if (s === S_ATTR) { //attrName attrName = source.slice(start, p); s = S_EQ; } else if (s === S_ATTR_SPACE) { s = S_EQ; } else { //fatalError: equal must after attrName or space after attrName throw new Error('attribute equal must after attrName'); // No known test case } break; case "'": case '"': if ( s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE ) { //equal if (s === S_ATTR) { errorHandler.warning('attribute value must after "="'); attrName = source.slice(start, p); } start = p + 1; p = source.indexOf(c, start); if (p > 0) { value = source.slice(start, p); addAttribute(attrName, value, start - 1); s = S_ATTR_END; } else { //fatalError: no end quot match throw new Error("attribute value no end '" + c + "' match"); } } else if (s == S_ATTR_NOQUOT_VALUE) { value = source.slice(start, p); addAttribute(attrName, value, start); errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!'); start = p + 1; s = S_ATTR_END; } else { //fatalError: no equal before throw new Error('attribute value must after "="'); // No known test case } break; case '/': switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); case S_ATTR_END: case S_TAG_SPACE: case S_TAG_CLOSE: s = S_TAG_CLOSE; el.closed = true; case S_ATTR_NOQUOT_VALUE: case S_ATTR: break; case S_ATTR_SPACE: el.closed = true; break; //case S_EQ: default: throw new Error("attribute invalid close char('/')"); // No known test case } break; case '': //end document errorHandler.error('unexpected end of input'); if (s == S_TAG) { el.setTagName(source.slice(start, p)); } return p; case '>': switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); case S_ATTR_END: case S_TAG_SPACE: case S_TAG_CLOSE: break; //normal case S_ATTR_NOQUOT_VALUE: //Compatible state case S_ATTR: value = source.slice(start, p); if (value.slice(-1) === '/') { el.closed = true; value = value.slice(0, -1); } case S_ATTR_SPACE: if (s === S_ATTR_SPACE) { value = attrName; } if (s == S_ATTR_NOQUOT_VALUE) { errorHandler.warning('attribute "' + value + '" missed quot(")!'); addAttribute(attrName, value, start); } else { if (!isHTML) { errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!'); } addAttribute(value, value, start); } break; case S_EQ: throw new Error('attribute value missed!!'); } // console.log(tagName,tagNamePattern,tagNamePattern.test(tagName)) return p; /*xml space '\x20' | #x9 | #xD | #xA; */ case '\u0080': c = ' '; default: if (c <= ' ') { //space switch (s) { case S_TAG: el.setTagName(source.slice(start, p)); //tagName s = S_TAG_SPACE; break; case S_ATTR: attrName = source.slice(start, p); s = S_ATTR_SPACE; break; case S_ATTR_NOQUOT_VALUE: var value = source.slice(start, p); errorHandler.warning('attribute "' + value + '" missed quot(")!!'); addAttribute(attrName, value, start); case S_ATTR_END: s = S_TAG_SPACE; break; //case S_TAG_SPACE: //case S_EQ: //case S_ATTR_SPACE: // void();break; //case S_TAG_CLOSE: //ignore warning } } else { //not space //S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE //S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE switch (s) { //case S_TAG:void();break; //case S_ATTR:void();break; //case S_ATTR_NOQUOT_VALUE:void();break; case S_ATTR_SPACE: if (!isHTML) { errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead2!!'); } addAttribute(attrName, attrName, start); start = p; s = S_ATTR; break; case S_ATTR_END: errorHandler.warning('attribute space is required"' + attrName + '"!!'); case S_TAG_SPACE: s = S_ATTR; start = p; break; case S_EQ: s = S_ATTR_NOQUOT_VALUE; start = p; break; case S_TAG_CLOSE: throw new Error("elements closed character '/' and '>' must be connected to"); } } } //end outer switch //console.log('p++',p) p++; } } /** * @return true if has new namespace define */ function appendElement(el, domBuilder, currentNSMap) { var tagName = el.tagName; var localNSMap = null; //var currentNSMap = parseStack[parseStack.length-1].currentNSMap; var i = el.length; while (i--) { var a = el[i]; var qName = a.qName; var value = a.value; var nsp = qName.indexOf(':'); if (nsp > 0) { var prefix = (a.prefix = qName.slice(0, nsp)); var localName = qName.slice(nsp + 1); var nsPrefix = prefix === 'xmlns' && localName; } else { localName = qName; prefix = null; nsPrefix = qName === 'xmlns' && ''; } //can not set prefix,because prefix !== '' a.localName = localName; //prefix == null for no ns prefix attribute if (nsPrefix !== false) { //hack!! if (localNSMap == null) { localNSMap = {}; //console.log(currentNSMap,0) _copy(currentNSMap, (currentNSMap = {})); //console.log(currentNSMap,1) } currentNSMap[nsPrefix] = localNSMap[nsPrefix] = value; a.uri = NAMESPACE.XMLNS; domBuilder.startPrefixMapping(nsPrefix, value); } } var i = el.length; while (i--) { a = el[i]; if (a.prefix) { //no prefix attribute has no namespace if (a.prefix === 'xml') { a.uri = NAMESPACE.XML; } if (a.prefix !== 'xmlns') { a.uri = currentNSMap[a.prefix]; } } } var nsp = tagName.indexOf(':'); if (nsp > 0) { prefix = el.prefix = tagName.slice(0, nsp); localName = el.localName = tagName.slice(nsp + 1); } else { prefix = null; //important!! localName = el.localName = tagName; } //no prefix element has default namespace var ns = (el.uri = currentNSMap[prefix || '']); domBuilder.startElement(ns, localName, tagName, el); //endPrefixMapping and startPrefixMapping have not any help for dom builder //localNSMap = null if (el.closed) { domBuilder.endElement(ns, localName, tagName); if (localNSMap) { for (prefix in localNSMap) { if (Object.prototype.hasOwnProperty.call(localNSMap, prefix)) { domBuilder.endPrefixMapping(prefix); } } } } else { el.currentNSMap = currentNSMap; el.localNSMap = localNSMap; //parseStack.push(el); return true; } } function parseHtmlSpecialContent(source, elStartEnd, tagName, entityReplacer, domBuilder) { // https://html.spec.whatwg.org/#raw-text-elements // https://html.spec.whatwg.org/#escapable-raw-text-elements // https://html.spec.whatwg.org/#cdata-rcdata-restrictions:raw-text-elements // TODO: https://html.spec.whatwg.org/#cdata-rcdata-restrictions var isEscapableRaw = isHTMLEscapableRawTextElement(tagName); if (isEscapableRaw || isHTMLRawTextElement(tagName)) { var elEndStart = source.indexOf('</' + tagName + '>', elStartEnd); var text = source.substring(elStartEnd + 1, elEndStart); if (isEscapableRaw) { text = text.replace(/&#?\w+;/g, entityReplacer); } domBuilder.characters(text, 0, text.length); return elEndStart; } return elStartEnd + 1; } function fixSelfClosed(source, elStartEnd, tagName, closeMap) { //if(tagName in closeMap){ var pos = closeMap[tagName]; if (pos == null) { //console.log(tagName) pos = source.lastIndexOf('</' + tagName + '>'); if (pos < elStartEnd) { //忘记闭合 pos = source.lastIndexOf('</' + tagName); } closeMap[tagName] = pos; } return pos < elStartEnd; //} } function _copy(source, target) { for (var n in source) { if (Object.prototype.hasOwnProperty.call(source, n)) { target[n] = source[n]; } } } function parseDCC(source, start, domBuilder, errorHandler) { //sure start with '<!' var next = source.charAt(start + 2); switch (next) { case '-': if (source.charAt(start + 3) === '-') { var end = source.indexOf('-->', start + 4); //append comment source.substring(4,end)//<!-- if (end > start) { domBuilder.comment(source, start + 4, end - start - 4); return end + 3; } else { errorHandler.error('Unclosed comment'); return -1; } } else { //error return -1; } default: if (source.substr(start + 3, 6) == 'CDATA[') { var end = source.indexOf(']]>', start + 9); domBuilder.startCDATA(); domBuilder.characters(source, start + 9, end - start - 9); domBuilder.endCDATA(); return end + 3; } //<!DOCTYPE //startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId) var matchs = split(source, start); var len = matchs.length; if (len > 1 && /!doctype/i.test(matchs[0][0])) { var name = matchs[1][0]; var pubid = false; var sysid = false; if (len > 3) { if (/^public$/i.test(matchs[2][0])) { pubid = matchs[3][0]; sysid = len > 4 && matchs[4][0]; } else if (/^system$/i.test(matchs[2][0])) { sysid = matchs[3][0]; } } var lastMatch = matchs[len - 1]; domBuilder.startDTD(name, pubid, sysid); domBuilder.endDTD(); return lastMatch.index + lastMatch[0].length; } } return -1; } function parseInstruction(source, start, domBuilder) { var end = source.indexOf('?>', start); if (end) { var match = source.substring(start, end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/); if (match) { domBuilder.processingInstruction(match[1], match[2]); return end + 2; } else { //error return -1; } } return -1; } function ElementAttributes() { this.attributeNames = {}; } ElementAttributes.prototype = { setTagName: function (tagName) { if (!tagNamePattern.test(tagName)) { throw new Error('invalid tagName:' + tagName); } this.tagName = tagName; }, addValue: function (qName, value, offset) { if (!tagNamePattern.test(qName)) { throw new Error('invalid attribute:' + qName); } this.attributeNames[qName] = this.length; this[this.length++] = { qName: qName, value: value, offset: offset }; }, length: 0, getLocalName: function (i) { return this[i].localName; }, getLocator: function (i) { return this[i].locator; }, getQName: function (i) { return this[i].qName; }, getURI: function (i) { return this[i].uri; }, getValue: function (i) { return this[i].value; }, // ,getIndex:function(uri, localName)){ // if(localName){ // // }else{ // var qName = uri // } // }, // getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))}, // getType:function(uri,localName){} // getType:function(i){}, }; function split(source, start) { var match; var buf = []; var reg = /'[^']+'|"[^"]+"|[^\s<>\/=]+=?|(\/?\s*>|<)/g; reg.lastIndex = start; reg.exec(source); //skip < while ((match = reg.exec(source))) { buf.push(match); if (match[1]) return buf; } } exports.XMLReader = XMLReader; exports.ParseError = ParseError;