UNPKG

@loaders.gl/xml

Version:

Framework-independent loaders for the XML (eXtensible Markup Language) format

1,454 lines (1,453 loc) 48.9 kB
// loaders.gl // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors const DEFAULT_SAX_EVENTS = { ontext: () => { }, onprocessinginstruction: () => { }, onsgmldeclaration: () => { }, ondoctype: () => { }, oncomment: () => { }, onopentagstart: () => { }, onattribute: () => { }, onopentag: () => { }, onclosetag: () => { }, onopencdata: () => { }, oncdata: () => { }, onclosecdata: () => { }, onerror: () => { }, onend: () => { }, onready: () => { }, onscript: () => { }, onopennamespace: () => { }, onclosenamespace: () => { } }; const DEFAULT_SAX_PARSER_OPTIONS = { ...DEFAULT_SAX_EVENTS, strict: false, MAX_BUFFER_LENGTH: 64 * 1024, lowercase: false, lowercasetags: false, noscript: false, strictEntities: false, xmlns: undefined, position: undefined, trim: undefined, normalize: undefined }; const EVENTS = [ 'text', 'processinginstruction', 'sgmldeclaration', 'doctype', 'comment', 'opentagstart', 'attribute', 'opentag', 'closetag', 'opencdata', 'cdata', 'closecdata', 'error', 'end', 'ready', 'script', 'opennamespace', 'closenamespace' ]; const BUFFERS = [ 'comment', 'sgmlDecl', 'textNode', 'tagName', 'doctype', 'procInstName', 'procInstBody', 'entity', 'attribName', 'attribValue', 'cdata', 'script' ]; const nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; const nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; const entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; const entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; export const ENTITIES = { amp: '&', gt: '>', lt: '<', quot: '"', apos: "'", AElig: 198, Aacute: 193, Acirc: 194, Agrave: 192, Aring: 197, Atilde: 195, Auml: 196, Ccedil: 199, ETH: 208, Eacute: 201, Ecirc: 202, Egrave: 200, Euml: 203, Iacute: 205, Icirc: 206, Igrave: 204, Iuml: 207, Ntilde: 209, Oacute: 211, Ocirc: 212, Ograve: 210, Oslash: 216, Otilde: 213, Ouml: 214, THORN: 222, Uacute: 218, Ucirc: 219, Ugrave: 217, Uuml: 220, Yacute: 221, aacute: 225, acirc: 226, aelig: 230, agrave: 224, aring: 229, atilde: 227, auml: 228, ccedil: 231, eacute: 233, ecirc: 234, egrave: 232, eth: 240, euml: 235, iacute: 237, icirc: 238, igrave: 236, iuml: 239, ntilde: 241, oacute: 243, ocirc: 244, ograve: 242, oslash: 248, otilde: 245, ouml: 246, szlig: 223, thorn: 254, uacute: 250, ucirc: 251, ugrave: 249, uuml: 252, yacute: 253, yuml: 255, copy: 169, reg: 174, nbsp: 160, iexcl: 161, cent: 162, pound: 163, curren: 164, yen: 165, brvbar: 166, sect: 167, uml: 168, ordf: 170, laquo: 171, not: 172, shy: 173, macr: 175, deg: 176, plusmn: 177, sup1: 185, sup2: 178, sup3: 179, acute: 180, micro: 181, para: 182, middot: 183, cedil: 184, ordm: 186, raquo: 187, frac14: 188, frac12: 189, frac34: 190, iquest: 191, times: 215, divide: 247, OElig: 338, oelig: 339, Scaron: 352, scaron: 353, Yuml: 376, fnof: 402, circ: 710, tilde: 732, Alpha: 913, Beta: 914, Gamma: 915, Delta: 916, Epsilon: 917, Zeta: 918, Eta: 919, Theta: 920, Iota: 921, Kappa: 922, Lambda: 923, Mu: 924, Nu: 925, Xi: 926, Omicron: 927, Pi: 928, Rho: 929, Sigma: 931, Tau: 932, Upsilon: 933, Phi: 934, Chi: 935, Psi: 936, Omega: 937, alpha: 945, beta: 946, gamma: 947, delta: 948, epsilon: 949, zeta: 950, eta: 951, theta: 952, iota: 953, kappa: 954, lambda: 955, mu: 956, nu: 957, xi: 958, omicron: 959, pi: 960, rho: 961, sigmaf: 962, sigma: 963, tau: 964, upsilon: 965, phi: 966, chi: 967, psi: 968, omega: 969, thetasym: 977, upsih: 978, piv: 982, ensp: 8194, emsp: 8195, thinsp: 8201, zwnj: 8204, zwj: 8205, lrm: 8206, rlm: 8207, ndash: 8211, mdash: 8212, lsquo: 8216, rsquo: 8217, sbquo: 8218, ldquo: 8220, rdquo: 8221, bdquo: 8222, dagger: 8224, Dagger: 8225, bull: 8226, hellip: 8230, permil: 8240, prime: 8242, Prime: 8243, lsaquo: 8249, rsaquo: 8250, oline: 8254, frasl: 8260, euro: 8364, image: 8465, weierp: 8472, real: 8476, trade: 8482, alefsym: 8501, larr: 8592, uarr: 8593, rarr: 8594, darr: 8595, harr: 8596, crarr: 8629, lArr: 8656, uArr: 8657, rArr: 8658, dArr: 8659, hArr: 8660, forall: 8704, part: 8706, exist: 8707, empty: 8709, nabla: 8711, isin: 8712, notin: 8713, ni: 8715, prod: 8719, sum: 8721, minus: 8722, lowast: 8727, radic: 8730, prop: 8733, infin: 8734, ang: 8736, and: 8743, or: 8744, cap: 8745, cup: 8746, int: 8747, there4: 8756, sim: 8764, cong: 8773, asymp: 8776, ne: 8800, equiv: 8801, le: 8804, ge: 8805, sub: 8834, sup: 8835, nsub: 8836, sube: 8838, supe: 8839, oplus: 8853, otimes: 8855, perp: 8869, sdot: 8901, lceil: 8968, rceil: 8969, lfloor: 8970, rfloor: 8971, lang: 9001, rang: 9002, loz: 9674, spades: 9824, clubs: 9827, hearts: 9829, diams: 9830 }; Object.keys(ENTITIES).forEach((key) => { const e = ENTITIES[key]; ENTITIES[key] = typeof e === 'number' ? String.fromCharCode(e) : e; }); /** * Internal helper class */ class SAX { EVENTS = EVENTS; ENTITIES = { // TODO: make it readonly, needed for entity-mega test // amp, gt, lt, quot and apos are resolved to strings instead of numerical // codes, IDK why ...ENTITIES }; XML_ENTITIES = { amp: '&', gt: '>', lt: '<', quot: '"', apos: "'" }; S = 0; opt; trackPosition = false; column = 0; line = 0; c = ''; error; q = ''; bufferCheckPosition; closed = false; tags = []; looseCase = ''; closedRoot = false; sawRoot = false; strict = false; tag; strictEntities; state; noscript = false; attribList = []; ns; position = 0; STATE = { BEGIN: this.S++, // leading byte order mark or whitespace BEGIN_WHITESPACE: this.S++, // leading whitespace TEXT: this.S++, // general stuff TEXT_ENTITY: this.S++, // &amp and such. OPEN_WAKA: this.S++, // < SGML_DECL: this.S++, // <!BLARG SGML_DECL_QUOTED: this.S++, // <!BLARG foo "bar DOCTYPE: this.S++, // <!DOCTYPE DOCTYPE_QUOTED: this.S++, // <!DOCTYPE "//blah DOCTYPE_DTD: this.S++, // <!DOCTYPE "//blah" [ ... DOCTYPE_DTD_QUOTED: this.S++, // <!DOCTYPE "//blah" [ "foo COMMENT_STARTING: this.S++, // <!- COMMENT: this.S++, // <!-- COMMENT_ENDING: this.S++, // <!-- blah - COMMENT_ENDED: this.S++, // <!-- blah -- CDATA: this.S++, // <![CDATA[ something CDATA_ENDING: this.S++, // ] CDATA_ENDING_2: this.S++, // ]] PROC_INST: this.S++, // <?hi PROC_INST_BODY: this.S++, // <?hi there PROC_INST_ENDING: this.S++, // <?hi "there" ? OPEN_TAG: this.S++, // <strong OPEN_TAG_SLASH: this.S++, // <strong / ATTRIB: this.S++, // <a ATTRIB_NAME: this.S++, // <a foo ATTRIB_NAME_SAW_WHITE: this.S++, // <a foo _ ATTRIB_VALUE: this.S++, // <a foo= ATTRIB_VALUE_QUOTED: this.S++, // <a foo="bar ATTRIB_VALUE_CLOSED: this.S++, // <a foo="bar" ATTRIB_VALUE_UNQUOTED: this.S++, // <a foo=bar ATTRIB_VALUE_ENTITY_Q: this.S++, // <foo bar="&quot;" ATTRIB_VALUE_ENTITY_U: this.S++, // <foo bar=&quot CLOSE_TAG: this.S++, // </a CLOSE_TAG_SAW_WHITE: this.S++, // </a > SCRIPT: this.S++, // <script> ... SCRIPT_ENDING: this.S++ // <script> ... < }; BUFFERS = BUFFERS; // private parser: (strict: boolean, opt: any) => SAXParser; CDATA = '[CDATA['; DOCTYPE = 'DOCTYPE'; XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'; XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'; rootNS = { xml: this.XML_NAMESPACE, xmlns: this.XMLNS_NAMESPACE }; comment; sgmlDecl; textNode = ''; tagName; doctype; procInstName; procInstBody; entity = ''; attribName; attribValue; cdata = ''; script = ''; startTagPosition = 0; constructor() { this.S = 0; for (const s in this.STATE) { if (this.STATE.hasOwnProperty(s)) { this.STATE[this.STATE[s]] = s; } } // shorthand this.S = this.STATE; } static charAt(chunk, i) { let result = ''; if (i < chunk.length) { result = chunk.charAt(i); } return result; } static isWhitespace(c) { return c === ' ' || c === '\n' || c === '\r' || c === '\t'; } static isQuote(c) { return c === '"' || c === "'"; } static isAttribEnd(c) { return c === '>' || SAX.isWhitespace(c); } static isMatch(regex, c) { return regex.test(c); } static notMatch(regex, c) { return !SAX.isMatch(regex, c); } static qname(name, attribute) { const i = name.indexOf(':'); const qualName = i < 0 ? ['', name] : name.split(':'); let prefix = qualName[0]; let local = qualName[1]; // <x "xmlns"="http://foo"> if (attribute && name === 'xmlns') { prefix = 'xmlns'; local = ''; } return { prefix, local }; } write(chunk) { if (this.error) { throw this.error; } if (this.closed) { return this.errorFunction('Cannot write after close. Assign an onready handler.'); } if (chunk === null) { return this.end(); } if (typeof chunk === 'object') { chunk = chunk.toString(); } let i = 0; let c; while (true) { c = SAX.charAt(chunk, i++); this.c = c; if (!c) { break; } if (this.trackPosition) { this.position++; if (c === '\n') { this.line++; this.column = 0; } else { this.column++; } } switch (this.state) { case this.S.BEGIN: this.state = this.S.BEGIN_WHITESPACE; if (c === '\uFEFF') { continue; } this.beginWhiteSpace(c); continue; case this.S.BEGIN_WHITESPACE: this.beginWhiteSpace(c); continue; case this.S.TEXT: if (this.sawRoot && !this.closedRoot) { const starti = i - 1; while (c && c !== '<' && c !== '&') { c = SAX.charAt(chunk, i++); if (c && this.trackPosition) { this.position++; if (c === '\n') { this.line++; this.column = 0; } else { this.column++; } } } this.textNode += chunk.substring(starti, i - 1); } if (c === '<' && !(this.sawRoot && this.closedRoot && !this.strict)) { this.state = this.S.OPEN_WAKA; this.startTagPosition = this.position; } else { if (!SAX.isWhitespace(c) && (!this.sawRoot || this.closedRoot)) { this.strictFail('Text data outside of root node.'); } if (c === '&') { this.state = this.S.TEXT_ENTITY; } else { this.textNode += c; } } continue; case this.S.SCRIPT: // only non-strict if (c === '<') { this.state = this.S.SCRIPT_ENDING; } else { this.script += c; } continue; case this.S.SCRIPT_ENDING: if (c === '/') { this.state = this.S.CLOSE_TAG; } else { this.script += `<${c}`; this.state = this.S.SCRIPT; } continue; case this.S.OPEN_WAKA: // either a /, ?, !, or text is coming next. if (c === '!') { this.state = this.S.SGML_DECL; this.sgmlDecl = ''; } else if (SAX.isWhitespace(c)) { // wait for it... } else if (SAX.isMatch(nameStart, c)) { this.state = this.S.OPEN_TAG; this.tagName = c; } else if (c === '/') { this.state = this.S.CLOSE_TAG; this.tagName = ''; } else if (c === '?') { this.state = this.S.PROC_INST; this.procInstName = this.procInstBody = ''; } else { this.strictFail('Unencoded <'); // if there was some whitespace, then add that in. if (this.startTagPosition + 1 < this.position) { const pad = this.position - this.startTagPosition; c = new Array(pad).join(' ') + c; } this.textNode += `<${c}`; this.state = this.S.TEXT; } continue; case this.S.SGML_DECL: if ((this.sgmlDecl + c).toUpperCase() === this.CDATA) { this.emitNode('onopencdata'); this.state = this.S.CDATA; this.sgmlDecl = ''; this.cdata = ''; } else if (this.sgmlDecl + c === '--') { this.state = this.S.COMMENT; this.comment = ''; this.sgmlDecl = ''; } else if ((this.sgmlDecl + c).toUpperCase() === this.DOCTYPE) { this.state = this.S.DOCTYPE; if (this.doctype || this.sawRoot) { this.strictFail('Inappropriately located doctype declaration'); } this.doctype = ''; this.sgmlDecl = ''; } else if (c === '>') { this.emitNode('onsgmldeclaration', this.sgmlDecl); this.sgmlDecl = ''; this.state = this.S.TEXT; } else if (SAX.isQuote(c)) { this.state = this.S.SGML_DECL_QUOTED; this.sgmlDecl += c; } else { this.sgmlDecl += c; } continue; case this.S.SGML_DECL_QUOTED: if (c === this.q) { this.state = this.S.SGML_DECL; this.q = ''; } this.sgmlDecl += c; continue; case this.S.DOCTYPE: if (c === '>') { this.state = this.S.TEXT; this.emitNode('ondoctype', this.doctype); this.doctype = true; // just remember that we saw it. } else { this.doctype += c; if (c === '[') { this.state = this.S.DOCTYPE_DTD; } else if (SAX.isQuote(c)) { this.state = this.S.DOCTYPE_QUOTED; this.q = c; } } continue; case this.S.DOCTYPE_QUOTED: this.doctype += c; if (c === this.q) { this.q = ''; this.state = this.S.DOCTYPE; } continue; case this.S.DOCTYPE_DTD: this.doctype += c; if (c === ']') { this.state = this.S.DOCTYPE; } else if (SAX.isQuote(c)) { this.state = this.S.DOCTYPE_DTD_QUOTED; this.q = c; } continue; case this.S.DOCTYPE_DTD_QUOTED: this.doctype += c; if (c === this.q) { this.state = this.S.DOCTYPE_DTD; this.q = ''; } continue; case this.S.COMMENT: if (c === '-') { this.state = this.S.COMMENT_ENDING; } else { this.comment += c; } continue; case this.S.COMMENT_ENDING: if (c === '-') { this.state = this.S.COMMENT_ENDED; this.comment = this.textApplyOptions(this.comment); if (this.comment) { this.emitNode('oncomment', this.comment); } this.comment = ''; } else { this.comment += `-${c}`; this.state = this.S.COMMENT; } continue; case this.S.COMMENT_ENDED: if (c !== '>') { this.strictFail('Malformed comment'); // allow <!-- blah -- bloo --> in non-strict mode, // which is a comment of " blah -- bloo " this.comment += `--${c}`; this.state = this.S.COMMENT; } else { this.state = this.S.TEXT; } continue; case this.S.CDATA: if (c === ']') { this.state = this.S.CDATA_ENDING; } else { this.cdata += c; } continue; case this.S.CDATA_ENDING: if (c === ']') { this.state = this.S.CDATA_ENDING_2; } else { this.cdata += `]${c}`; this.state = this.S.CDATA; } continue; case this.S.CDATA_ENDING_2: if (c === '>') { if (this.cdata) { this.emitNode('oncdata', this.cdata); } this.emitNode('onclosecdata'); this.cdata = ''; this.state = this.S.TEXT; } else if (c === ']') { this.cdata += ']'; } else { this.cdata += `]]${c}`; this.state = this.S.CDATA; } continue; case this.S.PROC_INST: if (c === '?') { this.state = this.S.PROC_INST_ENDING; } else if (SAX.isWhitespace(c)) { this.state = this.S.PROC_INST_BODY; } else { this.procInstName += c; } continue; case this.S.PROC_INST_BODY: if (!this.procInstBody && SAX.isWhitespace(c)) { continue; } else if (c === '?') { this.state = this.S.PROC_INST_ENDING; } else { this.procInstBody += c; } continue; case this.S.PROC_INST_ENDING: if (c === '>') { this.emitNode('onprocessinginstruction', { name: this.procInstName, body: this.procInstBody }); this.procInstName = this.procInstBody = ''; this.state = this.S.TEXT; } else { this.procInstBody += `?${c}`; this.state = this.S.PROC_INST_BODY; } continue; case this.S.OPEN_TAG: if (SAX.isMatch(nameBody, c)) { this.tagName += c; } else { this.newTag(); if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else { if (!SAX.isWhitespace(c)) { this.strictFail('Invalid character in tag name'); } this.state = this.S.ATTRIB; } } continue; case this.S.OPEN_TAG_SLASH: if (c === '>') { this.openTag(true); this.closeTag(); } else { this.strictFail('Forward-slash in opening tag not followed by >'); this.state = this.S.ATTRIB; } continue; case this.S.ATTRIB: // haven't read the attribute name yet. if (SAX.isWhitespace(c)) { continue; } else if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else if (SAX.isMatch(nameStart, c)) { this.attribName = c; this.attribValue = ''; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); } continue; case this.S.ATTRIB_NAME: if (c === '=') { this.state = this.S.ATTRIB_VALUE; } else if (c === '>') { this.strictFail('Attribute without value'); this.attribValue = this.attribName; this.attrib(); this.openTag(); } else if (SAX.isWhitespace(c)) { this.state = this.S.ATTRIB_NAME_SAW_WHITE; } else if (SAX.isMatch(nameBody, c)) { this.attribName += c; } else { this.strictFail('Invalid attribute name'); } continue; case this.S.ATTRIB_NAME_SAW_WHITE: if (c === '=') { this.state = this.S.ATTRIB_VALUE; } else if (SAX.isWhitespace(c)) { continue; } else { this.strictFail('Attribute without value'); this.tag.attributes[this.attribName] = ''; this.attribValue = ''; this.emitNode('onattribute', { name: this.attribName, value: '' }); this.attribName = ''; if (c === '>') { this.openTag(); } else if (SAX.isMatch(nameStart, c)) { this.attribName = c; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); this.state = this.S.ATTRIB; } } continue; case this.S.ATTRIB_VALUE: if (SAX.isWhitespace(c)) { continue; } else if (SAX.isQuote(c)) { this.q = c; this.state = this.S.ATTRIB_VALUE_QUOTED; } else { this.strictFail('Unquoted attribute value'); this.state = this.S.ATTRIB_VALUE_UNQUOTED; this.attribValue = c; } continue; case this.S.ATTRIB_VALUE_QUOTED: if (c !== this.q) { if (c === '&') { this.state = this.S.ATTRIB_VALUE_ENTITY_Q; } else { this.attribValue += c; } continue; } this.attrib(); this.q = ''; this.state = this.S.ATTRIB_VALUE_CLOSED; continue; case this.S.ATTRIB_VALUE_CLOSED: if (SAX.isWhitespace(c)) { this.state = this.S.ATTRIB; } else if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else if (SAX.isMatch(nameStart, c)) { this.strictFail('No whitespace between attributes'); this.attribName = c; this.attribValue = ''; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); } continue; case this.S.ATTRIB_VALUE_UNQUOTED: if (!SAX.isAttribEnd(c)) { if (c === '&') { this.state = this.S.ATTRIB_VALUE_ENTITY_U; } else { this.attribValue += c; } continue; } this.attrib(); if (c === '>') { this.openTag(); } else { this.state = this.S.ATTRIB; } continue; case this.S.CLOSE_TAG: if (!this.tagName) { if (SAX.isWhitespace(c)) { continue; } else if (SAX.notMatch(nameStart, c)) { if (this.script) { this.script += `</${c}`; this.state = this.S.SCRIPT; } else { this.strictFail('Invalid tagname in closing tag.'); } } else { this.tagName = c; } } else if (c === '>') { this.closeTag(); } else if (SAX.isMatch(nameBody, c)) { this.tagName += c; } else if (this.script) { this.script += `</${this.tagName}`; this.tagName = ''; this.state = this.S.SCRIPT; } else { if (!SAX.isWhitespace(c)) { this.strictFail('Invalid tagname in closing tag'); } this.state = this.S.CLOSE_TAG_SAW_WHITE; } continue; case this.S.CLOSE_TAG_SAW_WHITE: if (SAX.isWhitespace(c)) { continue; } if (c === '>') { this.closeTag(); } else { this.strictFail('Invalid characters in closing tag'); } continue; case this.S.TEXT_ENTITY: case this.S.ATTRIB_VALUE_ENTITY_Q: case this.S.ATTRIB_VALUE_ENTITY_U: let returnState; let buffer; switch (this.state) { case this.S.TEXT_ENTITY: returnState = this.S.TEXT; buffer = 'textNode'; break; case this.S.ATTRIB_VALUE_ENTITY_Q: returnState = this.S.ATTRIB_VALUE_QUOTED; buffer = 'attribValue'; break; case this.S.ATTRIB_VALUE_ENTITY_U: returnState = this.S.ATTRIB_VALUE_UNQUOTED; buffer = 'attribValue'; break; default: throw new Error(`Unknown state: ${this.state}`); } if (c === ';') { this[buffer] += this.parseEntity(); this.entity = ''; this.state = returnState; } else if (SAX.isMatch(this.entity.length ? entityBody : entityStart, c)) { this.entity += c; } else { this.strictFail('Invalid character in entity name'); this[buffer] += `&${this.entity}${c}`; this.entity = ''; this.state = returnState; } continue; default: throw new Error(`Unknown state: ${this.state}`); } } // while if (this.position >= this.bufferCheckPosition) { this.checkBufferLength(); } return this; } emit(event, data) { if (this.events.hasOwnProperty(event)) { const eventName = event.replace(/^on/, ''); this.events[event](data, eventName, this); } } clearBuffers() { for (let i = 0, l = this.BUFFERS.length; i < l; i++) { this[this[i]] = ''; } } flushBuffers() { this.closeText(); if (this.cdata !== '') { this.emitNode('oncdata', this.cdata); this.cdata = ''; } if (this.script !== '') { this.emitNode('onscript', this.script); this.script = ''; } } end() { if (this.sawRoot && !this.closedRoot) this.strictFail('Unclosed root tag'); if (this.state !== this.S.BEGIN && this.state !== this.S.BEGIN_WHITESPACE && this.state !== this.S.TEXT) { this.errorFunction('Unexpected end'); } this.closeText(); this.c = ''; this.closed = true; this.emit('onend'); return new SAXParser(this.opt); } errorFunction(er) { this.closeText(); if (this.trackPosition) { er += `\nLine: ${this.line}\nColumn: ${this.column}\nChar: ${this.c}`; } const error = new Error(er); this.error = error; this.emit('onerror', error); return this; } attrib() { if (!this.strict) { this.attribName = this.attribName[this.looseCase](); } if (this.attribList.indexOf(this.attribName) !== -1 || this.tag.attributes.hasOwnProperty(this.attribName)) { this.attribName = this.attribValue = ''; return; } if (this.opt.xmlns) { const qn = SAX.qname(this.attribName, true); const prefix = qn.prefix; const local = qn.local; if (prefix === 'xmlns') { // namespace binding attribute. push the binding into scope if (local === 'xml' && this.attribValue !== this.XML_NAMESPACE) { this.strictFail(`xml: prefix must be bound to ${this.XML_NAMESPACE}\n` + `Actual: ${this.attribValue}`); } else if (local === 'xmlns' && this.attribValue !== this.XMLNS_NAMESPACE) { this.strictFail(`xmlns: prefix must be bound to ${this.XMLNS_NAMESPACE}\n` + `Actual: ${this.attribValue}`); } else { const tag = this.tag; const parent = this.tags[this.tags.length - 1] || this; if (tag.ns === parent.ns) { tag.ns = Object.create(parent.ns); } tag.ns[local] = this.attribValue; } } // defer onattribute events until all attributes have been seen // so any new bindings can take effect. preserve attribute order // so deferred events can be emitted in document order this.attribList.push([this.attribName, this.attribValue]); } else { // in non-xmlns mode, we can emit the event right away this.tag.attributes[this.attribName] = this.attribValue; this.emitNode('onattribute', { name: this.attribName, value: this.attribValue }); } this.attribName = this.attribValue = ''; } newTag() { if (!this.strict) this.tagName = this.tagName[this.looseCase](); const parent = this.tags[this.tags.length - 1] || this; const tag = (this.tag = { name: this.tagName, attributes: {} }); // will be overridden if tag contains an xmlns="foo" or xmlns:foo="bar" if (this.opt.xmlns) { tag.ns = parent.ns; } this.attribList.length = 0; this.emitNode('onopentagstart', tag); } parseEntity() { let entity = this.entity; const entityLC = entity.toLowerCase(); let num = NaN; let numStr = ''; if (this.ENTITIES[entity]) { return this.ENTITIES[entity]; } if (this.ENTITIES[entityLC]) { return this.ENTITIES[entityLC]; } entity = entityLC; if (entity.charAt(0) === '#') { if (entity.charAt(1) === 'x') { entity = entity.slice(2); // TODO: remove tslint:disable // tslint:disable-next-line num = parseInt(entity, 16); numStr = num.toString(16); } else { entity = entity.slice(1); // TODO: remove tslint:disable // tslint:disable-next-line num = parseInt(entity, 10); numStr = num.toString(10); } } entity = entity.replace(/^0+/, ''); if (isNaN(num) || numStr.toLowerCase() !== entity) { this.strictFail('Invalid character entity'); return `&${this.entity};`; } return String.fromCodePoint(num); } beginWhiteSpace(c) { if (c === '<') { this.state = this.S.OPEN_WAKA; this.startTagPosition = this.position; } else if (!SAX.isWhitespace(c)) { // have to process this as a text node. // weird, but happens. this.strictFail('Non-whitespace before first tag.'); this.textNode = c; this.state = this.S.TEXT; } else { } } strictFail(message) { if (typeof this !== 'object' || !(this instanceof SAXParser)) { throw new Error('bad call to strictFail'); } if (this.strict) { this.errorFunction(message); } } textApplyOptions(text) { if (this.opt.trim) text = text.trim(); if (this.opt.normalize) text = text.replace(/\s+/g, ' '); return text; } emitNode(nodeType, data) { if (this.textNode) this.closeText(); this.emit(nodeType, data); } closeText() { this.textNode = this.textApplyOptions(this.textNode); // TODO: figure out why this.textNode can be "" and "undefined" if (this.textNode !== undefined && this.textNode !== '' && this.textNode !== 'undefined') { this.emit('ontext', this.textNode); } this.textNode = ''; } checkBufferLength() { const maxAllowed = Math.max(this.opt.MAX_BUFFER_LENGTH, 10); let maxActual = 0; for (let i = 0, l = this.BUFFERS.length; i < l; i++) { const len = this[this.BUFFERS[i]]?.length || 0; if (len > maxAllowed) { // Text/cdata nodes can get big, and since they're buffered, // we can get here under normal conditions. // Avoid issues by emitting the text node now, // so at least it won't get any bigger. switch (this.BUFFERS[i]) { case 'textNode': this.closeText(); break; case 'cdata': this.emitNode('oncdata', this.cdata); this.cdata = ''; break; case 'script': this.emitNode('onscript', this.script); this.script = ''; break; default: this.errorFunction(`Max buffer length exceeded: ${this.BUFFERS[i]}`); } } maxActual = Math.max(maxActual, len); } // schedule the next check for the earliest possible buffer overrun. const m = this.opt.MAX_BUFFER_LENGTH - maxActual; this.bufferCheckPosition = m + this.position; } openTag(selfClosing) { if (this.opt.xmlns) { // emit namespace binding events const tag = this.tag; // add namespace info to tag const qn = SAX.qname(this.tagName); tag.prefix = qn.prefix; tag.local = qn.local; tag.uri = tag.ns[qn.prefix] || ''; if (tag.prefix && !tag.uri) { this.strictFail(`Unbound namespace prefix: ${JSON.stringify(this.tagName)}`); tag.uri = qn.prefix; } const parent = this.tags[this.tags.length - 1] || this; if (tag.ns && parent.ns !== tag.ns) { const that = this; Object.keys(tag.ns).forEach((p) => { that.emitNode('onopennamespace', { prefix: p, uri: tag.ns[p] }); }); } // handle deferred onattribute events // Note: do not apply default ns to attributes: // http://www.w3.org/TR/REC-xml-names/#defaulting for (let i = 0, l = this.attribList.length; i < l; i++) { const nv = this.attribList[i]; const name = nv[0]; const value = nv[1]; const qualName = SAX.qname(name, true); const prefix = qualName.prefix; const local = qualName.local; const uri = prefix === '' ? '' : tag.ns[prefix] || ''; const a = { name, value, prefix, local, uri }; // if there's any attributes with an undefined namespace, // then fail on them now. if (prefix && prefix !== 'xmlns' && !uri) { this.strictFail(`Unbound namespace prefix: ${JSON.stringify(prefix)}`); a.uri = prefix; } this.tag.attributes[name] = a; this.emitNode('onattribute', a); } this.attribList.length = 0; } this.tag.isSelfClosing = Boolean(selfClosing); // process the tag this.sawRoot = true; this.tags.push(this.tag); this.emitNode('onopentag', this.tag); if (!selfClosing) { // special case for <script> in non-strict mode. if (!this.noscript && this.tagName.toLowerCase() === 'script') { this.state = this.S.SCRIPT; } else { this.state = this.S.TEXT; } this.tag = null; this.tagName = ''; } this.attribName = this.attribValue = ''; this.attribList.length = 0; } closeTag() { if (!this.tagName) { this.strictFail('Weird empty close tag.'); this.textNode += '</>'; this.state = this.S.TEXT; return; } if (this.script) { if (this.tagName !== 'script') { this.script += `</${this.tagName}>`; this.tagName = ''; this.state = this.S.SCRIPT; return; } this.emitNode('onscript', this.script); this.script = ''; } // first make sure that the closing tag actually exists. // <a><b></c></b></a> will close everything, otherwise. let t = this.tags.length; let tagName = this.tagName; if (!this.strict) { tagName = tagName[this.looseCase](); } while (t--) { const close = this.tags[t]; if (close.name !== tagName) { // fail the first time in strict mode this.strictFail('Unexpected close tag'); } else { break; } } // didn't find it. we already failed for strict, so just abort. if (t < 0) { this.strictFail(`Unmatched closing tag: ${this.tagName}`); this.textNode += `</${this.tagName}>`; this.state = this.S.TEXT; return; } this.tagName = tagName; let s = this.tags.length; while (s-- > t) { const tag = (this.tag = this.tags.pop()); this.tagName = this.tag.name; this.emitNode('onclosetag', this.tagName); const x = {}; for (const i in tag.ns) { if (tag.ns.hasOwnProperty(i)) { x[i] = tag.ns[i]; } } const parent = this.tags[this.tags.length - 1] || this; if (this.opt.xmlns && tag.ns !== parent.ns) { // remove namespace bindings introduced by tag const that = this; Object.keys(tag.ns).forEach((p) => { const n = tag.ns[p]; that.emitNode('onclosenamespace', { prefix: p, uri: n }); }); } } if (t === 0) this.closedRoot = true; this.tagName = this.attribValue = this.attribName = ''; this.attribList.length = 0; this.state = this.S.TEXT; } } /** * * @todo Weird inheritance, with some variables initialized in subclass */ export class SAXParser extends SAX { static ENTITIES = ENTITIES; opt = DEFAULT_SAX_PARSER_OPTIONS; events = DEFAULT_SAX_EVENTS; constructor(opt) { super(); this.clearBuffers(); this.opt = opt = { ...this.opt, ...opt }; this.events = { ...this.events, ...opt }; this.q = this.c = ''; this.opt.lowercase = this.opt.lowercase || this.opt.lowercasetags; this.bufferCheckPosition = this.opt.MAX_BUFFER_LENGTH; this.looseCase = this.opt.lowercase ? 'toLowerCase' : 'toUpperCase'; this.tags = []; this.closed = this.closedRoot = this.sawRoot = false; this.tag = this.error = null; this.strict = Boolean(this.opt.strict); this.noscript = Boolean(this.opt.strict || this.opt.noscript); this.state = this.S.BEGIN; this.strictEntities = this.opt.strictEntities; this.ENTITIES = this.strictEntities ? Object.create(this.XML_ENTITIES) : Object.create(this.ENTITIES); this.attribList = []; // namespaces form a prototype chain. // it always points at the current tag, // which protos to its parent tag. if (this.opt.xmlns) { this.ns = Object.create(this.rootNS); } // mostly just for error reporting this.trackPosition = this.opt.position !== false; if (this.trackPosition) { this.position = this.line = this.column = 0; } this.emit('onready'); } resume() { this.error = null; return this; } close() { return this.write(null); } flush() { this.flushBuffers(); } }