UNPKG

@loaders.gl/xml

Version:

Framework-independent loaders for the XML (eXtensible Markup Language) format

1,572 lines (1,447 loc) 42.6 kB
// loaders.gl // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors // This file is forked from https://github.com/Maxim-Mazurok/sax-ts under ISC license, // which in turn is forked from https://github.com/isaacs/sax-js under ISC license // Copyright (c) Isaac Z. Schlueter and Contributors // Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. /* eslint-disable */ export type SAXEventName = | 'text' | 'processinginstruction' | 'sgmldeclaration' | 'doctype' | 'comment' | 'opentagstart' | 'attribute' | 'opentag' | 'closetag' | 'opencdata' | 'cdata' | 'closecdata' | 'error' | 'end' | 'ready' | 'script' | 'opennamespace' | 'closenamespace'; export type SAXEventCallback = (data: any, eventName: SAXEventName, SAXParser) => void; export type SAXEvents = { ontext?: SAXEventCallback; onprocessinginstruction?: SAXEventCallback; onsgmldeclaration?: SAXEventCallback; ondoctype?: SAXEventCallback; oncomment?: SAXEventCallback; onopentagstart?: SAXEventCallback; onattribute?: SAXEventCallback; onopentag?: SAXEventCallback; onclosetag?: SAXEventCallback; onopencdata?: SAXEventCallback; oncdata?: SAXEventCallback; onclosecdata?: SAXEventCallback; onerror?: SAXEventCallback; onend?: SAXEventCallback; onready?: SAXEventCallback; onscript?: SAXEventCallback; onopennamespace?: SAXEventCallback; onclosenamespace?: SAXEventCallback; }; export type SAXParserOptions = SAXEvents & { strict?: boolean; MAX_BUFFER_LENGTH?: number; lowercase?: boolean; lowercasetags?: boolean; noscript?: boolean; strictEntities?: boolean; xmlns?: any; position?: any; trim?: any; normalize?: any; }; const DEFAULT_SAX_EVENTS: Required<SAXEvents> = { ontext: () => {}, onprocessinginstruction: () => {}, onsgmldeclaration: () => {}, ondoctype: () => {}, oncomment: () => {}, onopentagstart: () => {}, onattribute: () => {}, onopentag: () => {}, onclosetag: () => {}, onopencdata: () => {}, oncdata: () => {}, onclosecdata: () => {}, onerror: () => {}, onend: () => {}, onready: () => {}, onscript: () => {}, onopennamespace: () => {}, onclosenamespace: () => {} }; const DEFAULT_SAX_PARSER_OPTIONS: Required<SAXParserOptions> = { ...DEFAULT_SAX_EVENTS, strict: false, MAX_BUFFER_LENGTH: 64 * 1024, lowercase: false, lowercasetags: false, noscript: false, strictEntities: false, xmlns: undefined, position: undefined, trim: undefined, normalize: undefined }; const EVENTS = [ 'text', 'processinginstruction', 'sgmldeclaration', 'doctype', 'comment', 'opentagstart', 'attribute', 'opentag', 'closetag', 'opencdata', 'cdata', 'closecdata', 'error', 'end', 'ready', 'script', 'opennamespace', 'closenamespace' ]; const BUFFERS = [ 'comment', 'sgmlDecl', 'textNode', 'tagName', 'doctype', 'procInstName', 'procInstBody', 'entity', 'attribName', 'attribValue', 'cdata', 'script' ]; const nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; const nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; const entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; const entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; export const ENTITIES: {[key: string]: number | string} = { amp: '&', gt: '>', lt: '<', quot: '"', apos: "'", AElig: 198, Aacute: 193, Acirc: 194, Agrave: 192, Aring: 197, Atilde: 195, Auml: 196, Ccedil: 199, ETH: 208, Eacute: 201, Ecirc: 202, Egrave: 200, Euml: 203, Iacute: 205, Icirc: 206, Igrave: 204, Iuml: 207, Ntilde: 209, Oacute: 211, Ocirc: 212, Ograve: 210, Oslash: 216, Otilde: 213, Ouml: 214, THORN: 222, Uacute: 218, Ucirc: 219, Ugrave: 217, Uuml: 220, Yacute: 221, aacute: 225, acirc: 226, aelig: 230, agrave: 224, aring: 229, atilde: 227, auml: 228, ccedil: 231, eacute: 233, ecirc: 234, egrave: 232, eth: 240, euml: 235, iacute: 237, icirc: 238, igrave: 236, iuml: 239, ntilde: 241, oacute: 243, ocirc: 244, ograve: 242, oslash: 248, otilde: 245, ouml: 246, szlig: 223, thorn: 254, uacute: 250, ucirc: 251, ugrave: 249, uuml: 252, yacute: 253, yuml: 255, copy: 169, reg: 174, nbsp: 160, iexcl: 161, cent: 162, pound: 163, curren: 164, yen: 165, brvbar: 166, sect: 167, uml: 168, ordf: 170, laquo: 171, not: 172, shy: 173, macr: 175, deg: 176, plusmn: 177, sup1: 185, sup2: 178, sup3: 179, acute: 180, micro: 181, para: 182, middot: 183, cedil: 184, ordm: 186, raquo: 187, frac14: 188, frac12: 189, frac34: 190, iquest: 191, times: 215, divide: 247, OElig: 338, oelig: 339, Scaron: 352, scaron: 353, Yuml: 376, fnof: 402, circ: 710, tilde: 732, Alpha: 913, Beta: 914, Gamma: 915, Delta: 916, Epsilon: 917, Zeta: 918, Eta: 919, Theta: 920, Iota: 921, Kappa: 922, Lambda: 923, Mu: 924, Nu: 925, Xi: 926, Omicron: 927, Pi: 928, Rho: 929, Sigma: 931, Tau: 932, Upsilon: 933, Phi: 934, Chi: 935, Psi: 936, Omega: 937, alpha: 945, beta: 946, gamma: 947, delta: 948, epsilon: 949, zeta: 950, eta: 951, theta: 952, iota: 953, kappa: 954, lambda: 955, mu: 956, nu: 957, xi: 958, omicron: 959, pi: 960, rho: 961, sigmaf: 962, sigma: 963, tau: 964, upsilon: 965, phi: 966, chi: 967, psi: 968, omega: 969, thetasym: 977, upsih: 978, piv: 982, ensp: 8194, emsp: 8195, thinsp: 8201, zwnj: 8204, zwj: 8205, lrm: 8206, rlm: 8207, ndash: 8211, mdash: 8212, lsquo: 8216, rsquo: 8217, sbquo: 8218, ldquo: 8220, rdquo: 8221, bdquo: 8222, dagger: 8224, Dagger: 8225, bull: 8226, hellip: 8230, permil: 8240, prime: 8242, Prime: 8243, lsaquo: 8249, rsaquo: 8250, oline: 8254, frasl: 8260, euro: 8364, image: 8465, weierp: 8472, real: 8476, trade: 8482, alefsym: 8501, larr: 8592, uarr: 8593, rarr: 8594, darr: 8595, harr: 8596, crarr: 8629, lArr: 8656, uArr: 8657, rArr: 8658, dArr: 8659, hArr: 8660, forall: 8704, part: 8706, exist: 8707, empty: 8709, nabla: 8711, isin: 8712, notin: 8713, ni: 8715, prod: 8719, sum: 8721, minus: 8722, lowast: 8727, radic: 8730, prop: 8733, infin: 8734, ang: 8736, and: 8743, or: 8744, cap: 8745, cup: 8746, int: 8747, there4: 8756, sim: 8764, cong: 8773, asymp: 8776, ne: 8800, equiv: 8801, le: 8804, ge: 8805, sub: 8834, sup: 8835, nsub: 8836, sube: 8838, supe: 8839, oplus: 8853, otimes: 8855, perp: 8869, sdot: 8901, lceil: 8968, rceil: 8969, lfloor: 8970, rfloor: 8971, lang: 9001, rang: 9002, loz: 9674, spades: 9824, clubs: 9827, hearts: 9829, diams: 9830 }; Object.keys(ENTITIES).forEach((key) => { const e = ENTITIES[key]; ENTITIES[key] = typeof e === 'number' ? String.fromCharCode(e) : e; }); /** * Internal helper class */ abstract class SAX { EVENTS: string[] = EVENTS; ENTITIES: {[key: string]: number | string} = { // TODO: make it readonly, needed for entity-mega test // amp, gt, lt, quot and apos are resolved to strings instead of numerical // codes, IDK why ...ENTITIES }; protected abstract events: SAXEvents; protected XML_ENTITIES: {[key: string]: string} = { amp: '&', gt: '>', lt: '<', quot: '"', apos: "'" }; protected S: any = 0; protected opt: any; protected trackPosition = false; protected column = 0; protected line = 0; protected c = ''; protected error: any; protected q = ''; protected bufferCheckPosition: any; protected closed = false; protected tags: any[] = []; protected looseCase = ''; protected closedRoot = false; protected sawRoot = false; protected strict = false; protected tag: any; protected strictEntities: any; protected state: any; protected noscript = false; protected attribList: any[] = []; protected ns: any; protected position = 0; private STATE: {[index: string]: any} = { BEGIN: this.S++, // leading byte order mark or whitespace BEGIN_WHITESPACE: this.S++, // leading whitespace TEXT: this.S++, // general stuff TEXT_ENTITY: this.S++, // &amp and such. OPEN_WAKA: this.S++, // < SGML_DECL: this.S++, // <!BLARG SGML_DECL_QUOTED: this.S++, // <!BLARG foo "bar DOCTYPE: this.S++, // <!DOCTYPE DOCTYPE_QUOTED: this.S++, // <!DOCTYPE "//blah DOCTYPE_DTD: this.S++, // <!DOCTYPE "//blah" [ ... DOCTYPE_DTD_QUOTED: this.S++, // <!DOCTYPE "//blah" [ "foo COMMENT_STARTING: this.S++, // <!- COMMENT: this.S++, // <!-- COMMENT_ENDING: this.S++, // <!-- blah - COMMENT_ENDED: this.S++, // <!-- blah -- CDATA: this.S++, // <![CDATA[ something CDATA_ENDING: this.S++, // ] CDATA_ENDING_2: this.S++, // ]] PROC_INST: this.S++, // <?hi PROC_INST_BODY: this.S++, // <?hi there PROC_INST_ENDING: this.S++, // <?hi "there" ? OPEN_TAG: this.S++, // <strong OPEN_TAG_SLASH: this.S++, // <strong / ATTRIB: this.S++, // <a ATTRIB_NAME: this.S++, // <a foo ATTRIB_NAME_SAW_WHITE: this.S++, // <a foo _ ATTRIB_VALUE: this.S++, // <a foo= ATTRIB_VALUE_QUOTED: this.S++, // <a foo="bar ATTRIB_VALUE_CLOSED: this.S++, // <a foo="bar" ATTRIB_VALUE_UNQUOTED: this.S++, // <a foo=bar ATTRIB_VALUE_ENTITY_Q: this.S++, // <foo bar="&quot;" ATTRIB_VALUE_ENTITY_U: this.S++, // <foo bar=&quot CLOSE_TAG: this.S++, // </a CLOSE_TAG_SAW_WHITE: this.S++, // </a > SCRIPT: this.S++, // <script> ... SCRIPT_ENDING: this.S++ // <script> ... < }; private readonly BUFFERS: string[] = BUFFERS; // private parser: (strict: boolean, opt: any) => SAXParser; private CDATA = '[CDATA['; private DOCTYPE = 'DOCTYPE'; private XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'; private XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'; protected rootNS: {} = { xml: this.XML_NAMESPACE, xmlns: this.XMLNS_NAMESPACE }; private comment: any; private sgmlDecl: any; private textNode = ''; private tagName: any; private doctype: any; private procInstName: any; private procInstBody: any; private entity = ''; private attribName: any; private attribValue: any; private cdata = ''; private script = ''; private startTagPosition = 0; constructor() { this.S = 0; for (const s in this.STATE) { if (this.STATE.hasOwnProperty(s)) { this.STATE[this.STATE[s]] = s; } } // shorthand this.S = this.STATE; } private static charAt(chunk: string, i: number): string { let result = ''; if (i < chunk.length) { result = chunk.charAt(i); } return result; } private static isWhitespace(c: string): boolean { return c === ' ' || c === '\n' || c === '\r' || c === '\t'; } private static isQuote(c: string): boolean { return c === '"' || c === "'"; } private static isAttribEnd(c: string): boolean { return c === '>' || SAX.isWhitespace(c); } private static isMatch(regex: RegExp, c: string): boolean { return regex.test(c); } private static notMatch(regex: RegExp, c: string): boolean { return !SAX.isMatch(regex, c); } private static qname( name: string, attribute?: string | boolean ): {prefix: string; local: string} { const i = name.indexOf(':'); const qualName = i < 0 ? ['', name] : name.split(':'); let prefix = qualName[0]; let local = qualName[1]; // <x "xmlns"="http://foo"> if (attribute && name === 'xmlns') { prefix = 'xmlns'; local = ''; } return {prefix, local}; } write(chunk: null | object | string): this | SAXParser { if (this.error) { throw this.error; } if (this.closed) { return this.errorFunction('Cannot write after close. Assign an onready handler.'); } if (chunk === null) { return this.end(); } if (typeof chunk === 'object') { chunk = chunk.toString(); } let i = 0; let c: string; while (true) { c = SAX.charAt(chunk, i++); this.c = c; if (!c) { break; } if (this.trackPosition) { this.position++; if (c === '\n') { this.line++; this.column = 0; } else { this.column++; } } switch (this.state) { case this.S.BEGIN: this.state = this.S.BEGIN_WHITESPACE; if (c === '\uFEFF') { continue; } this.beginWhiteSpace(c); continue; case this.S.BEGIN_WHITESPACE: this.beginWhiteSpace(c); continue; case this.S.TEXT: if (this.sawRoot && !this.closedRoot) { const starti = i - 1; while (c && c !== '<' && c !== '&') { c = SAX.charAt(chunk, i++); if (c && this.trackPosition) { this.position++; if (c === '\n') { this.line++; this.column = 0; } else { this.column++; } } } this.textNode += chunk.substring(starti, i - 1); } if (c === '<' && !(this.sawRoot && this.closedRoot && !this.strict)) { this.state = this.S.OPEN_WAKA; this.startTagPosition = this.position; } else { if (!SAX.isWhitespace(c) && (!this.sawRoot || this.closedRoot)) { this.strictFail('Text data outside of root node.'); } if (c === '&') { this.state = this.S.TEXT_ENTITY; } else { this.textNode += c; } } continue; case this.S.SCRIPT: // only non-strict if (c === '<') { this.state = this.S.SCRIPT_ENDING; } else { this.script += c; } continue; case this.S.SCRIPT_ENDING: if (c === '/') { this.state = this.S.CLOSE_TAG; } else { this.script += `<${c}`; this.state = this.S.SCRIPT; } continue; case this.S.OPEN_WAKA: // either a /, ?, !, or text is coming next. if (c === '!') { this.state = this.S.SGML_DECL; this.sgmlDecl = ''; } else if (SAX.isWhitespace(c)) { // wait for it... } else if (SAX.isMatch(nameStart, c)) { this.state = this.S.OPEN_TAG; this.tagName = c; } else if (c === '/') { this.state = this.S.CLOSE_TAG; this.tagName = ''; } else if (c === '?') { this.state = this.S.PROC_INST; this.procInstName = this.procInstBody = ''; } else { this.strictFail('Unencoded <'); // if there was some whitespace, then add that in. if (this.startTagPosition + 1 < this.position) { const pad = this.position - this.startTagPosition; c = new Array(pad).join(' ') + c; } this.textNode += `<${c}`; this.state = this.S.TEXT; } continue; case this.S.SGML_DECL: if ((this.sgmlDecl + c).toUpperCase() === this.CDATA) { this.emitNode('onopencdata'); this.state = this.S.CDATA; this.sgmlDecl = ''; this.cdata = ''; } else if (this.sgmlDecl + c === '--') { this.state = this.S.COMMENT; this.comment = ''; this.sgmlDecl = ''; } else if ((this.sgmlDecl + c).toUpperCase() === this.DOCTYPE) { this.state = this.S.DOCTYPE; if (this.doctype || this.sawRoot) { this.strictFail('Inappropriately located doctype declaration'); } this.doctype = ''; this.sgmlDecl = ''; } else if (c === '>') { this.emitNode('onsgmldeclaration', this.sgmlDecl); this.sgmlDecl = ''; this.state = this.S.TEXT; } else if (SAX.isQuote(c)) { this.state = this.S.SGML_DECL_QUOTED; this.sgmlDecl += c; } else { this.sgmlDecl += c; } continue; case this.S.SGML_DECL_QUOTED: if (c === this.q) { this.state = this.S.SGML_DECL; this.q = ''; } this.sgmlDecl += c; continue; case this.S.DOCTYPE: if (c === '>') { this.state = this.S.TEXT; this.emitNode('ondoctype', this.doctype); this.doctype = true; // just remember that we saw it. } else { this.doctype += c; if (c === '[') { this.state = this.S.DOCTYPE_DTD; } else if (SAX.isQuote(c)) { this.state = this.S.DOCTYPE_QUOTED; this.q = c; } } continue; case this.S.DOCTYPE_QUOTED: this.doctype += c; if (c === this.q) { this.q = ''; this.state = this.S.DOCTYPE; } continue; case this.S.DOCTYPE_DTD: this.doctype += c; if (c === ']') { this.state = this.S.DOCTYPE; } else if (SAX.isQuote(c)) { this.state = this.S.DOCTYPE_DTD_QUOTED; this.q = c; } continue; case this.S.DOCTYPE_DTD_QUOTED: this.doctype += c; if (c === this.q) { this.state = this.S.DOCTYPE_DTD; this.q = ''; } continue; case this.S.COMMENT: if (c === '-') { this.state = this.S.COMMENT_ENDING; } else { this.comment += c; } continue; case this.S.COMMENT_ENDING: if (c === '-') { this.state = this.S.COMMENT_ENDED; this.comment = this.textApplyOptions(this.comment); if (this.comment) { this.emitNode('oncomment', this.comment); } this.comment = ''; } else { this.comment += `-${c}`; this.state = this.S.COMMENT; } continue; case this.S.COMMENT_ENDED: if (c !== '>') { this.strictFail('Malformed comment'); // allow <!-- blah -- bloo --> in non-strict mode, // which is a comment of " blah -- bloo " this.comment += `--${c}`; this.state = this.S.COMMENT; } else { this.state = this.S.TEXT; } continue; case this.S.CDATA: if (c === ']') { this.state = this.S.CDATA_ENDING; } else { this.cdata += c; } continue; case this.S.CDATA_ENDING: if (c === ']') { this.state = this.S.CDATA_ENDING_2; } else { this.cdata += `]${c}`; this.state = this.S.CDATA; } continue; case this.S.CDATA_ENDING_2: if (c === '>') { if (this.cdata) { this.emitNode('oncdata', this.cdata); } this.emitNode('onclosecdata'); this.cdata = ''; this.state = this.S.TEXT; } else if (c === ']') { this.cdata += ']'; } else { this.cdata += `]]${c}`; this.state = this.S.CDATA; } continue; case this.S.PROC_INST: if (c === '?') { this.state = this.S.PROC_INST_ENDING; } else if (SAX.isWhitespace(c)) { this.state = this.S.PROC_INST_BODY; } else { this.procInstName += c; } continue; case this.S.PROC_INST_BODY: if (!this.procInstBody && SAX.isWhitespace(c)) { continue; } else if (c === '?') { this.state = this.S.PROC_INST_ENDING; } else { this.procInstBody += c; } continue; case this.S.PROC_INST_ENDING: if (c === '>') { this.emitNode('onprocessinginstruction', { name: this.procInstName, body: this.procInstBody }); this.procInstName = this.procInstBody = ''; this.state = this.S.TEXT; } else { this.procInstBody += `?${c}`; this.state = this.S.PROC_INST_BODY; } continue; case this.S.OPEN_TAG: if (SAX.isMatch(nameBody, c)) { this.tagName += c; } else { this.newTag(); if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else { if (!SAX.isWhitespace(c)) { this.strictFail('Invalid character in tag name'); } this.state = this.S.ATTRIB; } } continue; case this.S.OPEN_TAG_SLASH: if (c === '>') { this.openTag(true); this.closeTag(); } else { this.strictFail('Forward-slash in opening tag not followed by >'); this.state = this.S.ATTRIB; } continue; case this.S.ATTRIB: // haven't read the attribute name yet. if (SAX.isWhitespace(c)) { continue; } else if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else if (SAX.isMatch(nameStart, c)) { this.attribName = c; this.attribValue = ''; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); } continue; case this.S.ATTRIB_NAME: if (c === '=') { this.state = this.S.ATTRIB_VALUE; } else if (c === '>') { this.strictFail('Attribute without value'); this.attribValue = this.attribName; this.attrib(); this.openTag(); } else if (SAX.isWhitespace(c)) { this.state = this.S.ATTRIB_NAME_SAW_WHITE; } else if (SAX.isMatch(nameBody, c)) { this.attribName += c; } else { this.strictFail('Invalid attribute name'); } continue; case this.S.ATTRIB_NAME_SAW_WHITE: if (c === '=') { this.state = this.S.ATTRIB_VALUE; } else if (SAX.isWhitespace(c)) { continue; } else { this.strictFail('Attribute without value'); this.tag.attributes[this.attribName] = ''; this.attribValue = ''; this.emitNode('onattribute', { name: this.attribName, value: '' }); this.attribName = ''; if (c === '>') { this.openTag(); } else if (SAX.isMatch(nameStart, c)) { this.attribName = c; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); this.state = this.S.ATTRIB; } } continue; case this.S.ATTRIB_VALUE: if (SAX.isWhitespace(c)) { continue; } else if (SAX.isQuote(c)) { this.q = c; this.state = this.S.ATTRIB_VALUE_QUOTED; } else { this.strictFail('Unquoted attribute value'); this.state = this.S.ATTRIB_VALUE_UNQUOTED; this.attribValue = c; } continue; case this.S.ATTRIB_VALUE_QUOTED: if (c !== this.q) { if (c === '&') { this.state = this.S.ATTRIB_VALUE_ENTITY_Q; } else { this.attribValue += c; } continue; } this.attrib(); this.q = ''; this.state = this.S.ATTRIB_VALUE_CLOSED; continue; case this.S.ATTRIB_VALUE_CLOSED: if (SAX.isWhitespace(c)) { this.state = this.S.ATTRIB; } else if (c === '>') { this.openTag(); } else if (c === '/') { this.state = this.S.OPEN_TAG_SLASH; } else if (SAX.isMatch(nameStart, c)) { this.strictFail('No whitespace between attributes'); this.attribName = c; this.attribValue = ''; this.state = this.S.ATTRIB_NAME; } else { this.strictFail('Invalid attribute name'); } continue; case this.S.ATTRIB_VALUE_UNQUOTED: if (!SAX.isAttribEnd(c)) { if (c === '&') { this.state = this.S.ATTRIB_VALUE_ENTITY_U; } else { this.attribValue += c; } continue; } this.attrib(); if (c === '>') { this.openTag(); } else { this.state = this.S.ATTRIB; } continue; case this.S.CLOSE_TAG: if (!this.tagName) { if (SAX.isWhitespace(c)) { continue; } else if (SAX.notMatch(nameStart, c)) { if (this.script) { this.script += `</${c}`; this.state = this.S.SCRIPT; } else { this.strictFail('Invalid tagname in closing tag.'); } } else { this.tagName = c; } } else if (c === '>') { this.closeTag(); } else if (SAX.isMatch(nameBody, c)) { this.tagName += c; } else if (this.script) { this.script += `</${this.tagName}`; this.tagName = ''; this.state = this.S.SCRIPT; } else { if (!SAX.isWhitespace(c)) { this.strictFail('Invalid tagname in closing tag'); } this.state = this.S.CLOSE_TAG_SAW_WHITE; } continue; case this.S.CLOSE_TAG_SAW_WHITE: if (SAX.isWhitespace(c)) { continue; } if (c === '>') { this.closeTag(); } else { this.strictFail('Invalid characters in closing tag'); } continue; case this.S.TEXT_ENTITY: case this.S.ATTRIB_VALUE_ENTITY_Q: case this.S.ATTRIB_VALUE_ENTITY_U: let returnState; let buffer; switch (this.state) { case this.S.TEXT_ENTITY: returnState = this.S.TEXT; buffer = 'textNode'; break; case this.S.ATTRIB_VALUE_ENTITY_Q: returnState = this.S.ATTRIB_VALUE_QUOTED; buffer = 'attribValue'; break; case this.S.ATTRIB_VALUE_ENTITY_U: returnState = this.S.ATTRIB_VALUE_UNQUOTED; buffer = 'attribValue'; break; default: throw new Error(`Unknown state: ${this.state}`); } if (c === ';') { this[buffer] += this.parseEntity(); this.entity = ''; this.state = returnState; } else if (SAX.isMatch(this.entity.length ? entityBody : entityStart, c)) { this.entity += c; } else { this.strictFail('Invalid character in entity name'); this[buffer] += `&${this.entity}${c}`; this.entity = ''; this.state = returnState; } continue; default: throw new Error(`Unknown state: ${this.state}`); } } // while if (this.position >= this.bufferCheckPosition) { this.checkBufferLength(); } return this; } protected emit(event: string, data?: Error | {}): void { if (this.events.hasOwnProperty(event)) { const eventName = event.replace(/^on/, ''); this.events[event](data, eventName, this); } } protected clearBuffers(): void { for (let i = 0, l = this.BUFFERS.length; i < l; i++) { this[this[i]] = ''; } } protected flushBuffers(): void { this.closeText(); if (this.cdata !== '') { this.emitNode('oncdata', this.cdata); this.cdata = ''; } if (this.script !== '') { this.emitNode('onscript', this.script); this.script = ''; } } protected end(): SAXParser { if (this.sawRoot && !this.closedRoot) this.strictFail('Unclosed root tag'); if ( this.state !== this.S.BEGIN && this.state !== this.S.BEGIN_WHITESPACE && this.state !== this.S.TEXT ) { this.errorFunction('Unexpected end'); } this.closeText(); this.c = ''; this.closed = true; this.emit('onend'); return new SAXParser(this.opt); } protected errorFunction(er: string): this { this.closeText(); if (this.trackPosition) { er += `\nLine: ${this.line}\nColumn: ${this.column}\nChar: ${this.c}`; } const error = new Error(er); this.error = error; this.emit('onerror', error); return this; } private attrib(): void { if (!this.strict) { this.attribName = this.attribName[this.looseCase](); } if ( this.attribList.indexOf(this.attribName) !== -1 || this.tag.attributes.hasOwnProperty(this.attribName) ) { this.attribName = this.attribValue = ''; return; } if (this.opt.xmlns) { const qn = SAX.qname(this.attribName, true); const prefix = qn.prefix; const local = qn.local; if (prefix === 'xmlns') { // namespace binding attribute. push the binding into scope if (local === 'xml' && this.attribValue !== this.XML_NAMESPACE) { this.strictFail( `xml: prefix must be bound to ${this.XML_NAMESPACE}\n` + `Actual: ${this.attribValue}` ); } else if (local === 'xmlns' && this.attribValue !== this.XMLNS_NAMESPACE) { this.strictFail( `xmlns: prefix must be bound to ${this.XMLNS_NAMESPACE}\n` + `Actual: ${this.attribValue}` ); } else { const tag = this.tag; const parent = this.tags[this.tags.length - 1] || this; if (tag.ns === parent.ns) { tag.ns = Object.create(parent.ns); } tag.ns[local] = this.attribValue; } } // defer onattribute events until all attributes have been seen // so any new bindings can take effect. preserve attribute order // so deferred events can be emitted in document order this.attribList.push([this.attribName, this.attribValue]); } else { // in non-xmlns mode, we can emit the event right away this.tag.attributes[this.attribName] = this.attribValue; this.emitNode('onattribute', { name: this.attribName, value: this.attribValue }); } this.attribName = this.attribValue = ''; } private newTag(): void { if (!this.strict) this.tagName = this.tagName[this.looseCase](); const parent = this.tags[this.tags.length - 1] || this; const tag: any = (this.tag = {name: this.tagName, attributes: {}}); // will be overridden if tag contains an xmlns="foo" or xmlns:foo="bar" if (this.opt.xmlns) { tag.ns = parent.ns; } this.attribList.length = 0; this.emitNode('onopentagstart', tag); } private parseEntity(): string | number { let entity = this.entity; const entityLC = entity.toLowerCase(); let num = NaN; let numStr = ''; if (this.ENTITIES[entity]) { return this.ENTITIES[entity]; } if (this.ENTITIES[entityLC]) { return this.ENTITIES[entityLC]; } entity = entityLC; if (entity.charAt(0) === '#') { if (entity.charAt(1) === 'x') { entity = entity.slice(2); // TODO: remove tslint:disable // tslint:disable-next-line num = parseInt(entity, 16); numStr = num.toString(16); } else { entity = entity.slice(1); // TODO: remove tslint:disable // tslint:disable-next-line num = parseInt(entity, 10); numStr = num.toString(10); } } entity = entity.replace(/^0+/, ''); if (isNaN(num) || numStr.toLowerCase() !== entity) { this.strictFail('Invalid character entity'); return `&${this.entity};`; } return String.fromCodePoint(num); } private beginWhiteSpace(c: string): void { if (c === '<') { this.state = this.S.OPEN_WAKA; this.startTagPosition = this.position; } else if (!SAX.isWhitespace(c)) { // have to process this as a text node. // weird, but happens. this.strictFail('Non-whitespace before first tag.'); this.textNode = c; this.state = this.S.TEXT; } else { } } private strictFail(message: string): void { if (typeof this !== 'object' || !(this instanceof SAXParser)) { throw new Error('bad call to strictFail'); } if (this.strict) { this.errorFunction(message); } } private textApplyOptions(text: string): string { if (this.opt.trim) text = text.trim(); if (this.opt.normalize) text = text.replace(/\s+/g, ' '); return text; } private emitNode(nodeType: string, data?: {}): void { if (this.textNode) this.closeText(); this.emit(nodeType, data); } private closeText(): void { this.textNode = this.textApplyOptions(this.textNode); // TODO: figure out why this.textNode can be "" and "undefined" if (this.textNode !== undefined && this.textNode !== '' && this.textNode !== 'undefined') { this.emit('ontext', this.textNode); } this.textNode = ''; } private checkBufferLength(): void { const maxAllowed = Math.max(this.opt.MAX_BUFFER_LENGTH, 10); let maxActual = 0; for (let i = 0, l = this.BUFFERS.length; i < l; i++) { const len = this[this.BUFFERS[i]]?.length || 0; if (len > maxAllowed) { // Text/cdata nodes can get big, and since they're buffered, // we can get here under normal conditions. // Avoid issues by emitting the text node now, // so at least it won't get any bigger. switch (this.BUFFERS[i]) { case 'textNode': this.closeText(); break; case 'cdata': this.emitNode('oncdata', this.cdata); this.cdata = ''; break; case 'script': this.emitNode('onscript', this.script); this.script = ''; break; default: this.errorFunction(`Max buffer length exceeded: ${this.BUFFERS[i]}`); } } maxActual = Math.max(maxActual, len); } // schedule the next check for the earliest possible buffer overrun. const m = this.opt.MAX_BUFFER_LENGTH - maxActual; this.bufferCheckPosition = m + this.position; } private openTag(selfClosing?: boolean): void { if (this.opt.xmlns) { // emit namespace binding events const tag = this.tag; // add namespace info to tag const qn = SAX.qname(this.tagName); tag.prefix = qn.prefix; tag.local = qn.local; tag.uri = tag.ns[qn.prefix] || ''; if (tag.prefix && !tag.uri) { this.strictFail(`Unbound namespace prefix: ${JSON.stringify(this.tagName)}`); tag.uri = qn.prefix; } const parent = this.tags[this.tags.length - 1] || this; if (tag.ns && parent.ns !== tag.ns) { const that = this; Object.keys(tag.ns).forEach((p) => { that.emitNode('onopennamespace', { prefix: p, uri: tag.ns[p] }); }); } // handle deferred onattribute events // Note: do not apply default ns to attributes: // http://www.w3.org/TR/REC-xml-names/#defaulting for (let i = 0, l = this.attribList.length; i < l; i++) { const nv = this.attribList[i]; const name = nv[0]; const value = nv[1]; const qualName = SAX.qname(name, true); const prefix = qualName.prefix; const local = qualName.local; const uri = prefix === '' ? '' : tag.ns[prefix] || ''; const a = { name, value, prefix, local, uri }; // if there's any attributes with an undefined namespace, // then fail on them now. if (prefix && prefix !== 'xmlns' && !uri) { this.strictFail(`Unbound namespace prefix: ${JSON.stringify(prefix)}`); a.uri = prefix; } this.tag.attributes[name] = a; this.emitNode('onattribute', a); } this.attribList.length = 0; } this.tag.isSelfClosing = Boolean(selfClosing); // process the tag this.sawRoot = true; this.tags.push(this.tag); this.emitNode('onopentag', this.tag); if (!selfClosing) { // special case for <script> in non-strict mode. if (!this.noscript && this.tagName.toLowerCase() === 'script') { this.state = this.S.SCRIPT; } else { this.state = this.S.TEXT; } this.tag = null; this.tagName = ''; } this.attribName = this.attribValue = ''; this.attribList.length = 0; } private closeTag(): void { if (!this.tagName) { this.strictFail('Weird empty close tag.'); this.textNode += '</>'; this.state = this.S.TEXT; return; } if (this.script) { if (this.tagName !== 'script') { this.script += `</${this.tagName}>`; this.tagName = ''; this.state = this.S.SCRIPT; return; } this.emitNode('onscript', this.script); this.script = ''; } // first make sure that the closing tag actually exists. // <a><b></c></b></a> will close everything, otherwise. let t = this.tags.length; let tagName = this.tagName; if (!this.strict) { tagName = tagName[this.looseCase](); } while (t--) { const close = this.tags[t]; if (close.name !== tagName) { // fail the first time in strict mode this.strictFail('Unexpected close tag'); } else { break; } } // didn't find it. we already failed for strict, so just abort. if (t < 0) { this.strictFail(`Unmatched closing tag: ${this.tagName}`); this.textNode += `</${this.tagName}>`; this.state = this.S.TEXT; return; } this.tagName = tagName; let s = this.tags.length; while (s-- > t) { const tag = (this.tag = this.tags.pop()); this.tagName = this.tag.name; this.emitNode('onclosetag', this.tagName); const x: {[index: string]: any} = {}; for (const i in tag.ns) { if (tag.ns.hasOwnProperty(i)) { x[i] = tag.ns[i]; } } const parent = this.tags[this.tags.length - 1] || this; if (this.opt.xmlns && tag.ns !== parent.ns) { // remove namespace bindings introduced by tag const that = this; Object.keys(tag.ns).forEach((p) => { const n = tag.ns[p]; that.emitNode('onclosenamespace', {prefix: p, uri: n}); }); } } if (t === 0) this.closedRoot = true; this.tagName = this.attribValue = this.attribName = ''; this.attribList.length = 0; this.state = this.S.TEXT; } } /** * * @todo Weird inheritance, with some variables initialized in subclass */ export class SAXParser extends SAX { static ENTITIES = ENTITIES; opt: Required<SAXParserOptions> = DEFAULT_SAX_PARSER_OPTIONS; events: Required<SAXEvents> = DEFAULT_SAX_EVENTS; constructor(opt?: SAXParserOptions) { super(); this.clearBuffers(); this.opt = opt = {...this.opt, ...opt}; this.events = {...this.events, ...opt}; this.q = this.c = ''; this.opt.lowercase = this.opt.lowercase || this.opt.lowercasetags; this.bufferCheckPosition = this.opt.MAX_BUFFER_LENGTH; this.looseCase = this.opt.lowercase ? 'toLowerCase' : 'toUpperCase'; this.tags = []; this.closed = this.closedRoot = this.sawRoot = false; this.tag = this.error = null; this.strict = Boolean(this.opt.strict); this.noscript = Boolean(this.opt.strict || this.opt.noscript); this.state = this.S.BEGIN; this.strictEntities = this.opt.strictEntities; this.ENTITIES = this.strictEntities ? Object.create(this.XML_ENTITIES) : Object.create(this.ENTITIES); this.attribList = []; // namespaces form a prototype chain. // it always points at the current tag, // which protos to its parent tag. if (this.opt.xmlns) { this.ns = Object.create(this.rootNS); } // mostly just for error reporting this.trackPosition = this.opt.position !== false; if (this.trackPosition) { this.position = this.line = this.column = 0; } this.emit('onready'); } resume(): this { this.error = null; return this; } close(): this | SAXParser { return this.write(null); } flush(): void { this.flushBuffers(); } }