@loaders.gl/xml
Version:
Framework-independent loaders for the XML (eXtensible Markup Language) format
1,454 lines (1,453 loc) • 48.9 kB
JavaScript
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors
const DEFAULT_SAX_EVENTS = {
ontext: () => { },
onprocessinginstruction: () => { },
onsgmldeclaration: () => { },
ondoctype: () => { },
oncomment: () => { },
onopentagstart: () => { },
onattribute: () => { },
onopentag: () => { },
onclosetag: () => { },
onopencdata: () => { },
oncdata: () => { },
onclosecdata: () => { },
onerror: () => { },
onend: () => { },
onready: () => { },
onscript: () => { },
onopennamespace: () => { },
onclosenamespace: () => { }
};
const DEFAULT_SAX_PARSER_OPTIONS = {
...DEFAULT_SAX_EVENTS,
strict: false,
MAX_BUFFER_LENGTH: 64 * 1024,
lowercase: false,
lowercasetags: false,
noscript: false,
strictEntities: false,
xmlns: undefined,
position: undefined,
trim: undefined,
normalize: undefined
};
const EVENTS = [
'text',
'processinginstruction',
'sgmldeclaration',
'doctype',
'comment',
'opentagstart',
'attribute',
'opentag',
'closetag',
'opencdata',
'cdata',
'closecdata',
'error',
'end',
'ready',
'script',
'opennamespace',
'closenamespace'
];
const BUFFERS = [
'comment',
'sgmlDecl',
'textNode',
'tagName',
'doctype',
'procInstName',
'procInstBody',
'entity',
'attribName',
'attribValue',
'cdata',
'script'
];
const nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/;
const nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/;
const entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/;
const entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/;
export const ENTITIES = {
amp: '&',
gt: '>',
lt: '<',
quot: '"',
apos: "'",
AElig: 198,
Aacute: 193,
Acirc: 194,
Agrave: 192,
Aring: 197,
Atilde: 195,
Auml: 196,
Ccedil: 199,
ETH: 208,
Eacute: 201,
Ecirc: 202,
Egrave: 200,
Euml: 203,
Iacute: 205,
Icirc: 206,
Igrave: 204,
Iuml: 207,
Ntilde: 209,
Oacute: 211,
Ocirc: 212,
Ograve: 210,
Oslash: 216,
Otilde: 213,
Ouml: 214,
THORN: 222,
Uacute: 218,
Ucirc: 219,
Ugrave: 217,
Uuml: 220,
Yacute: 221,
aacute: 225,
acirc: 226,
aelig: 230,
agrave: 224,
aring: 229,
atilde: 227,
auml: 228,
ccedil: 231,
eacute: 233,
ecirc: 234,
egrave: 232,
eth: 240,
euml: 235,
iacute: 237,
icirc: 238,
igrave: 236,
iuml: 239,
ntilde: 241,
oacute: 243,
ocirc: 244,
ograve: 242,
oslash: 248,
otilde: 245,
ouml: 246,
szlig: 223,
thorn: 254,
uacute: 250,
ucirc: 251,
ugrave: 249,
uuml: 252,
yacute: 253,
yuml: 255,
copy: 169,
reg: 174,
nbsp: 160,
iexcl: 161,
cent: 162,
pound: 163,
curren: 164,
yen: 165,
brvbar: 166,
sect: 167,
uml: 168,
ordf: 170,
laquo: 171,
not: 172,
shy: 173,
macr: 175,
deg: 176,
plusmn: 177,
sup1: 185,
sup2: 178,
sup3: 179,
acute: 180,
micro: 181,
para: 182,
middot: 183,
cedil: 184,
ordm: 186,
raquo: 187,
frac14: 188,
frac12: 189,
frac34: 190,
iquest: 191,
times: 215,
divide: 247,
OElig: 338,
oelig: 339,
Scaron: 352,
scaron: 353,
Yuml: 376,
fnof: 402,
circ: 710,
tilde: 732,
Alpha: 913,
Beta: 914,
Gamma: 915,
Delta: 916,
Epsilon: 917,
Zeta: 918,
Eta: 919,
Theta: 920,
Iota: 921,
Kappa: 922,
Lambda: 923,
Mu: 924,
Nu: 925,
Xi: 926,
Omicron: 927,
Pi: 928,
Rho: 929,
Sigma: 931,
Tau: 932,
Upsilon: 933,
Phi: 934,
Chi: 935,
Psi: 936,
Omega: 937,
alpha: 945,
beta: 946,
gamma: 947,
delta: 948,
epsilon: 949,
zeta: 950,
eta: 951,
theta: 952,
iota: 953,
kappa: 954,
lambda: 955,
mu: 956,
nu: 957,
xi: 958,
omicron: 959,
pi: 960,
rho: 961,
sigmaf: 962,
sigma: 963,
tau: 964,
upsilon: 965,
phi: 966,
chi: 967,
psi: 968,
omega: 969,
thetasym: 977,
upsih: 978,
piv: 982,
ensp: 8194,
emsp: 8195,
thinsp: 8201,
zwnj: 8204,
zwj: 8205,
lrm: 8206,
rlm: 8207,
ndash: 8211,
mdash: 8212,
lsquo: 8216,
rsquo: 8217,
sbquo: 8218,
ldquo: 8220,
rdquo: 8221,
bdquo: 8222,
dagger: 8224,
Dagger: 8225,
bull: 8226,
hellip: 8230,
permil: 8240,
prime: 8242,
Prime: 8243,
lsaquo: 8249,
rsaquo: 8250,
oline: 8254,
frasl: 8260,
euro: 8364,
image: 8465,
weierp: 8472,
real: 8476,
trade: 8482,
alefsym: 8501,
larr: 8592,
uarr: 8593,
rarr: 8594,
darr: 8595,
harr: 8596,
crarr: 8629,
lArr: 8656,
uArr: 8657,
rArr: 8658,
dArr: 8659,
hArr: 8660,
forall: 8704,
part: 8706,
exist: 8707,
empty: 8709,
nabla: 8711,
isin: 8712,
notin: 8713,
ni: 8715,
prod: 8719,
sum: 8721,
minus: 8722,
lowast: 8727,
radic: 8730,
prop: 8733,
infin: 8734,
ang: 8736,
and: 8743,
or: 8744,
cap: 8745,
cup: 8746,
int: 8747,
there4: 8756,
sim: 8764,
cong: 8773,
asymp: 8776,
ne: 8800,
equiv: 8801,
le: 8804,
ge: 8805,
sub: 8834,
sup: 8835,
nsub: 8836,
sube: 8838,
supe: 8839,
oplus: 8853,
otimes: 8855,
perp: 8869,
sdot: 8901,
lceil: 8968,
rceil: 8969,
lfloor: 8970,
rfloor: 8971,
lang: 9001,
rang: 9002,
loz: 9674,
spades: 9824,
clubs: 9827,
hearts: 9829,
diams: 9830
};
Object.keys(ENTITIES).forEach((key) => {
const e = ENTITIES[key];
ENTITIES[key] = typeof e === 'number' ? String.fromCharCode(e) : e;
});
/**
* Internal helper class
*/
class SAX {
EVENTS = EVENTS;
ENTITIES = {
// TODO: make it readonly, needed for entity-mega test
// amp, gt, lt, quot and apos are resolved to strings instead of numerical
// codes, IDK why
...ENTITIES
};
XML_ENTITIES = {
amp: '&',
gt: '>',
lt: '<',
quot: '"',
apos: "'"
};
S = 0;
opt;
trackPosition = false;
column = 0;
line = 0;
c = '';
error;
q = '';
bufferCheckPosition;
closed = false;
tags = [];
looseCase = '';
closedRoot = false;
sawRoot = false;
strict = false;
tag;
strictEntities;
state;
noscript = false;
attribList = [];
ns;
position = 0;
STATE = {
BEGIN: this.S++, // leading byte order mark or whitespace
BEGIN_WHITESPACE: this.S++, // leading whitespace
TEXT: this.S++, // general stuff
TEXT_ENTITY: this.S++, // & and such.
OPEN_WAKA: this.S++, // <
SGML_DECL: this.S++, // <!BLARG
SGML_DECL_QUOTED: this.S++, // <!BLARG foo "bar
DOCTYPE: this.S++, // <!DOCTYPE
DOCTYPE_QUOTED: this.S++, // <!DOCTYPE "//blah
DOCTYPE_DTD: this.S++, // <!DOCTYPE "//blah" [ ...
DOCTYPE_DTD_QUOTED: this.S++, // <!DOCTYPE "//blah" [ "foo
COMMENT_STARTING: this.S++, // <!-
COMMENT: this.S++, // <!--
COMMENT_ENDING: this.S++, // <!-- blah -
COMMENT_ENDED: this.S++, // <!-- blah --
CDATA: this.S++, // <![CDATA[ something
CDATA_ENDING: this.S++, // ]
CDATA_ENDING_2: this.S++, // ]]
PROC_INST: this.S++, // <?hi
PROC_INST_BODY: this.S++, // <?hi there
PROC_INST_ENDING: this.S++, // <?hi "there" ?
OPEN_TAG: this.S++, // <strong
OPEN_TAG_SLASH: this.S++, // <strong /
ATTRIB: this.S++, // <a
ATTRIB_NAME: this.S++, // <a foo
ATTRIB_NAME_SAW_WHITE: this.S++, // <a foo _
ATTRIB_VALUE: this.S++, // <a foo=
ATTRIB_VALUE_QUOTED: this.S++, // <a foo="bar
ATTRIB_VALUE_CLOSED: this.S++, // <a foo="bar"
ATTRIB_VALUE_UNQUOTED: this.S++, // <a foo=bar
ATTRIB_VALUE_ENTITY_Q: this.S++, // <foo bar="""
ATTRIB_VALUE_ENTITY_U: this.S++, // <foo bar="
CLOSE_TAG: this.S++, // </a
CLOSE_TAG_SAW_WHITE: this.S++, // </a >
SCRIPT: this.S++, // <script> ...
SCRIPT_ENDING: this.S++ // <script> ... <
};
BUFFERS = BUFFERS;
// private parser: (strict: boolean, opt: any) => SAXParser;
CDATA = '[CDATA[';
DOCTYPE = 'DOCTYPE';
XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace';
XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/';
rootNS = {
xml: this.XML_NAMESPACE,
xmlns: this.XMLNS_NAMESPACE
};
comment;
sgmlDecl;
textNode = '';
tagName;
doctype;
procInstName;
procInstBody;
entity = '';
attribName;
attribValue;
cdata = '';
script = '';
startTagPosition = 0;
constructor() {
this.S = 0;
for (const s in this.STATE) {
if (this.STATE.hasOwnProperty(s)) {
this.STATE[this.STATE[s]] = s;
}
}
// shorthand
this.S = this.STATE;
}
static charAt(chunk, i) {
let result = '';
if (i < chunk.length) {
result = chunk.charAt(i);
}
return result;
}
static isWhitespace(c) {
return c === ' ' || c === '\n' || c === '\r' || c === '\t';
}
static isQuote(c) {
return c === '"' || c === "'";
}
static isAttribEnd(c) {
return c === '>' || SAX.isWhitespace(c);
}
static isMatch(regex, c) {
return regex.test(c);
}
static notMatch(regex, c) {
return !SAX.isMatch(regex, c);
}
static qname(name, attribute) {
const i = name.indexOf(':');
const qualName = i < 0 ? ['', name] : name.split(':');
let prefix = qualName[0];
let local = qualName[1];
// <x "xmlns"="http://foo">
if (attribute && name === 'xmlns') {
prefix = 'xmlns';
local = '';
}
return { prefix, local };
}
write(chunk) {
if (this.error) {
throw this.error;
}
if (this.closed) {
return this.errorFunction('Cannot write after close. Assign an onready handler.');
}
if (chunk === null) {
return this.end();
}
if (typeof chunk === 'object') {
chunk = chunk.toString();
}
let i = 0;
let c;
while (true) {
c = SAX.charAt(chunk, i++);
this.c = c;
if (!c) {
break;
}
if (this.trackPosition) {
this.position++;
if (c === '\n') {
this.line++;
this.column = 0;
}
else {
this.column++;
}
}
switch (this.state) {
case this.S.BEGIN:
this.state = this.S.BEGIN_WHITESPACE;
if (c === '\uFEFF') {
continue;
}
this.beginWhiteSpace(c);
continue;
case this.S.BEGIN_WHITESPACE:
this.beginWhiteSpace(c);
continue;
case this.S.TEXT:
if (this.sawRoot && !this.closedRoot) {
const starti = i - 1;
while (c && c !== '<' && c !== '&') {
c = SAX.charAt(chunk, i++);
if (c && this.trackPosition) {
this.position++;
if (c === '\n') {
this.line++;
this.column = 0;
}
else {
this.column++;
}
}
}
this.textNode += chunk.substring(starti, i - 1);
}
if (c === '<' && !(this.sawRoot && this.closedRoot && !this.strict)) {
this.state = this.S.OPEN_WAKA;
this.startTagPosition = this.position;
}
else {
if (!SAX.isWhitespace(c) && (!this.sawRoot || this.closedRoot)) {
this.strictFail('Text data outside of root node.');
}
if (c === '&') {
this.state = this.S.TEXT_ENTITY;
}
else {
this.textNode += c;
}
}
continue;
case this.S.SCRIPT:
// only non-strict
if (c === '<') {
this.state = this.S.SCRIPT_ENDING;
}
else {
this.script += c;
}
continue;
case this.S.SCRIPT_ENDING:
if (c === '/') {
this.state = this.S.CLOSE_TAG;
}
else {
this.script += `<${c}`;
this.state = this.S.SCRIPT;
}
continue;
case this.S.OPEN_WAKA:
// either a /, ?, !, or text is coming next.
if (c === '!') {
this.state = this.S.SGML_DECL;
this.sgmlDecl = '';
}
else if (SAX.isWhitespace(c)) {
// wait for it...
}
else if (SAX.isMatch(nameStart, c)) {
this.state = this.S.OPEN_TAG;
this.tagName = c;
}
else if (c === '/') {
this.state = this.S.CLOSE_TAG;
this.tagName = '';
}
else if (c === '?') {
this.state = this.S.PROC_INST;
this.procInstName = this.procInstBody = '';
}
else {
this.strictFail('Unencoded <');
// if there was some whitespace, then add that in.
if (this.startTagPosition + 1 < this.position) {
const pad = this.position - this.startTagPosition;
c = new Array(pad).join(' ') + c;
}
this.textNode += `<${c}`;
this.state = this.S.TEXT;
}
continue;
case this.S.SGML_DECL:
if ((this.sgmlDecl + c).toUpperCase() === this.CDATA) {
this.emitNode('onopencdata');
this.state = this.S.CDATA;
this.sgmlDecl = '';
this.cdata = '';
}
else if (this.sgmlDecl + c === '--') {
this.state = this.S.COMMENT;
this.comment = '';
this.sgmlDecl = '';
}
else if ((this.sgmlDecl + c).toUpperCase() === this.DOCTYPE) {
this.state = this.S.DOCTYPE;
if (this.doctype || this.sawRoot) {
this.strictFail('Inappropriately located doctype declaration');
}
this.doctype = '';
this.sgmlDecl = '';
}
else if (c === '>') {
this.emitNode('onsgmldeclaration', this.sgmlDecl);
this.sgmlDecl = '';
this.state = this.S.TEXT;
}
else if (SAX.isQuote(c)) {
this.state = this.S.SGML_DECL_QUOTED;
this.sgmlDecl += c;
}
else {
this.sgmlDecl += c;
}
continue;
case this.S.SGML_DECL_QUOTED:
if (c === this.q) {
this.state = this.S.SGML_DECL;
this.q = '';
}
this.sgmlDecl += c;
continue;
case this.S.DOCTYPE:
if (c === '>') {
this.state = this.S.TEXT;
this.emitNode('ondoctype', this.doctype);
this.doctype = true; // just remember that we saw it.
}
else {
this.doctype += c;
if (c === '[') {
this.state = this.S.DOCTYPE_DTD;
}
else if (SAX.isQuote(c)) {
this.state = this.S.DOCTYPE_QUOTED;
this.q = c;
}
}
continue;
case this.S.DOCTYPE_QUOTED:
this.doctype += c;
if (c === this.q) {
this.q = '';
this.state = this.S.DOCTYPE;
}
continue;
case this.S.DOCTYPE_DTD:
this.doctype += c;
if (c === ']') {
this.state = this.S.DOCTYPE;
}
else if (SAX.isQuote(c)) {
this.state = this.S.DOCTYPE_DTD_QUOTED;
this.q = c;
}
continue;
case this.S.DOCTYPE_DTD_QUOTED:
this.doctype += c;
if (c === this.q) {
this.state = this.S.DOCTYPE_DTD;
this.q = '';
}
continue;
case this.S.COMMENT:
if (c === '-') {
this.state = this.S.COMMENT_ENDING;
}
else {
this.comment += c;
}
continue;
case this.S.COMMENT_ENDING:
if (c === '-') {
this.state = this.S.COMMENT_ENDED;
this.comment = this.textApplyOptions(this.comment);
if (this.comment) {
this.emitNode('oncomment', this.comment);
}
this.comment = '';
}
else {
this.comment += `-${c}`;
this.state = this.S.COMMENT;
}
continue;
case this.S.COMMENT_ENDED:
if (c !== '>') {
this.strictFail('Malformed comment');
// allow <!-- blah -- bloo --> in non-strict mode,
// which is a comment of " blah -- bloo "
this.comment += `--${c}`;
this.state = this.S.COMMENT;
}
else {
this.state = this.S.TEXT;
}
continue;
case this.S.CDATA:
if (c === ']') {
this.state = this.S.CDATA_ENDING;
}
else {
this.cdata += c;
}
continue;
case this.S.CDATA_ENDING:
if (c === ']') {
this.state = this.S.CDATA_ENDING_2;
}
else {
this.cdata += `]${c}`;
this.state = this.S.CDATA;
}
continue;
case this.S.CDATA_ENDING_2:
if (c === '>') {
if (this.cdata) {
this.emitNode('oncdata', this.cdata);
}
this.emitNode('onclosecdata');
this.cdata = '';
this.state = this.S.TEXT;
}
else if (c === ']') {
this.cdata += ']';
}
else {
this.cdata += `]]${c}`;
this.state = this.S.CDATA;
}
continue;
case this.S.PROC_INST:
if (c === '?') {
this.state = this.S.PROC_INST_ENDING;
}
else if (SAX.isWhitespace(c)) {
this.state = this.S.PROC_INST_BODY;
}
else {
this.procInstName += c;
}
continue;
case this.S.PROC_INST_BODY:
if (!this.procInstBody && SAX.isWhitespace(c)) {
continue;
}
else if (c === '?') {
this.state = this.S.PROC_INST_ENDING;
}
else {
this.procInstBody += c;
}
continue;
case this.S.PROC_INST_ENDING:
if (c === '>') {
this.emitNode('onprocessinginstruction', {
name: this.procInstName,
body: this.procInstBody
});
this.procInstName = this.procInstBody = '';
this.state = this.S.TEXT;
}
else {
this.procInstBody += `?${c}`;
this.state = this.S.PROC_INST_BODY;
}
continue;
case this.S.OPEN_TAG:
if (SAX.isMatch(nameBody, c)) {
this.tagName += c;
}
else {
this.newTag();
if (c === '>') {
this.openTag();
}
else if (c === '/') {
this.state = this.S.OPEN_TAG_SLASH;
}
else {
if (!SAX.isWhitespace(c)) {
this.strictFail('Invalid character in tag name');
}
this.state = this.S.ATTRIB;
}
}
continue;
case this.S.OPEN_TAG_SLASH:
if (c === '>') {
this.openTag(true);
this.closeTag();
}
else {
this.strictFail('Forward-slash in opening tag not followed by >');
this.state = this.S.ATTRIB;
}
continue;
case this.S.ATTRIB:
// haven't read the attribute name yet.
if (SAX.isWhitespace(c)) {
continue;
}
else if (c === '>') {
this.openTag();
}
else if (c === '/') {
this.state = this.S.OPEN_TAG_SLASH;
}
else if (SAX.isMatch(nameStart, c)) {
this.attribName = c;
this.attribValue = '';
this.state = this.S.ATTRIB_NAME;
}
else {
this.strictFail('Invalid attribute name');
}
continue;
case this.S.ATTRIB_NAME:
if (c === '=') {
this.state = this.S.ATTRIB_VALUE;
}
else if (c === '>') {
this.strictFail('Attribute without value');
this.attribValue = this.attribName;
this.attrib();
this.openTag();
}
else if (SAX.isWhitespace(c)) {
this.state = this.S.ATTRIB_NAME_SAW_WHITE;
}
else if (SAX.isMatch(nameBody, c)) {
this.attribName += c;
}
else {
this.strictFail('Invalid attribute name');
}
continue;
case this.S.ATTRIB_NAME_SAW_WHITE:
if (c === '=') {
this.state = this.S.ATTRIB_VALUE;
}
else if (SAX.isWhitespace(c)) {
continue;
}
else {
this.strictFail('Attribute without value');
this.tag.attributes[this.attribName] = '';
this.attribValue = '';
this.emitNode('onattribute', {
name: this.attribName,
value: ''
});
this.attribName = '';
if (c === '>') {
this.openTag();
}
else if (SAX.isMatch(nameStart, c)) {
this.attribName = c;
this.state = this.S.ATTRIB_NAME;
}
else {
this.strictFail('Invalid attribute name');
this.state = this.S.ATTRIB;
}
}
continue;
case this.S.ATTRIB_VALUE:
if (SAX.isWhitespace(c)) {
continue;
}
else if (SAX.isQuote(c)) {
this.q = c;
this.state = this.S.ATTRIB_VALUE_QUOTED;
}
else {
this.strictFail('Unquoted attribute value');
this.state = this.S.ATTRIB_VALUE_UNQUOTED;
this.attribValue = c;
}
continue;
case this.S.ATTRIB_VALUE_QUOTED:
if (c !== this.q) {
if (c === '&') {
this.state = this.S.ATTRIB_VALUE_ENTITY_Q;
}
else {
this.attribValue += c;
}
continue;
}
this.attrib();
this.q = '';
this.state = this.S.ATTRIB_VALUE_CLOSED;
continue;
case this.S.ATTRIB_VALUE_CLOSED:
if (SAX.isWhitespace(c)) {
this.state = this.S.ATTRIB;
}
else if (c === '>') {
this.openTag();
}
else if (c === '/') {
this.state = this.S.OPEN_TAG_SLASH;
}
else if (SAX.isMatch(nameStart, c)) {
this.strictFail('No whitespace between attributes');
this.attribName = c;
this.attribValue = '';
this.state = this.S.ATTRIB_NAME;
}
else {
this.strictFail('Invalid attribute name');
}
continue;
case this.S.ATTRIB_VALUE_UNQUOTED:
if (!SAX.isAttribEnd(c)) {
if (c === '&') {
this.state = this.S.ATTRIB_VALUE_ENTITY_U;
}
else {
this.attribValue += c;
}
continue;
}
this.attrib();
if (c === '>') {
this.openTag();
}
else {
this.state = this.S.ATTRIB;
}
continue;
case this.S.CLOSE_TAG:
if (!this.tagName) {
if (SAX.isWhitespace(c)) {
continue;
}
else if (SAX.notMatch(nameStart, c)) {
if (this.script) {
this.script += `</${c}`;
this.state = this.S.SCRIPT;
}
else {
this.strictFail('Invalid tagname in closing tag.');
}
}
else {
this.tagName = c;
}
}
else if (c === '>') {
this.closeTag();
}
else if (SAX.isMatch(nameBody, c)) {
this.tagName += c;
}
else if (this.script) {
this.script += `</${this.tagName}`;
this.tagName = '';
this.state = this.S.SCRIPT;
}
else {
if (!SAX.isWhitespace(c)) {
this.strictFail('Invalid tagname in closing tag');
}
this.state = this.S.CLOSE_TAG_SAW_WHITE;
}
continue;
case this.S.CLOSE_TAG_SAW_WHITE:
if (SAX.isWhitespace(c)) {
continue;
}
if (c === '>') {
this.closeTag();
}
else {
this.strictFail('Invalid characters in closing tag');
}
continue;
case this.S.TEXT_ENTITY:
case this.S.ATTRIB_VALUE_ENTITY_Q:
case this.S.ATTRIB_VALUE_ENTITY_U:
let returnState;
let buffer;
switch (this.state) {
case this.S.TEXT_ENTITY:
returnState = this.S.TEXT;
buffer = 'textNode';
break;
case this.S.ATTRIB_VALUE_ENTITY_Q:
returnState = this.S.ATTRIB_VALUE_QUOTED;
buffer = 'attribValue';
break;
case this.S.ATTRIB_VALUE_ENTITY_U:
returnState = this.S.ATTRIB_VALUE_UNQUOTED;
buffer = 'attribValue';
break;
default:
throw new Error(`Unknown state: ${this.state}`);
}
if (c === ';') {
this[buffer] += this.parseEntity();
this.entity = '';
this.state = returnState;
}
else if (SAX.isMatch(this.entity.length ? entityBody : entityStart, c)) {
this.entity += c;
}
else {
this.strictFail('Invalid character in entity name');
this[buffer] += `&${this.entity}${c}`;
this.entity = '';
this.state = returnState;
}
continue;
default:
throw new Error(`Unknown state: ${this.state}`);
}
} // while
if (this.position >= this.bufferCheckPosition) {
this.checkBufferLength();
}
return this;
}
emit(event, data) {
if (this.events.hasOwnProperty(event)) {
const eventName = event.replace(/^on/, '');
this.events[event](data, eventName, this);
}
}
clearBuffers() {
for (let i = 0, l = this.BUFFERS.length; i < l; i++) {
this[this[i]] = '';
}
}
flushBuffers() {
this.closeText();
if (this.cdata !== '') {
this.emitNode('oncdata', this.cdata);
this.cdata = '';
}
if (this.script !== '') {
this.emitNode('onscript', this.script);
this.script = '';
}
}
end() {
if (this.sawRoot && !this.closedRoot)
this.strictFail('Unclosed root tag');
if (this.state !== this.S.BEGIN &&
this.state !== this.S.BEGIN_WHITESPACE &&
this.state !== this.S.TEXT) {
this.errorFunction('Unexpected end');
}
this.closeText();
this.c = '';
this.closed = true;
this.emit('onend');
return new SAXParser(this.opt);
}
errorFunction(er) {
this.closeText();
if (this.trackPosition) {
er += `\nLine: ${this.line}\nColumn: ${this.column}\nChar: ${this.c}`;
}
const error = new Error(er);
this.error = error;
this.emit('onerror', error);
return this;
}
attrib() {
if (!this.strict) {
this.attribName = this.attribName[this.looseCase]();
}
if (this.attribList.indexOf(this.attribName) !== -1 ||
this.tag.attributes.hasOwnProperty(this.attribName)) {
this.attribName = this.attribValue = '';
return;
}
if (this.opt.xmlns) {
const qn = SAX.qname(this.attribName, true);
const prefix = qn.prefix;
const local = qn.local;
if (prefix === 'xmlns') {
// namespace binding attribute. push the binding into scope
if (local === 'xml' && this.attribValue !== this.XML_NAMESPACE) {
this.strictFail(`xml: prefix must be bound to ${this.XML_NAMESPACE}\n` + `Actual: ${this.attribValue}`);
}
else if (local === 'xmlns' && this.attribValue !== this.XMLNS_NAMESPACE) {
this.strictFail(`xmlns: prefix must be bound to ${this.XMLNS_NAMESPACE}\n` +
`Actual: ${this.attribValue}`);
}
else {
const tag = this.tag;
const parent = this.tags[this.tags.length - 1] || this;
if (tag.ns === parent.ns) {
tag.ns = Object.create(parent.ns);
}
tag.ns[local] = this.attribValue;
}
}
// defer onattribute events until all attributes have been seen
// so any new bindings can take effect. preserve attribute order
// so deferred events can be emitted in document order
this.attribList.push([this.attribName, this.attribValue]);
}
else {
// in non-xmlns mode, we can emit the event right away
this.tag.attributes[this.attribName] = this.attribValue;
this.emitNode('onattribute', {
name: this.attribName,
value: this.attribValue
});
}
this.attribName = this.attribValue = '';
}
newTag() {
if (!this.strict)
this.tagName = this.tagName[this.looseCase]();
const parent = this.tags[this.tags.length - 1] || this;
const tag = (this.tag = { name: this.tagName, attributes: {} });
// will be overridden if tag contains an xmlns="foo" or xmlns:foo="bar"
if (this.opt.xmlns) {
tag.ns = parent.ns;
}
this.attribList.length = 0;
this.emitNode('onopentagstart', tag);
}
parseEntity() {
let entity = this.entity;
const entityLC = entity.toLowerCase();
let num = NaN;
let numStr = '';
if (this.ENTITIES[entity]) {
return this.ENTITIES[entity];
}
if (this.ENTITIES[entityLC]) {
return this.ENTITIES[entityLC];
}
entity = entityLC;
if (entity.charAt(0) === '#') {
if (entity.charAt(1) === 'x') {
entity = entity.slice(2);
// TODO: remove tslint:disable
// tslint:disable-next-line
num = parseInt(entity, 16);
numStr = num.toString(16);
}
else {
entity = entity.slice(1);
// TODO: remove tslint:disable
// tslint:disable-next-line
num = parseInt(entity, 10);
numStr = num.toString(10);
}
}
entity = entity.replace(/^0+/, '');
if (isNaN(num) || numStr.toLowerCase() !== entity) {
this.strictFail('Invalid character entity');
return `&${this.entity};`;
}
return String.fromCodePoint(num);
}
beginWhiteSpace(c) {
if (c === '<') {
this.state = this.S.OPEN_WAKA;
this.startTagPosition = this.position;
}
else if (!SAX.isWhitespace(c)) {
// have to process this as a text node.
// weird, but happens.
this.strictFail('Non-whitespace before first tag.');
this.textNode = c;
this.state = this.S.TEXT;
}
else {
}
}
strictFail(message) {
if (typeof this !== 'object' || !(this instanceof SAXParser)) {
throw new Error('bad call to strictFail');
}
if (this.strict) {
this.errorFunction(message);
}
}
textApplyOptions(text) {
if (this.opt.trim)
text = text.trim();
if (this.opt.normalize)
text = text.replace(/\s+/g, ' ');
return text;
}
emitNode(nodeType, data) {
if (this.textNode)
this.closeText();
this.emit(nodeType, data);
}
closeText() {
this.textNode = this.textApplyOptions(this.textNode);
// TODO: figure out why this.textNode can be "" and "undefined"
if (this.textNode !== undefined && this.textNode !== '' && this.textNode !== 'undefined') {
this.emit('ontext', this.textNode);
}
this.textNode = '';
}
checkBufferLength() {
const maxAllowed = Math.max(this.opt.MAX_BUFFER_LENGTH, 10);
let maxActual = 0;
for (let i = 0, l = this.BUFFERS.length; i < l; i++) {
const len = this[this.BUFFERS[i]]?.length || 0;
if (len > maxAllowed) {
// Text/cdata nodes can get big, and since they're buffered,
// we can get here under normal conditions.
// Avoid issues by emitting the text node now,
// so at least it won't get any bigger.
switch (this.BUFFERS[i]) {
case 'textNode':
this.closeText();
break;
case 'cdata':
this.emitNode('oncdata', this.cdata);
this.cdata = '';
break;
case 'script':
this.emitNode('onscript', this.script);
this.script = '';
break;
default:
this.errorFunction(`Max buffer length exceeded: ${this.BUFFERS[i]}`);
}
}
maxActual = Math.max(maxActual, len);
}
// schedule the next check for the earliest possible buffer overrun.
const m = this.opt.MAX_BUFFER_LENGTH - maxActual;
this.bufferCheckPosition = m + this.position;
}
openTag(selfClosing) {
if (this.opt.xmlns) {
// emit namespace binding events
const tag = this.tag;
// add namespace info to tag
const qn = SAX.qname(this.tagName);
tag.prefix = qn.prefix;
tag.local = qn.local;
tag.uri = tag.ns[qn.prefix] || '';
if (tag.prefix && !tag.uri) {
this.strictFail(`Unbound namespace prefix: ${JSON.stringify(this.tagName)}`);
tag.uri = qn.prefix;
}
const parent = this.tags[this.tags.length - 1] || this;
if (tag.ns && parent.ns !== tag.ns) {
const that = this;
Object.keys(tag.ns).forEach((p) => {
that.emitNode('onopennamespace', {
prefix: p,
uri: tag.ns[p]
});
});
}
// handle deferred onattribute events
// Note: do not apply default ns to attributes:
// http://www.w3.org/TR/REC-xml-names/#defaulting
for (let i = 0, l = this.attribList.length; i < l; i++) {
const nv = this.attribList[i];
const name = nv[0];
const value = nv[1];
const qualName = SAX.qname(name, true);
const prefix = qualName.prefix;
const local = qualName.local;
const uri = prefix === '' ? '' : tag.ns[prefix] || '';
const a = {
name,
value,
prefix,
local,
uri
};
// if there's any attributes with an undefined namespace,
// then fail on them now.
if (prefix && prefix !== 'xmlns' && !uri) {
this.strictFail(`Unbound namespace prefix: ${JSON.stringify(prefix)}`);
a.uri = prefix;
}
this.tag.attributes[name] = a;
this.emitNode('onattribute', a);
}
this.attribList.length = 0;
}
this.tag.isSelfClosing = Boolean(selfClosing);
// process the tag
this.sawRoot = true;
this.tags.push(this.tag);
this.emitNode('onopentag', this.tag);
if (!selfClosing) {
// special case for <script> in non-strict mode.
if (!this.noscript && this.tagName.toLowerCase() === 'script') {
this.state = this.S.SCRIPT;
}
else {
this.state = this.S.TEXT;
}
this.tag = null;
this.tagName = '';
}
this.attribName = this.attribValue = '';
this.attribList.length = 0;
}
closeTag() {
if (!this.tagName) {
this.strictFail('Weird empty close tag.');
this.textNode += '</>';
this.state = this.S.TEXT;
return;
}
if (this.script) {
if (this.tagName !== 'script') {
this.script += `</${this.tagName}>`;
this.tagName = '';
this.state = this.S.SCRIPT;
return;
}
this.emitNode('onscript', this.script);
this.script = '';
}
// first make sure that the closing tag actually exists.
// <a><b></c></b></a> will close everything, otherwise.
let t = this.tags.length;
let tagName = this.tagName;
if (!this.strict) {
tagName = tagName[this.looseCase]();
}
while (t--) {
const close = this.tags[t];
if (close.name !== tagName) {
// fail the first time in strict mode
this.strictFail('Unexpected close tag');
}
else {
break;
}
}
// didn't find it. we already failed for strict, so just abort.
if (t < 0) {
this.strictFail(`Unmatched closing tag: ${this.tagName}`);
this.textNode += `</${this.tagName}>`;
this.state = this.S.TEXT;
return;
}
this.tagName = tagName;
let s = this.tags.length;
while (s-- > t) {
const tag = (this.tag = this.tags.pop());
this.tagName = this.tag.name;
this.emitNode('onclosetag', this.tagName);
const x = {};
for (const i in tag.ns) {
if (tag.ns.hasOwnProperty(i)) {
x[i] = tag.ns[i];
}
}
const parent = this.tags[this.tags.length - 1] || this;
if (this.opt.xmlns && tag.ns !== parent.ns) {
// remove namespace bindings introduced by tag
const that = this;
Object.keys(tag.ns).forEach((p) => {
const n = tag.ns[p];
that.emitNode('onclosenamespace', { prefix: p, uri: n });
});
}
}
if (t === 0)
this.closedRoot = true;
this.tagName = this.attribValue = this.attribName = '';
this.attribList.length = 0;
this.state = this.S.TEXT;
}
}
/**
*
* @todo Weird inheritance, with some variables initialized in subclass
*/
export class SAXParser extends SAX {
static ENTITIES = ENTITIES;
opt = DEFAULT_SAX_PARSER_OPTIONS;
events = DEFAULT_SAX_EVENTS;
constructor(opt) {
super();
this.clearBuffers();
this.opt = opt = { ...this.opt, ...opt };
this.events = { ...this.events, ...opt };
this.q = this.c = '';
this.opt.lowercase = this.opt.lowercase || this.opt.lowercasetags;
this.bufferCheckPosition = this.opt.MAX_BUFFER_LENGTH;
this.looseCase = this.opt.lowercase ? 'toLowerCase' : 'toUpperCase';
this.tags = [];
this.closed = this.closedRoot = this.sawRoot = false;
this.tag = this.error = null;
this.strict = Boolean(this.opt.strict);
this.noscript = Boolean(this.opt.strict || this.opt.noscript);
this.state = this.S.BEGIN;
this.strictEntities = this.opt.strictEntities;
this.ENTITIES = this.strictEntities
? Object.create(this.XML_ENTITIES)
: Object.create(this.ENTITIES);
this.attribList = [];
// namespaces form a prototype chain.
// it always points at the current tag,
// which protos to its parent tag.
if (this.opt.xmlns) {
this.ns = Object.create(this.rootNS);
}
// mostly just for error reporting
this.trackPosition = this.opt.position !== false;
if (this.trackPosition) {
this.position = this.line = this.column = 0;
}
this.emit('onready');
}
resume() {
this.error = null;
return this;
}
close() {
return this.write(null);
}
flush() {
this.flushBuffers();
}
}