UNPKG

@langchain/core

Version:
1,435 lines (1,434 loc) 55.2 kB
"use strict"; // @ts-nocheck Object.defineProperty(exports, "__esModule", { value: true }); exports.sax = void 0; // Inlined to deal with portability issues // Originally from: https://github.com/isaacs/sax-js const initializeSax = function () { const sax = {}; sax.parser = function (strict, opt) { return new SAXParser(strict, opt); }; sax.SAXParser = SAXParser; sax.SAXStream = SAXStream; sax.createStream = createStream; // When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns. // When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)), // since that's the earliest that a buffer overrun could occur. This way, checks are // as rare as required, but as often as necessary to ensure never crossing this bound. // Furthermore, buffers are only tested at most once per write(), so passing a very // large string into write() might have undesirable effects, but this is manageable by // the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme // edge case, result in creating at most one complete copy of the string passed in. // Set to Infinity to have unlimited buffers. sax.MAX_BUFFER_LENGTH = 64 * 1024; const buffers = [ "comment", "sgmlDecl", "textNode", "tagName", "doctype", "procInstName", "procInstBody", "entity", "attribName", "attribValue", "cdata", "script", ]; sax.EVENTS = [ "text", "processinginstruction", "sgmldeclaration", "doctype", "comment", "opentagstart", "attribute", "opentag", "closetag", "opencdata", "cdata", "closecdata", "error", "end", "ready", "script", "opennamespace", "closenamespace", ]; function SAXParser(strict, opt) { if (!(this instanceof SAXParser)) { return new SAXParser(strict, opt); } var parser = this; clearBuffers(parser); parser.q = parser.c = ""; parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH; parser.opt = opt || {}; parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags; parser.looseCase = parser.opt.lowercase ? "toLowerCase" : "toUpperCase"; parser.tags = []; parser.closed = parser.closedRoot = parser.sawRoot = false; parser.tag = parser.error = null; parser.strict = !!strict; parser.noscript = !!(strict || parser.opt.noscript); parser.state = S.BEGIN; parser.strictEntities = parser.opt.strictEntities; parser.ENTITIES = parser.strictEntities ? Object.create(sax.XML_ENTITIES) : Object.create(sax.ENTITIES); parser.attribList = []; // namespaces form a prototype chain. // it always points at the current tag, // which protos to its parent tag. if (parser.opt.xmlns) { parser.ns = Object.create(rootNS); } // mostly just for error reporting parser.trackPosition = parser.opt.position !== false; if (parser.trackPosition) { parser.position = parser.line = parser.column = 0; } emit(parser, "onready"); } if (!Object.create) { Object.create = function (o) { function F() { } F.prototype = o; var newf = new F(); return newf; }; } if (!Object.keys) { Object.keys = function (o) { var a = []; for (var i in o) if (o.hasOwnProperty(i)) a.push(i); return a; }; } function checkBufferLength(parser) { var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10); var maxActual = 0; for (var i = 0, l = buffers.length; i < l; i++) { var len = parser[buffers[i]].length; if (len > maxAllowed) { // Text/cdata nodes can get big, and since they're buffered, // we can get here under normal conditions. // Avoid issues by emitting the text node now, // so at least it won't get any bigger. switch (buffers[i]) { case "textNode": closeText(parser); break; case "cdata": emitNode(parser, "oncdata", parser.cdata); parser.cdata = ""; break; case "script": emitNode(parser, "onscript", parser.script); parser.script = ""; break; default: error(parser, "Max buffer length exceeded: " + buffers[i]); } } maxActual = Math.max(maxActual, len); } // schedule the next check for the earliest possible buffer overrun. var m = sax.MAX_BUFFER_LENGTH - maxActual; parser.bufferCheckPosition = m + parser.position; } function clearBuffers(parser) { for (var i = 0, l = buffers.length; i < l; i++) { parser[buffers[i]] = ""; } } function flushBuffers(parser) { closeText(parser); if (parser.cdata !== "") { emitNode(parser, "oncdata", parser.cdata); parser.cdata = ""; } if (parser.script !== "") { emitNode(parser, "onscript", parser.script); parser.script = ""; } } SAXParser.prototype = { end: function () { end(this); }, write: write, resume: function () { this.error = null; return this; }, close: function () { return this.write(null); }, flush: function () { flushBuffers(this); }, }; var Stream = ReadableStream; if (!Stream) Stream = function () { }; var streamWraps = sax.EVENTS.filter(function (ev) { return ev !== "error" && ev !== "end"; }); function createStream(strict, opt) { return new SAXStream(strict, opt); } function SAXStream(strict, opt) { if (!(this instanceof SAXStream)) { return new SAXStream(strict, opt); } Stream.apply(this); this._parser = new SAXParser(strict, opt); this.writable = true; this.readable = true; var me = this; this._parser.onend = function () { me.emit("end"); }; this._parser.onerror = function (er) { me.emit("error", er); // if didn't throw, then means error was handled. // go ahead and clear error, so we can write again. me._parser.error = null; }; this._decoder = null; streamWraps.forEach(function (ev) { Object.defineProperty(me, "on" + ev, { get: function () { return me._parser["on" + ev]; }, set: function (h) { if (!h) { me.removeAllListeners(ev); me._parser["on" + ev] = h; return h; } me.on(ev, h); }, enumerable: true, configurable: false, }); }); } SAXStream.prototype = Object.create(Stream.prototype, { constructor: { value: SAXStream, }, }); SAXStream.prototype.write = function (data) { this._parser.write(data.toString()); this.emit("data", data); return true; }; SAXStream.prototype.end = function (chunk) { if (chunk && chunk.length) { this.write(chunk); } this._parser.end(); return true; }; SAXStream.prototype.on = function (ev, handler) { var me = this; if (!me._parser["on" + ev] && streamWraps.indexOf(ev) !== -1) { me._parser["on" + ev] = function () { var args = arguments.length === 1 ? [arguments[0]] : Array.apply(null, arguments); args.splice(0, 0, ev); me.emit.apply(me, args); }; } return Stream.prototype.on.call(me, ev, handler); }; // this really needs to be replaced with character classes. // XML allows all manner of ridiculous numbers and digits. var CDATA = "[CDATA["; var DOCTYPE = "DOCTYPE"; var XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"; var XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"; var rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE }; // http://www.w3.org/TR/REC-xml/#NT-NameStartChar // This implementation works on strings, a single character at a time // as such, it cannot ever support astral-plane characters (10000-EFFFF) // without a significant breaking change to either this parser, or the // JavaScript language. Implementation of an emoji-capable xml parser // is left as an exercise for the reader. var nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; var nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; var entityStart = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/; var entityBody = /[#:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040.\d-]/; function isWhitespace(c) { return c === " " || c === "\n" || c === "\r" || c === "\t"; } function isQuote(c) { return c === '"' || c === "'"; } function isAttribEnd(c) { return c === ">" || isWhitespace(c); } function isMatch(regex, c) { return regex.test(c); } function notMatch(regex, c) { return !isMatch(regex, c); } var S = 0; sax.STATE = { BEGIN: S++, BEGIN_WHITESPACE: S++, TEXT: S++, TEXT_ENTITY: S++, OPEN_WAKA: S++, SGML_DECL: S++, SGML_DECL_QUOTED: S++, DOCTYPE: S++, DOCTYPE_QUOTED: S++, DOCTYPE_DTD: S++, DOCTYPE_DTD_QUOTED: S++, COMMENT_STARTING: S++, COMMENT: S++, COMMENT_ENDING: S++, COMMENT_ENDED: S++, CDATA: S++, CDATA_ENDING: S++, CDATA_ENDING_2: S++, PROC_INST: S++, PROC_INST_BODY: S++, PROC_INST_ENDING: S++, OPEN_TAG: S++, OPEN_TAG_SLASH: S++, ATTRIB: S++, ATTRIB_NAME: S++, ATTRIB_NAME_SAW_WHITE: S++, ATTRIB_VALUE: S++, ATTRIB_VALUE_QUOTED: S++, ATTRIB_VALUE_CLOSED: S++, ATTRIB_VALUE_UNQUOTED: S++, ATTRIB_VALUE_ENTITY_Q: S++, ATTRIB_VALUE_ENTITY_U: S++, CLOSE_TAG: S++, CLOSE_TAG_SAW_WHITE: S++, SCRIPT: S++, SCRIPT_ENDING: S++, // <script> ... < }; sax.XML_ENTITIES = { amp: "&", gt: ">", lt: "<", quot: '"', apos: "'", }; sax.ENTITIES = { amp: "&", gt: ">", lt: "<", quot: '"', apos: "'", AElig: 198, Aacute: 193, Acirc: 194, Agrave: 192, Aring: 197, Atilde: 195, Auml: 196, Ccedil: 199, ETH: 208, Eacute: 201, Ecirc: 202, Egrave: 200, Euml: 203, Iacute: 205, Icirc: 206, Igrave: 204, Iuml: 207, Ntilde: 209, Oacute: 211, Ocirc: 212, Ograve: 210, Oslash: 216, Otilde: 213, Ouml: 214, THORN: 222, Uacute: 218, Ucirc: 219, Ugrave: 217, Uuml: 220, Yacute: 221, aacute: 225, acirc: 226, aelig: 230, agrave: 224, aring: 229, atilde: 227, auml: 228, ccedil: 231, eacute: 233, ecirc: 234, egrave: 232, eth: 240, euml: 235, iacute: 237, icirc: 238, igrave: 236, iuml: 239, ntilde: 241, oacute: 243, ocirc: 244, ograve: 242, oslash: 248, otilde: 245, ouml: 246, szlig: 223, thorn: 254, uacute: 250, ucirc: 251, ugrave: 249, uuml: 252, yacute: 253, yuml: 255, copy: 169, reg: 174, nbsp: 160, iexcl: 161, cent: 162, pound: 163, curren: 164, yen: 165, brvbar: 166, sect: 167, uml: 168, ordf: 170, laquo: 171, not: 172, shy: 173, macr: 175, deg: 176, plusmn: 177, sup1: 185, sup2: 178, sup3: 179, acute: 180, micro: 181, para: 182, middot: 183, cedil: 184, ordm: 186, raquo: 187, frac14: 188, frac12: 189, frac34: 190, iquest: 191, times: 215, divide: 247, OElig: 338, oelig: 339, Scaron: 352, scaron: 353, Yuml: 376, fnof: 402, circ: 710, tilde: 732, Alpha: 913, Beta: 914, Gamma: 915, Delta: 916, Epsilon: 917, Zeta: 918, Eta: 919, Theta: 920, Iota: 921, Kappa: 922, Lambda: 923, Mu: 924, Nu: 925, Xi: 926, Omicron: 927, Pi: 928, Rho: 929, Sigma: 931, Tau: 932, Upsilon: 933, Phi: 934, Chi: 935, Psi: 936, Omega: 937, alpha: 945, beta: 946, gamma: 947, delta: 948, epsilon: 949, zeta: 950, eta: 951, theta: 952, iota: 953, kappa: 954, lambda: 955, mu: 956, nu: 957, xi: 958, omicron: 959, pi: 960, rho: 961, sigmaf: 962, sigma: 963, tau: 964, upsilon: 965, phi: 966, chi: 967, psi: 968, omega: 969, thetasym: 977, upsih: 978, piv: 982, ensp: 8194, emsp: 8195, thinsp: 8201, zwnj: 8204, zwj: 8205, lrm: 8206, rlm: 8207, ndash: 8211, mdash: 8212, lsquo: 8216, rsquo: 8217, sbquo: 8218, ldquo: 8220, rdquo: 8221, bdquo: 8222, dagger: 8224, Dagger: 8225, bull: 8226, hellip: 8230, permil: 8240, prime: 8242, Prime: 8243, lsaquo: 8249, rsaquo: 8250, oline: 8254, frasl: 8260, euro: 8364, image: 8465, weierp: 8472, real: 8476, trade: 8482, alefsym: 8501, larr: 8592, uarr: 8593, rarr: 8594, darr: 8595, harr: 8596, crarr: 8629, lArr: 8656, uArr: 8657, rArr: 8658, dArr: 8659, hArr: 8660, forall: 8704, part: 8706, exist: 8707, empty: 8709, nabla: 8711, isin: 8712, notin: 8713, ni: 8715, prod: 8719, sum: 8721, minus: 8722, lowast: 8727, radic: 8730, prop: 8733, infin: 8734, ang: 8736, and: 8743, or: 8744, cap: 8745, cup: 8746, int: 8747, there4: 8756, sim: 8764, cong: 8773, asymp: 8776, ne: 8800, equiv: 8801, le: 8804, ge: 8805, sub: 8834, sup: 8835, nsub: 8836, sube: 8838, supe: 8839, oplus: 8853, otimes: 8855, perp: 8869, sdot: 8901, lceil: 8968, rceil: 8969, lfloor: 8970, rfloor: 8971, lang: 9001, rang: 9002, loz: 9674, spades: 9824, clubs: 9827, hearts: 9829, diams: 9830, }; Object.keys(sax.ENTITIES).forEach(function (key) { var e = sax.ENTITIES[key]; var s = typeof e === "number" ? String.fromCharCode(e) : e; sax.ENTITIES[key] = s; }); for (var s in sax.STATE) { sax.STATE[sax.STATE[s]] = s; } // shorthand S = sax.STATE; function emit(parser, event, data) { parser[event] && parser[event](data); } function emitNode(parser, nodeType, data) { if (parser.textNode) closeText(parser); emit(parser, nodeType, data); } function closeText(parser) { parser.textNode = textopts(parser.opt, parser.textNode); if (parser.textNode) emit(parser, "ontext", parser.textNode); parser.textNode = ""; } function textopts(opt, text) { if (opt.trim) text = text.trim(); if (opt.normalize) text = text.replace(/\s+/g, " "); return text; } function error(parser, er) { closeText(parser); if (parser.trackPosition) { er += "\nLine: " + parser.line + "\nColumn: " + parser.column + "\nChar: " + parser.c; } er = new Error(er); parser.error = er; emit(parser, "onerror", er); return parser; } function end(parser) { if (parser.sawRoot && !parser.closedRoot) strictFail(parser, "Unclosed root tag"); if (parser.state !== S.BEGIN && parser.state !== S.BEGIN_WHITESPACE && parser.state !== S.TEXT) { error(parser, "Unexpected end"); } closeText(parser); parser.c = ""; parser.closed = true; emit(parser, "onend"); SAXParser.call(parser, parser.strict, parser.opt); return parser; } function strictFail(parser, message) { if (typeof parser !== "object" || !(parser instanceof SAXParser)) { throw new Error("bad call to strictFail"); } if (parser.strict) { error(parser, message); } } function newTag(parser) { if (!parser.strict) parser.tagName = parser.tagName[parser.looseCase](); var parent = parser.tags[parser.tags.length - 1] || parser; var tag = (parser.tag = { name: parser.tagName, attributes: {} }); // will be overridden if tag contails an xmlns="foo" or xmlns:foo="bar" if (parser.opt.xmlns) { tag.ns = parent.ns; } parser.attribList.length = 0; emitNode(parser, "onopentagstart", tag); } function qname(name, attribute) { var i = name.indexOf(":"); var qualName = i < 0 ? ["", name] : name.split(":"); var prefix = qualName[0]; var local = qualName[1]; // <x "xmlns"="http://foo"> if (attribute && name === "xmlns") { prefix = "xmlns"; local = ""; } return { prefix: prefix, local: local }; } function attrib(parser) { if (!parser.strict) { parser.attribName = parser.attribName[parser.looseCase](); } if (parser.attribList.indexOf(parser.attribName) !== -1 || parser.tag.attributes.hasOwnProperty(parser.attribName)) { parser.attribName = parser.attribValue = ""; return; } if (parser.opt.xmlns) { var qn = qname(parser.attribName, true); var prefix = qn.prefix; var local = qn.local; if (prefix === "xmlns") { // namespace binding attribute. push the binding into scope if (local === "xml" && parser.attribValue !== XML_NAMESPACE) { strictFail(parser, "xml: prefix must be bound to " + XML_NAMESPACE + "\n" + "Actual: " + parser.attribValue); } else if (local === "xmlns" && parser.attribValue !== XMLNS_NAMESPACE) { strictFail(parser, "xmlns: prefix must be bound to " + XMLNS_NAMESPACE + "\n" + "Actual: " + parser.attribValue); } else { var tag = parser.tag; var parent = parser.tags[parser.tags.length - 1] || parser; if (tag.ns === parent.ns) { tag.ns = Object.create(parent.ns); } tag.ns[local] = parser.attribValue; } } // defer onattribute events until all attributes have been seen // so any new bindings can take effect. preserve attribute order // so deferred events can be emitted in document order parser.attribList.push([parser.attribName, parser.attribValue]); } else { // in non-xmlns mode, we can emit the event right away parser.tag.attributes[parser.attribName] = parser.attribValue; emitNode(parser, "onattribute", { name: parser.attribName, value: parser.attribValue, }); } parser.attribName = parser.attribValue = ""; } function openTag(parser, selfClosing) { if (parser.opt.xmlns) { // emit namespace binding events var tag = parser.tag; // add namespace info to tag var qn = qname(parser.tagName); tag.prefix = qn.prefix; tag.local = qn.local; tag.uri = tag.ns[qn.prefix] || ""; if (tag.prefix && !tag.uri) { strictFail(parser, "Unbound namespace prefix: " + JSON.stringify(parser.tagName)); tag.uri = qn.prefix; } var parent = parser.tags[parser.tags.length - 1] || parser; if (tag.ns && parent.ns !== tag.ns) { Object.keys(tag.ns).forEach(function (p) { emitNode(parser, "onopennamespace", { prefix: p, uri: tag.ns[p], }); }); } // handle deferred onattribute events // Note: do not apply default ns to attributes: // http://www.w3.org/TR/REC-xml-names/#defaulting for (var i = 0, l = parser.attribList.length; i < l; i++) { var nv = parser.attribList[i]; var name = nv[0]; var value = nv[1]; var qualName = qname(name, true); var prefix = qualName.prefix; var local = qualName.local; var uri = prefix === "" ? "" : tag.ns[prefix] || ""; var a = { name: name, value: value, prefix: prefix, local: local, uri: uri, }; // if there's any attributes with an undefined namespace, // then fail on them now. if (prefix && prefix !== "xmlns" && !uri) { strictFail(parser, "Unbound namespace prefix: " + JSON.stringify(prefix)); a.uri = prefix; } parser.tag.attributes[name] = a; emitNode(parser, "onattribute", a); } parser.attribList.length = 0; } parser.tag.isSelfClosing = !!selfClosing; // process the tag parser.sawRoot = true; parser.tags.push(parser.tag); emitNode(parser, "onopentag", parser.tag); if (!selfClosing) { // special case for <script> in non-strict mode. if (!parser.noscript && parser.tagName.toLowerCase() === "script") { parser.state = S.SCRIPT; } else { parser.state = S.TEXT; } parser.tag = null; parser.tagName = ""; } parser.attribName = parser.attribValue = ""; parser.attribList.length = 0; } function closeTag(parser) { if (!parser.tagName) { strictFail(parser, "Weird empty close tag."); parser.textNode += "</>"; parser.state = S.TEXT; return; } if (parser.script) { if (parser.tagName !== "script") { parser.script += "</" + parser.tagName + ">"; parser.tagName = ""; parser.state = S.SCRIPT; return; } emitNode(parser, "onscript", parser.script); parser.script = ""; } // first make sure that the closing tag actually exists. // <a><b></c></b></a> will close everything, otherwise. var t = parser.tags.length; var tagName = parser.tagName; if (!parser.strict) { tagName = tagName[parser.looseCase](); } var closeTo = tagName; while (t--) { var close = parser.tags[t]; if (close.name !== closeTo) { // fail the first time in strict mode strictFail(parser, "Unexpected close tag"); } else { break; } } // didn't find it. we already failed for strict, so just abort. if (t < 0) { strictFail(parser, "Unmatched closing tag: " + parser.tagName); parser.textNode += "</" + parser.tagName + ">"; parser.state = S.TEXT; return; } parser.tagName = tagName; var s = parser.tags.length; while (s-- > t) { var tag = (parser.tag = parser.tags.pop()); parser.tagName = parser.tag.name; emitNode(parser, "onclosetag", parser.tagName); var x = {}; for (var i in tag.ns) { x[i] = tag.ns[i]; } var parent = parser.tags[parser.tags.length - 1] || parser; if (parser.opt.xmlns && tag.ns !== parent.ns) { // remove namespace bindings introduced by tag Object.keys(tag.ns).forEach(function (p) { var n = tag.ns[p]; emitNode(parser, "onclosenamespace", { prefix: p, uri: n }); }); } } if (t === 0) parser.closedRoot = true; parser.tagName = parser.attribValue = parser.attribName = ""; parser.attribList.length = 0; parser.state = S.TEXT; } function parseEntity(parser) { var entity = parser.entity; var entityLC = entity.toLowerCase(); var num; var numStr = ""; if (parser.ENTITIES[entity]) { return parser.ENTITIES[entity]; } if (parser.ENTITIES[entityLC]) { return parser.ENTITIES[entityLC]; } entity = entityLC; if (entity.charAt(0) === "#") { if (entity.charAt(1) === "x") { entity = entity.slice(2); num = parseInt(entity, 16); numStr = num.toString(16); } else { entity = entity.slice(1); num = parseInt(entity, 10); numStr = num.toString(10); } } entity = entity.replace(/^0+/, ""); if (isNaN(num) || numStr.toLowerCase() !== entity) { strictFail(parser, "Invalid character entity"); return "&" + parser.entity + ";"; } return String.fromCodePoint(num); } function beginWhiteSpace(parser, c) { if (c === "<") { parser.state = S.OPEN_WAKA; parser.startTagPosition = parser.position; } else if (!isWhitespace(c)) { // have to process this as a text node. // weird, but happens. strictFail(parser, "Non-whitespace before first tag."); parser.textNode = c; parser.state = S.TEXT; } } function charAt(chunk, i) { var result = ""; if (i < chunk.length) { result = chunk.charAt(i); } return result; } function write(chunk) { var parser = this; if (this.error) { throw this.error; } if (parser.closed) { return error(parser, "Cannot write after close. Assign an onready handler."); } if (chunk === null) { return end(parser); } if (typeof chunk === "object") { chunk = chunk.toString(); } var i = 0; var c = ""; while (true) { c = charAt(chunk, i++); parser.c = c; if (!c) { break; } if (parser.trackPosition) { parser.position++; if (c === "\n") { parser.line++; parser.column = 0; } else { parser.column++; } } switch (parser.state) { case S.BEGIN: parser.state = S.BEGIN_WHITESPACE; if (c === "\uFEFF") { continue; } beginWhiteSpace(parser, c); continue; case S.BEGIN_WHITESPACE: beginWhiteSpace(parser, c); continue; case S.TEXT: if (parser.sawRoot && !parser.closedRoot) { var starti = i - 1; while (c && c !== "<" && c !== "&") { c = charAt(chunk, i++); if (c && parser.trackPosition) { parser.position++; if (c === "\n") { parser.line++; parser.column = 0; } else { parser.column++; } } } parser.textNode += chunk.substring(starti, i - 1); } if (c === "<" && !(parser.sawRoot && parser.closedRoot && !parser.strict)) { parser.state = S.OPEN_WAKA; parser.startTagPosition = parser.position; } else { if (!isWhitespace(c) && (!parser.sawRoot || parser.closedRoot)) { strictFail(parser, "Text data outside of root node."); } if (c === "&") { parser.state = S.TEXT_ENTITY; } else { parser.textNode += c; } } continue; case S.SCRIPT: // only non-strict if (c === "<") { parser.state = S.SCRIPT_ENDING; } else { parser.script += c; } continue; case S.SCRIPT_ENDING: if (c === "/") { parser.state = S.CLOSE_TAG; } else { parser.script += "<" + c; parser.state = S.SCRIPT; } continue; case S.OPEN_WAKA: // either a /, ?, !, or text is coming next. if (c === "!") { parser.state = S.SGML_DECL; parser.sgmlDecl = ""; } else if (isWhitespace(c)) { // wait for it... } else if (isMatch(nameStart, c)) { parser.state = S.OPEN_TAG; parser.tagName = c; } else if (c === "/") { parser.state = S.CLOSE_TAG; parser.tagName = ""; } else if (c === "?") { parser.state = S.PROC_INST; parser.procInstName = parser.procInstBody = ""; } else { strictFail(parser, "Unencoded <"); // if there was some whitespace, then add that in. if (parser.startTagPosition + 1 < parser.position) { var pad = parser.position - parser.startTagPosition; c = new Array(pad).join(" ") + c; } parser.textNode += "<" + c; parser.state = S.TEXT; } continue; case S.SGML_DECL: if ((parser.sgmlDecl + c).toUpperCase() === CDATA) { emitNode(parser, "onopencdata"); parser.state = S.CDATA; parser.sgmlDecl = ""; parser.cdata = ""; } else if (parser.sgmlDecl + c === "--") { parser.state = S.COMMENT; parser.comment = ""; parser.sgmlDecl = ""; } else if ((parser.sgmlDecl + c).toUpperCase() === DOCTYPE) { parser.state = S.DOCTYPE; if (parser.doctype || parser.sawRoot) { strictFail(parser, "Inappropriately located doctype declaration"); } parser.doctype = ""; parser.sgmlDecl = ""; } else if (c === ">") { emitNode(parser, "onsgmldeclaration", parser.sgmlDecl); parser.sgmlDecl = ""; parser.state = S.TEXT; } else if (isQuote(c)) { parser.state = S.SGML_DECL_QUOTED; parser.sgmlDecl += c; } else { parser.sgmlDecl += c; } continue; case S.SGML_DECL_QUOTED: if (c === parser.q) { parser.state = S.SGML_DECL; parser.q = ""; } parser.sgmlDecl += c; continue; case S.DOCTYPE: if (c === ">") { parser.state = S.TEXT; emitNode(parser, "ondoctype", parser.doctype); parser.doctype = true; // just remember that we saw it. } else { parser.doctype += c; if (c === "[") { parser.state = S.DOCTYPE_DTD; } else if (isQuote(c)) { parser.state = S.DOCTYPE_QUOTED; parser.q = c; } } continue; case S.DOCTYPE_QUOTED: parser.doctype += c; if (c === parser.q) { parser.q = ""; parser.state = S.DOCTYPE; } continue; case S.DOCTYPE_DTD: parser.doctype += c; if (c === "]") { parser.state = S.DOCTYPE; } else if (isQuote(c)) { parser.state = S.DOCTYPE_DTD_QUOTED; parser.q = c; } continue; case S.DOCTYPE_DTD_QUOTED: parser.doctype += c; if (c === parser.q) { parser.state = S.DOCTYPE_DTD; parser.q = ""; } continue; case S.COMMENT: if (c === "-") { parser.state = S.COMMENT_ENDING; } else { parser.comment += c; } continue; case S.COMMENT_ENDING: if (c === "-") { parser.state = S.COMMENT_ENDED; parser.comment = textopts(parser.opt, parser.comment); if (parser.comment) { emitNode(parser, "oncomment", parser.comment); } parser.comment = ""; } else { parser.comment += "-" + c; parser.state = S.COMMENT; } continue; case S.COMMENT_ENDED: if (c !== ">") { strictFail(parser, "Malformed comment"); // allow <!-- blah -- bloo --> in non-strict mode, // which is a comment of " blah -- bloo " parser.comment += "--" + c; parser.state = S.COMMENT; } else { parser.state = S.TEXT; } continue; case S.CDATA: if (c === "]") { parser.state = S.CDATA_ENDING; } else { parser.cdata += c; } continue; case S.CDATA_ENDING: if (c === "]") { parser.state = S.CDATA_ENDING_2; } else { parser.cdata += "]" + c; parser.state = S.CDATA; } continue; case S.CDATA_ENDING_2: if (c === ">") { if (parser.cdata) { emitNode(parser, "oncdata", parser.cdata); } emitNode(parser, "onclosecdata"); parser.cdata = ""; parser.state = S.TEXT; } else if (c === "]") { parser.cdata += "]"; } else { parser.cdata += "]]" + c; parser.state = S.CDATA; } continue; case S.PROC_INST: if (c === "?") { parser.state = S.PROC_INST_ENDING; } else if (isWhitespace(c)) { parser.state = S.PROC_INST_BODY; } else { parser.procInstName += c; } continue; case S.PROC_INST_BODY: if (!parser.procInstBody && isWhitespace(c)) { continue; } else if (c === "?") { parser.state = S.PROC_INST_ENDING; } else { parser.procInstBody += c; } continue; case S.PROC_INST_ENDING: if (c === ">") { emitNode(parser, "onprocessinginstruction", { name: parser.procInstName, body: parser.procInstBody, }); parser.procInstName = parser.procInstBody = ""; parser.state = S.TEXT; } else { parser.procInstBody += "?" + c; parser.state = S.PROC_INST_BODY; } continue; case S.OPEN_TAG: if (isMatch(nameBody, c)) { parser.tagName += c; } else { newTag(parser); if (c === ">") { openTag(parser); } else if (c === "/") { parser.state = S.OPEN_TAG_SLASH; } else { if (!isWhitespace(c)) { strictFail(parser, "Invalid character in tag name"); } parser.state = S.ATTRIB; } } continue; case S.OPEN_TAG_SLASH: if (c === ">") { openTag(parser, true); closeTag(parser); } else { strictFail(parser, "Forward-slash in opening tag not followed by >"); parser.state = S.ATTRIB; } continue; case S.ATTRIB: // haven't read the attribute name yet. if (isWhitespace(c)) { continue; } else if (c === ">") { openTag(parser); } else if (c === "/") { parser.state = S.OPEN_TAG_SLASH; } else if (isMatch(nameStart, c)) { parser.attribName = c; parser.attribValue = ""; parser.state = S.ATTRIB_NAME; } else { strictFail(parser, "Invalid attribute name"); } continue; case S.ATTRIB_NAME: if (c === "=") { parser.state = S.ATTRIB_VALUE; } else if (c === ">") { strictFail(parser, "Attribute without value"); parser.attribValue = parser.attribName; attrib(parser); openTag(parser); } else if (isWhitespace(c)) { parser.state = S.ATTRIB_NAME_SAW_WHITE; } else if (isMatch(nameBody, c)) { parser.attribName += c; } else { strictFail(parser, "Invalid attribute name"); } continue; case S.ATTRIB_NAME_SAW_WHITE: if (c === "=") { parser.state = S.ATTRIB_VALUE; } else if (isWhitespace(c)) { continue; } else { strictFail(parser, "Attribute without value"); parser.tag.attributes[parser.attribName] = ""; parser.attribValue = ""; emitNode(parser, "onattribute", { name: parser.attribName, value: "", }); parser.attribName = ""; if (c === ">") { openTag(parser); } else if (isMatch(nameStart, c)) { parser.attribName = c; parser.state = S.ATTRIB_NAME; } else { strictFail(parser, "Invalid attribute name"); parser.state = S.ATTRIB; } } continue; case S.ATTRIB_VALUE: if (isWhitespace(c)) { continue; } else if (isQuote(c)) { parser.q = c; parser.state = S.ATTRIB_VALUE_QUOTED; } else { strictFail(parser, "Unquoted attribute value"); parser.state = S.ATTRIB_VALUE_UNQUOTED; parser.attribValue = c; } continue; case S.ATTRIB_VALUE_QUOTED: if (c !== parser.q) { if (c === "&") { parser.state = S.ATTRIB_VALUE_ENTITY_Q; } else { parser.attribValue += c; } continue; } attrib(parser); parser.q = ""; parser.state = S.ATTRIB_VALUE_CLOSED; continue; case S.ATTRIB_VALUE_CLOSED: if (isWhitespace(c)) { parser.state = S.ATTRIB; } else if (c === ">") { openTag(parser); } else if (c === "/") { parser.state = S.OPEN_TAG_SLASH; } else if (isMatch(nameStart, c)) { strictFail(parser, "No whitespace between attributes"); parser.attribName = c; parser.attribValue = ""; parser.state = S.ATTRIB_NAME; } else { strictFail(parser, "Invalid attribute name"); } continue; case S.ATTRIB_VALUE_UNQUOTED: if (!isAttribEnd(c)) { if (c === "&") { parser.state = S.ATTRIB_VALUE_ENTITY_U; } else { parser.attribValue += c; } continue; } attrib(parser); if (c === ">") { openTag(parser); } else { parser.state = S.ATTRIB; } continue; case S.CLOSE_TAG: if (!parser.tagName) { if (isWhitespace(c)) { continue; } else if (notMatch(nameStart, c)) { if (parser.script) { parser.script += "</" + c; parser.state = S.SCRIPT; } else { strictFail(parser, "Invalid tagname in closing tag."); } } else { parser.tagName = c; } } else if (c === ">") { closeTag(parser); } else if (isMatch(nameBody, c)) { parser.tagName += c; } else if (parser.script) { parser.script += "</" + parser.tagName; parser.tagName = ""; parser.state = S.SCRIPT; } else { if (!isWhitespace(c)) { strictFail(parser, "Invalid tagname in closing tag"); } parser.state = S.CLOSE_TAG_SAW_WHITE; } continue; case S.CLOSE_TAG_SAW_WHITE: