UNPKG

htmlparser2

Version:

Fast & forgiving HTML/XML parser

github.com/fb55/htmlparser2

fb55/htmlparser2

1,056 lines • 40.6 kB

JavaScript

import { DecodingMode, EntityDecoder, htmlDecodeTree, xmlDecodeTree, } from "entities/decode"; var CharCodes; (function (CharCodes) { CharCodes[CharCodes["Tab"] = 9] = "Tab"; CharCodes[CharCodes["NewLine"] = 10] = "NewLine"; CharCodes[CharCodes["FormFeed"] = 12] = "FormFeed"; CharCodes[CharCodes["CarriageReturn"] = 13] = "CarriageReturn"; CharCodes[CharCodes["Space"] = 32] = "Space"; CharCodes[CharCodes["ExclamationMark"] = 33] = "ExclamationMark"; CharCodes[CharCodes["Number"] = 35] = "Number"; CharCodes[CharCodes["Amp"] = 38] = "Amp"; CharCodes[CharCodes["SingleQuote"] = 39] = "SingleQuote"; CharCodes[CharCodes["DoubleQuote"] = 34] = "DoubleQuote"; CharCodes[CharCodes["Dash"] = 45] = "Dash"; CharCodes[CharCodes["Slash"] = 47] = "Slash"; CharCodes[CharCodes["Zero"] = 48] = "Zero"; CharCodes[CharCodes["Nine"] = 57] = "Nine"; CharCodes[CharCodes["Semi"] = 59] = "Semi"; CharCodes[CharCodes["Lt"] = 60] = "Lt"; CharCodes[CharCodes["Eq"] = 61] = "Eq"; CharCodes[CharCodes["Gt"] = 62] = "Gt"; CharCodes[CharCodes["Questionmark"] = 63] = "Questionmark"; CharCodes[CharCodes["UpperA"] = 65] = "UpperA"; CharCodes[CharCodes["LowerA"] = 97] = "LowerA"; CharCodes[CharCodes["UpperF"] = 70] = "UpperF"; CharCodes[CharCodes["LowerF"] = 102] = "LowerF"; CharCodes[CharCodes["UpperZ"] = 90] = "UpperZ"; CharCodes[CharCodes["LowerZ"] = 122] = "LowerZ"; CharCodes[CharCodes["LowerX"] = 120] = "LowerX"; CharCodes[CharCodes["OpeningSquareBracket"] = 91] = "OpeningSquareBracket"; })(CharCodes || (CharCodes = {})); /** All the states the tokenizer can be in. */ var State; (function (State) { State[State["Text"] = 1] = "Text"; State[State["BeforeTagName"] = 2] = "BeforeTagName"; State[State["InTagName"] = 3] = "InTagName"; State[State["InSelfClosingTag"] = 4] = "InSelfClosingTag"; State[State["BeforeClosingTagName"] = 5] = "BeforeClosingTagName"; State[State["InClosingTagName"] = 6] = "InClosingTagName"; State[State["AfterClosingTagName"] = 7] = "AfterClosingTagName"; // Attributes State[State["BeforeAttributeName"] = 8] = "BeforeAttributeName"; State[State["InAttributeName"] = 9] = "InAttributeName"; State[State["AfterAttributeName"] = 10] = "AfterAttributeName"; State[State["BeforeAttributeValue"] = 11] = "BeforeAttributeValue"; State[State["InAttributeValueDq"] = 12] = "InAttributeValueDq"; State[State["InAttributeValueSq"] = 13] = "InAttributeValueSq"; State[State["InAttributeValueNq"] = 14] = "InAttributeValueNq"; // Declarations State[State["BeforeDeclaration"] = 15] = "BeforeDeclaration"; State[State["InDeclaration"] = 16] = "InDeclaration"; // Processing instructions State[State["InProcessingInstruction"] = 17] = "InProcessingInstruction"; // Comments & CDATA State[State["BeforeComment"] = 18] = "BeforeComment"; State[State["CDATASequence"] = 19] = "CDATASequence"; State[State["DeclarationSequence"] = 20] = "DeclarationSequence"; State[State["InSpecialComment"] = 21] = "InSpecialComment"; State[State["InCommentLike"] = 22] = "InCommentLike"; // Special tags State[State["SpecialStartSequence"] = 23] = "SpecialStartSequence"; State[State["InSpecialTag"] = 24] = "InSpecialTag"; State[State["InPlainText"] = 25] = "InPlainText"; State[State["InEntity"] = 26] = "InEntity"; })(State || (State = {})); function isWhitespace(c) { return (c === CharCodes.Space || c === CharCodes.NewLine || c === CharCodes.Tab || c === CharCodes.FormFeed || c === CharCodes.CarriageReturn); } function isEndOfTagSection(c) { return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c); } function isASCIIAlpha(c) { return ((c >= CharCodes.LowerA && c <= CharCodes.LowerZ) || (c >= CharCodes.UpperA && c <= CharCodes.UpperZ)); } /** * Quote style used for parsed attributes. */ export var QuoteType; (function (QuoteType) { QuoteType[QuoteType["NoValue"] = 0] = "NoValue"; QuoteType[QuoteType["Unquoted"] = 1] = "Unquoted"; QuoteType[QuoteType["Single"] = 2] = "Single"; QuoteType[QuoteType["Double"] = 3] = "Double"; })(QuoteType || (QuoteType = {})); /** * Sequences used to match longer strings. * * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End * sequences with an increased offset. */ const Sequences = { Empty: new Uint8Array(0), Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[ CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]> CommentEnd: new Uint8Array([0x2d, 0x2d, 0x21, 0x3e]), // `--!>` Doctype: new Uint8Array([0x64, 0x6f, 0x63, 0x74, 0x79, 0x70, 0x65]), // `doctype` IframeEnd: new Uint8Array([0x3c, 0x2f, 0x69, 0x66, 0x72, 0x61, 0x6d, 0x65]), // `</iframe` NoembedEnd: new Uint8Array([ 0x3c, 0x2f, 0x6e, 0x6f, 0x65, 0x6d, 0x62, 0x65, 0x64, ]), // `</noembed` NoframesEnd: new Uint8Array([ 0x3c, 0x2f, 0x6e, 0x6f, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x73, ]), // `</noframes` Plaintext: new Uint8Array([ 0x3c, 0x2f, 0x70, 0x6c, 0x61, 0x69, 0x6e, 0x74, 0x65, 0x78, 0x74, ]), // `</plaintext` ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script` StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style` TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title` TextareaEnd: new Uint8Array([ 0x3c, 0x2f, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, ]), // `</textarea` XmpEnd: new Uint8Array([0x3c, 0x2f, 0x78, 0x6d, 0x70]), // `</xmp` }; /** * Maps the first lowercase character of an HTML tag name to the sequence * used for special-tag detection. All sequences share a common layout * where index 2 is the first tag-name character, so matching always * continues from offset 3. */ const specialStartSequences = new Map([ [Sequences.IframeEnd[2], Sequences.IframeEnd], [Sequences.NoembedEnd[2], Sequences.NoembedEnd], [Sequences.Plaintext[2], Sequences.Plaintext], [Sequences.ScriptEnd[2], Sequences.ScriptEnd], [Sequences.TitleEnd[2], Sequences.TitleEnd], [Sequences.XmpEnd[2], Sequences.XmpEnd], ]); /** * Tokenizer implementation used by `Parser`. */ export default class Tokenizer { cbs; /** The current state the tokenizer is in. */ state = State.Text; /** The read buffer. */ buffer = ""; /** The beginning of the section that is currently being read. */ sectionStart = 0; /** The index within the buffer that we are currently looking at. */ index = 0; /** The start of the last entity. */ entityStart = 0; /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ baseState = State.Text; /** For special parsing behavior inside of script and style tags. */ isSpecial = false; /** Indicates whether the tokenizer has been paused. */ running = true; /** The offset of the current buffer. */ offset = 0; xmlMode; decodeEntities; recognizeSelfClosing; entityDecoder; constructor({ xmlMode = false, decodeEntities = true, recognizeSelfClosing = xmlMode, }, cbs) { this.cbs = cbs; this.xmlMode = xmlMode; this.decodeEntities = decodeEntities; this.recognizeSelfClosing = recognizeSelfClosing; this.entityDecoder = new EntityDecoder(xmlMode ? xmlDecodeTree : htmlDecodeTree, (cp, consumed) => this.emitCodePoint(cp, consumed)); } reset() { this.state = State.Text; this.buffer = ""; this.sectionStart = 0; this.index = 0; this.baseState = State.Text; this.isSpecial = false; this.currentSequence = Sequences.Empty; this.sequenceIndex = 0; this.running = true; this.offset = 0; } write(chunk) { this.offset += this.buffer.length; this.buffer = chunk; this.parse(); } end() { if (this.running) this.finish(); } pause() { this.running = false; } resume() { this.running = true; if (this.index < this.buffer.length + this.offset) { this.parse(); } } stateText(c) { if (c === CharCodes.Lt || (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))) { if (this.index > this.sectionStart) { this.cbs.ontext(this.sectionStart, this.index); } this.state = State.BeforeTagName; this.sectionStart = this.index; } else if (this.decodeEntities && c === CharCodes.Amp) { this.startEntity(); } } currentSequence = Sequences.Empty; sequenceIndex = 0; enterTagBody() { if (this.currentSequence === Sequences.Plaintext) { this.currentSequence = Sequences.Empty; this.state = State.InPlainText; } else if (this.isSpecial) { this.state = State.InSpecialTag; this.sequenceIndex = 0; } else { this.state = State.Text; } } /** * Match the opening tag name against an HTML text-only tag sequence. * * Some tags share an initial prefix (`script`/`style`, `title`/`textarea`, * `noembed`/`noframes`), so we may switch to an alternate sequence at the * first distinguishing byte. On a successful full match we fall back to * the normal tag-name state; a later `>` will enter raw-text, RCDATA, or * plaintext mode based on `currentSequence` / `isSpecial`. * @param c Current character code point. */ stateSpecialStartSequence(c) { const lower = c | 0x20; // Still matching — check for an alternate sequence at branch points. if (this.sequenceIndex < this.currentSequence.length) { if (lower === this.currentSequence[this.sequenceIndex]) { this.sequenceIndex++; return; } if (this.sequenceIndex === 3) { if (this.currentSequence === Sequences.ScriptEnd && lower === Sequences.StyleEnd[3]) { this.currentSequence = Sequences.StyleEnd; this.sequenceIndex = 4; return; } if (this.currentSequence === Sequences.TitleEnd && lower === Sequences.TextareaEnd[3]) { this.currentSequence = Sequences.TextareaEnd; this.sequenceIndex = 4; return; } } else if (this.sequenceIndex === 4 && this.currentSequence === Sequences.NoembedEnd && lower === Sequences.NoframesEnd[4]) { this.currentSequence = Sequences.NoframesEnd; this.sequenceIndex = 5; return; } } else if (isEndOfTagSection(c)) { // Full match on a valid tag boundary — keep the sequence. this.sequenceIndex = 0; this.state = State.InTagName; this.stateInTagName(c); return; } // No match — abandon special-tag detection. this.isSpecial = false; this.currentSequence = Sequences.Empty; this.sequenceIndex = 0; this.state = State.InTagName; this.stateInTagName(c); } stateCDATASequence(c) { if (c === Sequences.Cdata[this.sequenceIndex]) { if (++this.sequenceIndex === Sequences.Cdata.length) { this.state = State.InCommentLike; this.currentSequence = Sequences.CdataEnd; this.sequenceIndex = 0; this.sectionStart = this.index + 1; } } else { this.sequenceIndex = 0; if (this.xmlMode) { this.state = State.InDeclaration; this.stateInDeclaration(c); // Reconsume the character } else { this.state = State.InSpecialComment; this.stateInSpecialComment(c); // Reconsume the character } } } /** * When we wait for one specific character, we can speed things up * by skipping through the buffer until we find it. * @param c Current character code point. * @returns Whether the character was found. */ fastForwardTo(c) { while (++this.index < this.buffer.length + this.offset) { if (this.buffer.charCodeAt(this.index - this.offset) === c) { return true; } } /* * We increment the index at the end of the `parse` loop, * so set it to `buffer.length - 1` here. * * TODO: Refactor `parse` to increment index before calling states. */ this.index = this.buffer.length + this.offset - 1; return false; } /** * Emit a comment token and return to the text state. * @param offset Number of characters in the end sequence that have already been matched. */ emitComment(offset) { this.cbs.oncomment(this.sectionStart, this.index, offset); this.sequenceIndex = 0; this.sectionStart = this.index + 1; this.state = State.Text; } /** * Comments and CDATA end with `-->` and `]]>`. * * Their common qualities are: * - Their end sequences have a distinct character they start with. * - That character is then repeated, so we have to check multiple repeats. * - All characters but the start character of the sequence can be skipped. * @param c Current character code point. */ stateInCommentLike(c) { if (!this.xmlMode && this.currentSequence === Sequences.CommentEnd && this.sequenceIndex <= 1 && /* * We're still at the very start of the comment: the only * characters consumed since ``, 1 for ``. this.emitComment(2); } else if (this.currentSequence === Sequences.CommentEnd && this.sequenceIndex === this.currentSequence.length - 1 && c !== CharCodes.Gt) { this.sequenceIndex = Number(c === CharCodes.Dash); } else if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { if (this.currentSequence === Sequences.CdataEnd) { this.cbs.oncdata(this.sectionStart, this.index, 2); } else { this.cbs.oncomment(this.sectionStart, this.index, 3); } this.sequenceIndex = 0; this.sectionStart = this.index + 1; this.state = State.Text; } } else if (this.sequenceIndex === 0) { // Fast-forward to the first character of the sequence if (this.fastForwardTo(this.currentSequence[0])) { this.sequenceIndex = 1; } } else if (c !== this.currentSequence[this.sequenceIndex - 1]) { // Allow long sequences, eg. --->, ]]]> this.sequenceIndex = 0; } } /** * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. * * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar). * We allow anything that wouldn't end the tag. * @param c Current character code point. */ isTagStartChar(c) { return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c); } /** * Scan raw-text / RCDATA content for the matching end tag. * * For RCDATA tags (`<title>`, `<textarea>`) entities are decoded inline. * For raw-text tags (`<script>`, `<style>`, etc.) we fast-forward to `<`. * @param c Current character code point. */ stateInSpecialTag(c) { if (this.sequenceIndex === this.currentSequence.length) { if (isEndOfTagSection(c)) { const endOfText = this.index - this.currentSequence.length; if (this.sectionStart < endOfText) { // Spoof the index so that reported locations match up. const actualIndex = this.index; this.index = endOfText; this.cbs.ontext(this.sectionStart, endOfText); this.index = actualIndex; } this.isSpecial = false; this.sectionStart = endOfText + 2; // Skip over the `</` this.stateInClosingTagName(c); return; // We are done; skip the rest of the function. } this.sequenceIndex = 0; } if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) { this.sequenceIndex += 1; } else if (this.sequenceIndex === 0) { if (this.currentSequence === Sequences.TitleEnd || this.currentSequence === Sequences.TextareaEnd) { // RCDATA tags have to parse entities while still looking for their end tag. if (this.decodeEntities && c === CharCodes.Amp) { this.startEntity(); } } else if (this.fastForwardTo(CharCodes.Lt)) { // Outside of RCDATA tags, we can fast-forward. this.sequenceIndex = 1; } } else { // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`. this.sequenceIndex = Number(c === CharCodes.Lt); } } stateBeforeTagName(c) { if (c === CharCodes.ExclamationMark) { this.state = State.BeforeDeclaration; this.sectionStart = this.index + 1; } else if (c === CharCodes.Questionmark) { if (this.xmlMode) { this.state = State.InProcessingInstruction; this.sequenceIndex = 0; this.sectionStart = this.index + 1; } else { this.state = State.InSpecialComment; this.sectionStart = this.index; } } else if (this.isTagStartChar(c)) { this.sectionStart = this.index; const special = this.xmlMode || this.cbs.isInForeignContext?.() ? undefined : specialStartSequences.get(c | 0x20); if (special === undefined) { this.state = State.InTagName; } else { this.isSpecial = true; this.currentSequence = special; this.sequenceIndex = 3; this.state = State.SpecialStartSequence; } } else if (c === CharCodes.Slash) { this.state = State.BeforeClosingTagName; } else { this.state = State.Text; this.stateText(c); } } stateInTagName(c) { if (isEndOfTagSection(c)) { this.cbs.onopentagname(this.sectionStart, this.index); this.sectionStart = -1; this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } } stateBeforeClosingTagName(c) { if (isWhitespace(c)) { if (this.xmlMode) { // Ignore } else { this.state = State.InSpecialComment; this.sectionStart = this.index; } } else if (c === CharCodes.Gt) { this.state = State.Text; if (!this.xmlMode) { this.sectionStart = this.index + 1; } } else { this.state = this.isTagStartChar(c) ? State.InClosingTagName : State.InSpecialComment; this.sectionStart = this.index; } } stateInClosingTagName(c) { if (isEndOfTagSection(c)) { this.cbs.onclosetag(this.sectionStart, this.index); this.sectionStart = -1; this.state = State.AfterClosingTagName; this.stateAfterClosingTagName(c); } } stateAfterClosingTagName(c) { // Skip everything until ">" if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.state = State.Text; this.sectionStart = this.index + 1; } } stateBeforeAttributeName(c) { if (c === CharCodes.Gt) { this.cbs.onopentagend(this.index); this.enterTagBody(); this.sectionStart = this.index + 1; } else if (c === CharCodes.Slash) { this.state = State.InSelfClosingTag; } else if (!isWhitespace(c)) { this.state = State.InAttributeName; this.sectionStart = this.index; } } /** * Handle `/` before `>` in an opening tag. * * In HTML mode, text-only tags ignore the self-closing flag and still enter * their raw-text/RCDATA/plaintext state unless self-closing tags are being * recognized. In XML mode, or for ordinary tags, the tokenizer returns to * regular text parsing after emitting the self-closing callback. * @param c Current character code point. */ stateInSelfClosingTag(c) { if (c === CharCodes.Gt) { this.cbs.onselfclosingtag(this.index); this.sectionStart = this.index + 1; if (!this.recognizeSelfClosing) { this.enterTagBody(); return; } this.state = State.Text; this.isSpecial = false; // Reset special state, in case of self-closing special tags this.currentSequence = Sequences.Empty; } else if (!isWhitespace(c)) { this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } } stateInAttributeName(c) { if (c === CharCodes.Eq || isEndOfTagSection(c)) { this.cbs.onattribname(this.sectionStart, this.index); this.sectionStart = this.index; this.state = State.AfterAttributeName; this.stateAfterAttributeName(c); } } stateAfterAttributeName(c) { if (c === CharCodes.Eq) { this.state = State.BeforeAttributeValue; } else if (c === CharCodes.Slash || c === CharCodes.Gt) { this.cbs.onattribend(QuoteType.NoValue, this.sectionStart); this.sectionStart = -1; this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (!isWhitespace(c)) { this.cbs.onattribend(QuoteType.NoValue, this.sectionStart); this.state = State.InAttributeName; this.sectionStart = this.index; } } stateBeforeAttributeValue(c) { if (c === CharCodes.DoubleQuote) { this.state = State.InAttributeValueDq; this.sectionStart = this.index + 1; } else if (c === CharCodes.SingleQuote) { this.state = State.InAttributeValueSq; this.sectionStart = this.index + 1; } else if (!isWhitespace(c)) { this.sectionStart = this.index; this.state = State.InAttributeValueNq; this.stateInAttributeValueNoQuotes(c); // Reconsume token } } handleInAttributeValue(c, quote) { if (c === quote || (!this.decodeEntities && this.fastForwardTo(quote))) { this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = -1; this.cbs.onattribend(quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single, this.index + 1); this.state = State.BeforeAttributeName; } else if (this.decodeEntities && c === CharCodes.Amp) { this.startEntity(); } } stateInAttributeValueDoubleQuotes(c) { this.handleInAttributeValue(c, CharCodes.DoubleQuote); } stateInAttributeValueSingleQuotes(c) { this.handleInAttributeValue(c, CharCodes.SingleQuote); } stateInAttributeValueNoQuotes(c) { if (isWhitespace(c) || c === CharCodes.Gt) { this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = -1; this.cbs.onattribend(QuoteType.Unquoted, this.index); this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (this.decodeEntities && c === CharCodes.Amp) { this.startEntity(); } } /** * Distinguish between CDATA, declarations, HTML comments, and HTML bogus * comments after `<!`. * * In HTML mode, only real comments and doctypes stay on declaration paths; * everything else becomes a bogus comment terminated by the next `>`. * @param c Current character code point. */ stateBeforeDeclaration(c) { if (c === CharCodes.OpeningSquareBracket) { this.state = State.CDATASequence; this.sequenceIndex = 0; } else if (this.xmlMode) { this.state = c === CharCodes.Dash ? State.BeforeComment : State.InDeclaration; } else if ((c | 0x20) === Sequences.Doctype[0]) { this.state = State.DeclarationSequence; this.currentSequence = Sequences.Doctype; this.sequenceIndex = 1; } else if (c === CharCodes.Gt) { this.cbs.oncomment(this.sectionStart, this.index, 0); this.state = State.Text; this.sectionStart = this.index + 1; } else if (c === CharCodes.Dash) { this.state = State.BeforeComment; } else { this.state = State.InSpecialComment; } } /** * Continue matching `doctype` after `<!d`. * * A full `doctype` match stays on the declaration path; any other name falls * back to an HTML bogus comment, which matches browser behavior for * non-doctype `<!...>` constructs. * @param c Current character code point. */ stateDeclarationSequence(c) { if (this.sequenceIndex === this.currentSequence.length) { this.state = State.InDeclaration; this.stateInDeclaration(c); } else if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) { this.sequenceIndex += 1; } else if (c === CharCodes.Gt) { this.cbs.oncomment(this.sectionStart, this.index, 0); this.state = State.Text; this.sectionStart = this.index + 1; } else { this.state = State.InSpecialComment; } } stateInDeclaration(c) { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.ondeclaration(this.sectionStart, this.index); this.state = State.Text; this.sectionStart = this.index + 1; } } /** * XML processing instructions (`<?...?>`). * * In HTML mode `<?` is routed to `InSpecialComment` instead, so this * state is only reachable in XML mode. * @param c Current character code point. */ stateInProcessingInstruction(c) { if (c === CharCodes.Questionmark) { // Remember that we just consumed `?`, so the next `>` closes the PI. this.sequenceIndex = 1; } else if (c === CharCodes.Gt && this.sequenceIndex === 1) { this.cbs.onprocessinginstruction(this.sectionStart, this.index - 1); this.sequenceIndex = 0; this.state = State.Text; this.sectionStart = this.index + 1; } else { // Keep scanning for the next `?`, which can start a closing `?>`. this.sequenceIndex = Number(this.fastForwardTo(CharCodes.Questionmark)); } } stateBeforeComment(c) { if (c === CharCodes.Dash) { this.state = State.InCommentLike; this.currentSequence = Sequences.CommentEnd; this.sequenceIndex = 0; this.sectionStart = this.index + 1; } else if (this.xmlMode) { this.state = State.InDeclaration; } else if (c === CharCodes.Gt) { this.cbs.oncomment(this.sectionStart, this.index, 0); this.state = State.Text; this.sectionStart = this.index + 1; } else { this.state = State.InSpecialComment; } } stateInSpecialComment(c) { if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.cbs.oncomment(this.sectionStart, this.index, 0); this.state = State.Text; this.sectionStart = this.index + 1; } } startEntity() { this.baseState = this.state; this.state = State.InEntity; this.entityStart = this.index; this.entityDecoder.startEntity(this.xmlMode ? DecodingMode.Strict : this.baseState === State.Text || this.baseState === State.InSpecialTag ? DecodingMode.Legacy : DecodingMode.Attribute); } stateInEntity() { const indexInBuffer = this.index - this.offset; const length = this.entityDecoder.write(this.buffer, indexInBuffer); // If `length` is positive, we are done with the entity. if (length >= 0) { this.state = this.baseState; if (length === 0) { this.index -= 1; } } else { if (indexInBuffer < this.buffer.length && this.buffer.charCodeAt(indexInBuffer) === CharCodes.Amp) { this.state = this.baseState; this.index -= 1; return; } // Mark buffer as consumed. this.index = this.offset + this.buffer.length - 1; } } /** * Remove data that has already been consumed from the buffer. */ cleanup() { // If we are inside of text or attributes, emit what we already have. if (this.running && this.sectionStart !== this.index) { if (this.state === State.Text || this.state === State.InPlainText || (this.state === State.InSpecialTag && this.sequenceIndex === 0)) { this.cbs.ontext(this.sectionStart, this.index); this.sectionStart = this.index; } else if (this.state === State.InAttributeValueDq || this.state === State.InAttributeValueSq || this.state === State.InAttributeValueNq) { this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = this.index; } } } shouldContinue() { return this.index < this.buffer.length + this.offset && this.running; } /** * Iterates through the buffer, calling the function corresponding to the current state. * * States that are more likely to be hit are higher up, as a performance improvement. */ parse() { while (this.shouldContinue()) { const c = this.buffer.charCodeAt(this.index - this.offset); switch (this.state) { case State.Text: { this.stateText(c); break; } case State.InPlainText: { // Skip to end of buffer; cleanup() emits the text. this.index = this.buffer.length + this.offset - 1; break; } case State.SpecialStartSequence: { this.stateSpecialStartSequence(c); break; } case State.InSpecialTag: { this.stateInSpecialTag(c); break; } case State.CDATASequence: { this.stateCDATASequence(c); break; } case State.DeclarationSequence: { this.stateDeclarationSequence(c); break; } case State.InAttributeValueDq: { this.stateInAttributeValueDoubleQuotes(c); break; } case State.InAttributeName: { this.stateInAttributeName(c); break; } case State.InCommentLike: { this.stateInCommentLike(c); break; } case State.InSpecialComment: { this.stateInSpecialComment(c); break; } case State.BeforeAttributeName: { this.stateBeforeAttributeName(c); break; } case State.InTagName: { this.stateInTagName(c); break; } case State.InClosingTagName: { this.stateInClosingTagName(c); break; } case State.BeforeTagName: { this.stateBeforeTagName(c); break; } case State.AfterAttributeName: { this.stateAfterAttributeName(c); break; } case State.InAttributeValueSq: { this.stateInAttributeValueSingleQuotes(c); break; } case State.BeforeAttributeValue: { this.stateBeforeAttributeValue(c); break; } case State.BeforeClosingTagName: { this.stateBeforeClosingTagName(c); break; } case State.AfterClosingTagName: { this.stateAfterClosingTagName(c); break; } case State.InAttributeValueNq: { this.stateInAttributeValueNoQuotes(c); break; } case State.InSelfClosingTag: { this.stateInSelfClosingTag(c); break; } case State.InDeclaration: { this.stateInDeclaration(c); break; } case State.BeforeDeclaration: { this.stateBeforeDeclaration(c); break; } case State.BeforeComment: { this.stateBeforeComment(c); break; } case State.InProcessingInstruction: { this.stateInProcessingInstruction(c); break; } case State.InEntity: { this.stateInEntity(); break; } } this.index++; } this.cleanup(); } finish() { if (this.state === State.InEntity) { this.entityDecoder.end(); this.state = this.baseState; } this.handleTrailingData(); this.cbs.onend(); } handleTrailingCommentLikeData(endIndex) { if (this.state !== State.InCommentLike) { return false; } if (this.currentSequence === Sequences.CdataEnd) { if (this.xmlMode) { if (this.sectionStart < endIndex) { this.cbs.oncdata(this.sectionStart, endIndex, 0); } } else { /* In HTML mode, unclosed CDATA is a bogus comment. */ const cdataStart = this.sectionStart - Sequences.Cdata.length - 1; this.cbs.oncomment(cdataStart, endIndex, 0); } } else { const offset = this.xmlMode ? 0 : Math.min(this.sequenceIndex, Sequences.CommentEnd.length - 1); this.cbs.oncomment(this.sectionStart, endIndex, offset); } return true; } handleTrailingMarkupDeclaration(endIndex) { if (this.xmlMode) { switch (this.state) { case State.InSpecialComment: case State.BeforeComment: case State.CDATASequence: case State.DeclarationSequence: case State.InDeclaration: { this.cbs.ontext(this.sectionStart, endIndex); return true; } default: { return false; } } } switch (this.state) { case State.BeforeDeclaration: case State.InSpecialComment: case State.BeforeComment: case State.CDATASequence: { this.cbs.oncomment(this.sectionStart, endIndex, 0); return true; } case State.DeclarationSequence: { if (this.sequenceIndex !== Sequences.Doctype.length) { this.cbs.oncomment(this.sectionStart, endIndex, 0); } return true; } case State.InDeclaration: { return true; } default: { return false; } } } /** Handle any trailing data. */ handleTrailingData() { const endIndex = this.buffer.length + this.offset; if (this.handleTrailingCommentLikeData(endIndex) || this.handleTrailingMarkupDeclaration(endIndex)) { return; } // If there is no remaining data, we are done. if (this.sectionStart >= endIndex) { return; } switch (this.state) { case State.InTagName: case State.BeforeAttributeName: case State.BeforeAttributeValue: case State.AfterAttributeName: case State.InAttributeName: case State.InAttributeValueSq: case State.InAttributeValueDq: case State.InAttributeValueNq: case State.InClosingTagName: { /* * If we are currently in an opening or closing tag, us not calling the * respective callback signals that the tag should be ignored. */ break; } default: { this.cbs.ontext(this.sectionStart, endIndex); } } } emitCodePoint(cp, consumed) { if (this.baseState !== State.Text && this.baseState !== State.InSpecialTag) { if (this.sectionStart < this.entityStart) { this.cbs.onattribdata(this.sectionStart, this.entityStart); } this.sectionStart = this.entityStart + consumed; this.index = this.sectionStart - 1; this.cbs.onattribentity(cp); } else { if (this.sectionStart < this.entityStart) { this.cbs.ontext(this.sectionStart, this.entityStart); } this.sectionStart = this.entityStart + consumed; this.index = this.sectionStart - 1; this.cbs.ontextentity(cp, this.sectionStart); } } } //# sourceMappingURL=Tokenizer.js.map