UNPKG

dropflow

Version:

A small CSS2 document renderer built from specifications

1,331 lines (1,330 loc) • 47.3 kB

JavaScript

// fb55/htmlparser2 by Felix Böhm // // Parser.ts and Tokenizer.ts were inlined into this file with no modifications // other than style changes and imports/exports (at time of writing) // // Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. import { determineBranch, BinTrieFlags } from './string-trie.js'; import entityTrie from '../gen/entity-trie.js'; const decodeMap = new Map([ [0, 65533], // C1 Unicode control character reference replacements [128, 8364], [130, 8218], [131, 402], [132, 8222], [133, 8230], [134, 8224], [135, 8225], [136, 710], [137, 8240], [138, 352], [139, 8249], [140, 338], [142, 381], [145, 8216], [146, 8217], [147, 8220], [148, 8221], [149, 8226], [150, 8211], [151, 8212], [152, 732], [153, 8482], [154, 353], [155, 8250], [156, 339], [158, 382], [159, 376] ]); /** * Replace the given code point with a replacement character if it is a * surrogate or is outside the valid range. Otherwise return the code * point unchanged. */ function replaceCodePoint(codePoint) { if ((codePoint >= 0xd800 && codePoint <= 0xdfff) || codePoint > 0x10ffff) { return 0xfffd; } return decodeMap.get(codePoint) ?? codePoint; } export function isWhitespace(c) { return (c === 32 /* CharCodes.Space */ || c === 10 /* CharCodes.NewLine */ || c === 9 /* CharCodes.Tab */ || c === 12 /* CharCodes.FormFeed */ || c === 13 /* CharCodes.CarriageReturn */); } function isEndOfTagSection(c) { return c === 47 /* CharCodes.Slash */ || c === 62 /* CharCodes.Gt */ || isWhitespace(c); } function isNumber(c) { return c >= 48 /* CharCodes.Zero */ && c <= 57 /* CharCodes.Nine */; } function isASCIIAlpha(c) { return ((c >= 97 /* CharCodes.LowerA */ && c <= 122 /* CharCodes.LowerZ */) || (c >= 65 /* CharCodes.UpperA */ && c <= 90 /* CharCodes.UpperZ */)); } function isHexDigit(c) { return ((c >= 65 /* CharCodes.UpperA */ && c <= 70 /* CharCodes.UpperF */) || (c >= 97 /* CharCodes.LowerA */ && c <= 102 /* CharCodes.LowerF */)); } var QuoteType; (function (QuoteType) { QuoteType[QuoteType["NoValue"] = 0] = "NoValue"; QuoteType[QuoteType["Unquoted"] = 1] = "Unquoted"; QuoteType[QuoteType["Single"] = 2] = "Single"; QuoteType[QuoteType["Double"] = 3] = "Double"; })(QuoteType || (QuoteType = {})); /** * Sequences used to match longer strings. * * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End * sequences with an increased offset. */ const Sequences = { Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[ CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]> CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), // `-->` ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script` StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style` TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title` }; class Tokenizer { cbs; /** The current state the tokenizer is in. */ state = 1 /* State.Text */; /** The read buffer. */ buffer = ''; /** The beginning of the section that is currently being read. */ sectionStart = 0; /** The index within the buffer that we are currently looking at. */ index = 0; /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ baseState = 1 /* State.Text */; /** For special parsing behavior inside of script and style tags. */ isSpecial = false; /** Indicates whether the tokenizer has been paused. */ running = true; /** The offset of the current buffer. */ offset = 0; constructor(cbs) { this.cbs = cbs; } reset() { this.state = 1 /* State.Text */; this.buffer = ''; this.sectionStart = 0; this.index = 0; this.baseState = 1 /* State.Text */; this.currentSequence = undefined; this.running = true; this.offset = 0; } write(chunk) { this.offset += this.buffer.length; this.buffer = chunk; this.parse(); } end() { if (this.running) this.finish(); } pause() { this.running = false; } resume() { this.running = true; if (this.index < this.buffer.length + this.offset) { this.parse(); } } /** * The current index within all of the written data. */ getIndex() { return this.index; } /** * The start of the current section. */ getSectionStart() { return this.sectionStart; } stateText(c) { if (c === 60 /* CharCodes.Lt */) { if (this.index > this.sectionStart) { this.cbs.ontext(this.sectionStart, this.index); } this.state = 2 /* State.BeforeTagName */; this.sectionStart = this.index; } else if (c === 38 /* CharCodes.Amp */) { this.state = 25 /* State.BeforeEntity */; } } currentSequence; sequenceIndex = 0; stateSpecialStartSequence(c) { const isEnd = this.sequenceIndex === this.currentSequence.length; const isMatch = isEnd ? // If we are at the end of the sequence, make sure the tag name has ended isEndOfTagSection(c) : // Otherwise, do a case-insensitive comparison (c | 0x20) === this.currentSequence[this.sequenceIndex]; if (!isMatch) { this.isSpecial = false; } else if (!isEnd) { this.sequenceIndex++; return; } this.sequenceIndex = 0; this.state = 3 /* State.InTagName */; this.stateInTagName(c); } /** Look for an end tag. For <title> tags, also decode entities. */ stateInSpecialTag(c) { if (this.sequenceIndex === this.currentSequence.length) { if (c === 62 /* CharCodes.Gt */ || isWhitespace(c)) { const endOfText = this.index - this.currentSequence.length; if (this.sectionStart < endOfText) { // Spoof the index so that reported locations match up. const actualIndex = this.index; this.index = endOfText; this.cbs.ontext(this.sectionStart, endOfText); this.index = actualIndex; } this.isSpecial = false; this.sectionStart = endOfText + 2; // Skip over the `</` this.stateInClosingTagName(c); return; // We are done; skip the rest of the function. } this.sequenceIndex = 0; } if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) { this.sequenceIndex += 1; } else if (this.sequenceIndex === 0) { if (this.currentSequence === Sequences.TitleEnd) { // We have to parse entities in <title> tags. if (c === 38 /* CharCodes.Amp */) { this.state = 25 /* State.BeforeEntity */; } } else if (this.fastForwardTo(60 /* CharCodes.Lt */)) { // Outside of <title> tags, we can fast-forward. this.sequenceIndex = 1; } } else { // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`. this.sequenceIndex = Number(c === 60 /* CharCodes.Lt */); } } stateCDATASequence(c) { if (c === Sequences.Cdata[this.sequenceIndex]) { if (++this.sequenceIndex === Sequences.Cdata.length) { this.state = 21 /* State.InCommentLike */; this.currentSequence = Sequences.CdataEnd; this.sequenceIndex = 0; this.sectionStart = this.index + 1; } } else { this.sequenceIndex = 0; this.state = 16 /* State.InDeclaration */; this.stateInDeclaration(c); // Reconsume the character } } /** * When we wait for one specific character, we can speed things up * by skipping through the buffer until we find it. * * @returns Whether the character was found. */ fastForwardTo(c) { while (++this.index < this.buffer.length + this.offset) { if (this.buffer.charCodeAt(this.index - this.offset) === c) { return true; } } /* * We increment the index at the end of the `parse` loop, * so set it to `buffer.length - 1` here. * * TODO: Refactor `parse` to increment index before calling states. */ this.index = this.buffer.length + this.offset - 1; return false; } /** * Comments and CDATA end with `-->` and `]]>`. * * Their common qualities are: * - Their end sequences have a distinct character they start with. * - That character is then repeated, so we have to check multiple repeats. * - All characters but the start character of the sequence can be skipped. */ stateInCommentLike(c) { if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { if (this.currentSequence === Sequences.CdataEnd) { this.cbs.oncdata(this.sectionStart, this.index, 2); } else { this.cbs.oncomment(this.sectionStart, this.index, 2); } this.sequenceIndex = 0; this.sectionStart = this.index + 1; this.state = 1 /* State.Text */; } } else if (this.sequenceIndex === 0) { // Fast-forward to the first character of the sequence if (this.fastForwardTo(this.currentSequence[0])) { this.sequenceIndex = 1; } } else if (c !== this.currentSequence[this.sequenceIndex - 1]) { // Allow long sequences, eg. --->, ]]]> this.sequenceIndex = 0; } } /** * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. */ isTagStartChar(c) { return isASCIIAlpha(c); } startSpecial(sequence, offset) { this.isSpecial = true; this.currentSequence = sequence; this.sequenceIndex = offset; this.state = 23 /* State.SpecialStartSequence */; } stateBeforeTagName(c) { if (c === 33 /* CharCodes.ExclamationMark */) { this.state = 15 /* State.BeforeDeclaration */; this.sectionStart = this.index + 1; } else if (c === 63 /* CharCodes.Questionmark */) { this.state = 17 /* State.InProcessingInstruction */; this.sectionStart = this.index + 1; } else if (this.isTagStartChar(c)) { const lower = c | 0x20; this.sectionStart = this.index; if (lower === Sequences.TitleEnd[2]) { this.startSpecial(Sequences.TitleEnd, 3); } else { this.state = lower === Sequences.ScriptEnd[2] ? 22 /* State.BeforeSpecialS */ : 3 /* State.InTagName */; } } else if (c === 47 /* CharCodes.Slash */) { this.state = 5 /* State.BeforeClosingTagName */; } else { this.state = 1 /* State.Text */; this.stateText(c); } } stateInTagName(c) { if (isEndOfTagSection(c)) { this.cbs.onopentagname(this.sectionStart, this.index); this.sectionStart = -1; this.state = 8 /* State.BeforeAttributeName */; this.stateBeforeAttributeName(c); } } stateBeforeClosingTagName(c) { if (isWhitespace(c)) { // Ignore } else if (c === 62 /* CharCodes.Gt */) { this.state = 1 /* State.Text */; } else { this.state = this.isTagStartChar(c) ? 6 /* State.InClosingTagName */ : 20 /* State.InSpecialComment */; this.sectionStart = this.index; } } stateInClosingTagName(c) { if (c === 62 /* CharCodes.Gt */ || isWhitespace(c)) { this.cbs.onclosetag(this.sectionStart, this.index); this.sectionStart = -1; this.state = 7 /* State.AfterClosingTagName */; this.stateAfterClosingTagName(c); } } stateAfterClosingTagName(c) { // Skip everything until ">" if (c === 62 /* CharCodes.Gt */ || this.fastForwardTo(62 /* CharCodes.Gt */)) { this.state = 1 /* State.Text */; this.sectionStart = this.index + 1; } } stateBeforeAttributeName(c) { if (c === 62 /* CharCodes.Gt */) { this.cbs.onopentagend(this.index); if (this.isSpecial) { this.state = 24 /* State.InSpecialTag */; this.sequenceIndex = 0; } else { this.state = 1 /* State.Text */; } this.baseState = this.state; this.sectionStart = this.index + 1; } else if (c === 47 /* CharCodes.Slash */) { this.state = 4 /* State.InSelfClosingTag */; } else if (!isWhitespace(c)) { this.state = 9 /* State.InAttributeName */; this.sectionStart = this.index; } } stateInSelfClosingTag(c) { if (c === 62 /* CharCodes.Gt */) { this.cbs.onselfclosingtag(this.index); this.state = 1 /* State.Text */; this.baseState = 1 /* State.Text */; this.sectionStart = this.index + 1; this.isSpecial = false; // Reset special state, in case of self-closing special tags } else if (!isWhitespace(c)) { this.state = 8 /* State.BeforeAttributeName */; this.stateBeforeAttributeName(c); } } stateInAttributeName(c) { if (c === 61 /* CharCodes.Eq */ || isEndOfTagSection(c)) { this.cbs.onattribname(this.sectionStart, this.index); this.sectionStart = -1; this.state = 10 /* State.AfterAttributeName */; this.stateAfterAttributeName(c); } } stateAfterAttributeName(c) { if (c === 61 /* CharCodes.Eq */) { this.state = 11 /* State.BeforeAttributeValue */; } else if (c === 47 /* CharCodes.Slash */ || c === 62 /* CharCodes.Gt */) { this.cbs.onattribend(QuoteType.NoValue, this.index); this.state = 8 /* State.BeforeAttributeName */; this.stateBeforeAttributeName(c); } else if (!isWhitespace(c)) { this.cbs.onattribend(QuoteType.NoValue, this.index); this.state = 9 /* State.InAttributeName */; this.sectionStart = this.index; } } stateBeforeAttributeValue(c) { if (c === 34 /* CharCodes.DoubleQuote */) { this.state = 12 /* State.InAttributeValueDq */; this.sectionStart = this.index + 1; } else if (c === 39 /* CharCodes.SingleQuote */) { this.state = 13 /* State.InAttributeValueSq */; this.sectionStart = this.index + 1; } else if (!isWhitespace(c)) { this.sectionStart = this.index; this.state = 14 /* State.InAttributeValueNq */; this.stateInAttributeValueNoQuotes(c); // Reconsume token } } handleInAttributeValue(c, quote) { if (c === quote) { this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = -1; this.cbs.onattribend(quote === 34 /* CharCodes.DoubleQuote */ ? QuoteType.Double : QuoteType.Single, this.index); this.state = 8 /* State.BeforeAttributeName */; } else if (c === 38 /* CharCodes.Amp */) { this.baseState = this.state; this.state = 25 /* State.BeforeEntity */; } } stateInAttributeValueDoubleQuotes(c) { this.handleInAttributeValue(c, 34 /* CharCodes.DoubleQuote */); } stateInAttributeValueSingleQuotes(c) { this.handleInAttributeValue(c, 39 /* CharCodes.SingleQuote */); } stateInAttributeValueNoQuotes(c) { if (isWhitespace(c) || c === 62 /* CharCodes.Gt */) { this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = -1; this.cbs.onattribend(QuoteType.Unquoted, this.index); this.state = 8 /* State.BeforeAttributeName */; this.stateBeforeAttributeName(c); } else if (c === 38 /* CharCodes.Amp */) { this.baseState = this.state; this.state = 25 /* State.BeforeEntity */; } } stateBeforeDeclaration(c) { if (c === 91 /* CharCodes.OpeningSquareBracket */) { this.state = 19 /* State.CDATASequence */; this.sequenceIndex = 0; } else { this.state = c === 45 /* CharCodes.Dash */ ? 18 /* State.BeforeComment */ : 16 /* State.InDeclaration */; } } stateInDeclaration(c) { if (c === 62 /* CharCodes.Gt */ || this.fastForwardTo(62 /* CharCodes.Gt */)) { this.cbs.ondeclaration(this.sectionStart, this.index); this.state = 1 /* State.Text */; this.sectionStart = this.index + 1; } } stateInProcessingInstruction(c) { if (c === 62 /* CharCodes.Gt */ || this.fastForwardTo(62 /* CharCodes.Gt */)) { this.cbs.onprocessinginstruction(this.sectionStart, this.index); this.state = 1 /* State.Text */; this.sectionStart = this.index + 1; } } stateBeforeComment(c) { if (c === 45 /* CharCodes.Dash */) { this.state = 21 /* State.InCommentLike */; this.currentSequence = Sequences.CommentEnd; // Allow short comments (eg. <!-->) this.sequenceIndex = 2; this.sectionStart = this.index + 1; } else { this.state = 16 /* State.InDeclaration */; } } stateInSpecialComment(c) { if (c === 62 /* CharCodes.Gt */ || this.fastForwardTo(62 /* CharCodes.Gt */)) { this.cbs.oncomment(this.sectionStart, this.index, 0); this.state = 1 /* State.Text */; this.sectionStart = this.index + 1; } } stateBeforeSpecialS(c) { const lower = c | 0x20; if (lower === Sequences.ScriptEnd[3]) { this.startSpecial(Sequences.ScriptEnd, 4); } else if (lower === Sequences.StyleEnd[3]) { this.startSpecial(Sequences.StyleEnd, 4); } else { this.state = 3 /* State.InTagName */; this.stateInTagName(c); // Consume the token again } } trieIndex = 0; trieCurrent = 0; /** For named entities, the index of the value. For numeric entities, the code point. */ entityResult = 0; entityExcess = 0; stateBeforeEntity(c) { // Start excess with 1 to include the '&' this.entityExcess = 1; this.entityResult = 0; if (c === 35 /* CharCodes.Num */) { this.state = 26 /* State.BeforeNumericEntity */; } else if (c === 38 /* CharCodes.Amp */) { // We have two `&` characters in a row. Stay in the current state. } else { this.trieIndex = 0; this.trieCurrent = entityTrie[0]; this.state = 27 /* State.InNamedEntity */; this.stateInNamedEntity(c); } } stateInNamedEntity(c) { this.entityExcess += 1; this.trieIndex = determineBranch(entityTrie, this.trieCurrent, this.trieIndex + 1, c); if (this.trieIndex < 0) { this.emitNamedEntity(); this.index--; return; } this.trieCurrent = entityTrie[this.trieIndex]; const masked = this.trieCurrent & BinTrieFlags.VALUE_LENGTH; // If the branch is a value, store it and continue if (masked) { // The mask is the number of bytes of the value, including the current byte. const valueLength = (masked >> 14) - 1; // If we have a legacy entity while parsing strictly, just skip the number of bytes if (!this.allowLegacyEntity() && c !== 59 /* CharCodes.Semi */) { this.trieIndex += valueLength; } else { // Add 1 as we have already incremented the excess const entityStart = this.index - this.entityExcess + 1; if (entityStart > this.sectionStart) { this.emitPartial(this.sectionStart, entityStart); } // If this is a surrogate pair, consume the next two bytes this.entityResult = this.trieIndex; this.trieIndex += valueLength; this.entityExcess = 0; this.sectionStart = this.index + 1; if (valueLength === 0) { this.emitNamedEntity(); } } } } emitNamedEntity() { this.state = this.baseState; if (this.entityResult === 0) { return; } const valueLength = (entityTrie[this.entityResult] & BinTrieFlags.VALUE_LENGTH) >> 14; switch (valueLength) { case 1: this.emitCodePoint(entityTrie[this.entityResult] & ~BinTrieFlags.VALUE_LENGTH); break; case 2: this.emitCodePoint(entityTrie[this.entityResult + 1]); break; case 3: { const first = entityTrie[this.entityResult + 1]; const second = entityTrie[this.entityResult + 2]; // If this is a surrogate pair, combine the code points. if (first >= 0xd8_00 && first <= 0xdf_ff) { this.emitCodePoint( // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae (first - 0xd8_00) * 0x4_00 + second + 0x24_00); } else { this.emitCodePoint(first); this.emitCodePoint(second); } } } } stateBeforeNumericEntity(c) { if ((c | 0x20) === 120 /* CharCodes.LowerX */) { this.entityExcess++; this.state = 29 /* State.InHexEntity */; } else { this.state = 28 /* State.InNumericEntity */; this.stateInNumericEntity(c); } } emitNumericEntity(strict) { const entityStart = this.index - this.entityExcess - 1; const numberStart = entityStart + 2 + Number(this.state === 29 /* State.InHexEntity */); if (numberStart !== this.index) { // Emit leading data if any if (entityStart > this.sectionStart) { this.emitPartial(this.sectionStart, entityStart); } this.sectionStart = this.index + Number(strict); this.emitCodePoint(this.entityResult); } this.state = this.baseState; } stateInNumericEntity(c) { if (c === 59 /* CharCodes.Semi */) { this.emitNumericEntity(true); } else if (isNumber(c)) { this.entityResult = this.entityResult * 10 + (c - 48 /* CharCodes.Zero */); this.entityExcess++; } else { if (this.allowLegacyEntity()) { this.emitNumericEntity(false); } else { this.state = this.baseState; } this.index--; } } stateInHexEntity(c) { if (c === 59 /* CharCodes.Semi */) { this.emitNumericEntity(true); } else if (isNumber(c)) { this.entityResult = this.entityResult * 16 + (c - 48 /* CharCodes.Zero */); this.entityExcess++; } else if (isHexDigit(c)) { this.entityResult = this.entityResult * 16 + ((c | 0x20) - 97 /* CharCodes.LowerA */ + 10); this.entityExcess++; } else { if (this.allowLegacyEntity()) { this.emitNumericEntity(false); } else { this.state = this.baseState; } this.index--; } } allowLegacyEntity() { return this.baseState === 1 /* State.Text */ || this.baseState === 24 /* State.InSpecialTag */; } /** * Remove data that has already been consumed from the buffer. */ cleanup() { // If we are inside of text or attributes, emit what we already have. if (this.running && this.sectionStart !== this.index) { if (this.state === 1 /* State.Text */ || (this.state === 24 /* State.InSpecialTag */ && this.sequenceIndex === 0)) { this.cbs.ontext(this.sectionStart, this.index); this.sectionStart = this.index; } else if (this.state === 12 /* State.InAttributeValueDq */ || this.state === 13 /* State.InAttributeValueSq */ || this.state === 14 /* State.InAttributeValueNq */) { this.cbs.onattribdata(this.sectionStart, this.index); this.sectionStart = this.index; } } } shouldContinue() { return this.index < this.buffer.length + this.offset && this.running; } /** * Iterates through the buffer, calling the function corresponding to the current state. * * States that are more likely to be hit are higher up, as a performance improvement. */ parse() { while (this.shouldContinue()) { const c = this.buffer.charCodeAt(this.index - this.offset); if (this.state === 1 /* State.Text */) { this.stateText(c); } else if (this.state === 23 /* State.SpecialStartSequence */) { this.stateSpecialStartSequence(c); } else if (this.state === 24 /* State.InSpecialTag */) { this.stateInSpecialTag(c); } else if (this.state === 19 /* State.CDATASequence */) { this.stateCDATASequence(c); } else if (this.state === 12 /* State.InAttributeValueDq */) { this.stateInAttributeValueDoubleQuotes(c); } else if (this.state === 9 /* State.InAttributeName */) { this.stateInAttributeName(c); } else if (this.state === 21 /* State.InCommentLike */) { this.stateInCommentLike(c); } else if (this.state === 20 /* State.InSpecialComment */) { this.stateInSpecialComment(c); } else if (this.state === 8 /* State.BeforeAttributeName */) { this.stateBeforeAttributeName(c); } else if (this.state === 3 /* State.InTagName */) { this.stateInTagName(c); } else if (this.state === 6 /* State.InClosingTagName */) { this.stateInClosingTagName(c); } else if (this.state === 2 /* State.BeforeTagName */) { this.stateBeforeTagName(c); } else if (this.state === 10 /* State.AfterAttributeName */) { this.stateAfterAttributeName(c); } else if (this.state === 13 /* State.InAttributeValueSq */) { this.stateInAttributeValueSingleQuotes(c); } else if (this.state === 11 /* State.BeforeAttributeValue */) { this.stateBeforeAttributeValue(c); } else if (this.state === 5 /* State.BeforeClosingTagName */) { this.stateBeforeClosingTagName(c); } else if (this.state === 7 /* State.AfterClosingTagName */) { this.stateAfterClosingTagName(c); } else if (this.state === 22 /* State.BeforeSpecialS */) { this.stateBeforeSpecialS(c); } else if (this.state === 14 /* State.InAttributeValueNq */) { this.stateInAttributeValueNoQuotes(c); } else if (this.state === 4 /* State.InSelfClosingTag */) { this.stateInSelfClosingTag(c); } else if (this.state === 16 /* State.InDeclaration */) { this.stateInDeclaration(c); } else if (this.state === 15 /* State.BeforeDeclaration */) { this.stateBeforeDeclaration(c); } else if (this.state === 18 /* State.BeforeComment */) { this.stateBeforeComment(c); } else if (this.state === 17 /* State.InProcessingInstruction */) { this.stateInProcessingInstruction(c); } else if (this.state === 27 /* State.InNamedEntity */) { this.stateInNamedEntity(c); } else if (this.state === 25 /* State.BeforeEntity */) { this.stateBeforeEntity(c); } else if (this.state === 29 /* State.InHexEntity */) { this.stateInHexEntity(c); } else if (this.state === 28 /* State.InNumericEntity */) { this.stateInNumericEntity(c); } else { // `this._state === State.BeforeNumericEntity` this.stateBeforeNumericEntity(c); } this.index++; } this.cleanup(); } finish() { if (this.state === 27 /* State.InNamedEntity */) { this.emitNamedEntity(); } // If there is remaining data, emit it in a reasonable way if (this.sectionStart < this.index) { this.handleTrailingData(); } this.cbs.onend(); } /** Handle any trailing data. */ handleTrailingData() { const endIndex = this.buffer.length + this.offset; if (this.state === 21 /* State.InCommentLike */) { if (this.currentSequence === Sequences.CdataEnd) { this.cbs.oncdata(this.sectionStart, endIndex, 0); } else { this.cbs.oncomment(this.sectionStart, endIndex, 0); } } else if (this.state === 28 /* State.InNumericEntity */ && this.allowLegacyEntity()) { this.emitNumericEntity(false); // All trailing data will have been consumed } else if (this.state === 29 /* State.InHexEntity */ && this.allowLegacyEntity()) { this.emitNumericEntity(false); // All trailing data will have been consumed } else if (this.state === 3 /* State.InTagName */ || this.state === 8 /* State.BeforeAttributeName */ || this.state === 11 /* State.BeforeAttributeValue */ || this.state === 10 /* State.AfterAttributeName */ || this.state === 9 /* State.InAttributeName */ || this.state === 13 /* State.InAttributeValueSq */ || this.state === 12 /* State.InAttributeValueDq */ || this.state === 14 /* State.InAttributeValueNq */ || this.state === 6 /* State.InClosingTagName */) { /* * If we are currently in an opening or closing tag, us not calling the * respective callback signals that the tag should be ignored. */ } else { this.cbs.ontext(this.sectionStart, endIndex); } } emitPartial(start, endIndex) { if (this.baseState !== 1 /* State.Text */ && this.baseState !== 24 /* State.InSpecialTag */) { this.cbs.onattribdata(start, endIndex); } else { this.cbs.ontext(start, endIndex); } } emitCodePoint(cp) { if (this.baseState !== 1 /* State.Text */ && this.baseState !== 24 /* State.InSpecialTag */) { this.cbs.onattribentity(cp); } else { this.cbs.ontextentity(cp); } } } const formTags = new Set([ 'input', 'option', 'optgroup', 'select', 'button', 'datalist', 'textarea', ]); const pTag = new Set(['p']); const tableSectionTags = new Set(['thead', 'tbody']); const ddtTags = new Set(['dd', 'dt']); const rtpTags = new Set(['rt', 'rp']); const openImpliesClose = new Map([ ['tr', new Set(['tr', 'th', 'td'])], ['th', new Set(['th'])], ['td', new Set(['thead', 'th', 'td'])], ['body', new Set(['head', 'link', 'script'])], ['li', new Set(['li'])], ['p', pTag], ['h1', pTag], ['h2', pTag], ['h3', pTag], ['h4', pTag], ['h5', pTag], ['h6', pTag], ['select', formTags], ['input', formTags], ['output', formTags], ['button', formTags], ['datalist', formTags], ['textarea', formTags], ['option', new Set(['option'])], ['optgroup', new Set(['optgroup', 'option'])], ['dd', ddtTags], ['dt', ddtTags], ['address', pTag], ['article', pTag], ['aside', pTag], ['blockquote', pTag], ['details', pTag], ['div', pTag], ['dl', pTag], ['fieldset', pTag], ['figcaption', pTag], ['figure', pTag], ['footer', pTag], ['form', pTag], ['header', pTag], ['hr', pTag], ['main', pTag], ['nav', pTag], ['ol', pTag], ['pre', pTag], ['section', pTag], ['table', pTag], ['ul', pTag], ['rt', rtpTags], ['rp', rtpTags], ['tbody', tableSectionTags], ['tfoot', tableSectionTags], ]); const voidElements = new Set([ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', ]); const foreignContextElements = new Set(['math', 'svg']); const htmlIntegrationElements = new Set([ 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml', 'foreignobject', 'desc', 'title', ]); const reNameEnd = /\s|\//; export class Parser { /** The start index of the last event. */ startIndex = 0; /** The end index of the last event. */ endIndex = 0; /** * Store the start index of the current open tag, * so we can update the start index for attributes. */ openTagStart = 0; tagname = ''; attribname = ''; attribvalue = ''; attribs = null; stack = []; foreignContext = []; cbs; tokenizer; buffers = []; bufferOffset = 0; /** The index of the last written buffer. Used when resuming after a `pause()`. */ writeIndex = 0; /** Indicates whether the parser has finished running / `.end` has been called. */ ended = false; constructor(cbs, options = {}) { this.cbs = cbs ?? {}; this.tokenizer = new (options.Tokenizer ?? Tokenizer)(this); this.cbs.onparserinit?.(this); } // Tokenizer event handlers /** @internal */ ontext(start, endIndex) { const data = this.getSlice(start, endIndex); this.endIndex = endIndex - 1; this.cbs.ontext?.(data); this.startIndex = endIndex; } /** @internal */ ontextentity(cp) { /* * Entities can be emitted on the character, or directly after. * We use the section start here to get accurate indices. */ const idx = this.tokenizer.getSectionStart(); this.endIndex = idx - 1; this.cbs.ontext?.(String.fromCodePoint(replaceCodePoint(cp))); this.startIndex = idx; } isVoidElement(name) { return voidElements.has(name); } /** @internal */ onopentagname(start, endIndex) { this.endIndex = endIndex; const name = this.getSlice(start, endIndex).toLowerCase(); this.emitOpenTag(name); } emitOpenTag(name) { this.openTagStart = this.startIndex; this.tagname = name; const impliesClose = openImpliesClose.get(name); if (impliesClose) { while (this.stack.length > 0 && impliesClose.has(this.stack[this.stack.length - 1])) { const el = this.stack.pop(); this.cbs.onclosetag?.(el, true); } } if (!this.isVoidElement(name)) { this.stack.push(name); if (foreignContextElements.has(name)) { this.foreignContext.push(true); } else if (htmlIntegrationElements.has(name)) { this.foreignContext.push(false); } } this.cbs.onopentagname?.(name); if (this.cbs.onopentag) this.attribs = {}; } endOpenTag(isImplied) { this.startIndex = this.openTagStart; if (this.attribs) { this.cbs.onopentag?.(this.tagname, this.attribs, isImplied); this.attribs = null; } if (this.cbs.onclosetag && this.isVoidElement(this.tagname)) { this.cbs.onclosetag(this.tagname, true); } this.tagname = ''; } /** @internal */ onopentagend(endIndex) { this.endIndex = endIndex; this.endOpenTag(false); // Set `startIndex` for next node this.startIndex = endIndex + 1; } /** @internal */ onclosetag(start, endIndex) { this.endIndex = endIndex; const name = this.getSlice(start, endIndex).toLowerCase(); if (foreignContextElements.has(name) || htmlIntegrationElements.has(name)) { this.foreignContext.pop(); } if (!this.isVoidElement(name)) { const pos = this.stack.lastIndexOf(name); if (pos !== -1) { if (this.cbs.onclosetag) { let count = this.stack.length - pos; while (count--) { // We know the stack has sufficient elements. this.cbs.onclosetag(this.stack.pop(), count !== 0); } } else this.stack.length = pos; } else if (name === 'p') { // Implicit open before close this.emitOpenTag('p'); this.closeCurrentTag(true); } } else if (name === 'br') { // We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed. this.cbs.onopentagname?.('br'); this.cbs.onopentag?.('br', {}, true); this.cbs.onclosetag?.('br', false); } // Set `startIndex` for next node this.startIndex = endIndex + 1; } /** @internal */ onselfclosingtag(endIndex) { this.endIndex = endIndex; if (this.foreignContext[this.foreignContext.length - 1]) { this.closeCurrentTag(false); // Set `startIndex` for next node this.startIndex = endIndex + 1; } else { // Ignore the fact that the tag is self-closing. this.onopentagend(endIndex); } } closeCurrentTag(isOpenImplied) { const name = this.tagname; this.endOpenTag(isOpenImplied); // Self-closing tags will be on the top of the stack if (this.stack[this.stack.length - 1] === name) { // If the opening tag isn't implied, the closing tag has to be implied. this.cbs.onclosetag?.(name, !isOpenImplied); this.stack.pop(); } } /** @internal */ onattribname(start, endIndex) { this.startIndex = start; const name = this.getSlice(start, endIndex); this.attribname = name.toLowerCase(); } /** @internal */ onattribdata(start, endIndex) { this.attribvalue += this.getSlice(start, endIndex); } /** @internal */ onattribentity(cp) { this.attribvalue += String.fromCodePoint(replaceCodePoint(cp)); } /** @internal */ onattribend(quote, endIndex) { this.endIndex = endIndex; this.cbs.onattribute?.(this.attribname, this.attribvalue, quote === QuoteType.Double ? '"' : quote === QuoteType.Single ? '\'' : quote === QuoteType.NoValue ? undefined : null); if (this.attribs && !Object.prototype.hasOwnProperty.call(this.attribs, this.attribname)) { this.attribs[this.attribname] = this.attribvalue; } this.attribname = ''; this.attribvalue = ''; } getInstructionName(value) { const idx = value.search(reNameEnd); let name = idx < 0 ? value : value.substr(0, idx); return name.toLowerCase(); } /** @internal */ ondeclaration(start, endIndex) { this.endIndex = endIndex; const value = this.getSlice(start, endIndex); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); this.cbs.onprocessinginstruction(`!${name}`, `!${value}`); } // Set `startIndex` for next node this.startIndex = endIndex + 1; } /** @internal */ onprocessinginstruction(start, endIndex) { this.endIndex = endIndex; const value = this.getSlice(start, endIndex); if (this.cbs.onprocessinginstruction) { const name = this.getInstructionName(value); this.cbs.onprocessinginstruction(`?${name}`, `?${value}`); } // Set `startIndex` for next node this.startIndex = endIndex + 1; } /** @internal */ oncomment(start, endIndex, offset) { this.endIndex = endIndex; this.cbs.oncomment?.(this.getSlice(start, endIndex - offset)); this.cbs.oncommentend?.(); // Set `startIndex` for next node this.startIndex = endIndex + 1; } /** @internal */ oncdata(start, endIndex, offset) { this.endIndex = endIndex; const value = this.getSlice(start, endIndex - offset); this.cbs.oncomment?.(`[CDATA[${value}]]`); this.cbs.oncommentend?.(); // Set `startIndex` for next node this.startIndex = endIndex + 1; } /** @internal */ onend() { if (this.cbs.onclosetag) { // Set the end index for all remaining tags this.endIndex = this.startIndex; for (let i = this.stack.length; i > 0; this.cbs.onclosetag(this.stack[--i], true)) ; } this.cbs.onend?.(); } /** * Resets the parser to a blank state, ready to parse a new HTML document */ reset() { this.cbs.onreset?.(); this.tokenizer.reset(); this.tagname = ''; this.attribname = ''; this.attribs = null; this.stack.length = 0; this.startIndex = 0; this.endIndex = 0; this.cbs.onparserinit?.(this); this.buffers.length = 0; this.bufferOffset = 0; this.writeIndex = 0; this.ended = false; } /** * Resets the parser, then parses a complete document and * pushes it to the handler. * * @param data Document to parse. */ parseComplete(data) { this.reset(); this.end(data); } getSlice(start, end) { while (start - this.bufferOffset >= this.buffers[0].length) { this.shiftBuffer(); } let str = this.buffers[0].slice(start - this.bufferOffset, end - this.bufferOffset); while (end - this.bufferOffset > this.buffers[0].length) { this.shiftBuffer(); str += this.buffers[0].slice(0, end - this.bufferOffset); } return str; } shiftBuffer() { this.bufferOffset += this.buffers[0].length; this.writeIndex--; this.buffers.shift(); } /** * Parses a chunk of data and calls the corresponding callbacks. * * @param chunk Chunk to parse. */ write(chunk) { if (this.ended) { this.cbs.onerror?.(new Error('.write() after done!')); return; } this.buffers.push(chunk); if (this.tokenizer.running) { this.tokenizer.write(chunk); this.writeIndex++; } } /** * Parses the end of the buffer and clears the stack, calls onend. * * @param chunk Optional final chunk to parse. */ end(chunk) { if (this.ended) { this.cbs.onerror?.(Error('.end() after done!')); return; } if (chunk) this.write(chunk); this.ended = true; this.tokenizer.end(); } /** * Pauses parsing. The parser won't emit events until `resume` is called. */ pause() { this.tokenizer.pause(); } /** * Resumes parsing after `pause` was called. */ resume() { this.tokenizer.resume(); while (this.tokenizer.running && this.writeIndex < this.buffers.length) { this.tokenizer.write(this.buffers[this.writeIndex++]); } if (this.ended) this.tokenizer.end(); } }