UNPKG

parse5

Version:
1,273 lines (1,272 loc) 116 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Tokenizer = exports.TokenizerMode = void 0; const preprocessor_js_1 = require("./preprocessor.js"); const unicode_js_1 = require("../common/unicode.js"); const token_js_1 = require("../common/token.js"); const decode_js_1 = require("entities/lib/decode.js"); const error_codes_js_1 = require("../common/error-codes.js"); const html_js_1 = require("../common/html.js"); //C1 Unicode control character reference replacements const C1_CONTROLS_REFERENCE_REPLACEMENTS = new Map([ [0x80, 8364], [0x82, 8218], [0x83, 402], [0x84, 8222], [0x85, 8230], [0x86, 8224], [0x87, 8225], [0x88, 710], [0x89, 8240], [0x8a, 352], [0x8b, 8249], [0x8c, 338], [0x8e, 381], [0x91, 8216], [0x92, 8217], [0x93, 8220], [0x94, 8221], [0x95, 8226], [0x96, 8211], [0x97, 8212], [0x98, 732], [0x99, 8482], [0x9a, 353], [0x9b, 8250], [0x9c, 339], [0x9e, 382], [0x9f, 376], ]); //States var State; (function (State) { State[State["DATA"] = 0] = "DATA"; State[State["RCDATA"] = 1] = "RCDATA"; State[State["RAWTEXT"] = 2] = "RAWTEXT"; State[State["SCRIPT_DATA"] = 3] = "SCRIPT_DATA"; State[State["PLAINTEXT"] = 4] = "PLAINTEXT"; State[State["TAG_OPEN"] = 5] = "TAG_OPEN"; State[State["END_TAG_OPEN"] = 6] = "END_TAG_OPEN"; State[State["TAG_NAME"] = 7] = "TAG_NAME"; State[State["RCDATA_LESS_THAN_SIGN"] = 8] = "RCDATA_LESS_THAN_SIGN"; State[State["RCDATA_END_TAG_OPEN"] = 9] = "RCDATA_END_TAG_OPEN"; State[State["RCDATA_END_TAG_NAME"] = 10] = "RCDATA_END_TAG_NAME"; State[State["RAWTEXT_LESS_THAN_SIGN"] = 11] = "RAWTEXT_LESS_THAN_SIGN"; State[State["RAWTEXT_END_TAG_OPEN"] = 12] = "RAWTEXT_END_TAG_OPEN"; State[State["RAWTEXT_END_TAG_NAME"] = 13] = "RAWTEXT_END_TAG_NAME"; State[State["SCRIPT_DATA_LESS_THAN_SIGN"] = 14] = "SCRIPT_DATA_LESS_THAN_SIGN"; State[State["SCRIPT_DATA_END_TAG_OPEN"] = 15] = "SCRIPT_DATA_END_TAG_OPEN"; State[State["SCRIPT_DATA_END_TAG_NAME"] = 16] = "SCRIPT_DATA_END_TAG_NAME"; State[State["SCRIPT_DATA_ESCAPE_START"] = 17] = "SCRIPT_DATA_ESCAPE_START"; State[State["SCRIPT_DATA_ESCAPE_START_DASH"] = 18] = "SCRIPT_DATA_ESCAPE_START_DASH"; State[State["SCRIPT_DATA_ESCAPED"] = 19] = "SCRIPT_DATA_ESCAPED"; State[State["SCRIPT_DATA_ESCAPED_DASH"] = 20] = "SCRIPT_DATA_ESCAPED_DASH"; State[State["SCRIPT_DATA_ESCAPED_DASH_DASH"] = 21] = "SCRIPT_DATA_ESCAPED_DASH_DASH"; State[State["SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN"] = 22] = "SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN"; State[State["SCRIPT_DATA_ESCAPED_END_TAG_OPEN"] = 23] = "SCRIPT_DATA_ESCAPED_END_TAG_OPEN"; State[State["SCRIPT_DATA_ESCAPED_END_TAG_NAME"] = 24] = "SCRIPT_DATA_ESCAPED_END_TAG_NAME"; State[State["SCRIPT_DATA_DOUBLE_ESCAPE_START"] = 25] = "SCRIPT_DATA_DOUBLE_ESCAPE_START"; State[State["SCRIPT_DATA_DOUBLE_ESCAPED"] = 26] = "SCRIPT_DATA_DOUBLE_ESCAPED"; State[State["SCRIPT_DATA_DOUBLE_ESCAPED_DASH"] = 27] = "SCRIPT_DATA_DOUBLE_ESCAPED_DASH"; State[State["SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH"] = 28] = "SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH"; State[State["SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN"] = 29] = "SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN"; State[State["SCRIPT_DATA_DOUBLE_ESCAPE_END"] = 30] = "SCRIPT_DATA_DOUBLE_ESCAPE_END"; State[State["BEFORE_ATTRIBUTE_NAME"] = 31] = "BEFORE_ATTRIBUTE_NAME"; State[State["ATTRIBUTE_NAME"] = 32] = "ATTRIBUTE_NAME"; State[State["AFTER_ATTRIBUTE_NAME"] = 33] = "AFTER_ATTRIBUTE_NAME"; State[State["BEFORE_ATTRIBUTE_VALUE"] = 34] = "BEFORE_ATTRIBUTE_VALUE"; State[State["ATTRIBUTE_VALUE_DOUBLE_QUOTED"] = 35] = "ATTRIBUTE_VALUE_DOUBLE_QUOTED"; State[State["ATTRIBUTE_VALUE_SINGLE_QUOTED"] = 36] = "ATTRIBUTE_VALUE_SINGLE_QUOTED"; State[State["ATTRIBUTE_VALUE_UNQUOTED"] = 37] = "ATTRIBUTE_VALUE_UNQUOTED"; State[State["AFTER_ATTRIBUTE_VALUE_QUOTED"] = 38] = "AFTER_ATTRIBUTE_VALUE_QUOTED"; State[State["SELF_CLOSING_START_TAG"] = 39] = "SELF_CLOSING_START_TAG"; State[State["BOGUS_COMMENT"] = 40] = "BOGUS_COMMENT"; State[State["MARKUP_DECLARATION_OPEN"] = 41] = "MARKUP_DECLARATION_OPEN"; State[State["COMMENT_START"] = 42] = "COMMENT_START"; State[State["COMMENT_START_DASH"] = 43] = "COMMENT_START_DASH"; State[State["COMMENT"] = 44] = "COMMENT"; State[State["COMMENT_LESS_THAN_SIGN"] = 45] = "COMMENT_LESS_THAN_SIGN"; State[State["COMMENT_LESS_THAN_SIGN_BANG"] = 46] = "COMMENT_LESS_THAN_SIGN_BANG"; State[State["COMMENT_LESS_THAN_SIGN_BANG_DASH"] = 47] = "COMMENT_LESS_THAN_SIGN_BANG_DASH"; State[State["COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH"] = 48] = "COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH"; State[State["COMMENT_END_DASH"] = 49] = "COMMENT_END_DASH"; State[State["COMMENT_END"] = 50] = "COMMENT_END"; State[State["COMMENT_END_BANG"] = 51] = "COMMENT_END_BANG"; State[State["DOCTYPE"] = 52] = "DOCTYPE"; State[State["BEFORE_DOCTYPE_NAME"] = 53] = "BEFORE_DOCTYPE_NAME"; State[State["DOCTYPE_NAME"] = 54] = "DOCTYPE_NAME"; State[State["AFTER_DOCTYPE_NAME"] = 55] = "AFTER_DOCTYPE_NAME"; State[State["AFTER_DOCTYPE_PUBLIC_KEYWORD"] = 56] = "AFTER_DOCTYPE_PUBLIC_KEYWORD"; State[State["BEFORE_DOCTYPE_PUBLIC_IDENTIFIER"] = 57] = "BEFORE_DOCTYPE_PUBLIC_IDENTIFIER"; State[State["DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED"] = 58] = "DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED"; State[State["DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED"] = 59] = "DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED"; State[State["AFTER_DOCTYPE_PUBLIC_IDENTIFIER"] = 60] = "AFTER_DOCTYPE_PUBLIC_IDENTIFIER"; State[State["BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS"] = 61] = "BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS"; State[State["AFTER_DOCTYPE_SYSTEM_KEYWORD"] = 62] = "AFTER_DOCTYPE_SYSTEM_KEYWORD"; State[State["BEFORE_DOCTYPE_SYSTEM_IDENTIFIER"] = 63] = "BEFORE_DOCTYPE_SYSTEM_IDENTIFIER"; State[State["DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED"] = 64] = "DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED"; State[State["DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED"] = 65] = "DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED"; State[State["AFTER_DOCTYPE_SYSTEM_IDENTIFIER"] = 66] = "AFTER_DOCTYPE_SYSTEM_IDENTIFIER"; State[State["BOGUS_DOCTYPE"] = 67] = "BOGUS_DOCTYPE"; State[State["CDATA_SECTION"] = 68] = "CDATA_SECTION"; State[State["CDATA_SECTION_BRACKET"] = 69] = "CDATA_SECTION_BRACKET"; State[State["CDATA_SECTION_END"] = 70] = "CDATA_SECTION_END"; State[State["CHARACTER_REFERENCE"] = 71] = "CHARACTER_REFERENCE"; State[State["NAMED_CHARACTER_REFERENCE"] = 72] = "NAMED_CHARACTER_REFERENCE"; State[State["AMBIGUOUS_AMPERSAND"] = 73] = "AMBIGUOUS_AMPERSAND"; State[State["NUMERIC_CHARACTER_REFERENCE"] = 74] = "NUMERIC_CHARACTER_REFERENCE"; State[State["HEXADEMICAL_CHARACTER_REFERENCE_START"] = 75] = "HEXADEMICAL_CHARACTER_REFERENCE_START"; State[State["HEXADEMICAL_CHARACTER_REFERENCE"] = 76] = "HEXADEMICAL_CHARACTER_REFERENCE"; State[State["DECIMAL_CHARACTER_REFERENCE"] = 77] = "DECIMAL_CHARACTER_REFERENCE"; State[State["NUMERIC_CHARACTER_REFERENCE_END"] = 78] = "NUMERIC_CHARACTER_REFERENCE_END"; })(State || (State = {})); //Tokenizer initial states for different modes exports.TokenizerMode = { DATA: State.DATA, RCDATA: State.RCDATA, RAWTEXT: State.RAWTEXT, SCRIPT_DATA: State.SCRIPT_DATA, PLAINTEXT: State.PLAINTEXT, CDATA_SECTION: State.CDATA_SECTION, }; //Utils //OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline //this functions if they will be situated in another module due to context switch. //Always perform inlining check before modifying this functions ('node --trace-inlining'). function isAsciiDigit(cp) { return cp >= unicode_js_1.CODE_POINTS.DIGIT_0 && cp <= unicode_js_1.CODE_POINTS.DIGIT_9; } function isAsciiUpper(cp) { return cp >= unicode_js_1.CODE_POINTS.LATIN_CAPITAL_A && cp <= unicode_js_1.CODE_POINTS.LATIN_CAPITAL_Z; } function isAsciiLower(cp) { return cp >= unicode_js_1.CODE_POINTS.LATIN_SMALL_A && cp <= unicode_js_1.CODE_POINTS.LATIN_SMALL_Z; } function isAsciiLetter(cp) { return isAsciiLower(cp) || isAsciiUpper(cp); } function isAsciiAlphaNumeric(cp) { return isAsciiLetter(cp) || isAsciiDigit(cp); } function isAsciiUpperHexDigit(cp) { return cp >= unicode_js_1.CODE_POINTS.LATIN_CAPITAL_A && cp <= unicode_js_1.CODE_POINTS.LATIN_CAPITAL_F; } function isAsciiLowerHexDigit(cp) { return cp >= unicode_js_1.CODE_POINTS.LATIN_SMALL_A && cp <= unicode_js_1.CODE_POINTS.LATIN_SMALL_F; } function isAsciiHexDigit(cp) { return isAsciiDigit(cp) || isAsciiUpperHexDigit(cp) || isAsciiLowerHexDigit(cp); } function toAsciiLower(cp) { return cp + 32; } function isWhitespace(cp) { return cp === unicode_js_1.CODE_POINTS.SPACE || cp === unicode_js_1.CODE_POINTS.LINE_FEED || cp === unicode_js_1.CODE_POINTS.TABULATION || cp === unicode_js_1.CODE_POINTS.FORM_FEED; } function isEntityInAttributeInvalidEnd(nextCp) { return nextCp === unicode_js_1.CODE_POINTS.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp); } function isScriptDataDoubleEscapeSequenceEnd(cp) { return isWhitespace(cp) || cp === unicode_js_1.CODE_POINTS.SOLIDUS || cp === unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN; } //Tokenizer class Tokenizer { constructor(options, handler) { this.options = options; this.handler = handler; this.paused = false; /** Ensures that the parsing loop isn't run multiple times at once. */ this.inLoop = false; /** * Indicates that the current adjusted node exists, is not an element in the HTML namespace, * and that it is not an integration point for either MathML or HTML. * * @see {@link https://html.spec.whatwg.org/multipage/parsing.html#tree-construction} */ this.inForeignNode = false; this.lastStartTagName = ''; this.active = false; this.state = State.DATA; this.returnState = State.DATA; this.charRefCode = -1; this.consumedAfterSnapshot = -1; this.currentCharacterToken = null; this.currentToken = null; this.currentAttr = { name: '', value: '' }; this.preprocessor = new preprocessor_js_1.Preprocessor(handler); this.currentLocation = this.getCurrentLocation(-1); } //Errors _err(code) { var _a, _b; (_b = (_a = this.handler).onParseError) === null || _b === void 0 ? void 0 : _b.call(_a, this.preprocessor.getError(code)); } // NOTE: `offset` may never run across line boundaries. getCurrentLocation(offset) { if (!this.options.sourceCodeLocationInfo) { return null; } return { startLine: this.preprocessor.line, startCol: this.preprocessor.col - offset, startOffset: this.preprocessor.offset - offset, endLine: -1, endCol: -1, endOffset: -1, }; } _runParsingLoop() { if (this.inLoop) return; this.inLoop = true; while (this.active && !this.paused) { this.consumedAfterSnapshot = 0; const cp = this._consume(); if (!this._ensureHibernation()) { this._callState(cp); } } this.inLoop = false; } //API pause() { this.paused = true; } resume(writeCallback) { if (!this.paused) { throw new Error('Parser was already resumed'); } this.paused = false; // Necessary for synchronous resume. if (this.inLoop) return; this._runParsingLoop(); if (!this.paused) { writeCallback === null || writeCallback === void 0 ? void 0 : writeCallback(); } } write(chunk, isLastChunk, writeCallback) { this.active = true; this.preprocessor.write(chunk, isLastChunk); this._runParsingLoop(); if (!this.paused) { writeCallback === null || writeCallback === void 0 ? void 0 : writeCallback(); } } insertHtmlAtCurrentPos(chunk) { this.active = true; this.preprocessor.insertHtmlAtCurrentPos(chunk); this._runParsingLoop(); } //Hibernation _ensureHibernation() { if (this.preprocessor.endOfChunkHit) { this._unconsume(this.consumedAfterSnapshot); this.active = false; return true; } return false; } //Consumption _consume() { this.consumedAfterSnapshot++; return this.preprocessor.advance(); } _unconsume(count) { this.consumedAfterSnapshot -= count; this.preprocessor.retreat(count); } _reconsumeInState(state, cp) { this.state = state; this._callState(cp); } _advanceBy(count) { this.consumedAfterSnapshot += count; for (let i = 0; i < count; i++) { this.preprocessor.advance(); } } _consumeSequenceIfMatch(pattern, caseSensitive) { if (this.preprocessor.startsWith(pattern, caseSensitive)) { // We will already have consumed one character before calling this method. this._advanceBy(pattern.length - 1); return true; } return false; } //Token creation _createStartTagToken() { this.currentToken = { type: token_js_1.TokenType.START_TAG, tagName: '', tagID: html_js_1.TAG_ID.UNKNOWN, selfClosing: false, ackSelfClosing: false, attrs: [], location: this.getCurrentLocation(1), }; } _createEndTagToken() { this.currentToken = { type: token_js_1.TokenType.END_TAG, tagName: '', tagID: html_js_1.TAG_ID.UNKNOWN, selfClosing: false, ackSelfClosing: false, attrs: [], location: this.getCurrentLocation(2), }; } _createCommentToken(offset) { this.currentToken = { type: token_js_1.TokenType.COMMENT, data: '', location: this.getCurrentLocation(offset), }; } _createDoctypeToken(initialName) { this.currentToken = { type: token_js_1.TokenType.DOCTYPE, name: initialName, forceQuirks: false, publicId: null, systemId: null, location: this.currentLocation, }; } _createCharacterToken(type, chars) { this.currentCharacterToken = { type, chars, location: this.currentLocation, }; } //Tag attributes _createAttr(attrNameFirstCh) { this.currentAttr = { name: attrNameFirstCh, value: '', }; this.currentLocation = this.getCurrentLocation(0); } _leaveAttrName() { var _a; var _b; const token = this.currentToken; if ((0, token_js_1.getTokenAttr)(token, this.currentAttr.name) === null) { token.attrs.push(this.currentAttr); if (token.location && this.currentLocation) { const attrLocations = ((_a = (_b = token.location).attrs) !== null && _a !== void 0 ? _a : (_b.attrs = Object.create(null))); attrLocations[this.currentAttr.name] = this.currentLocation; // Set end location this._leaveAttrValue(); } } else { this._err(error_codes_js_1.ERR.duplicateAttribute); } } _leaveAttrValue() { if (this.currentLocation) { this.currentLocation.endLine = this.preprocessor.line; this.currentLocation.endCol = this.preprocessor.col; this.currentLocation.endOffset = this.preprocessor.offset; } } //Token emission prepareToken(ct) { this._emitCurrentCharacterToken(ct.location); this.currentToken = null; if (ct.location) { ct.location.endLine = this.preprocessor.line; ct.location.endCol = this.preprocessor.col + 1; ct.location.endOffset = this.preprocessor.offset + 1; } this.currentLocation = this.getCurrentLocation(-1); } emitCurrentTagToken() { const ct = this.currentToken; this.prepareToken(ct); ct.tagID = (0, html_js_1.getTagID)(ct.tagName); if (ct.type === token_js_1.TokenType.START_TAG) { this.lastStartTagName = ct.tagName; this.handler.onStartTag(ct); } else { if (ct.attrs.length > 0) { this._err(error_codes_js_1.ERR.endTagWithAttributes); } if (ct.selfClosing) { this._err(error_codes_js_1.ERR.endTagWithTrailingSolidus); } this.handler.onEndTag(ct); } this.preprocessor.dropParsedChunk(); } emitCurrentComment(ct) { this.prepareToken(ct); this.handler.onComment(ct); this.preprocessor.dropParsedChunk(); } emitCurrentDoctype(ct) { this.prepareToken(ct); this.handler.onDoctype(ct); this.preprocessor.dropParsedChunk(); } _emitCurrentCharacterToken(nextLocation) { if (this.currentCharacterToken) { //NOTE: if we have a pending character token, make it's end location equal to the //current token's start location. if (nextLocation && this.currentCharacterToken.location) { this.currentCharacterToken.location.endLine = nextLocation.startLine; this.currentCharacterToken.location.endCol = nextLocation.startCol; this.currentCharacterToken.location.endOffset = nextLocation.startOffset; } switch (this.currentCharacterToken.type) { case token_js_1.TokenType.CHARACTER: { this.handler.onCharacter(this.currentCharacterToken); break; } case token_js_1.TokenType.NULL_CHARACTER: { this.handler.onNullCharacter(this.currentCharacterToken); break; } case token_js_1.TokenType.WHITESPACE_CHARACTER: { this.handler.onWhitespaceCharacter(this.currentCharacterToken); break; } } this.currentCharacterToken = null; } } _emitEOFToken() { const location = this.getCurrentLocation(0); if (location) { location.endLine = location.startLine; location.endCol = location.startCol; location.endOffset = location.startOffset; } this._emitCurrentCharacterToken(location); this.handler.onEof({ type: token_js_1.TokenType.EOF, location }); this.active = false; } //Characters emission //OPTIMIZATION: specification uses only one type of character tokens (one token per character). //This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters. //If we have a sequence of characters that belong to the same group, the parser can process it //as a single solid character token. //So, there are 3 types of character tokens in parse5: //1)TokenType.NULL_CHARACTER - \u0000-character sequences (e.g. '\u0000\u0000\u0000') //2)TokenType.WHITESPACE_CHARACTER - any whitespace/new-line character sequences (e.g. '\n \r\t \f') //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') _appendCharToCurrentCharacterToken(type, ch) { if (this.currentCharacterToken) { if (this.currentCharacterToken.type !== type) { this.currentLocation = this.getCurrentLocation(0); this._emitCurrentCharacterToken(this.currentLocation); this.preprocessor.dropParsedChunk(); } else { this.currentCharacterToken.chars += ch; return; } } this._createCharacterToken(type, ch); } _emitCodePoint(cp) { const type = isWhitespace(cp) ? token_js_1.TokenType.WHITESPACE_CHARACTER : cp === unicode_js_1.CODE_POINTS.NULL ? token_js_1.TokenType.NULL_CHARACTER : token_js_1.TokenType.CHARACTER; this._appendCharToCurrentCharacterToken(type, String.fromCodePoint(cp)); } //NOTE: used when we emit characters explicitly. //This is always for non-whitespace and non-null characters, which allows us to avoid additional checks. _emitChars(ch) { this._appendCharToCurrentCharacterToken(token_js_1.TokenType.CHARACTER, ch); } // Character reference helpers _matchNamedCharacterReference(cp) { let result = null; let excess = 0; let withoutSemicolon = false; for (let i = 0, current = decode_js_1.htmlDecodeTree[0]; i >= 0; cp = this._consume()) { i = (0, decode_js_1.determineBranch)(decode_js_1.htmlDecodeTree, current, i + 1, cp); if (i < 0) break; excess += 1; current = decode_js_1.htmlDecodeTree[i]; const masked = current & decode_js_1.BinTrieFlags.VALUE_LENGTH; // If the branch is a value, store it and continue if (masked) { // The mask is the number of bytes of the value, including the current byte. const valueLength = (masked >> 14) - 1; // Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error. // See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state if (cp !== unicode_js_1.CODE_POINTS.SEMICOLON && this._isCharacterReferenceInAttribute() && isEntityInAttributeInvalidEnd(this.preprocessor.peek(1))) { //NOTE: we don't flush all consumed code points here, and instead switch back to the original state after //emitting an ampersand. This is fine, as alphanumeric characters won't be parsed differently in attributes. result = [unicode_js_1.CODE_POINTS.AMPERSAND]; // Skip over the value. i += valueLength; } else { // If this is a surrogate pair, consume the next two bytes. result = valueLength === 0 ? [decode_js_1.htmlDecodeTree[i] & ~decode_js_1.BinTrieFlags.VALUE_LENGTH] : valueLength === 1 ? [decode_js_1.htmlDecodeTree[++i]] : [decode_js_1.htmlDecodeTree[++i], decode_js_1.htmlDecodeTree[++i]]; excess = 0; withoutSemicolon = cp !== unicode_js_1.CODE_POINTS.SEMICOLON; } if (valueLength === 0) { // If the value is zero-length, we're done. this._consume(); break; } } } this._unconsume(excess); if (withoutSemicolon && !this.preprocessor.endOfChunkHit) { this._err(error_codes_js_1.ERR.missingSemicolonAfterCharacterReference); } // We want to emit the error above on the code point after the entity. // We always consume one code point too many in the loop, and we wait to // unconsume it until after the error is emitted. this._unconsume(1); return result; } _isCharacterReferenceInAttribute() { return (this.returnState === State.ATTRIBUTE_VALUE_DOUBLE_QUOTED || this.returnState === State.ATTRIBUTE_VALUE_SINGLE_QUOTED || this.returnState === State.ATTRIBUTE_VALUE_UNQUOTED); } _flushCodePointConsumedAsCharacterReference(cp) { if (this._isCharacterReferenceInAttribute()) { this.currentAttr.value += String.fromCodePoint(cp); } else { this._emitCodePoint(cp); } } // Calling states this way turns out to be much faster than any other approach. _callState(cp) { switch (this.state) { case State.DATA: { this._stateData(cp); break; } case State.RCDATA: { this._stateRcdata(cp); break; } case State.RAWTEXT: { this._stateRawtext(cp); break; } case State.SCRIPT_DATA: { this._stateScriptData(cp); break; } case State.PLAINTEXT: { this._statePlaintext(cp); break; } case State.TAG_OPEN: { this._stateTagOpen(cp); break; } case State.END_TAG_OPEN: { this._stateEndTagOpen(cp); break; } case State.TAG_NAME: { this._stateTagName(cp); break; } case State.RCDATA_LESS_THAN_SIGN: { this._stateRcdataLessThanSign(cp); break; } case State.RCDATA_END_TAG_OPEN: { this._stateRcdataEndTagOpen(cp); break; } case State.RCDATA_END_TAG_NAME: { this._stateRcdataEndTagName(cp); break; } case State.RAWTEXT_LESS_THAN_SIGN: { this._stateRawtextLessThanSign(cp); break; } case State.RAWTEXT_END_TAG_OPEN: { this._stateRawtextEndTagOpen(cp); break; } case State.RAWTEXT_END_TAG_NAME: { this._stateRawtextEndTagName(cp); break; } case State.SCRIPT_DATA_LESS_THAN_SIGN: { this._stateScriptDataLessThanSign(cp); break; } case State.SCRIPT_DATA_END_TAG_OPEN: { this._stateScriptDataEndTagOpen(cp); break; } case State.SCRIPT_DATA_END_TAG_NAME: { this._stateScriptDataEndTagName(cp); break; } case State.SCRIPT_DATA_ESCAPE_START: { this._stateScriptDataEscapeStart(cp); break; } case State.SCRIPT_DATA_ESCAPE_START_DASH: { this._stateScriptDataEscapeStartDash(cp); break; } case State.SCRIPT_DATA_ESCAPED: { this._stateScriptDataEscaped(cp); break; } case State.SCRIPT_DATA_ESCAPED_DASH: { this._stateScriptDataEscapedDash(cp); break; } case State.SCRIPT_DATA_ESCAPED_DASH_DASH: { this._stateScriptDataEscapedDashDash(cp); break; } case State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: { this._stateScriptDataEscapedLessThanSign(cp); break; } case State.SCRIPT_DATA_ESCAPED_END_TAG_OPEN: { this._stateScriptDataEscapedEndTagOpen(cp); break; } case State.SCRIPT_DATA_ESCAPED_END_TAG_NAME: { this._stateScriptDataEscapedEndTagName(cp); break; } case State.SCRIPT_DATA_DOUBLE_ESCAPE_START: { this._stateScriptDataDoubleEscapeStart(cp); break; } case State.SCRIPT_DATA_DOUBLE_ESCAPED: { this._stateScriptDataDoubleEscaped(cp); break; } case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH: { this._stateScriptDataDoubleEscapedDash(cp); break; } case State.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: { this._stateScriptDataDoubleEscapedDashDash(cp); break; } case State.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: { this._stateScriptDataDoubleEscapedLessThanSign(cp); break; } case State.SCRIPT_DATA_DOUBLE_ESCAPE_END: { this._stateScriptDataDoubleEscapeEnd(cp); break; } case State.BEFORE_ATTRIBUTE_NAME: { this._stateBeforeAttributeName(cp); break; } case State.ATTRIBUTE_NAME: { this._stateAttributeName(cp); break; } case State.AFTER_ATTRIBUTE_NAME: { this._stateAfterAttributeName(cp); break; } case State.BEFORE_ATTRIBUTE_VALUE: { this._stateBeforeAttributeValue(cp); break; } case State.ATTRIBUTE_VALUE_DOUBLE_QUOTED: { this._stateAttributeValueDoubleQuoted(cp); break; } case State.ATTRIBUTE_VALUE_SINGLE_QUOTED: { this._stateAttributeValueSingleQuoted(cp); break; } case State.ATTRIBUTE_VALUE_UNQUOTED: { this._stateAttributeValueUnquoted(cp); break; } case State.AFTER_ATTRIBUTE_VALUE_QUOTED: { this._stateAfterAttributeValueQuoted(cp); break; } case State.SELF_CLOSING_START_TAG: { this._stateSelfClosingStartTag(cp); break; } case State.BOGUS_COMMENT: { this._stateBogusComment(cp); break; } case State.MARKUP_DECLARATION_OPEN: { this._stateMarkupDeclarationOpen(cp); break; } case State.COMMENT_START: { this._stateCommentStart(cp); break; } case State.COMMENT_START_DASH: { this._stateCommentStartDash(cp); break; } case State.COMMENT: { this._stateComment(cp); break; } case State.COMMENT_LESS_THAN_SIGN: { this._stateCommentLessThanSign(cp); break; } case State.COMMENT_LESS_THAN_SIGN_BANG: { this._stateCommentLessThanSignBang(cp); break; } case State.COMMENT_LESS_THAN_SIGN_BANG_DASH: { this._stateCommentLessThanSignBangDash(cp); break; } case State.COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH: { this._stateCommentLessThanSignBangDashDash(cp); break; } case State.COMMENT_END_DASH: { this._stateCommentEndDash(cp); break; } case State.COMMENT_END: { this._stateCommentEnd(cp); break; } case State.COMMENT_END_BANG: { this._stateCommentEndBang(cp); break; } case State.DOCTYPE: { this._stateDoctype(cp); break; } case State.BEFORE_DOCTYPE_NAME: { this._stateBeforeDoctypeName(cp); break; } case State.DOCTYPE_NAME: { this._stateDoctypeName(cp); break; } case State.AFTER_DOCTYPE_NAME: { this._stateAfterDoctypeName(cp); break; } case State.AFTER_DOCTYPE_PUBLIC_KEYWORD: { this._stateAfterDoctypePublicKeyword(cp); break; } case State.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: { this._stateBeforeDoctypePublicIdentifier(cp); break; } case State.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: { this._stateDoctypePublicIdentifierDoubleQuoted(cp); break; } case State.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: { this._stateDoctypePublicIdentifierSingleQuoted(cp); break; } case State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER: { this._stateAfterDoctypePublicIdentifier(cp); break; } case State.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: { this._stateBetweenDoctypePublicAndSystemIdentifiers(cp); break; } case State.AFTER_DOCTYPE_SYSTEM_KEYWORD: { this._stateAfterDoctypeSystemKeyword(cp); break; } case State.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: { this._stateBeforeDoctypeSystemIdentifier(cp); break; } case State.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: { this._stateDoctypeSystemIdentifierDoubleQuoted(cp); break; } case State.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: { this._stateDoctypeSystemIdentifierSingleQuoted(cp); break; } case State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER: { this._stateAfterDoctypeSystemIdentifier(cp); break; } case State.BOGUS_DOCTYPE: { this._stateBogusDoctype(cp); break; } case State.CDATA_SECTION: { this._stateCdataSection(cp); break; } case State.CDATA_SECTION_BRACKET: { this._stateCdataSectionBracket(cp); break; } case State.CDATA_SECTION_END: { this._stateCdataSectionEnd(cp); break; } case State.CHARACTER_REFERENCE: { this._stateCharacterReference(cp); break; } case State.NAMED_CHARACTER_REFERENCE: { this._stateNamedCharacterReference(cp); break; } case State.AMBIGUOUS_AMPERSAND: { this._stateAmbiguousAmpersand(cp); break; } case State.NUMERIC_CHARACTER_REFERENCE: { this._stateNumericCharacterReference(cp); break; } case State.HEXADEMICAL_CHARACTER_REFERENCE_START: { this._stateHexademicalCharacterReferenceStart(cp); break; } case State.HEXADEMICAL_CHARACTER_REFERENCE: { this._stateHexademicalCharacterReference(cp); break; } case State.DECIMAL_CHARACTER_REFERENCE: { this._stateDecimalCharacterReference(cp); break; } case State.NUMERIC_CHARACTER_REFERENCE_END: { this._stateNumericCharacterReferenceEnd(cp); break; } default: { throw new Error('Unknown state'); } } } // State machine // Data state //------------------------------------------------------------------ _stateData(cp) { switch (cp) { case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: { this.state = State.TAG_OPEN; break; } case unicode_js_1.CODE_POINTS.AMPERSAND: { this.returnState = State.DATA; this.state = State.CHARACTER_REFERENCE; break; } case unicode_js_1.CODE_POINTS.NULL: { this._err(error_codes_js_1.ERR.unexpectedNullCharacter); this._emitCodePoint(cp); break; } case unicode_js_1.CODE_POINTS.EOF: { this._emitEOFToken(); break; } default: { this._emitCodePoint(cp); } } } // RCDATA state //------------------------------------------------------------------ _stateRcdata(cp) { switch (cp) { case unicode_js_1.CODE_POINTS.AMPERSAND: { this.returnState = State.RCDATA; this.state = State.CHARACTER_REFERENCE; break; } case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: { this.state = State.RCDATA_LESS_THAN_SIGN; break; } case unicode_js_1.CODE_POINTS.NULL: { this._err(error_codes_js_1.ERR.unexpectedNullCharacter); this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER); break; } case unicode_js_1.CODE_POINTS.EOF: { this._emitEOFToken(); break; } default: { this._emitCodePoint(cp); } } } // RAWTEXT state //------------------------------------------------------------------ _stateRawtext(cp) { switch (cp) { case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: { this.state = State.RAWTEXT_LESS_THAN_SIGN; break; } case unicode_js_1.CODE_POINTS.NULL: { this._err(error_codes_js_1.ERR.unexpectedNullCharacter); this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER); break; } case unicode_js_1.CODE_POINTS.EOF: { this._emitEOFToken(); break; } default: { this._emitCodePoint(cp); } } } // Script data state //------------------------------------------------------------------ _stateScriptData(cp) { switch (cp) { case unicode_js_1.CODE_POINTS.LESS_THAN_SIGN: { this.state = State.SCRIPT_DATA_LESS_THAN_SIGN; break; } case unicode_js_1.CODE_POINTS.NULL: { this._err(error_codes_js_1.ERR.unexpectedNullCharacter); this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER); break; } case unicode_js_1.CODE_POINTS.EOF: { this._emitEOFToken(); break; } default: { this._emitCodePoint(cp); } } } // PLAINTEXT state //------------------------------------------------------------------ _statePlaintext(cp) { switch (cp) { case unicode_js_1.CODE_POINTS.NULL: { this._err(error_codes_js_1.ERR.unexpectedNullCharacter); this._emitChars(unicode_js_1.REPLACEMENT_CHARACTER); break; } case unicode_js_1.CODE_POINTS.EOF: { this._emitEOFToken(); break; } default: { this._emitCodePoint(cp); } } } // Tag open state //------------------------------------------------------------------ _stateTagOpen(cp) { if (isAsciiLetter(cp)) { this._createStartTagToken(); this.state = State.TAG_NAME; this._stateTagName(cp); } else switch (cp) { case unicode_js_1.CODE_POINTS.EXCLAMATION_MARK: { this.state = State.MARKUP_DECLARATION_OPEN; break; } case unicode_js_1.CODE_POINTS.SOLIDUS: { this.state = State.END_TAG_OPEN; break; } case unicode_js_1.CODE_POINTS.QUESTION_MARK: { this._err(error_codes_js_1.ERR.unexpectedQuestionMarkInsteadOfTagName); this._createCommentToken(1); this.state = State.BOGUS_COMMENT; this._stateBogusComment(cp); break; } case unicode_js_1.CODE_POINTS.EOF: { this._err(error_codes_js_1.ERR.eofBeforeTagName); this._emitChars('<'); this._emitEOFToken(); break; } default: { this._err(error_codes_js_1.ERR.invalidFirstCharacterOfTagName); this._emitChars('<'); this.state = State.DATA; this._stateData(cp); } } } // End tag open state //------------------------------------------------------------------ _stateEndTagOpen(cp) { if (isAsciiLetter(cp)) { this._createEndTagToken(); this.state = State.TAG_NAME; this._stateTagName(cp); } else switch (cp) { case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: { this._err(error_codes_js_1.ERR.missingEndTagName); this.state = State.DATA; break; } case unicode_js_1.CODE_POINTS.EOF: { this._err(error_codes_js_1.ERR.eofBeforeTagName); this._emitChars('</'); this._emitEOFToken(); break; } default: { this._err(error_codes_js_1.ERR.invalidFirstCharacterOfTagName); this._createCommentToken(2); this.state = State.BOGUS_COMMENT; this._stateBogusComment(cp); } } } // Tag name state //------------------------------------------------------------------ _stateTagName(cp) { const token = this.currentToken; switch (cp) { case unicode_js_1.CODE_POINTS.SPACE: case unicode_js_1.CODE_POINTS.LINE_FEED: case unicode_js_1.CODE_POINTS.TABULATION: case unicode_js_1.CODE_POINTS.FORM_FEED: { this.state = State.BEFORE_ATTRIBUTE_NAME; break; } case unicode_js_1.CODE_POINTS.SOLIDUS: { this.state = State.SELF_CLOSING_START_TAG; break; } case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: { this.state = State.DATA; this.emitCurrentTagToken(); break; } case unicode_js_1.CODE_POINTS.NULL: { this._err(error_codes_js_1.ERR.unexpectedNullCharacter); token.tagName += unicode_js_1.REPLACEMENT_CHARACTER; break; } case unicode_js_1.CODE_POINTS.EOF: { this._err(error_codes_js_1.ERR.eofInTag); this._emitEOFToken(); break; } default: { token.tagName += String.fromCodePoint(isAsciiUpper(cp) ? toAsciiLower(cp) : cp); } } } // RCDATA less-than sign state //------------------------------------------------------------------ _stateRcdataLessThanSign(cp) { if (cp === unicode_js_1.CODE_POINTS.SOLIDUS) { this.state = State.RCDATA_END_TAG_OPEN; } else { this._emitChars('<'); this.state = State.RCDATA; this._stateRcdata(cp); } } // RCDATA end tag open state //------------------------------------------------------------------ _stateRcdataEndTagOpen(cp) { if (isAsciiLetter(cp)) { this.state = State.RCDATA_END_TAG_NAME; this._stateRcdataEndTagName(cp); } else { this._emitChars('</'); this.state = State.RCDATA; this._stateRcdata(cp); } } handleSpecialEndTag(_cp) { if (!this.preprocessor.startsWith(this.lastStartTagName, false)) { return !this._ensureHibernation(); } this._createEndTagToken(); const token = this.currentToken; token.tagName = this.lastStartTagName; const cp = this.preprocessor.peek(this.lastStartTagName.length); switch (cp) { case unicode_js_1.CODE_POINTS.SPACE: case unicode_js_1.CODE_POINTS.LINE_FEED: case unicode_js_1.CODE_POINTS.TABULATION: case unicode_js_1.CODE_POINTS.FORM_FEED: { this._advanceBy(this.lastStartTagName.length); this.state = State.BEFORE_ATTRIBUTE_NAME; return false; } case unicode_js_1.CODE_POINTS.SOLIDUS: { this._advanceBy(this.lastStartTagName.length); this.state = State.SELF_CLOSING_START_TAG; return false; } case unicode_js_1.CODE_POINTS.GREATER_THAN_SIGN: { this._advanceBy(this.lastStartTagName.length); this.emitCurrentTagToken(); this.state = State.DATA; return false; } default: { return !this._ensureHibernation(); } } } // RCDATA end tag name state //------------------------------------------------------------------ _stateRcdataEndTagName(cp) { if (this.handleSpecialEndTag(cp)) { this._emitChars('</'); this.state = State.RCDATA; this._stateRcdata(cp); } } // RAWTEXT less-than sign state //------------------------------------------------------------------ _stateRawtextLessThanSign(cp) { if (cp === unicode_js_1.CODE_POINTS.SOLIDUS) { this.state = State.RAWTEXT_END_TAG_OPEN; } else { this._emitChars('<'); this.state = State.RAWTEXT; this._stateRawtext(cp); } } // RAWTEXT end tag open state //------------------------------------------------------------------ _stateRawtextEndTagOpen(cp) { if (isAsciiLetter(cp)) { this.state = State.RAWTEXT_END_TAG_NAME; this._stateRawtextEndTagName(cp); } else { this._emitChars('</'); this.state = State.RAWTEXT; this._stateRawtext(cp); } } // RAWTEXT end tag name state //------------------------------------------------------------------ _stateRawtextEndTagName(cp) { if (this.handleSpecialEndTag(cp)) { this._emitChars('</'); this.state = State.RAWTEXT; this._stateRawtext(cp); } } // Script data less-than sign state //------------------------------------------------------------------ _stateScriptDataLessThanSign(cp) { switch (cp) { case unicode_js_1.CODE_POINTS.SOLIDUS: { this.state = State.SCRIPT_DATA_END_TAG_OPEN; break; } case unicode_js_1.CODE_POINTS.EXCLAMATION_MARK: { this.state = State.SCRIPT_DATA_ESCAPE_START; this._emitChars('<!'); break; } default: { this._emitChars('<'); this.state = State.SCRIPT_DATA; this._stateScriptData(cp); } } } // Script data end tag open state //------------------------------------------------------------------ _stateScriptDataEndTagOpen(cp) { if (isAsciiLetter(cp)) { this.state = State.SCRIPT_DATA_END_TAG_NAME; this._stateScriptDataEndTagName(cp); } else { this._emitChars('</'); this.state = State.SCRIPT_DATA; this._stateScriptData(cp); } } // Script data end tag name state //------------------------------------------------------------------ _stateScriptDataEndTagName(cp) { if (this.handleSpecialEndTag(cp)) { this._emitChars('</'); this.state = State.SCRIPT_DATA; this._stateScriptData(cp); } } // Script data escape start state //------------------------------------------------------------------ _stateScriptDataEscapeStart(cp) { if (cp === unicode_js_1.CODE_POINTS.HYPHEN_MINUS) { this.state = State.SCRIPT_DATA_ESCAPE_START_DASH; this._emitChars('-'); } else { this.state = State.SCRIPT_DATA; this._stateScriptData(cp); } } // Script data escape start dash state //------------------------------------------------------------------ _stateScriptDataEscapeStartDash(cp) { if (cp === unicode_js_1.CODE_POINTS.HYPHEN_MINUS) { this.state = State.SCRIPT_DATA_ESCAPED_DASH_DASH; this._emitChars('-'); } else { this.state = State.SCRIPT_DATA; this._stateScriptData(cp); } } // Script data escaped state //------------------------------------------------------------------ _stateScriptDataEscaped(cp) { switch (cp) {