tenko
Version:
A "pixel perfect" 100% spec compliant ES2021 JavaScript parser written in JS.
1,466 lines (1,318 loc) • 228 kB
JavaScript
// Char codes aren't really used in the parser (only in the lexer), except for asserts. Inlined for builds.
import {
$$A_61,
$$A_UC_41,
$$B_62,
$$B_UC_42,
$$C_63,
$$C_UC_43,
$$D_64,
$$D_UC_44,
$$E_65,
$$E_UC_45,
$$F_66,
$$F_UC_46,
$$G_67,
$$G_UC_47,
$$H_68,
$$H_UC_48,
$$I_69,
$$I_UC_49,
$$J_6A,
$$J_UC_4A,
$$K_6B,
$$K_UC_4B,
$$L_6C,
$$L_UC_4C,
$$M_6D,
$$M_UC_4D,
$$N_6E,
$$N_UC_4E,
$$O_6F,
$$O_UC_4F,
$$P_70,
$$P_UC_50,
$$Q_71,
$$Q_UC_51,
$$R_72,
$$R_UC_52,
$$S_73,
$$S_UC_53,
$$T_74,
$$T_UC_54,
$$U_75,
$$U_UC_55,
$$V_76,
$$V_UC_56,
$$W_77,
$$W_UC_57,
$$X_78,
$$X_UC_58,
$$Y_79,
$$Y_UC_59,
$$Z_7A,
$$Z_UC_5A,
$$0_30,
$$1_31,
$$2_32,
$$3_33,
$$4_34,
$$5_35,
$$6_36,
$$7_37,
$$8_38,
$$9_39,
$$NULL_00,
$$BACKSPACE_08,
$$TAB_09,
$$LF_0A,
$$VTAB_0B,
$$FF_0C,
$$CR_0D,
$$SPACE_20,
$$EXCL_21,
$$DQUOTE_22,
$$HASH_23,
$$$_24,
$$PERCENT_25,
$$AND_26,
$$SQUOTE_27,
$$PAREN_L_28,
$$PAREN_R_29,
$$STAR_2A,
$$PLUS_2B,
$$COMMA_2C,
$$DASH_2D,
$$DOT_2E,
$$FWDSLASH_2F,
$$COLON_3A,
$$SEMI_3B,
$$LT_3C,
$$IS_3D,
$$GT_3E,
$$QMARK_3F,
$$SQUARE_L_5B,
$$BACKSLASH_5C,
$$SQUARE_R_5D,
$$XOR_5E,
$$LODASH_5F,
$$TICK_60,
$$CURLY_L_7B,
$$CURLY_R_7D,
$$TILDE_7E,
$$OR_7C,
$$NBSP_A0,
$$ZWS_200B,
$$ZWNJ_200C,
$$ZWJ_200D,
$$LS_2029,
$$PS_2028,
$$BOM_FEFF,
} from './charcodes.mjs';
// Utils are only used in dev
import {
ASSERT,
} from './utils.mjs';
// Lexerflags are used to signal state from parser to lexer. And organically grew into a wider signaling system.
import {
INITIAL_LEXER_FLAGS,
LF_NO_FLAGS,
LF_CAN_NEW_DOT_TARGET,
LF_FOR_REGEX,
LF_IN_ASYNC,
LF_IN_CONSTRUCTOR,
LF_IN_FOR_LHS,
LF_IN_FUNC_ARGS,
LF_IN_GENERATOR,
LF_IN_GLOBAL,
LF_IN_ITERATION,
LF_IN_SWITCH,
LF_IN_TEMPLATE,
LF_NO_ASI,
LF_STRICT_MODE,
LF_SUPER_CALL,
LF_SUPER_PROP,
LF_NOT_KEYWORD,
LF_CHAINING,
L,
} from './lexerflags.mjs';
// Token type stuff is put in their own file
import {
getIdentPart,
getStringPart,
getTokenStart,
isWhiteToken,
isNewlineToken,
isCommentToken,
isIdentToken,
isNumberToken,
isBigintToken,
isStringToken,
isPunctuatorToken,
isRegexToken,
isTickToken,
isBadTickToken,
isNumberStringToken,
isNumberStringRegex,
getHexValue,
toktypeToString,
T,
KEYWORD_TRIE_OBJLIT,
MAX_START_VALUE,
$UNTYPED, // 0
$SPACE,
$TAB,
$NL_SOLO,
$NL_CRLF,
$COMMENT_SINGLE,
$COMMENT_MULTI,
$COMMENT_HTML,
$IDENT,
$ID_arguments,
$ID_as,
$ID_async,
$ID_await,
$ID_break,
$ID_case,
$ID_catch,
$ID_class,
$ID_const,
$ID_continue,
$ID_debugger,
$ID_default,
$ID_delete,
$ID_do,
$ID_else,
$ID_enum,
$ID_eval,
$ID_export,
$ID_extends,
$ID_false,
$ID_finally,
$ID_for,
$ID_from,
$ID_function,
$ID_get,
$ID_if,
$ID_implements,
$ID_import,
$ID_in,
$ID_instanceof,
$ID_interface,
$ID_let,
$ID_new,
$ID_null,
$ID_of,
$ID_package,
$ID_private,
$ID_protected,
$ID_public,
$ID_return,
$ID_set,
$ID_static,
$ID_super,
$ID_switch,
$ID_target,
$ID_this,
$ID_throw,
$ID_true,
$ID_try,
$ID_typeof,
$ID_var,
$ID_void,
$ID_while,
$ID_with,
$ID_yield,
$NUMBER_HEX,
$NUMBER_DEC,
$NUMBER_BIN,
$NUMBER_OCT,
$NUMBER_OLD,
$NUMBER_BIG_HEX,
$NUMBER_BIG_DEC,
$NUMBER_BIG_BIN,
$NUMBER_BIG_OCT,
$PUNC_EXCL,
$PUNC_EXCL_EQ,
$PUNC_EXCL_EQ_EQ,
$PUNC_PERCENT,
$PUNC_PERCENT_EQ,
$PUNC_AND,
$PUNC_AND_AND,
$PUNC_AND_AND_EQ,
$PUNC_AND_EQ,
$PUNC_PAREN_OPEN,
$PUNC_PAREN_CLOSE,
$PUNC_STAR,
$PUNC_STAR_STAR,
$PUNC_STAR_EQ,
$PUNC_STAR_STAR_EQ,
$PUNC_PLUS,
$PUNC_PLUS_PLUS,
$PUNC_PLUS_EQ,
$PUNC_COMMA,
$PUNC_MIN,
$PUNC_MIN_MIN,
$PUNC_MIN_EQ,
$PUNC_MIN_MIN_GT,
$PUNC_DOT,
$PUNC_DOT_DOT_DOT,
$PUNC_DIV,
$PUNC_DIV_EQ,
$PUNC_COLON,
$PUNC_SEMI,
$PUNC_LT,
$PUNC_LT_LT,
$PUNC_LT_EQ,
$PUNC_LT_LT_EQ,
$PUNC_LT_EXCL_MIN_MIN,
$PUNC_EQ,
$PUNC_EQ_EQ,
$PUNC_EQ_EQ_EQ,
$PUNC_EQ_GT,
$PUNC_GT,
$PUNC_GT_GT,
$PUNC_GT_GT_GT,
$PUNC_GT_EQ,
$PUNC_GT_GT_EQ,
$PUNC_GT_GT_GT_EQ,
$PUNC_QMARK,
$PUNC_QMARK_DOT,
$PUNC_QMARK_QMARK,
$PUNC_QMARK_QMARK_EQ,
$PUNC_BRACKET_OPEN,
$PUNC_BRACKET_CLOSE,
$PUNC_CARET,
$PUNC_CARET_EQ,
$PUNC_CURLY_OPEN,
$PUNC_OR,
$PUNC_OR_OR,
$PUNC_OR_OR_EQ,
$PUNC_OR_EQ,
$PUNC_CURLY_CLOSE,
$PUNC_TILDE,
$REGEXN,
$REGEXU,
$STRING_SINGLE,
$STRING_DOUBLE,
$TICK_HEAD,
$TICK_BODY,
$TICK_TAIL,
$TICK_PURE,
$TICK_BAD_HEAD,
$TICK_BAD_BODY,
$TICK_BAD_TAIL,
$TICK_BAD_PURE,
$EOF,
$ASI,
$ERROR,
START_SPACE,
START_ID,
START_KEY,
START_NL_SOLO,
START_CR,
START_STRING,
START_DECIMAL,
START_DOT,
START_CURLY_CLOSE,
START_EQ,
START_DIV,
START_PLUS,
START_MIN,
START_ZERO,
START_TEMPLATE,
START_EXCL,
START_PERCENT,
START_AND,
START_STAR,
START_CARET,
START_LT,
START_GT,
START_OR,
START_BSLASH,
START_QMARK,
START_ERROR,
STRING_PART,
STRING_QUOTE,
STRING_BS,
STRING_UNICODE,
STRING_NL,
IDENT_PART,
IDENT_END,
IDENT_BS,
IDENT_UNICODE,
regexAtomEscapeStartJumpTable,
REGATOM_ESC_NONU,
REGATOM_ESC_OK,
REGATOM_ESC_u,
REGATOM_ESC_x,
REGATOM_ESC_UNICODE,
REGATOM_ESC_c,
REGATOM_ESC_pP,
REGATOM_ESC_0,
REGATOM_ESC_123456789,
REGATOM_ESC_k,
REGATOM_ESC_NL,
REGATOM_ESC_WC,
regexClassEscapeStartJumpTable,
REGCLS_ESC_NSC,
REGCLS_ESC_UNICODE,
REGCLS_ESC_u,
REGCLS_ESC_x,
REGCLS_ESC_c,
REGCLS_ESC_k,
REGCLS_ESC_b,
REGCLS_ESC_B,
REGCLS_ESC_f,
REGCLS_ESC_n,
REGCLS_ESC_r,
REGCLS_ESC_t,
REGCLS_ESC_v,
REGCLS_ESC_ERR,
REGCLS_ESC_pP,
REGCLS_ESC_0,
REGCLS_ESC_1234567,
REGCLS_ESC_89,
REGCLS_ESC_SYNTAX,
REGCLS_ESC_DASH,
REGCLS_ESC_NL,
regexAtomJumpTable,
REGEX_ATOM_OTHER,
REGEX_ATOM_DOT,
REGEX_ATOM_QUANT,
REGEX_ATOM_PARENL,
REGEX_ATOM_PARENR,
REGEX_ATOM_SQUAREL,
REGEX_ATOM_SQUARER,
REGEX_ATOM_BSLASH,
REGEX_ATOM_FSLASH,
REGEX_ATOM_XOR,
REGEX_ATOM_DOLLAR,
REGEX_ATOM_UNICODE,
REGEX_ATOM_CURLYL,
REGEX_ATOM_CURLYR,
REGEX_ATOM_OR,
REGEX_ATOM_NL,
stringEscapeStartJumpTable,
STRING_ESC_OK,
STRING_ESC_N,
STRING_ESC_SQ,
STRING_ESC_DQ,
STRING_ESC_U,
STRING_ESC_UNICODE,
STRING_ESC_LF,
STRING_ESC_CR,
STRING_ESC_0,
STRING_ESC_123456789,
STRING_ESC_B,
STRING_ESC_F,
STRING_ESC_R,
STRING_ESC_T,
STRING_ESC_V,
STRING_ESC_X,
HEX_OOB,
// <SCRUB ASSERTS TO COMMENT>
ALL_START_TYPES,
ALL_GEES,
ALL_TOKEN_GROUPS,
ALL_TOKEN_TYPES,
// </SCRUB ASSERTS TO COMMENT>
} from './tokentype.mjs';
// https://tc39.es/ecma262/#table-nonbinary-unicode-properties
// (Manually copied from spec. Note that the numbers of the table is not "fixed" so don't refer to them like that)
const TABLE_NONBIN_UNI_PROPS = ',General_Category,gc,Script,sc,Script_Extensions,scx,';
const TABLE_BIN_UNI_PROPS = ',ASCII,ASCII_Hex_Digit,AHex,Alphabetic,Alpha,Any,Assigned,Bidi_Control,Bidi_C,Bidi_Mirrored,Bidi_M,Case_Ignorable,CI,Cased,Changes_When_Casefolded,CWCF,Changes_When_Casemapped,CWCM,Changes_When_Lowercased,CWL,Changes_When_NFKC_Casefolded,CWKCF,Changes_When_Titlecased,CWT,Changes_When_Uppercased,CWU,Dash,Default_Ignorable_Code_Point,DI,Deprecated,Dep,Diacritic,Dia,Emoji,Emoji_Component,Emoji_Modifier,Emoji_Modifier_Base,Emoji_Presentation,Extended_Pictographic,Extender,Ext,Grapheme_Base,Gr_Base,Grapheme_Extend,Gr_Ext,Hex_Digit,Hex,IDS_Binary_Operator,IDSB,IDS_Trinary_Operator,IDST,ID_Continue,IDC,ID_Start,IDS,Ideographic,Ideo,Join_Control,Join_C,Logical_Order_Exception,LOE,Lowercase,Lower,Math,Noncharacter_Code_Point,NChar,Pattern_Syntax,Pat_Syn,Pattern_White_Space,Pat_WS,Quotation_Mark,QMark,Radical,Regional_Indicator,RI,Sentence_Terminal,STerm,Soft_Dotted,SD,Terminal_Punctuation,Term,Unified_Ideograph,UIdeo,Uppercase,Upper,Variation_Selector,VS,White_Space,space,XID_Continue,XIDC,XID_Start,XIDS,';
const TABLE_GEN_CAT_VALUES = ',Cased_Letter,LC,Close_Punctuation,Pe,Connector_Punctuation,Pc,Control,Cc,cntrl,Currency_Symbol,Sc,Dash_Punctuation,Pd,Decimal_Number,Nd,digit,Enclosing_Mark,Me,Final_Punctuation,Pf,Format,Cf,Initial_Punctuation,Pi,Letter,L,Letter_Number,Nl,Line_Separator,Zl,Lowercase_Letter,Ll,Mark,M,Combining_Mark,Math_Symbol,Sm,Modifier_Letter,Lm,Modifier_Symbol,Sk,Nonspacing_Mark,Mn,Number,N,Open_Punctuation,Ps,Other,C,Other_Letter,Lo,Other_Number,No,Other_Punctuation,Po,Other_Symbol,So,Paragraph_Separator,Zp,Private_Use,Co,Punctuation,P,punct,Separator,Z,Space_Separator,Zs,Spacing_Mark,Mc,Surrogate,Cs,Symbol,S,Titlecase_Letter,Lt,Unassigned,Cn,Uppercase_Letter,Lu,';
const TABLE_SCRIPT_VALUES = ',Adlam,Adlm,Ahom,Anatolian_Hieroglyphs,Hluw,Arabic,Arab,Armenian,Armn,Avestan,Avst,Balinese,Bali,Bamum,Bamu,Bassa_Vah,Bass,Batak,Batk,Bengali,Beng,Bhaiksuki,Bhks,Bopomofo,Bopo,Brahmi,Brah,Braille,Brai,Buginese,Bugi,Buhid,Buhd,Canadian_Aboriginal,Cans,Carian,Cari,Caucasian_Albanian,Aghb,Chakma,Cakm,Cham,Cherokee,Cher,Common,Zyyy,Coptic,Copt,Qaac,Cuneiform,Xsux,Cypriot,Cprt,Cyrillic,Cyrl,Deseret,Dsrt,Devanagari,Deva,Dogra,Dogr,Duployan,Dupl,Egyptian_Hieroglyphs,Egyp,Elbasan,Elba,Elymaic,Elym,Ethiopic,Ethi,Georgian,Geor,Glagolitic,Glag,Gothic,Goth,Grantha,Gran,Greek,Grek,Gujarati,Gujr,Gunjala_Gondi,Gong,Gurmukhi,Guru,Han,Hani,Hangul,Hang,Hanifi_Rohingya,Rohg,Hanunoo,Hano,Hatran,Hatr,Hebrew,Hebr,Hiragana,Hira,Imperial_Aramaic,Armi,Inherited,Zinh,Qaai,Inscriptional_Pahlavi,Phli,Inscriptional_Parthian,Prti,Javanese,Java,Kaithi,Kthi,Kannada,Knda,Katakana,Kana,Kayah_Li,Kali,Kharoshthi,Khar,Khmer,Khmr,Khojki,Khoj,Khudawadi,Sind,Lao,Laoo,Latin,Latn,Lepcha,Lepc,Limbu,Limb,Linear_A,Lina,Linear_B,Linb,Lisu,Lycian,Lyci,Lydian,Lydi,Mahajani,Mahj,Makasar,Maka,Malayalam,Mlym,Mandaic,Mand,Manichaean,Mani,Marchen,Marc,Medefaidrin,Medf,Masaram_Gondi,Gonm,Meetei_Mayek,Mtei,Mende_Kikakui,Mend,Meroitic_Cursive,Merc,Meroitic_Hieroglyphs,Mero,Miao,Plrd,Modi,Mongolian,Mong,Mro,Mroo,Multani,Mult,Myanmar,Mymr,Nabataean,Nbat,Nandinagari,Nand,New_Tai_Lue,Talu,Newa,Nko,Nkoo,Nushu,Nshu,Nyiakeng_Puachue_Hmong,Hmnp,Ogham,Ogam,Ol_Chiki,Olck,Old_Hungarian,Hung,Old_Italic,Ital,Old_North_Arabian,Narb,Old_Permic,Perm,Old_Persian,Xpeo,Old_Sogdian,Sogo,Old_South_Arabian,Sarb,Old_Turkic,Orkh,Oriya,Orya,Osage,Osge,Osmanya,Osma,Pahawh_Hmong,Hmng,Palmyrene,Palm,Pau_Cin_Hau,Pauc,Phags_Pa,Phag,Phoenician,Phnx,Psalter_Pahlavi,Phlp,Rejang,Rjng,Runic,Runr,Samaritan,Samr,Saurashtra,Saur,Sharada,Shrd,Shavian,Shaw,Siddham,Sidd,SignWriting,Sgnw,Sinhala,Sinh,Sogdian,Sogd,Sora_Sompeng,Sora,Soyombo,Soyo,Sundanese,Sund,Syloti_Nagri,Sylo,Syriac,Syrc,Tagalog,Tglg,Tagbanwa,Tagb,Tai_Le,Tale,Tai_Tham,Lana,Tai_Viet,Tavt,Takri,Takr,Tamil,Taml,Tangut,Tang,Telugu,Telu,Thaana,Thaa,Thai,Tibetan,Tibt,Tifinagh,Tfng,Tirhuta,Tirh,Ugaritic,Ugar,Vai,Vaii,Wancho,Wcho,Warang_Citi,Wara,Yi,Yiii,Zanabazar_Square,Zanb,';
import {
BAD_ESCAPE,
GOOD_ESCAPE,
FOR_NAMED_GROUP,
FOR_K_ESCAPE,
GOAL_MODULE,
GOAL_SCRIPT,
MAX_VALID_UNICODE_VALUE,
REGEX_ALWAYS_GOOD,
REGEX_GOOD_WITH_U_FLAG,
REGEX_GOOD_SANS_U_FLAG,
REGEX_ALWAYS_BAD,
REGEX_GOOD_RUBY_EDGE_CASE,
FIRST_CHAR,
ILLEGAL_UNICODE_ESCAPE,
NON_START,
REGEX_CHARCLASS_BAD,
REGEX_CHARCLASS_ESCAPED_UC_B,
REGEX_CHARCLASS_ESCAPED_C,
REGEX_CHARCLASS_BAD_SANS_U_FLAG,
REGEX_CHARCLASS_BAD_WITH_U_FLAG,
REGEX_CHARCLASS_CLASS_ESCAPE,
REGEX_CHARCLASS_WAS_RUBY,
COLLECT_TOKENS_NONE,
COLLECT_TOKENS_SOLID,
COLLECT_TOKENS_ALL,
COLLECT_TOKENS_TYPES,
WEB_COMPAT_OFF,
WEB_COMPAT_ON,
RETURN_ANY_TOKENS,
RETURN_COMMENT_TOKENS,
RETURN_SOLID_TOKENS,
WHITESPACE_TOKEN,
SOLID_TOKEN,
PARSING_FROM_TICK,
PARSING_SANS_TICK,
FAIL_GRACEFULLY,
FAIL_HARD,
FOR_TEMPLATE,
NOT_TEMPLATE,
CODEPOINT_FROM_ESCAPE,
INVALID_IDENT_CHAR,
VALID_SINGLE_CHAR,
VALID_DOUBLE_CHAR,
REGEX_VALID_CURLY_QUANTIFIER,
REGEX_INVALID_CURLY_QUANTIFIER,
REGEX_PARTIAL_CURLY_QUANTIFIER,
} from './enum_lexer.mjs';
let ID_START_REGEX = undefined;
function getIdStartRegexSuperSlow() {
if (ID_START_REGEX) return ID_START_REGEX;
return ID_START_REGEX = createUnicodeRegex('^\\p{ID_Start}$');
}
let ID_CONTINUE_REGEX = undefined;
function getIdRestRegexSuperSlow() {
if (ID_CONTINUE_REGEX) return ID_CONTINUE_REGEX;
return ID_CONTINUE_REGEX = createUnicodeRegex('^\\p{ID_Continue}$');
}
function createUnicodeRegex(pattern) {
try {
return new RegExp(pattern,'u');
} catch(e) {
console.warn('Tenko: Current nodejs version does not suppport unicode regexes or regex property escapes; Input contains unicode that requires it so Tenko is unable to properly parse input (' + e.message + ')');
return /|/;
}
}
function Lexer(
input,
options
) {
const {
targetEsVersion = Infinity,
parsingGoal = GOAL_MODULE,
collectTokens = COLLECT_TOKENS_NONE, // Collect token objects in an array? (Enabling this may slow down parsing!)
returnTokens = RETURN_SOLID_TOKENS, // What to emit and not to emit while lexing
webCompat = WEB_COMPAT_ON,
gracefulErrors = FAIL_HARD,
tokenStorageExternal,
babelTokenCompat = false,
errorCodeFrame = true, // Print a code frame of input with context with errors?
truncCodeFrame = false, // Trunc large input codes to just a few lines around the point of error?
// You can override the logging functions
$log = console.log,
$warn = console.warn,
$error = console.error,
} = options;
const tokenStorage = tokenStorageExternal || (collectTokens !== COLLECT_TOKENS_NONE ? [] : undefined);
ASSERT(typeof input === 'string', 'input string should be string; ' + typeof input);
ASSERT(targetEsVersion !== undefined, 'undefined should become default', targetEsVersion);
ASSERT(typeof targetEsVersion === 'number', 'targetEsVersion should be a number', typeof targetEsVersion);
ASSERT((targetEsVersion >= 6 && targetEsVersion <= 12) || targetEsVersion === Infinity, 'only support v6~12 (ES2015-ES2021) right now [' + targetEsVersion + ','+(typeof targetEsVersion)+']');
const supportRegexPropertyEscapes = targetEsVersion >= 9 || targetEsVersion === Infinity;
const supportRegexLookbehinds = targetEsVersion >= 9 || targetEsVersion === Infinity;
const supportRegexDotallFlag = targetEsVersion >= 9 || targetEsVersion === Infinity;
const supportRegexNamedGroups = targetEsVersion >= 9 || targetEsVersion === Infinity;
const supportBigInt = targetEsVersion === 11 || targetEsVersion === Infinity;
const supportNullishCoalescing = targetEsVersion === 11 || targetEsVersion === Infinity;
const supportOptionalChaining = targetEsVersion === 11 || targetEsVersion === Infinity;
const supportLogicCompound = targetEsVersion === 12 || targetEsVersion === Infinity;
let pointer = 0;
let len = input.length;
let consumedNewlinesBeforeSolid = false; // whitespace newline token or string token that contained newline or multiline comment
let nlwas = false; // basically the state of consumedNewlinesBeforeSolid before starting the current token
let finished = false; // generated an $EOF?
let lastOffset = pointer; // Value of `pointer` before starting to parse one token
let startForError = 0;
let lastType = 0;
let lastStart = 0;
let lastStop = 0;
let lastLine = 0;
let lastColumn = 0;
let lastCanonizedInput = ''; // updated when parsing ident or string. Contains _unescaped_ input. Used for keyword checks and .value in ast for strings
let lastCanonizedInputLen = 0; // work around an inline cache bug (lastCanonizedInput.length would cause megamorphic deopt for some reason)
let lastPotentialRegexError = ''; // If regex scanner is an error then this is the message. Many errors require flag validation at the end.
let lastReportableLexerError = ''; // Set whenever an $error is or will be returned
let currentLine = 1; // the number of newlines, crlf sensitive (the pair is considered 1 line)
let currentColOffset = 0; // position in the input code of the first character after the last newline
let prevTokenEndColumn = 0;
let prevTokenEndLine = 0;
let prevTokenEndPointer = 0;
let prevTokenSolid = true;
let stale = false; // do NOT read from `cache` when `stale` is true. This is a dev-only assertion based safeguard...
let cache = input.charCodeAt(0);
let anyTokenCount = 0;
let solidTokenCount = 0;
function peek() {
ASSERT(neof(), 'pointer not oob');
ASSERT(!arguments.length, 'peek is not expecting args');
ASSERT(cache === input.charCodeAt(pointer), 'cache should be up to date');
return _readCache();
}
function ASSERT_peekUncached() {
// You can use this even if stale=true
ASSERT(pointer < len, 'never read oob');
return input.charCodeAt(pointer);
}
function _readCache() {
// _ALL_ reads for `cache` must go through this function. This way we can assert that it is not read when stale.
ASSERT(stale === false, 'do NOT read from cache while it is stale ... (meaning the pointer got changed without updating the cache)');
return cache;
}
function peekd(delta) {
ASSERT(delta, 'jump should be at least something otehrwise use peek()');
ASSERT(pointer + delta >= 0 && pointer + delta < len, 'pointer not oob');
ASSERT(arguments.length === 1, 'one args');
return input.charCodeAt(pointer + delta);
}
function peeky(ord) {
ASSERT(neof(), 'pointer not oob');
ASSERT(arguments.length === 1, 'one args');
return peek() === ord;
}
function slice(from, to) {
ASSERT(slice.length === arguments.length, 'arg count');
ASSERT(typeof from === 'number', '`from` should be valid number', from, to);
ASSERT(typeof to === 'number', '`to` should be valid number', from, to);
ASSERT(from >= 0 && from <= len, '`from` should be valid index', from, to);
ASSERT( to >= 0 && to <= len, '`to` should be valid index', from, to);
return input.slice(from, to);
}
function ASSERT_skipPeek(c) {
ASSERT(ASSERT_skipPeek.length === arguments.length, 'arg count');
ASSERT(stale ? ASSERT_peekUncached() === c : peek() === c, 'expecting to skip a particular char', c, stale ? ASSERT_peekUncached() : peek());
return skipPeek();
}
function skipPeek() {
ASSERT(!arguments.length, 'no args');
ASSERT(neofd(1), 'pointer should not read oob');
ASSERT(!(stale = false), '(marking cache fresh so in devmode it wont throw when read)');
return cache = input.charCodeAt(++pointer); // TODO: not unicode aware... should confirm this with unicode strings. and what about unicode identifiers?
}
function skip() {
ASSERT(!arguments.length, 'no args');
ASSERT(pointer < len, 'the pointer should not be oob yet, thats a bad smell');
let p = ++pointer;
if (pointer >= len) {
ASSERT(stale = true, '(the cache is stale because we reached the end of the input. any code should check eof before reading the input)');
cache = 0;
return;
}
ASSERT(!(stale = false), '(marking cache fresh so in devmode it wont throw when read)');
cache = input.charCodeAt(p);
}
function skipFastWithoutUpdatingCache() {
// Use ASSERT_peekUncached() for peeking in dev assertions
ASSERT(stale = true, '(marking the cache unsafe, any reads should throw in dev mode while stale)');
++pointer;
}
function eof() {
return pointer >= len;
}
function eofd(d) {
return pointer >= len - d;
}
function neof() {
return pointer < len;
}
function neofd(d) {
return pointer <= len - d;
}
// <SCRUB ASSERTS TO COMMENT>
function ASSERT_skip(chr) { // these calls are replaced with skip() in a build step
// note: consider this `skip()` in prod
ASSERT(neof(), 'should not be oob before the skip');
ASSERT(arguments.length === 1, 'require explicit char');
ASSERT(peeky(chr), 'skip expecting different char', chr, peek());
skip();
}
// </SCRUB ASSERTS TO COMMENT>
function nextToken(lexerFlags) {
ASSERT(nextToken.length === arguments.length, 'arg count');
ASSERT(!finished, 'should not next() after eof token');
if (prevTokenSolid) {
// Do this at the start because otherwise something like `a \n b` would reset this when forward parsing `b` and
// would cause `a` to be set to the wrong column data.
prevTokenEndColumn = pointer - currentColOffset;
prevTokenEndLine = currentLine;
prevTokenEndPointer = pointer;
prevTokenSolid = false;
}
lastPotentialRegexError = ''; // reset at start of a new token
lastReportableLexerError = ''; // reset at start of a new token
// These vars are relevant for between anything that gets location data in the AST
do {
++anyTokenCount;
let startCol = pointer - currentColOffset;
let startRow = currentLine;
lastCanonizedInput = '';
lastCanonizedInputLen = 0;
nlwas = consumedNewlinesBeforeSolid; // Do not include the newlines for the token itself unless whitespace (ex: `` throw `\n` ``)
if (eof()) {
createToken($EOF, pointer, pointer, startCol, startRow);
finished = true;
return returnSolidToken($EOF, pointer, pointer, startCol, startRow);
}
let start = startForError = pointer; // TODO: see if startForError makes a dent at all
let consumedTokenType = jumpTableLexer(lexerFlags);
ASSERT(consumedTokenType !== undefined, 'should not return undefined');
ASSERT((consumedTokenType>>>0) > 0, 'enum does not have zero', consumedTokenType);
// Non-whitespace tokens always get returned
if (!isWhiteToken(consumedTokenType)) {
createToken(consumedTokenType, start, pointer, startCol, startRow);
return returnSolidToken(consumedTokenType, start, pointer, startCol, startRow);
}
// Babel parity demands comments to be returned... Not sure whether the complexity (over checking $white) is worth
if (isCommentToken(consumedTokenType)) {
if (returnTokens === RETURN_COMMENT_TOKENS) {
createToken(consumedTokenType, start, pointer, startCol, startRow);
return returnCommentToken(consumedTokenType, start, pointer, startCol, startRow);
}
}
// This is a whitespace token (which may be a comment) that is not yet collected.
if (collectTokens === COLLECT_TOKENS_ALL || collectTokens === COLLECT_TOKENS_TYPES) {
createToken(consumedTokenType, start, pointer, startCol, startRow);
tokenStorage.push(collectTokens === COLLECT_TOKENS_TYPES ? consumedTokenType : createBaseToken(consumedTokenType, start, pointer, startCol, startRow, false));
}
if (returnTokens === RETURN_ANY_TOKENS) {
return createToken(consumedTokenType, start, pointer, startCol, startRow);
}
// At this point it has to be some form of whitespace and we're clearly not returning it so we can
// safely skip any number of whitespaces, which are very likely to appear in sequence
if (consumedTokenType === $COMMENT_SINGLE) {
// Either this is EOF or the next token must be a newline
if (collectTokens !== COLLECT_TOKENS_ALL && collectTokens !== COLLECT_TOKENS_TYPES) skipNewlinesWithoutTokens();
} // do not `else`
if (nlwas === true) {
if (collectTokens !== COLLECT_TOKENS_ALL && collectTokens !== COLLECT_TOKENS_TYPES) skipSpacesWithoutTokens();
}
} while (true);
ASSERT(false, 'unreachable');
}
function returnCommentToken(consumedTokenType, start, pointer, startCol, startRow) {
ASSERT(returnCommentToken.length === arguments.length, 'arg count');
ASSERT(typeof consumedTokenType === 'number', 'our types are nums');
ASSERT(typeof start === 'number', 'our locs are nums');
ASSERT(typeof pointer === 'number', 'our locs are nums');
ASSERT(typeof startCol === 'number', 'our locs are nums');
ASSERT(typeof startRow === 'number', 'our locs are nums');
if (collectTokens === COLLECT_TOKENS_ALL || collectTokens === COLLECT_TOKENS_TYPES) {
tokenStorage.push(collectTokens === COLLECT_TOKENS_TYPES ? consumedTokenType : createBaseToken(consumedTokenType, start, pointer, startCol, startRow, false));
}
}
function returnSolidToken(consumedTokenType, start, pointer, startCol, startRow) {
ASSERT(returnSolidToken.length === arguments.length, 'arg count');
ASSERT(typeof consumedTokenType === 'number', 'our types are nums');
ASSERT(typeof start === 'number', 'our locs are nums');
ASSERT(typeof pointer === 'number', 'our locs are nums');
ASSERT(typeof startCol === 'number', 'our locs are nums');
ASSERT(typeof startRow === 'number', 'our locs are nums');
++solidTokenCount;
if (collectTokens !== COLLECT_TOKENS_NONE) {
tokenStorage.push(collectTokens === COLLECT_TOKENS_TYPES ? consumedTokenType : createBaseToken(consumedTokenType, start, pointer, startCol, startRow, consumedNewlinesBeforeSolid));
}
consumedNewlinesBeforeSolid = false;
prevTokenSolid = true;
}
function skipSpacesWithoutTokens() {
while (neof()) {
let c = peek();
if (c !== $$SPACE_20 && c !== $$TAB_09) return;
skip();
}
}
function skipNewlinesWithoutTokens() {
while (neof()) {
let c = peek();
if (c === $$LF_0A) {
skip();
incrementLine();
} else if (c === $$CR_0D) {
skip();
parseCR(); // crlf is relevant so skip carefully
} else {
return;
}
}
}
function jumpTableLexer(lexerFlags) {
ASSERT(jumpTableLexer.length === arguments.length, 'arg count');
ASSERT(typeof lexerFlags === 'number', 'lexerFlags bit flags', lexerFlags);
ASSERT(pointer < len, 'pointer should not be oob here');
// This creates one token of any kind that is valid in JS.
// Take the first char, look it up in an array of 126 entries (aka jump table) and it tells you either the type
// of the token that it must be, or a hint for the type of token that it may become.
// Then either return the token type, or use the hint in a switch (the hint will be zero to n) and properly slice it
let c = peek();
skip();
if (c > 0x7e) {
return parseOtherUnicode(c);
}
let s = getTokenStart(c);
if (s > MAX_START_VALUE) {
// This means c must be a single char token, like `(` or `:`
return s;
}
// It is important to note that each case is incremental from zero to n. This should lead to the switch being
// optimized to a jump table with O(1) lookup. (TODO: verify this is the case. Add a test to prevent regressions.)
switch (s) {
case START_SPACE:
return parseSpace();
case START_ID:
return parseIdentifierRest(String.fromCharCode(c), 1);
case START_KEY:
if ((lexerFlags & LF_NOT_KEYWORD) === LF_NOT_KEYWORD) return parseIdentifierRest(String.fromCharCode(c), 1);
return parsePotentialKeywordTrieMap(c);
case START_NL_SOLO:
return parseNewlineSolo();
case START_CR:
return parseCR(); // cr crlf
case START_STRING:
return parseAnyString(c, lexerFlags);
case START_DECIMAL:
return parseDecimal();
case START_DOT:
return parseLeadingDot();
case START_CURLY_CLOSE:
if ((lexerFlags & LF_IN_TEMPLATE) === LF_IN_TEMPLATE) return parseTemplateString(lexerFlags, PARSING_SANS_TICK);
return $PUNC_CURLY_CLOSE;
case START_EQ:
return parseEqual(); // = == === =>
case START_DIV:
return parseFwdSlash(lexerFlags); // / /= //.. /*..*/
case START_PLUS:
return parseSameOrCompound($$PLUS_2B); // + ++ +=
case START_MIN:
return parseDash(); // - -- -= -->
case START_ZERO:
return parseLeadingZero(lexerFlags);
case START_TEMPLATE:
return parseTemplateString(lexerFlags, PARSING_FROM_TICK);
case START_EXCL:
return parseExcl(); // != !==
case START_PERCENT:
return parseCompoundAssignment($$PERCENT_25); // % %=
case START_AND:
return parseSameOrCompound($$AND_26); // & && &=
case START_STAR:
return parseStar(); // * *= ** **=
case START_CARET:
return parseCompoundAssignment($$XOR_5E); // ^ ^=
case START_LT:
return parseLt(); // < << <= <<= <!--
case START_GT:
return parseGtPunctuator(); // > >> >>> >= >>= >>>=
case START_OR:
return parseSameOrCompound($$OR_7C); // | || |=
case START_BSLASH:
return parseBackslash(); // An ident that starts with a unicode escape can be valid
case START_QMARK:
return parseQmark();
}
THROW('Unknown input', pointer - 1, pointer);
}
function incrementLine() {
ASSERT(incrementLine.length === arguments.length, 'arg count');
// Call this function AFTER consuming the newline(s) that triggered it
ASSERT(pointer > 0 && input.charCodeAt(pointer-1) === $$CR_0D || isLfPsLs(input.charCodeAt(pointer-1)), 'should have just consumed a newline');
consumedNewlinesBeforeSolid = true;
++currentLine;
currentColOffset = pointer;
}
function addAsi() {
// are asi's whitespace? i dunno. they're kinda special so maybe.
// put it _before_ the current token (that should be the "offending" token)
if (collectTokens !== COLLECT_TOKENS_NONE) {
// createToken($ASI, pointer, pointer, pointer - currentColOffset, currentLine);
tokenStorage.push(collectTokens === COLLECT_TOKENS_TYPES ? $ASI : createBaseToken($ASI, pointer, pointer, pointer - currentColOffset, currentLine, false), tokenStorage.pop());
}
++anyTokenCount;
++solidTokenCount; // eh... i guess.
prevTokenSolid = true;
}
function createToken(type, start, stop, column, line) {
ASSERT(createToken.length === arguments.length, 'arg count');
ASSERT(ALL_TOKEN_TYPES.includes(type) || console.log('####\n' + getErrorContext(start, stop, 'bad type')), 'the set of generated token types is fixed. New ones combinations should be part of this set');
ASSERT(Number.isFinite(start), 'start finite');
ASSERT(Number.isFinite(stop), 'stop finite');
ASSERT(Number.isFinite(column), 'col finite');
ASSERT(Number.isFinite(line), 'line finite');
ASSERT(typeof type === 'number', 'type is enum');
ASSERT(typeof lastCanonizedInput !== 'string' || lastCanonizedInput.length === lastCanonizedInputLen, 'the len cache should be equal to the canonized string len itself (thats the point)');
lastType = type;
lastStart = start;
lastStop = stop;
lastLine = line;
lastColumn = column;
}
function createBaseToken(type, start, stop, column, line, nl) {
ASSERT(createBaseToken.length === arguments.length, 'arg count');
ASSERT(typeof type === 'number', 'our types are nums');
ASSERT(typeof start === 'number', 'our locs are nums');
ASSERT(typeof stop === 'number', 'our locs are nums');
ASSERT(typeof column === 'number', 'our locs are nums');
ASSERT(typeof line === 'number', 'our locs are nums');
ASSERT(typeof nl === 'boolean', 'nl is or is not');
if (babelTokenCompat) {
return {
type,
start,
stop, // start of next token
loc: { // Tenko does not use this
start: {
line: line,
column: column,
},
end: {
line: currentLine,
column: currentColOffset,
},
},
column, // of first char of token (we still have to set this as Tenko uses this)
line, // of first char of token (we still have to set this as Tenko uses this)
};
}
return {
type,
start,
stop, // start of next token
column, // of first char of token
line, // of first char of token
nl, // If true, this was a non-whitespace token, and there was a newline between this and the previous one
};
}
function parseLeadingDot() {
if (eof()) return $PUNC_DOT; // will lead to an error in the parser
let c = peek();
if (c === $$DOT_2E) {
return parseTripleDot();
}
if (isAsciiNumber(c)) {
return parseNumberFromDot(c);
}
return $PUNC_DOT;
}
function parseTripleDot() {
// we just parsed a dot
if (peekd(1) === $$DOT_2E) {
ASSERT_skip($$DOT_2E);
ASSERT_skip($$DOT_2E);
return $PUNC_DOT_DOT_DOT;
} // the else will ultimately lead to an error in the parser
return $PUNC_DOT_DOT_DOT;
}
function parseNumberFromDot(c) {
ASSERT_skip(c);
if (neof()) {
let d = skipDigitsWithSeparators(true);
parseExponentMaybe(d);
}
verifyCharAfterNumber();
return $NUMBER_DEC;
}
function parseSpace() {
// For non-minified code it is very likely that a space is followed by another space
return $SPACE;
}
function parseCR() {
if (neof() && peeky($$LF_0A)) {
ASSERT_skip($$LF_0A);
incrementLine();
return $NL_CRLF;
}
incrementLine();
return $NL_SOLO;
}
function parseAnyString(marker, lexerFlags) {
ASSERT(parseAnyString.length === arguments.length, 'need 3 args');
ASSERT(typeof lexerFlags === 'number', 'lexerFlags number');
let pointerOffset = pointer;
let badEscape = false;
let hadNewline = false;
while (neof()) {
// Peek: while we will want to consume at least one more byte for proper strings,
// there could be a malformed string and we wouldnt want to consume the newline
let c = peek();
let s = getStringPart(c);
if (s <= MAX_START_VALUE) {
// This means c must be a single char token, like `(` or `:`
// Note: these cases should be ordered 0, 1, 2 ...
// TODO: we can skip() before the loop if we update the backslash consumer
switch (s) {
case STRING_PART:
ASSERT_skip(c);
break;
case STRING_QUOTE:
ASSERT_skip(c);
if (c === marker) {
if (badEscape) {
if (!lastReportableLexerError) lastReportableLexerError = 'String had an illegal escape';
return $ERROR;
}
// Note: LF and PS are newlines that are _explicitly_ allowed in a string, so only check for LF and CR here
if (hadNewline) {
if (!lastReportableLexerError) lastReportableLexerError = 'Encountered newline in string which is not allowed';
return $ERROR;
}
lastCanonizedInput += slice(pointerOffset, pointer - 1);
lastCanonizedInputLen += (pointer - 1) - pointerOffset;
return marker === $$DQUOTE_22 ? $STRING_DOUBLE : $STRING_SINGLE;
}
break;
case STRING_BS:
lastCanonizedInput += slice(pointerOffset, pointer);
lastCanonizedInputLen += pointer - pointerOffset;
// The canonized value will be updated too
badEscape = parseStringOrTemplateEscape(lexerFlags, NOT_TEMPLATE) === BAD_ESCAPE || badEscape;
pointerOffset = pointer;
break;
case STRING_UNICODE:
ASSERT_skip(c);
if (c <= $$LS_2029 && c >= $$PS_2028) {
// (Increment after consumption as that's what incrementLine expects and asserts)
// Note: this is not an error but it does increase the line counter
incrementLine();
}
break;
case STRING_NL:
ASSERT_skip(c);
hadNewline = true;
break;
// <SCRUB ASSERTS>
default:
ASSERT(false, 'unreachable', c);
// </SCRUB ASSERTS>
}
}
}
ASSERT(eof(), 'this is only reachable in the early EOF case');
if (!lastReportableLexerError) lastReportableLexerError = 'Unclosed string at EOF';
return $ERROR;
}
function parseStringOrTemplateEscape(lexerFlags, forTemplate) {
ASSERT(arguments.length === parseStringOrTemplateEscape.length, 'need args');
ASSERT(typeof lexerFlags === 'number', 'lexerFlags number');
ASSERT_skip($$BACKSLASH_5C);
if (eof()) {
// You cant escape eof ;)
if (!lastReportableLexerError) lastReportableLexerError = 'Backslash at end of input';
return BAD_ESCAPE;
}
// we need to consume at least one char here
let c = peek();
skip();
let s = c > 0x7e ? STRING_ESC_UNICODE : stringEscapeStartJumpTable[c];
// note: the parser only really cares about \u and \x. it needs no extra work for \t \n etc
// note: it _does_ need to take care of escaped digits
switch(s) {
case STRING_ESC_OK:
// we can ignore this escape. treat it as a single char escape.
lastCanonizedInput += String.fromCharCode(c);
++lastCanonizedInputLen; // Always 1 char
return GOOD_ESCAPE;
case STRING_ESC_N:
lastCanonizedInput += '\n';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
case STRING_ESC_SQ:
lastCanonizedInput += '\'';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
case STRING_ESC_DQ:
lastCanonizedInput += '"';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
case STRING_ESC_U: {
if (eof()) return BAD_ESCAPE;
let r = parseUnicodeEscapeForNonRegex();
if (r === ILLEGAL_UNICODE_ESCAPE) return BAD_ESCAPE;
lastCanonizedInput += r > 0xffff ? String.fromCodePoint(r) : String.fromCharCode(r);
lastCanonizedInputLen += r > 0xffff ? 2 : 1;
return GOOD_ESCAPE;
}
case STRING_ESC_X:
return parseStringEscapeHex();
case STRING_ESC_UNICODE:
if (c === $$PS_2028 || c === $$LS_2029) {
incrementLine();
return GOOD_ESCAPE;
}
lastCanonizedInput += String.fromCharCode(c);
++lastCanonizedInputLen; // Always 1 char
return GOOD_ESCAPE;
case STRING_ESC_T:
lastCanonizedInput += '\t';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
case STRING_ESC_R:
lastCanonizedInput += '\r';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
case STRING_ESC_CR:
// Line continuation. Does not add anything to `lastCanonizedInput`
// Edge case: `\crlf` is a valid line continuation
if (neof() && peeky($$LF_0A)) ASSERT_skip($$LF_0A);
incrementLine();
return GOOD_ESCAPE;
case STRING_ESC_LF:
// Line continuation. Does not add anything to `lastCanonizedInput`
incrementLine();
return GOOD_ESCAPE;
case STRING_ESC_0:
case STRING_ESC_123456789:
return parseStringEscapeOctalOrDigit(c, forTemplate, lexerFlags);
case STRING_ESC_B:
lastCanonizedInput += '\b';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
case STRING_ESC_F:
lastCanonizedInput += '\f';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
case STRING_ESC_V:
lastCanonizedInput += '\v';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
// <SCRUB ASSERTS>
default:
return ASSERT(false, 'unreachable', c);
// </SCRUB ASSERTS>
}
}
function skipZeroes() {
ASSERT(neof(), 'should already been checked');
let c = peek();
while (c === $$0_30) {
ASSERT_skip($$0_30);
if (eof()) return 0;
c = peek();
}
return c;
}
function parseStringEscapeHex() {
if (eofd(1)) {
if (eof()) return GOOD_ESCAPE; // Let it error somewhere else
if (!lastReportableLexerError) lastReportableLexerError = 'Not enough of input left to create valid hex escape';
return BAD_ESCAPE;
}
let a = peek();
let b = peekd(1);
let va = getHexValue(a);
let vb = getHexValue(b);
// confirm they are both hex digits
if ((va | vb) >= HEX_OOB) {
// '\xz' should have 'xz' as canonized value
lastCanonizedInput += 'x';
++lastCanonizedInputLen;
if (!lastReportableLexerError) lastReportableLexerError = 'At least one of the two hex characters were not hex character (0-9a-f)';
return BAD_ESCAPE;
}
// okay, _now_ consume them
ASSERT_skip(a);
ASSERT_skip(b);
lastCanonizedInput += String.fromCharCode((va << 4) | vb);
++lastCanonizedInputLen; // Always 1 char
return GOOD_ESCAPE;
}
function parseStringEscapeOctalOrDigit(a, forTemplate, lexerFlags) {
ASSERT(arguments.length === parseStringEscapeOctalOrDigit.length, 'need args');
ASSERT(typeof a === 'number', 'first digit ord');
ASSERT(typeof lexerFlags === 'number', 'lexerFlags number');
// \8 and \9 are never allowed in strings. Tagged templates the exception of course.
// > SingleStringCharacter -> `\` EscapeSequence -> CharacterEscapeSequence -> NonEscapeCharacter ->
// > SourceCharacter but not one of EscapeCharacter or LineTerminator -> (EscapeCharacter:) DecimalDigit -> 0123456789
if (a === $$8_38 || a === $$9_39) {
if (!lastReportableLexerError) lastReportableLexerError = 'The grammar does not allow to escape the 8 or the 9 character';
return BAD_ESCAPE;
}
if (eof()) return GOOD_ESCAPE; // Will error somewhere else
let b = peek();
// If this is valid, this leads to a LegacyOctalEscapeSequence
// Octals are only supported in web compat, sloppy mode, and only in strings
// In web compat, \1 ~ \7 are considered start of an octal escape. \8 and \9 are illegal regardless.
// Otherwise, \1~\9 are illegal through CharacterEscapeSequence -> NonEscapeCharacter -> "SourceCharacter but not
// one of EscapeCharacter or LineTerminator" -> EscapeCharacter -> DecimalDigit
// There is a nasty edge case regarding nul (zero byte); In sloppy webcompat mode the nul escape may be followed by
// an 8 or 9 and still be a valid nul. In other modes and templates, `\08` and `\09` are considered syntax errors.
// Strings: octal escapes are only supported in sloppy mode with web compat enabled
// Template literals: explicitly do never support octal escapes so trigger a syntax error in the parser
// Tagged templates: are allowed to have bad escapes although they will cause `.value` to be `null` in the AST
// (Note that we do not know here whether the template will be tagged or just a literal, so just return BAD_ESCAPE)
if (webCompat === WEB_COMPAT_OFF || forTemplate || (lexerFlags & LF_STRICT_MODE) === LF_STRICT_MODE) {
// If octals are invalid, then the nul escape can not be followed by 8 or 9 either
// Note: in templates, octals are never valid escapes so `\08` is always a bad escape regardless of mode
if (a === $$0_30 && (b < $$0_30 || b > $$9_39)) {
// [v]: `"\0"`
// [v]: `"\0x"`
// \0 is not an octal escape, it's a nul, but whatever
lastCanonizedInput += '\0';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
}
// [v]: `"\07"`
// [v]: `"\08"`
// [v]: `"\09"`
if (forTemplate) {
if (!lastReportableLexerError) lastReportableLexerError = 'Illegal legacy octal escape in template, where octal escapes are never allowed';
} else if ((lexerFlags & LF_STRICT_MODE) === LF_STRICT_MODE) {
if (!lastReportableLexerError) lastReportableLexerError = 'Illegal legacy octal escape in strict mode';
} else {
ASSERT(webCompat === WEB_COMPAT_OFF);
if (!lastReportableLexerError) lastReportableLexerError = 'Octal escapes are only allowed in sloppy mode with web compat enabled';
}
return BAD_ESCAPE;
}
// If octals are allowed then the nul escape may be followed by 8 or 9
if (a === $$0_30 && (b < $$0_30 || b > $$7_37)) {
// [v]: `"\0"`
// [v]: `"\0x"`
// [v]: `"\07"`
// [v]: `"\08"`
// [v]: `"\09"`
// \0 is not an octal escape, it's a nul, but whatever
// In web compat mode the following char can be 8 and 9 according to the extended syntax
lastCanonizedInput += '\0';
++lastCanonizedInputLen;
return GOOD_ESCAPE;
}
if (b < $$0_30 || b > $$7_37) {
// Max valid octal escape is 0377 so if a >= 4 then it's max 2 digits so we can return now
lastCanonizedInput += String.fromCharCode(parseInt(String.fromCharCode(a), 8));
++lastCanonizedInputLen; // Always 1 char
return GOOD_ESCAPE;
}
ASSERT_skip(b);
if (eof()) return GOOD_ESCAPE; // Will error somewhere else
if (a > $$3_33) {
// Max valid octal escape is 0377 so if a >= 4 then it's max 2 digits so we can return now
lastCanonizedInput += String.fromCharCode(parseInt(String.fromCharCode(a, b), 8));
++lastCanonizedInputLen; // Always 1 char
return GOOD_ESCAPE;
}
let c = peek();
if (c < $$0_30 || c > $$7_37) {
lastCanonizedInput += String.fromCharCode(parseInt(String.fromCharCode(a, b), 8));
++lastCanonizedInputLen; // Always 1 char
return GOOD_ESCAPE;
}
ASSERT_skip(c);
lastCanonizedInput += String.fromCharCode(parseInt(String.fromCharCode(a, b, c), 8));
++lastCanonizedInputLen; // Always 1 char
return GOOD_ESCAPE;
}
function parseDash() {
// The first dash is already consumed
// This parses into
// - minus (either op or unary)
// - update (`--`)
// - compound assignment (`-=`)
// - html comment (`-->`)
if (parsingGoal === GOAL_SCRIPT && webCompat === WEB_COMPAT_ON && !eofd(1) && peeky($$DASH_2D) && peekd(1) === $$GT_3E) {
// https://tc39.github.io/ecma262/#sec-html-like-comments
// This extension is not allowed when parsing source code using the goal symbol Module
// There are two valid ways of closing html comment;
// - <a multi-line comment that contains at least one newline> <space>* <html close>
// - <newline> <space>* <html close>
// TODO: and properly parse this, not like the duplicate hack it is now
if (consumedNewlinesBeforeSolid === true) {
return parseCommentHtmlClose();
} else {
// Note that the `-->` is not picked up as a comment since that requires a newline to precede it.
// TODO: do we report this anywhere? This isn't an error but most likely end up being one
}
}
return parseSameOrCompound($$DASH_2D); // - -- -=
}
function parseSameOrCompound(c) {
ASSERT(parseSameOrCompound.length === arguments.length, 'arg count');
ASSERT(c === $$PLUS_2B || c === $$DASH_2D || c === $$AND_26 || c === $$OR_7C, 'parseSameOrCompound c is enum');
// `c` is an op, one of: `+`, `&`, `|`, `-`. The dash case already confirmed this is not `-->`.
// c cc c=
if (neof()) {
let d = peek();
if (d === c) {
ASSERT_skip(c); // @@
switch (c) {
case $$PLUS_2B:
return $PUNC_PLUS_PLUS;
case $$DASH_2D:
return $PUNC_MIN_MIN;
case $$AND_26:
if (neof() && peeky($$IS_3D)) {
ASSERT_skip($$IS_3D);
if (supportLogicCompound) return $PUNC_AND_AND_EQ;
return THROW('The logical compound operator (`&&=`) is only supported since ES2021, currently targeting a lower version', pointer - 3, pointer);
}
return $PUNC_AND_AND;
case $$OR_7C:
if (neof() && peeky($$IS_3D)) {
ASSERT_skip($$IS_3D);
if (supportLogicCompound) return $PUNC_AND_AND_EQ;
return THROW('The logical compound operator (`||=`) is only supported since ES2021, currently targeting a lower version', pointer - 3, pointer);
}
return $PUNC_OR_OR;
// <SCRUB ASSERTS>
default:
return ASSERT(false, 'unreachable, c is one of four enum', c);
// </SCRUB ASSERTS>
}
}
if (d === $$IS_3D) {
ASSERT_skip($$IS_3D); // @=
switch (c) {
case $$PLUS_2B:
return $PUNC_PLUS_EQ;
case $$DASH_2D:
return $PUNC_MIN_EQ;
case $$AND_26:
return $PUNC_AND_EQ;
case $$OR_7C:
return $PUNC_OR_EQ;
// <SCRUB ASSERTS>
default:
return ASSERT(false, 'unreachable, c is one of four enum', c);
// </SCRUB ASSERTS>
}
}
}
switch (c) {
case $$PLUS_2B:
return $PUNC_PLUS;
case $$DASH_2D:
return $PUNC_MIN;
case $$AND_26:
return $PUNC_AND;
case $$OR_7C:
return $PUNC_OR;
// <SCRUB ASSERTS>
default:
return ASSERT(false, 'unreachable, c is one of four enum', c);
// </SCRUB ASSERTS>
}
}
function parseTemplateString(lexerFlags, fromTick) {
// parseTick
ASSERT(arguments.length === 2, 'need 2 args');
ASSERT(typeof lexerFlags === 'number', 'lexerFlags number');
// https://tc39.github.io/ecma262/#prod-CodePoint
// "A conforming implementation must not use the extended definition of EscapeSequence described in B.1.2 when parsing a TemplateCharacter."
// Since ES9 a _tagged_ tick literal can contain illegal escapes. Regular template strings must still conform.
// The $G_TICK_BAD_ESCAPE type bit is set for template tokens that have such a bad escape (`isBadTickToken(type)`)
// - `...` // "pure", no expression components
// - `...${expr}...` // tick_head and tick_tail, no body
// - `...${expr}...${expr}...` // tick_head, tick_body (the middle part), and tick_tail
lastOffset = pointer;
let badEscapes = false;
while (neof()) {
// while we will want to consume at least one more byte for proper strings,
// there could be a malformed string and we wouldnt want to consume the newline
let c = peek();
// do ${ first, that way we can just use the peeked char in case it's a dud, without revisiting
while (c === $$$_24) {
ASS