UNPKG

chevrotain

Version:

Chevrotain is a high performance fault tolerant javascript parsing DSL for building recursive decent parsers

chevrotain.io/docs/

Chevrotain/chevrotain

868 lines • 36.3 kB

JavaScript

import { BaseRegExpVisitor } from "@chevrotain/regexp-to-ast"; import { Lexer, LexerDefinitionErrorType, } from "./lexer_public.js"; import { PRINT_ERROR } from "@chevrotain/utils"; import { canMatchCharCode, failedOptimizationPrefixMsg, getOptimizedStartCodesIndices, } from "./reg_exp.js"; import { getRegExpAst } from "./reg_exp_parser.js"; const PATTERN = "PATTERN"; export const DEFAULT_MODE = "defaultMode"; export const MODES = "modes"; export function analyzeTokenTypes(tokenTypes, options) { options = Object.assign({ safeMode: false, positionTracking: "full", lineTerminatorCharacters: ["\r", "\n"], tracer: (msg, action) => action() }, options); const tracer = options.tracer; tracer("initCharCodeToOptimizedIndexMap", () => { initCharCodeToOptimizedIndexMap(); }); let onlyRelevantTypes; tracer("Reject Lexer.NA", () => { onlyRelevantTypes = tokenTypes.filter((currType) => { return currType[PATTERN] !== Lexer.NA; }); }); let hasCustom = false; let allTransformedPatterns; tracer("Transform Patterns", () => { hasCustom = false; allTransformedPatterns = onlyRelevantTypes.map((currType) => { const currPattern = currType[PATTERN]; /* istanbul ignore else */ if (currPattern instanceof RegExp) { const regExpSource = currPattern.source; if (regExpSource.length === 1 && // only these regExp meta characters which can appear in a length one regExp regExpSource !== "^" && regExpSource !== "$" && regExpSource !== "." && !currPattern.ignoreCase) { return regExpSource; } else if (regExpSource.length === 2 && regExpSource[0] === "\\" && // not a meta character ![ "d", "D", "s", "S", "t", "r", "n", "t", "0", "c", "b", "B", "f", "v", "w", "W", ].includes(regExpSource[1])) { // escaped meta Characters: /\+/ /\[/ // or redundant escaping: /\a/ // without the escaping "\" return regExpSource[1]; } else { return addStickyFlag(currPattern); } } else if (typeof currPattern === "function") { hasCustom = true; // CustomPatternMatcherFunc - custom patterns do not require any transformations, only wrapping in a RegExp Like object return { exec: currPattern }; } else if (typeof currPattern === "object") { hasCustom = true; // ICustomPattern return currPattern; } else if (typeof currPattern === "string") { if (currPattern.length === 1) { return currPattern; } else { const escapedRegExpString = currPattern.replace(/[\\^$.*+?()[\]{}|]/g, "\\$&"); const wrappedRegExp = new RegExp(escapedRegExpString); return addStickyFlag(wrappedRegExp); } } else { throw Error("non exhaustive match"); } }); }); let patternIdxToType; let patternIdxToGroup; let patternIdxToLongerAltIdxArr; let patternIdxToPushMode; let patternIdxToPopMode; tracer("misc mapping", () => { patternIdxToType = onlyRelevantTypes.map((currType) => currType.tokenTypeIdx); patternIdxToGroup = onlyRelevantTypes.map((clazz) => { const groupName = clazz.GROUP; /* istanbul ignore next */ if (groupName === Lexer.SKIPPED) { return undefined; } else if (typeof groupName === "string") { return groupName; } else if (groupName === undefined) { return false; } else { throw Error("non exhaustive match"); } }); patternIdxToLongerAltIdxArr = onlyRelevantTypes.map((clazz) => { const longerAltType = clazz.LONGER_ALT; if (longerAltType) { const longerAltIdxArr = Array.isArray(longerAltType) ? longerAltType.map((type) => onlyRelevantTypes.indexOf(type)) : [onlyRelevantTypes.indexOf(longerAltType)]; return longerAltIdxArr; } }); patternIdxToPushMode = onlyRelevantTypes.map((clazz) => clazz.PUSH_MODE); patternIdxToPopMode = onlyRelevantTypes.map((clazz) => Object.hasOwn(clazz, "POP_MODE")); }); let patternIdxToCanLineTerminator; tracer("Line Terminator Handling", () => { const lineTerminatorCharCodes = getCharCodes(options.lineTerminatorCharacters); patternIdxToCanLineTerminator = onlyRelevantTypes.map((tokType) => false); if (options.positionTracking !== "onlyOffset") { patternIdxToCanLineTerminator = onlyRelevantTypes.map((tokType) => { if (Object.hasOwn(tokType, "LINE_BREAKS")) { return !!tokType.LINE_BREAKS; } else { return (checkLineBreaksIssues(tokType, lineTerminatorCharCodes) === false && canMatchCharCode(lineTerminatorCharCodes, tokType.PATTERN)); } }); } }); let patternIdxToIsCustom; let patternIdxToShort; let emptyGroups; let patternIdxToConfig; tracer("Misc Mapping #2", () => { patternIdxToIsCustom = onlyRelevantTypes.map(isCustomPattern); patternIdxToShort = allTransformedPatterns.map(isShortPattern); emptyGroups = onlyRelevantTypes.reduce((acc, clazz) => { const groupName = clazz.GROUP; if (typeof groupName === "string" && !(groupName === Lexer.SKIPPED)) { acc[groupName] = []; } return acc; }, {}); patternIdxToConfig = allTransformedPatterns.map((x, idx) => { return { pattern: allTransformedPatterns[idx], longerAlt: patternIdxToLongerAltIdxArr[idx], canLineTerminator: patternIdxToCanLineTerminator[idx], isCustom: patternIdxToIsCustom[idx], short: patternIdxToShort[idx], group: patternIdxToGroup[idx], push: patternIdxToPushMode[idx], pop: patternIdxToPopMode[idx], tokenTypeIdx: patternIdxToType[idx], tokenType: onlyRelevantTypes[idx], }; }); }); let canBeOptimized = true; let charCodeToPatternIdxToConfig = []; if (!options.safeMode) { tracer("First Char Optimization", () => { charCodeToPatternIdxToConfig = onlyRelevantTypes.reduce((result, currTokType, idx) => { if (typeof currTokType.PATTERN === "string") { const charCode = currTokType.PATTERN.charCodeAt(0); const optimizedIdx = charCodeToOptimizedIndex(charCode); addToMapOfArrays(result, optimizedIdx, patternIdxToConfig[idx]); } else if (Array.isArray(currTokType.START_CHARS_HINT)) { let lastOptimizedIdx; currTokType.START_CHARS_HINT.forEach((charOrInt) => { const charCode = typeof charOrInt === "string" ? charOrInt.charCodeAt(0) : charOrInt; const currOptimizedIdx = charCodeToOptimizedIndex(charCode); // Avoid adding the config multiple times /* istanbul ignore else */ // - Difficult to check this scenario effects as it is only a performance // optimization that does not change correctness if (lastOptimizedIdx !== currOptimizedIdx) { lastOptimizedIdx = currOptimizedIdx; addToMapOfArrays(result, currOptimizedIdx, patternIdxToConfig[idx]); } }); } else if (currTokType.PATTERN instanceof RegExp) { if (currTokType.PATTERN.unicode) { canBeOptimized = false; if (options.ensureOptimizations) { PRINT_ERROR(`${failedOptimizationPrefixMsg}` + `\tUnable to analyze < ${currTokType.PATTERN.toString()} > pattern.\n` + "\tThe regexp unicode flag is not currently supported by the regexp-to-ast library.\n" + "\tThis will disable the lexer's first char optimizations.\n" + "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNICODE_OPTIMIZE"); } } else { const optimizedCodes = getOptimizedStartCodesIndices(currTokType.PATTERN, options.ensureOptimizations); /* istanbul ignore if */ // start code will only be empty given an empty regExp or failure of regexp-to-ast library // the first should be a different validation and the second cannot be tested. if (optimizedCodes.length === 0) { // we cannot understand what codes may start possible matches // The optimization correctness requires knowing start codes for ALL patterns. // Not actually sure this is an error, no debug message canBeOptimized = false; } optimizedCodes.forEach((code) => { addToMapOfArrays(result, code, patternIdxToConfig[idx]); }); } } else { if (options.ensureOptimizations) { PRINT_ERROR(`${failedOptimizationPrefixMsg}` + `\tTokenType: <${currTokType.name}> is using a custom token pattern without providing <start_chars_hint> parameter.\n` + "\tThis will disable the lexer's first char optimizations.\n" + "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_OPTIMIZE"); } canBeOptimized = false; } return result; }, []); }); } return { emptyGroups: emptyGroups, patternIdxToConfig: patternIdxToConfig, charCodeToPatternIdxToConfig: charCodeToPatternIdxToConfig, hasCustom: hasCustom, canBeOptimized: canBeOptimized, }; } export function validatePatterns(tokenTypes, validModesNames) { let errors = []; const missingResult = findMissingPatterns(tokenTypes); errors = errors.concat(missingResult.errors); const invalidResult = findInvalidPatterns(missingResult.valid); const validTokenTypes = invalidResult.valid; errors = errors.concat(invalidResult.errors); errors = errors.concat(validateRegExpPattern(validTokenTypes)); errors = errors.concat(findInvalidGroupType(validTokenTypes)); errors = errors.concat(findModesThatDoNotExist(validTokenTypes, validModesNames)); errors = errors.concat(findUnreachablePatterns(validTokenTypes)); return errors; } function validateRegExpPattern(tokenTypes) { let errors = []; const withRegExpPatterns = tokenTypes.filter((currTokType) => currTokType[PATTERN] instanceof RegExp); errors = errors.concat(findEndOfInputAnchor(withRegExpPatterns)); errors = errors.concat(findStartOfInputAnchor(withRegExpPatterns)); errors = errors.concat(findUnsupportedFlags(withRegExpPatterns)); errors = errors.concat(findDuplicatePatterns(withRegExpPatterns)); errors = errors.concat(findEmptyMatchRegExps(withRegExpPatterns)); return errors; } export function findMissingPatterns(tokenTypes) { const tokenTypesWithMissingPattern = tokenTypes.filter((currType) => { return !Object.hasOwn(currType, PATTERN); }); const errors = tokenTypesWithMissingPattern.map((currType) => { return { message: "Token Type: ->" + currType.name + "<- missing static 'PATTERN' property", type: LexerDefinitionErrorType.MISSING_PATTERN, tokenTypes: [currType], }; }); const valid = tokenTypes.filter((x) => !tokenTypesWithMissingPattern.includes(x)); return { errors, valid }; } export function findInvalidPatterns(tokenTypes) { const tokenTypesWithInvalidPattern = tokenTypes.filter((currType) => { const pattern = currType[PATTERN]; return (!(pattern instanceof RegExp) && !(typeof pattern === "function") && !Object.hasOwn(pattern, "exec") && !(typeof pattern === "string")); }); const errors = tokenTypesWithInvalidPattern.map((currType) => { return { message: "Token Type: ->" + currType.name + "<- static 'PATTERN' can only be a RegExp, a" + " Function matching the {CustomPatternMatcherFunc} type or an Object matching the {ICustomPattern} interface.", type: LexerDefinitionErrorType.INVALID_PATTERN, tokenTypes: [currType], }; }); const valid = tokenTypes.filter((x) => !tokenTypesWithInvalidPattern.includes(x)); return { errors, valid }; } const end_of_input = /[^\\][$]/; export function findEndOfInputAnchor(tokenTypes) { class EndAnchorFinder extends BaseRegExpVisitor { constructor() { super(...arguments); this.found = false; } visitEndAnchor(node) { this.found = true; } } const invalidRegex = tokenTypes.filter((currType) => { const pattern = currType.PATTERN; try { const regexpAst = getRegExpAst(pattern); const endAnchorVisitor = new EndAnchorFinder(); endAnchorVisitor.visit(regexpAst); return endAnchorVisitor.found; } catch (e) { // old behavior in case of runtime exceptions with regexp-to-ast. /* istanbul ignore next - cannot ensure an error in regexp-to-ast*/ return end_of_input.test(pattern.source); } }); const errors = invalidRegex.map((currType) => { return { message: "Unexpected RegExp Anchor Error:\n" + "\tToken Type: ->" + currType.name + "<- static 'PATTERN' cannot contain end of input anchor '$'\n" + "\tSee chevrotain.io/docs/guide/resolving_lexer_errors.html#ANCHORS" + "\tfor details.", type: LexerDefinitionErrorType.EOI_ANCHOR_FOUND, tokenTypes: [currType], }; }); return errors; } export function findEmptyMatchRegExps(tokenTypes) { const matchesEmptyString = tokenTypes.filter((currType) => { const pattern = currType.PATTERN; return pattern.test(""); }); const errors = matchesEmptyString.map((currType) => { return { message: "Token Type: ->" + currType.name + "<- static 'PATTERN' must not match an empty string", type: LexerDefinitionErrorType.EMPTY_MATCH_PATTERN, tokenTypes: [currType], }; }); return errors; } const start_of_input = /[^\\[][\^]|^\^/; export function findStartOfInputAnchor(tokenTypes) { class StartAnchorFinder extends BaseRegExpVisitor { constructor() { super(...arguments); this.found = false; } visitStartAnchor(node) { this.found = true; } } const invalidRegex = tokenTypes.filter((currType) => { const pattern = currType.PATTERN; try { const regexpAst = getRegExpAst(pattern); const startAnchorVisitor = new StartAnchorFinder(); startAnchorVisitor.visit(regexpAst); return startAnchorVisitor.found; } catch (e) { // old behavior in case of runtime exceptions with regexp-to-ast. /* istanbul ignore next - cannot ensure an error in regexp-to-ast*/ return start_of_input.test(pattern.source); } }); const errors = invalidRegex.map((currType) => { return { message: "Unexpected RegExp Anchor Error:\n" + "\tToken Type: ->" + currType.name + "<- static 'PATTERN' cannot contain start of input anchor '^'\n" + "\tSee https://chevrotain.io/docs/guide/resolving_lexer_errors.html#ANCHORS" + "\tfor details.", type: LexerDefinitionErrorType.SOI_ANCHOR_FOUND, tokenTypes: [currType], }; }); return errors; } export function findUnsupportedFlags(tokenTypes) { const invalidFlags = tokenTypes.filter((currType) => { const pattern = currType[PATTERN]; return pattern instanceof RegExp && (pattern.multiline || pattern.global); }); const errors = invalidFlags.map((currType) => { return { message: "Token Type: ->" + currType.name + "<- static 'PATTERN' may NOT contain global('g') or multiline('m')", type: LexerDefinitionErrorType.UNSUPPORTED_FLAGS_FOUND, tokenTypes: [currType], }; }); return errors; } // This can only test for identical duplicate RegExps, not semantically equivalent ones. export function findDuplicatePatterns(tokenTypes) { const found = []; let identicalPatterns = tokenTypes.map((outerType) => { return tokenTypes.reduce((result, innerType) => { if (outerType.PATTERN.source === innerType.PATTERN.source && !found.includes(innerType) && innerType.PATTERN !== Lexer.NA) { // this avoids duplicates in the result, each Token Type may only appear in one "set" // in essence we are creating Equivalence classes on equality relation. found.push(innerType); result.push(innerType); return result; } return result; }, []); }); identicalPatterns = identicalPatterns.filter(Boolean); const duplicatePatterns = identicalPatterns.filter((currIdenticalSet) => { return currIdenticalSet.length > 1; }); const errors = duplicatePatterns.map((setOfIdentical) => { const tokenTypeNames = setOfIdentical.map((currType) => { return currType.name; }); const dupPatternSrc = setOfIdentical[0].PATTERN; return { message: `The same RegExp pattern ->${dupPatternSrc}<-` + `has been used in all of the following Token Types: ${tokenTypeNames.join(", ")} <-`, type: LexerDefinitionErrorType.DUPLICATE_PATTERNS_FOUND, tokenTypes: setOfIdentical, }; }); return errors; } export function findInvalidGroupType(tokenTypes) { const invalidTypes = tokenTypes.filter((clazz) => { if (!Object.hasOwn(clazz, "GROUP")) { return false; } const group = clazz.GROUP; return (group !== Lexer.SKIPPED && group !== Lexer.NA && !(typeof group === "string")); }); const errors = invalidTypes.map((currType) => { return { message: "Token Type: ->" + currType.name + "<- static 'GROUP' can only be Lexer.SKIPPED/Lexer.NA/A String", type: LexerDefinitionErrorType.INVALID_GROUP_TYPE_FOUND, tokenTypes: [currType], }; }); return errors; } export function findModesThatDoNotExist(tokenTypes, validModes) { const invalidModes = tokenTypes.filter((clazz) => { return (clazz.PUSH_MODE !== undefined && !validModes.includes(clazz.PUSH_MODE)); }); const errors = invalidModes.map((tokType) => { const msg = `Token Type: ->${tokType.name}<- static 'PUSH_MODE' value cannot refer to a Lexer Mode ->${tokType.PUSH_MODE}<-` + `which does not exist`; return { message: msg, type: LexerDefinitionErrorType.PUSH_MODE_DOES_NOT_EXIST, tokenTypes: [tokType], }; }); return errors; } export function findUnreachablePatterns(tokenTypes) { const errors = []; const canBeTested = tokenTypes.reduce((result, tokType, idx) => { const pattern = tokType.PATTERN; if (pattern === Lexer.NA) { return result; } // a more comprehensive validation for all forms of regExps would require // deeper regExp analysis capabilities if (typeof pattern === "string") { result.push({ str: pattern, idx, tokenType: tokType }); } else if (pattern instanceof RegExp && noMetaChar(pattern)) { result.push({ str: pattern.source, idx, tokenType: tokType }); } return result; }, []); tokenTypes.forEach((aTokType, aIdx) => { canBeTested.forEach(({ str: bStr, idx: bIdx, tokenType: bTokType }) => { if (aIdx < bIdx && tryToMatchStrToPattern(bStr, aTokType.PATTERN)) { const msg = `Token: ->${bTokType.name}<- can never be matched.\n` + `Because it appears AFTER the Token Type ->${aTokType.name}<-` + `in the lexer's definition.\n` + `See https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNREACHABLE`; errors.push({ message: msg, type: LexerDefinitionErrorType.UNREACHABLE_PATTERN, tokenTypes: [aTokType, bTokType], }); } }); }); return errors; } function tryToMatchStrToPattern(str, pattern) { if (pattern instanceof RegExp) { if (usesLookAheadOrBehind(pattern)) { // if lookahead or lookbehind assertions are used // we assume they would be responsible for disambiguating the match // The alternative is to risk false positive unreachable pattern errors. // e.g.: /(?<!a)b/ and /b/ tokens would cause such false positives. return false; } const regExpArray = pattern.exec(str); return regExpArray !== null && regExpArray.index === 0; } else if (typeof pattern === "function") { // maintain the API of custom patterns return pattern(str, 0, [], {}); } else if (Object.hasOwn(pattern, "exec")) { // maintain the API of custom patterns return pattern.exec(str, 0, [], {}); } else if (typeof pattern === "string") { return pattern === str; } else { throw Error("non exhaustive match"); } } function noMetaChar(regExp) { //https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp const metaChars = [ ".", "\\", "[", "]", "|", "^", "$", "(", ")", "?", "*", "+", "{", ]; return (metaChars.find((char) => regExp.source.indexOf(char) !== -1) === undefined); } function usesLookAheadOrBehind(regExp) { return /(\(\?=)|(\(\?!)|(\(\?<=)|(\(\?<!)/.test(regExp.source); } export function addStartOfInput(pattern) { const flags = pattern.ignoreCase ? "i" : ""; // always wrapping in a none capturing group preceded by '^' to make sure matching can only work on start of input. // duplicate/redundant start of input markers have no meaning (/^^^^A/ === /^A/) return new RegExp(`^(?:${pattern.source})`, flags); } export function addStickyFlag(pattern) { const flags = pattern.ignoreCase ? "iy" : "y"; return new RegExp(`${pattern.source}`, flags); } export function performRuntimeChecks(lexerDefinition, trackLines, lineTerminatorCharacters) { const errors = []; // some run time checks to help the end users. if (!Object.hasOwn(lexerDefinition, DEFAULT_MODE)) { errors.push({ message: "A MultiMode Lexer cannot be initialized without a <" + DEFAULT_MODE + "> property in its definition\n", type: LexerDefinitionErrorType.MULTI_MODE_LEXER_WITHOUT_DEFAULT_MODE, }); } if (!Object.hasOwn(lexerDefinition, MODES)) { errors.push({ message: "A MultiMode Lexer cannot be initialized without a <" + MODES + "> property in its definition\n", type: LexerDefinitionErrorType.MULTI_MODE_LEXER_WITHOUT_MODES_PROPERTY, }); } if (Object.hasOwn(lexerDefinition, MODES) && Object.hasOwn(lexerDefinition, DEFAULT_MODE) && !Object.hasOwn(lexerDefinition.modes, lexerDefinition.defaultMode)) { errors.push({ message: `A MultiMode Lexer cannot be initialized with a ${DEFAULT_MODE}: <${lexerDefinition.defaultMode}>` + `which does not exist\n`, type: LexerDefinitionErrorType.MULTI_MODE_LEXER_DEFAULT_MODE_VALUE_DOES_NOT_EXIST, }); } if (Object.hasOwn(lexerDefinition, MODES)) { Object.keys(lexerDefinition.modes).forEach((currModeName) => { const currModeValue = lexerDefinition.modes[currModeName]; currModeValue.forEach((currTokType, currIdx) => { if (currTokType === undefined) { errors.push({ message: `A Lexer cannot be initialized using an undefined Token Type. Mode:` + `<${currModeName}> at index: <${currIdx}>\n`, type: LexerDefinitionErrorType.LEXER_DEFINITION_CANNOT_CONTAIN_UNDEFINED, }); } else if (Object.hasOwn(currTokType, "LONGER_ALT")) { const longerAlt = Array.isArray(currTokType.LONGER_ALT) ? currTokType.LONGER_ALT : [currTokType.LONGER_ALT]; longerAlt.forEach((currLongerAlt) => { if (currLongerAlt !== undefined && !currModeValue.includes(currLongerAlt)) { errors.push({ message: `A MultiMode Lexer cannot be initialized with a longer_alt <${currLongerAlt.name}> on token <${currTokType.name}> outside of mode <${currModeName}>\n`, type: LexerDefinitionErrorType.MULTI_MODE_LEXER_LONGER_ALT_NOT_IN_CURRENT_MODE, }); } }); } }); }); } return errors; } export function performWarningRuntimeChecks(lexerDefinition, trackLines, lineTerminatorCharacters) { const warnings = []; let hasAnyLineBreak = false; const allTokenTypes = Object.values(lexerDefinition.modes || {}) .flat() .filter(Boolean); const concreteTokenTypes = allTokenTypes.filter((currType) => currType[PATTERN] !== Lexer.NA); const terminatorCharCodes = getCharCodes(lineTerminatorCharacters); if (trackLines) { concreteTokenTypes.forEach((tokType) => { const currIssue = checkLineBreaksIssues(tokType, terminatorCharCodes); if (currIssue !== false) { const message = buildLineBreakIssueMessage(tokType, currIssue); const warningDescriptor = { message, type: currIssue.issue, tokenType: tokType, }; warnings.push(warningDescriptor); } else { // we don't want to attempt to scan if the user explicitly specified the line_breaks option. if (Object.hasOwn(tokType, "LINE_BREAKS")) { if (tokType.LINE_BREAKS === true) { hasAnyLineBreak = true; } } else { if (canMatchCharCode(terminatorCharCodes, tokType.PATTERN)) { hasAnyLineBreak = true; } } } }); } if (trackLines && !hasAnyLineBreak) { warnings.push({ message: "Warning: No LINE_BREAKS Found.\n" + "\tThis Lexer has been defined to track line and column information,\n" + "\tBut none of the Token Types can be identified as matching a line terminator.\n" + "\tSee https://chevrotain.io/docs/guide/resolving_lexer_errors.html#LINE_BREAKS \n" + "\tfor details.", type: LexerDefinitionErrorType.NO_LINE_BREAKS_FLAGS, }); } return warnings; } export function cloneEmptyGroups(emptyGroups) { const clonedResult = {}; const groupKeys = Object.keys(emptyGroups); groupKeys.forEach((currKey) => { const currGroupValue = emptyGroups[currKey]; /* istanbul ignore else */ if (Array.isArray(currGroupValue)) { clonedResult[currKey] = []; } else { throw Error("non exhaustive match"); } }); return clonedResult; } // TODO: refactor to avoid duplication export function isCustomPattern(tokenType) { const pattern = tokenType.PATTERN; /* istanbul ignore else */ if (pattern instanceof RegExp) { return false; } else if (typeof pattern === "function") { // CustomPatternMatcherFunc - custom patterns do not require any transformations, only wrapping in a RegExp Like object return true; } else if (Object.hasOwn(pattern, "exec")) { // ICustomPattern return true; } else if (typeof pattern === "string") { return false; } else { throw Error("non exhaustive match"); } } export function isShortPattern(pattern) { if (typeof pattern === "string" && pattern.length === 1) { return pattern.charCodeAt(0); } else { return false; } } /** * Faster than using a RegExp for default newline detection during lexing. */ export const LineTerminatorOptimizedTester = { // implements /\n|\r\n?/g.test test: function (text) { const len = text.length; for (let i = this.lastIndex; i < len; i++) { const c = text.charCodeAt(i); if (c === 10) { this.lastIndex = i + 1; return true; } else if (c === 13) { if (text.charCodeAt(i + 1) === 10) { this.lastIndex = i + 2; } else { this.lastIndex = i + 1; } return true; } } return false; }, lastIndex: 0, }; function checkLineBreaksIssues(tokType, lineTerminatorCharCodes) { if (Object.hasOwn(tokType, "LINE_BREAKS")) { // if the user explicitly declared the line_breaks option we will respect their choice // and assume it is correct. return false; } else { /* istanbul ignore else */ if (tokType.PATTERN instanceof RegExp) { try { // TODO: why is the casting suddenly needed? canMatchCharCode(lineTerminatorCharCodes, tokType.PATTERN); } catch (e) { /* istanbul ignore next - to test this we would have to mock <canMatchCharCode> to throw an error */ return { issue: LexerDefinitionErrorType.IDENTIFY_TERMINATOR, errMsg: e.message, }; } return false; } else if (typeof tokType.PATTERN === "string") { // string literal patterns can always be analyzed to detect line terminator usage return false; } else if (isCustomPattern(tokType)) { // custom token types return { issue: LexerDefinitionErrorType.CUSTOM_LINE_BREAK }; } else { throw Error("non exhaustive match"); } } } export function buildLineBreakIssueMessage(tokType, details) { /* istanbul ignore else */ if (details.issue === LexerDefinitionErrorType.IDENTIFY_TERMINATOR) { return ("Warning: unable to identify line terminator usage in pattern.\n" + `\tThe problem is in the <${tokType.name}> Token Type\n` + `\t Root cause: ${details.errMsg}.\n` + "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#IDENTIFY_TERMINATOR"); } else if (details.issue === LexerDefinitionErrorType.CUSTOM_LINE_BREAK) { return ("Warning: A Custom Token Pattern should specify the <line_breaks> option.\n" + `\tThe problem is in the <${tokType.name}> Token Type\n` + "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_LINE_BREAK"); } else { throw Error("non exhaustive match"); } } function getCharCodes(charsOrCodes) { const charCodes = charsOrCodes.map((numOrString) => { if (typeof numOrString === "string") { return numOrString.charCodeAt(0); } else { return numOrString; } }); return charCodes; } function addToMapOfArrays(map, key, value) { if (map[key] === undefined) { map[key] = [value]; } else { map[key].push(value); } } export const minOptimizationVal = 256; /** * We are mapping charCode above ASCI (256) into buckets each in the size of 256. * This is because ASCI are the most common start chars so each one of those will get its own * possible token configs vector. * * Tokens starting with charCodes "above" ASCI are uncommon, so we can "afford" * to place these into buckets of possible token configs, What we gain from * this is avoiding the case of creating an optimization 'charCodeToPatternIdxToConfig' * which would contain 10,000+ arrays of small size (e.g unicode Identifiers scenario). * Our 'charCodeToPatternIdxToConfig' max size will now be: * 256 + (2^16 / 2^8) - 1 === 511 * * note the hack for fast division integer part extraction * See: https://stackoverflow.com/a/4228528 */ let charCodeToOptimizedIdxMap = []; export function charCodeToOptimizedIndex(charCode) { return charCode < minOptimizationVal ? charCode : charCodeToOptimizedIdxMap[charCode]; } /** * This is a compromise between cold start / hot running performance * Creating this array takes ~3ms on a modern machine, * But if we perform the computation at runtime as needed the CSS Lexer benchmark * performance degrades by ~10% * * TODO: Perhaps it should be lazy initialized only if a charCode > 255 is used. */ function initCharCodeToOptimizedIndexMap() { if (charCodeToOptimizedIdxMap.length === 0) { charCodeToOptimizedIdxMap = new Array(65536); for (let i = 0; i < 65536; i++) { charCodeToOptimizedIdxMap[i] = i > 255 ? 255 + ~~(i / 255) : i; } } } //# sourceMappingURL=lexer.js.map