UNPKG

rawsql-ts

Version:

[beta]High-performance SQL parser and AST analyzer written in TypeScript. Provides fast parsing and advanced transformation capabilities.

520 lines 23 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.SqlTokenizer = void 0; const Lexeme_1 = require("../models/Lexeme"); const CommandTokenReader_1 = require("../tokenReaders/CommandTokenReader"); const EscapedIdentifierTokenReader_1 = require("../tokenReaders/EscapedIdentifierTokenReader"); const FunctionTokenReader_1 = require("../tokenReaders/FunctionTokenReader"); const IdentifierTokenReader_1 = require("../tokenReaders/IdentifierTokenReader"); const LiteralTokenReader_1 = require("../tokenReaders/LiteralTokenReader"); const OperatorTokenReader_1 = require("../tokenReaders/OperatorTokenReader"); const ParameterTokenReader_1 = require("../tokenReaders/ParameterTokenReader"); const SymbolTokenReader_1 = require("../tokenReaders/SymbolTokenReader"); const StringSpecifierTokenReader_1 = require("../tokenReaders/StringSpecifierTokenReader"); const TokenReaderManager_1 = require("../tokenReaders/TokenReaderManager"); const TypeTokenReader_1 = require("../tokenReaders/TypeTokenReader"); const stringUtils_1 = require("../utils/stringUtils"); /** * Class responsible for tokenizing SQL input. */ class SqlTokenizer { /** * Initializes a new instance of the SqlTokenizer. */ constructor(input) { /** * Cached start offsets for each line in the input string. */ this.lineStartPositions = null; this.input = input; this.position = 0; // Initialize the token reader manager and register all readers this.readerManager = new TokenReaderManager_1.TokenReaderManager(input) .register(new EscapedIdentifierTokenReader_1.EscapedIdentifierTokenReader(input)) .register(new ParameterTokenReader_1.ParameterTokenReader(input)) .register(new StringSpecifierTokenReader_1.StringSpecifierTokenReader(input)) // LiteralTokenReader should be registered before SpecialSymbolTokenReader and OperatorTokenReader // Reason: To prevent numeric literals starting with a dot or sign from being misrecognized as operators // e.g. `1.0` is a literal, not an operator .register(new LiteralTokenReader_1.LiteralTokenReader(input)) .register(new SymbolTokenReader_1.SpecialSymbolTokenReader(input)) .register(new CommandTokenReader_1.CommandTokenReader(input)) .register(new OperatorTokenReader_1.OperatorTokenReader(input)) // TypeTokenReader should be registered before FunctionTokenReader // Reason: To prevent types containing parentheses from being misrecognized as functions // e.g. `numeric(10, 2)` is a type, not a function .register(new TypeTokenReader_1.TypeTokenReader(input)) .register(new FunctionTokenReader_1.FunctionTokenReader(input)) .register(new IdentifierTokenReader_1.IdentifierTokenReader(input)) // IdentifierTokenReader should be registered last ; } /** * Checks if the end of input is reached. * * @param shift - The shift to consider beyond the current position. * @returns True if the end of input is reached; otherwise, false. */ isEndOfInput(shift = 0) { return this.position + shift >= this.input.length; } /** * Checks if more input can be read. * * @param shift - The shift to consider beyond the current position. * @returns True if more input can be read; otherwise, false. */ canRead(shift = 0) { return !this.isEndOfInput(shift); } tokenize(options) { if (options === null || options === void 0 ? void 0 : options.preserveFormatting) { return this.tokenizeWithFormatting(); } // Create a fresh tokenizer instance for clean state const freshTokenizer = new SqlTokenizer(this.input); return freshTokenizer.readLexemes(); } /** * @deprecated Use {@link readLexemes} (correct spelling) instead. * This legacy alias remains for backwards compatibility and delegates to the new method. */ readLexmes() { return this.readLexemes(); } /** * Reads the lexemes from the input string. * * @returns An array of lexemes extracted from the input string. * @throws Error if an unexpected character is encountered. */ readLexemes() { const segment = this.readNextStatement(0); return segment ? segment.lexemes : []; } /** * Tokenizes the input SQL without formatting preservation (internal method) */ tokenizeBasic() { const segment = this.readNextStatement(0); return segment ? segment.lexemes : []; } readNextStatement(startPosition = 0, carryComments = null) { const length = this.input.length; // Abort when the cursor already moved past the input. if (startPosition >= length) { return null; } // Adopt a working cursor so the original tokenizer state is untouched. this.position = startPosition; const statementStart = startPosition; let pendingLeading = carryComments ? [...carryComments] : null; const tokenData = []; let previous = null; while (this.canRead()) { // Fold whitespace and comments into the token stream and advance to the next significant character. const prefixComment = this.readComment(); this.position = prefixComment.position; if (!this.canRead()) { // No more characters, so keep any trailing comments for the next statement. pendingLeading = this.mergeComments(pendingLeading, prefixComment.lines); break; } if (this.input[this.position] === ';') { // Statement terminated before any token appeared. pendingLeading = this.mergeComments(pendingLeading, prefixComment.lines); break; } // Read the next lexeme at the current position. const lexeme = this.readerManager.tryRead(this.position, previous); if (lexeme === null) { throw new Error(`Unexpected character. actual: ${this.input[this.position]}, position: ${this.position}\n${this.getDebugPositionInfo(this.position)}`); } const tokenStartPos = this.position; const tokenEndPos = this.position = this.readerManager.getMaxPosition(); // Capture trailing whitespace and comments after the token. const suffixComment = this.readComment(); this.position = suffixComment.position; let prefixComments = this.mergeComments(pendingLeading, prefixComment.lines); pendingLeading = null; tokenData.push({ lexeme, startPos: tokenStartPos, endPos: tokenEndPos, prefixComments, suffixComments: suffixComment.lines }); previous = lexeme; } const statementEnd = this.position; const lexemes = this.buildLexemesFromTokenData(tokenData); const nextPosition = this.skipPastTerminator(statementEnd); return { lexemes, statementStart, statementEnd, nextPosition, rawText: this.input.slice(statementStart, statementEnd), leadingComments: pendingLeading }; } buildLexemesFromTokenData(tokenData) { const lexemes = new Array(tokenData.length); for (let i = 0; i < tokenData.length; i++) { const current = tokenData[i]; const lexeme = current.lexeme; // Redirect SELECT suffix comments to the first meaningful select item. if (lexeme.value.toLowerCase() === 'select' && current.suffixComments && current.suffixComments.length > 0) { const suffixComments = current.suffixComments; let targetIndex = i + 1; while (targetIndex < tokenData.length) { const target = tokenData[targetIndex]; // Allow SELECT-prefix comments to bind to '*' tokens so they stay with the select list. const isStarOperator = (target.lexeme.type & Lexeme_1.TokenType.Operator) && target.lexeme.value === '*'; if ((target.lexeme.type & Lexeme_1.TokenType.Identifier) || (target.lexeme.type & Lexeme_1.TokenType.Literal) || isStarOperator || (!(target.lexeme.type & Lexeme_1.TokenType.Command) && !(target.lexeme.type & Lexeme_1.TokenType.Comma) && !(target.lexeme.type & Lexeme_1.TokenType.Operator))) { if (!target.prefixComments) { target.prefixComments = []; } target.prefixComments.unshift(...suffixComments); current.suffixComments = null; break; } targetIndex++; } } if (lexeme.value.toLowerCase() === 'from' && current.suffixComments && current.suffixComments.length > 0) { const suffixComments = current.suffixComments; let targetIndex = i + 1; while (targetIndex < tokenData.length) { const target = tokenData[targetIndex]; // Attach FROM suffix comments to the immediately following source token. const isCommand = (target.lexeme.type & Lexeme_1.TokenType.Command) !== 0; if (!isCommand) { if (!target.prefixComments) { target.prefixComments = []; } target.prefixComments.unshift(...suffixComments); current.suffixComments = null; break; } targetIndex++; } } // Ensure commas push trailing comments onto the following token. if ((lexeme.type & Lexeme_1.TokenType.Comma) && current.suffixComments && current.suffixComments.length > 0) { const suffixComments = current.suffixComments; let targetIndex = i + 1; if (targetIndex < tokenData.length) { const target = tokenData[targetIndex]; if (!target.prefixComments) { target.prefixComments = []; } target.prefixComments.unshift(...suffixComments); current.suffixComments = null; } } // Bridge set-operator suffix comments to the subsequent SELECT clause. if ((lexeme.value.toLowerCase() === 'union' || lexeme.value.toLowerCase() === 'intersect' || lexeme.value.toLowerCase() === 'except') && current.suffixComments && current.suffixComments.length > 0) { const suffixComments = current.suffixComments; let targetIndex = i + 1; while (targetIndex < tokenData.length) { const target = tokenData[targetIndex]; if (target.lexeme.value.toLowerCase() === 'select') { if (!target.prefixComments) { target.prefixComments = []; } target.prefixComments.unshift(...suffixComments); current.suffixComments = null; break; } targetIndex++; } } this.attachCommentsToLexeme(lexeme, current); // Attach source position metadata so downstream parsers can report precise locations. lexeme.position = { startPosition: current.startPos, endPosition: current.endPos, ...this.getLineColumnInfo(current.startPos, current.endPos) }; lexemes[i] = lexeme; } return lexemes; } skipPastTerminator(position) { let next = position; if (next < this.input.length && this.input[next] === ';') { next++; } return this.skipWhitespaceAndComments(next); } mergeComments(base, addition) { if (addition && addition.length > 0) { if (!base || base.length === 0) { return [...addition]; } return [...base, ...addition]; } return base ? [...base] : null; } // Attach comments to lexeme directly (no collection then assignment anti-pattern) attachCommentsToLexeme(lexeme, tokenData) { const newPositionedComments = []; const allLegacyComments = []; // Preserve existing positioned comments from token readers (e.g., CommandTokenReader) if (lexeme.positionedComments && lexeme.positionedComments.length > 0) { newPositionedComments.push(...lexeme.positionedComments); } // Add prefix comments as "before" positioned comments directly if (tokenData.prefixComments && tokenData.prefixComments.length > 0) { allLegacyComments.push(...tokenData.prefixComments); newPositionedComments.push({ position: 'before', comments: [...tokenData.prefixComments] }); } // Add suffix comments as "after" positioned comments directly if (tokenData.suffixComments && tokenData.suffixComments.length > 0) { allLegacyComments.push(...tokenData.suffixComments); newPositionedComments.push({ position: 'after', comments: [...tokenData.suffixComments] }); } // Apply comments directly to lexeme (positioned comments take priority) if (newPositionedComments.length > 0) { lexeme.positionedComments = newPositionedComments; // Clear legacy comments when positioned comments exist to avoid duplication lexeme.comments = null; } else if (allLegacyComments.length > 0) { // Only set legacy comments if no positioned comments exist lexeme.comments = allLegacyComments; lexeme.positionedComments = undefined; } else { // Clear both if no comments exist lexeme.comments = null; lexeme.positionedComments = undefined; } } /** * Skips whitespace characters and SQL comments in the input. * * @remarks This method updates the position pointer. */ readComment() { return stringUtils_1.StringUtils.readWhiteSpaceAndComment(this.input, this.position); } /** * Gets debug information for error reporting. * * @param errPosition - The position where the error occurred. * @returns A string containing the debug position information. */ getDebugPositionInfo(errPosition) { return stringUtils_1.StringUtils.getDebugPositionInfo(this.input, errPosition); } /** * Tokenizes the input SQL while preserving formatting information */ tokenizeWithFormatting() { // Get regular lexemes first const regularLexemes = this.tokenizeBasic(); // Map regular lexemes to formatting lexemes with whitespace info return this.mapToFormattingLexemes(regularLexemes); } mapToFormattingLexemes(regularLexemes) { if (regularLexemes.length === 0) { return []; } // First pass: find all lexeme positions in the input const lexemePositions = []; let searchPos = 0; for (const lexeme of regularLexemes) { // Skip whitespace and comments searchPos = this.skipWhitespaceAndComments(searchPos); // Find lexeme at current position const lexemeInfo = this.findLexemeAtPosition(lexeme, searchPos); if (lexemeInfo) { lexemePositions.push(lexemeInfo); searchPos = lexemeInfo.endPosition; } else { // Fallback: assume lexeme length and continue const fallbackInfo = { startPosition: searchPos, endPosition: searchPos + lexeme.value.length }; lexemePositions.push(fallbackInfo); searchPos = fallbackInfo.endPosition; } } // Second pass: build formatting lexemes with proper whitespace segments const formattingLexemes = []; for (let i = 0; i < regularLexemes.length; i++) { const lexeme = regularLexemes[i]; const lexemeInfo = lexemePositions[i]; // Determine the end position of the whitespace segment const nextLexemeStartPos = i < regularLexemes.length - 1 ? lexemePositions[i + 1].startPosition : this.input.length; // Extract whitespace between this lexeme and the next const whitespaceSegment = this.input.slice(lexemeInfo.endPosition, nextLexemeStartPos); const inlineComments = this.extractCommentsFromWhitespace(whitespaceSegment); const formattingLexeme = { ...lexeme, followingWhitespace: whitespaceSegment, inlineComments, position: { startPosition: lexemeInfo.startPosition, endPosition: lexemeInfo.endPosition, ...this.getLineColumnInfo(lexemeInfo.startPosition, lexemeInfo.endPosition) } }; formattingLexemes.push(formattingLexeme); } return formattingLexemes; } /** * Find lexeme at a specific position, handling case variations */ findLexemeAtPosition(lexeme, expectedPos) { if (expectedPos >= this.input.length) { return null; } // For command tokens (keywords), the lexeme.value might be lowercase but appear uppercase in input const valuesToTry = [lexeme.value, lexeme.value.toUpperCase(), lexeme.value.toLowerCase()]; for (const valueToTry of valuesToTry) { // Check if the input at expected position matches this value if (expectedPos + valueToTry.length <= this.input.length && this.input.substring(expectedPos, expectedPos + valueToTry.length) === valueToTry && this.isValidLexemeMatch(valueToTry, expectedPos)) { return { startPosition: expectedPos, endPosition: expectedPos + valueToTry.length }; } } return null; } isValidLexemeMatch(value, position) { // Check character before if (position > 0) { const charBefore = this.input[position - 1]; if (this.isAlphanumericUnderscore(charBefore)) { return false; // Part of another identifier } } // Check character after const endPosition = position + value.length; if (endPosition < this.input.length) { const charAfter = this.input[endPosition]; if (this.isAlphanumericUnderscore(charAfter)) { return false; // Part of another identifier } } return true; } /** * Check if character is alphanumeric or underscore (faster than regex) */ isAlphanumericUnderscore(char) { const code = char.charCodeAt(0); return (code >= 48 && code <= 57) || // 0-9 (code >= 65 && code <= 90) || // A-Z (code >= 97 && code <= 122) || // a-z code === 95; // _ } /** * Check if character is whitespace (faster than regex) */ isWhitespace(char) { const code = char.charCodeAt(0); return code === 32 || // space code === 9 || // tab code === 10 || // \n code === 13; // \r } extractCommentsFromWhitespace(whitespaceSegment) { const inlineComments = []; let pos = 0; while (pos < whitespaceSegment.length) { const oldPos = pos; // Try to extract comments using StringUtils const result = stringUtils_1.StringUtils.readWhiteSpaceAndComment(whitespaceSegment, pos); // Add any comments found const lines = result.lines; if (lines && lines.length > 0) { inlineComments.push(...lines); } // Move position forward pos = result.position; // Prevent infinite loop - if position didn't advance, manually skip one character if (pos === oldPos) { pos++; } } return inlineComments; } /** * Skip whitespace and comments from the given position */ skipWhitespaceAndComments(pos) { return stringUtils_1.StringUtils.readWhiteSpaceAndComment(this.input, pos).position; } getLineColumnInfo(startPos, endPos) { const startInfo = this.getLineColumn(startPos); const endInfo = this.getLineColumn(endPos); return { startLine: startInfo.line, startColumn: startInfo.column, endLine: endInfo.line, endColumn: endInfo.column }; } getLineColumn(position) { const starts = this.ensureLineStartPositions(); // Use binary search to locate the greatest line start that does not exceed position. let low = 0; let high = starts.length - 1; while (low <= high) { const mid = (low + high) >>> 1; if (starts[mid] <= position) { low = mid + 1; } else { high = mid - 1; } } const lineIndex = high >= 0 ? high : 0; const lineStart = starts[lineIndex]; return { line: lineIndex + 1, column: position - lineStart + 1 }; } ensureLineStartPositions() { if (this.lineStartPositions) { return this.lineStartPositions; } const starts = [0]; // Precompute the start index of each line so callers can map positions in O(log n). for (let i = 0; i < this.input.length; i++) { if (this.input.charCodeAt(i) === 10) { // '\n' starts.push(i + 1); } } this.lineStartPositions = starts; return starts; } } exports.SqlTokenizer = SqlTokenizer; //# sourceMappingURL=SqlTokenizer.js.map