rawsql-ts
Version:
[beta]High-performance SQL parser and AST analyzer written in TypeScript. Provides fast parsing and advanced transformation capabilities.
520 lines • 23 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.SqlTokenizer = void 0;
const Lexeme_1 = require("../models/Lexeme");
const CommandTokenReader_1 = require("../tokenReaders/CommandTokenReader");
const EscapedIdentifierTokenReader_1 = require("../tokenReaders/EscapedIdentifierTokenReader");
const FunctionTokenReader_1 = require("../tokenReaders/FunctionTokenReader");
const IdentifierTokenReader_1 = require("../tokenReaders/IdentifierTokenReader");
const LiteralTokenReader_1 = require("../tokenReaders/LiteralTokenReader");
const OperatorTokenReader_1 = require("../tokenReaders/OperatorTokenReader");
const ParameterTokenReader_1 = require("../tokenReaders/ParameterTokenReader");
const SymbolTokenReader_1 = require("../tokenReaders/SymbolTokenReader");
const StringSpecifierTokenReader_1 = require("../tokenReaders/StringSpecifierTokenReader");
const TokenReaderManager_1 = require("../tokenReaders/TokenReaderManager");
const TypeTokenReader_1 = require("../tokenReaders/TypeTokenReader");
const stringUtils_1 = require("../utils/stringUtils");
/**
* Class responsible for tokenizing SQL input.
*/
class SqlTokenizer {
/**
* Initializes a new instance of the SqlTokenizer.
*/
constructor(input) {
/**
* Cached start offsets for each line in the input string.
*/
this.lineStartPositions = null;
this.input = input;
this.position = 0;
// Initialize the token reader manager and register all readers
this.readerManager = new TokenReaderManager_1.TokenReaderManager(input)
.register(new EscapedIdentifierTokenReader_1.EscapedIdentifierTokenReader(input))
.register(new ParameterTokenReader_1.ParameterTokenReader(input))
.register(new StringSpecifierTokenReader_1.StringSpecifierTokenReader(input))
// LiteralTokenReader should be registered before SpecialSymbolTokenReader and OperatorTokenReader
// Reason: To prevent numeric literals starting with a dot or sign from being misrecognized as operators
// e.g. `1.0` is a literal, not an operator
.register(new LiteralTokenReader_1.LiteralTokenReader(input))
.register(new SymbolTokenReader_1.SpecialSymbolTokenReader(input))
.register(new CommandTokenReader_1.CommandTokenReader(input))
.register(new OperatorTokenReader_1.OperatorTokenReader(input))
// TypeTokenReader should be registered before FunctionTokenReader
// Reason: To prevent types containing parentheses from being misrecognized as functions
// e.g. `numeric(10, 2)` is a type, not a function
.register(new TypeTokenReader_1.TypeTokenReader(input))
.register(new FunctionTokenReader_1.FunctionTokenReader(input))
.register(new IdentifierTokenReader_1.IdentifierTokenReader(input)) // IdentifierTokenReader should be registered last
;
}
/**
* Checks if the end of input is reached.
*
* @param shift - The shift to consider beyond the current position.
* @returns True if the end of input is reached; otherwise, false.
*/
isEndOfInput(shift = 0) {
return this.position + shift >= this.input.length;
}
/**
* Checks if more input can be read.
*
* @param shift - The shift to consider beyond the current position.
* @returns True if more input can be read; otherwise, false.
*/
canRead(shift = 0) {
return !this.isEndOfInput(shift);
}
tokenize(options) {
if (options === null || options === void 0 ? void 0 : options.preserveFormatting) {
return this.tokenizeWithFormatting();
}
// Create a fresh tokenizer instance for clean state
const freshTokenizer = new SqlTokenizer(this.input);
return freshTokenizer.readLexemes();
}
/**
* @deprecated Use {@link readLexemes} (correct spelling) instead.
* This legacy alias remains for backwards compatibility and delegates to the new method.
*/
readLexmes() {
return this.readLexemes();
}
/**
* Reads the lexemes from the input string.
*
* @returns An array of lexemes extracted from the input string.
* @throws Error if an unexpected character is encountered.
*/
readLexemes() {
const segment = this.readNextStatement(0);
return segment ? segment.lexemes : [];
}
/**
* Tokenizes the input SQL without formatting preservation (internal method)
*/
tokenizeBasic() {
const segment = this.readNextStatement(0);
return segment ? segment.lexemes : [];
}
readNextStatement(startPosition = 0, carryComments = null) {
const length = this.input.length;
// Abort when the cursor already moved past the input.
if (startPosition >= length) {
return null;
}
// Adopt a working cursor so the original tokenizer state is untouched.
this.position = startPosition;
const statementStart = startPosition;
let pendingLeading = carryComments ? [...carryComments] : null;
const tokenData = [];
let previous = null;
while (this.canRead()) {
// Fold whitespace and comments into the token stream and advance to the next significant character.
const prefixComment = this.readComment();
this.position = prefixComment.position;
if (!this.canRead()) {
// No more characters, so keep any trailing comments for the next statement.
pendingLeading = this.mergeComments(pendingLeading, prefixComment.lines);
break;
}
if (this.input[this.position] === ';') {
// Statement terminated before any token appeared.
pendingLeading = this.mergeComments(pendingLeading, prefixComment.lines);
break;
}
// Read the next lexeme at the current position.
const lexeme = this.readerManager.tryRead(this.position, previous);
if (lexeme === null) {
throw new Error(`Unexpected character. actual: ${this.input[this.position]}, position: ${this.position}\n${this.getDebugPositionInfo(this.position)}`);
}
const tokenStartPos = this.position;
const tokenEndPos = this.position = this.readerManager.getMaxPosition();
// Capture trailing whitespace and comments after the token.
const suffixComment = this.readComment();
this.position = suffixComment.position;
let prefixComments = this.mergeComments(pendingLeading, prefixComment.lines);
pendingLeading = null;
tokenData.push({
lexeme,
startPos: tokenStartPos,
endPos: tokenEndPos,
prefixComments,
suffixComments: suffixComment.lines
});
previous = lexeme;
}
const statementEnd = this.position;
const lexemes = this.buildLexemesFromTokenData(tokenData);
const nextPosition = this.skipPastTerminator(statementEnd);
return {
lexemes,
statementStart,
statementEnd,
nextPosition,
rawText: this.input.slice(statementStart, statementEnd),
leadingComments: pendingLeading
};
}
buildLexemesFromTokenData(tokenData) {
const lexemes = new Array(tokenData.length);
for (let i = 0; i < tokenData.length; i++) {
const current = tokenData[i];
const lexeme = current.lexeme;
// Redirect SELECT suffix comments to the first meaningful select item.
if (lexeme.value.toLowerCase() === 'select' && current.suffixComments && current.suffixComments.length > 0) {
const suffixComments = current.suffixComments;
let targetIndex = i + 1;
while (targetIndex < tokenData.length) {
const target = tokenData[targetIndex];
// Allow SELECT-prefix comments to bind to '*' tokens so they stay with the select list.
const isStarOperator = (target.lexeme.type & Lexeme_1.TokenType.Operator) && target.lexeme.value === '*';
if ((target.lexeme.type & Lexeme_1.TokenType.Identifier) ||
(target.lexeme.type & Lexeme_1.TokenType.Literal) ||
isStarOperator ||
(!(target.lexeme.type & Lexeme_1.TokenType.Command) &&
!(target.lexeme.type & Lexeme_1.TokenType.Comma) &&
!(target.lexeme.type & Lexeme_1.TokenType.Operator))) {
if (!target.prefixComments) {
target.prefixComments = [];
}
target.prefixComments.unshift(...suffixComments);
current.suffixComments = null;
break;
}
targetIndex++;
}
}
if (lexeme.value.toLowerCase() === 'from' && current.suffixComments && current.suffixComments.length > 0) {
const suffixComments = current.suffixComments;
let targetIndex = i + 1;
while (targetIndex < tokenData.length) {
const target = tokenData[targetIndex];
// Attach FROM suffix comments to the immediately following source token.
const isCommand = (target.lexeme.type & Lexeme_1.TokenType.Command) !== 0;
if (!isCommand) {
if (!target.prefixComments) {
target.prefixComments = [];
}
target.prefixComments.unshift(...suffixComments);
current.suffixComments = null;
break;
}
targetIndex++;
}
}
// Ensure commas push trailing comments onto the following token.
if ((lexeme.type & Lexeme_1.TokenType.Comma) && current.suffixComments && current.suffixComments.length > 0) {
const suffixComments = current.suffixComments;
let targetIndex = i + 1;
if (targetIndex < tokenData.length) {
const target = tokenData[targetIndex];
if (!target.prefixComments) {
target.prefixComments = [];
}
target.prefixComments.unshift(...suffixComments);
current.suffixComments = null;
}
}
// Bridge set-operator suffix comments to the subsequent SELECT clause.
if ((lexeme.value.toLowerCase() === 'union' ||
lexeme.value.toLowerCase() === 'intersect' ||
lexeme.value.toLowerCase() === 'except') &&
current.suffixComments && current.suffixComments.length > 0) {
const suffixComments = current.suffixComments;
let targetIndex = i + 1;
while (targetIndex < tokenData.length) {
const target = tokenData[targetIndex];
if (target.lexeme.value.toLowerCase() === 'select') {
if (!target.prefixComments) {
target.prefixComments = [];
}
target.prefixComments.unshift(...suffixComments);
current.suffixComments = null;
break;
}
targetIndex++;
}
}
this.attachCommentsToLexeme(lexeme, current);
// Attach source position metadata so downstream parsers can report precise locations.
lexeme.position = {
startPosition: current.startPos,
endPosition: current.endPos,
...this.getLineColumnInfo(current.startPos, current.endPos)
};
lexemes[i] = lexeme;
}
return lexemes;
}
skipPastTerminator(position) {
let next = position;
if (next < this.input.length && this.input[next] === ';') {
next++;
}
return this.skipWhitespaceAndComments(next);
}
mergeComments(base, addition) {
if (addition && addition.length > 0) {
if (!base || base.length === 0) {
return [...addition];
}
return [...base, ...addition];
}
return base ? [...base] : null;
}
// Attach comments to lexeme directly (no collection then assignment anti-pattern)
attachCommentsToLexeme(lexeme, tokenData) {
const newPositionedComments = [];
const allLegacyComments = [];
// Preserve existing positioned comments from token readers (e.g., CommandTokenReader)
if (lexeme.positionedComments && lexeme.positionedComments.length > 0) {
newPositionedComments.push(...lexeme.positionedComments);
}
// Add prefix comments as "before" positioned comments directly
if (tokenData.prefixComments && tokenData.prefixComments.length > 0) {
allLegacyComments.push(...tokenData.prefixComments);
newPositionedComments.push({
position: 'before',
comments: [...tokenData.prefixComments]
});
}
// Add suffix comments as "after" positioned comments directly
if (tokenData.suffixComments && tokenData.suffixComments.length > 0) {
allLegacyComments.push(...tokenData.suffixComments);
newPositionedComments.push({
position: 'after',
comments: [...tokenData.suffixComments]
});
}
// Apply comments directly to lexeme (positioned comments take priority)
if (newPositionedComments.length > 0) {
lexeme.positionedComments = newPositionedComments;
// Clear legacy comments when positioned comments exist to avoid duplication
lexeme.comments = null;
}
else if (allLegacyComments.length > 0) {
// Only set legacy comments if no positioned comments exist
lexeme.comments = allLegacyComments;
lexeme.positionedComments = undefined;
}
else {
// Clear both if no comments exist
lexeme.comments = null;
lexeme.positionedComments = undefined;
}
}
/**
* Skips whitespace characters and SQL comments in the input.
*
* @remarks This method updates the position pointer.
*/
readComment() {
return stringUtils_1.StringUtils.readWhiteSpaceAndComment(this.input, this.position);
}
/**
* Gets debug information for error reporting.
*
* @param errPosition - The position where the error occurred.
* @returns A string containing the debug position information.
*/
getDebugPositionInfo(errPosition) {
return stringUtils_1.StringUtils.getDebugPositionInfo(this.input, errPosition);
}
/**
* Tokenizes the input SQL while preserving formatting information
*/
tokenizeWithFormatting() {
// Get regular lexemes first
const regularLexemes = this.tokenizeBasic();
// Map regular lexemes to formatting lexemes with whitespace info
return this.mapToFormattingLexemes(regularLexemes);
}
mapToFormattingLexemes(regularLexemes) {
if (regularLexemes.length === 0) {
return [];
}
// First pass: find all lexeme positions in the input
const lexemePositions = [];
let searchPos = 0;
for (const lexeme of regularLexemes) {
// Skip whitespace and comments
searchPos = this.skipWhitespaceAndComments(searchPos);
// Find lexeme at current position
const lexemeInfo = this.findLexemeAtPosition(lexeme, searchPos);
if (lexemeInfo) {
lexemePositions.push(lexemeInfo);
searchPos = lexemeInfo.endPosition;
}
else {
// Fallback: assume lexeme length and continue
const fallbackInfo = {
startPosition: searchPos,
endPosition: searchPos + lexeme.value.length
};
lexemePositions.push(fallbackInfo);
searchPos = fallbackInfo.endPosition;
}
}
// Second pass: build formatting lexemes with proper whitespace segments
const formattingLexemes = [];
for (let i = 0; i < regularLexemes.length; i++) {
const lexeme = regularLexemes[i];
const lexemeInfo = lexemePositions[i];
// Determine the end position of the whitespace segment
const nextLexemeStartPos = i < regularLexemes.length - 1
? lexemePositions[i + 1].startPosition
: this.input.length;
// Extract whitespace between this lexeme and the next
const whitespaceSegment = this.input.slice(lexemeInfo.endPosition, nextLexemeStartPos);
const inlineComments = this.extractCommentsFromWhitespace(whitespaceSegment);
const formattingLexeme = {
...lexeme,
followingWhitespace: whitespaceSegment,
inlineComments,
position: {
startPosition: lexemeInfo.startPosition,
endPosition: lexemeInfo.endPosition,
...this.getLineColumnInfo(lexemeInfo.startPosition, lexemeInfo.endPosition)
}
};
formattingLexemes.push(formattingLexeme);
}
return formattingLexemes;
}
/**
* Find lexeme at a specific position, handling case variations
*/
findLexemeAtPosition(lexeme, expectedPos) {
if (expectedPos >= this.input.length) {
return null;
}
// For command tokens (keywords), the lexeme.value might be lowercase but appear uppercase in input
const valuesToTry = [lexeme.value, lexeme.value.toUpperCase(), lexeme.value.toLowerCase()];
for (const valueToTry of valuesToTry) {
// Check if the input at expected position matches this value
if (expectedPos + valueToTry.length <= this.input.length &&
this.input.substring(expectedPos, expectedPos + valueToTry.length) === valueToTry &&
this.isValidLexemeMatch(valueToTry, expectedPos)) {
return {
startPosition: expectedPos,
endPosition: expectedPos + valueToTry.length
};
}
}
return null;
}
isValidLexemeMatch(value, position) {
// Check character before
if (position > 0) {
const charBefore = this.input[position - 1];
if (this.isAlphanumericUnderscore(charBefore)) {
return false; // Part of another identifier
}
}
// Check character after
const endPosition = position + value.length;
if (endPosition < this.input.length) {
const charAfter = this.input[endPosition];
if (this.isAlphanumericUnderscore(charAfter)) {
return false; // Part of another identifier
}
}
return true;
}
/**
* Check if character is alphanumeric or underscore (faster than regex)
*/
isAlphanumericUnderscore(char) {
const code = char.charCodeAt(0);
return (code >= 48 && code <= 57) || // 0-9
(code >= 65 && code <= 90) || // A-Z
(code >= 97 && code <= 122) || // a-z
code === 95; // _
}
/**
* Check if character is whitespace (faster than regex)
*/
isWhitespace(char) {
const code = char.charCodeAt(0);
return code === 32 || // space
code === 9 || // tab
code === 10 || // \n
code === 13; // \r
}
extractCommentsFromWhitespace(whitespaceSegment) {
const inlineComments = [];
let pos = 0;
while (pos < whitespaceSegment.length) {
const oldPos = pos;
// Try to extract comments using StringUtils
const result = stringUtils_1.StringUtils.readWhiteSpaceAndComment(whitespaceSegment, pos);
// Add any comments found
const lines = result.lines;
if (lines && lines.length > 0) {
inlineComments.push(...lines);
}
// Move position forward
pos = result.position;
// Prevent infinite loop - if position didn't advance, manually skip one character
if (pos === oldPos) {
pos++;
}
}
return inlineComments;
}
/**
* Skip whitespace and comments from the given position
*/
skipWhitespaceAndComments(pos) {
return stringUtils_1.StringUtils.readWhiteSpaceAndComment(this.input, pos).position;
}
getLineColumnInfo(startPos, endPos) {
const startInfo = this.getLineColumn(startPos);
const endInfo = this.getLineColumn(endPos);
return {
startLine: startInfo.line,
startColumn: startInfo.column,
endLine: endInfo.line,
endColumn: endInfo.column
};
}
getLineColumn(position) {
const starts = this.ensureLineStartPositions();
// Use binary search to locate the greatest line start that does not exceed position.
let low = 0;
let high = starts.length - 1;
while (low <= high) {
const mid = (low + high) >>> 1;
if (starts[mid] <= position) {
low = mid + 1;
}
else {
high = mid - 1;
}
}
const lineIndex = high >= 0 ? high : 0;
const lineStart = starts[lineIndex];
return {
line: lineIndex + 1,
column: position - lineStart + 1
};
}
ensureLineStartPositions() {
if (this.lineStartPositions) {
return this.lineStartPositions;
}
const starts = [0];
// Precompute the start index of each line so callers can map positions in O(log n).
for (let i = 0; i < this.input.length; i++) {
if (this.input.charCodeAt(i) === 10) { // '\n'
starts.push(i + 1);
}
}
this.lineStartPositions = starts;
return starts;
}
}
exports.SqlTokenizer = SqlTokenizer;
//# sourceMappingURL=SqlTokenizer.js.map