rawsql-ts
Version:
[beta]High-performance SQL parser and AST analyzer written in TypeScript. Provides fast parsing and advanced transformation capabilities.
172 lines • 7.61 kB
JavaScript
import { TokenType } from '../models/Lexeme';
import { IdentifierTokenReader } from '../tokenReaders/IdentifierTokenReader';
import { LiteralTokenReader } from '../tokenReaders/LiteralTokenReader';
import { ParameterTokenReader } from '../tokenReaders/ParameterTokenReader';
import { SpecialSymbolTokenReader } from '../tokenReaders/SymbolTokenReader';
import { TokenReaderManager } from '../tokenReaders/TokenReaderManager';
import { OperatorTokenReader } from '../tokenReaders/OperatorTokenReader';
import { StringUtils } from '../utils/stringUtils';
import { CommandTokenReader } from '../tokenReaders/CommandTokenReader';
import { StringSpecifierTokenReader } from '../tokenReaders/StringSpecifierTokenReader';
import { FunctionTokenReader } from '../tokenReaders/FunctionTokenReader';
import { TypeTokenReader } from '../tokenReaders/TypeTokenReader';
import { EscapedIdentifierTokenReader } from '../tokenReaders/EscapedIdentifierTokenReader';
/**
* Class responsible for tokenizing SQL input.
*/
export class SqlTokenizer {
/**
* Initializes a new instance of the SqlTokenizer.
*/
constructor(input) {
this.input = input;
this.position = 0;
// Initialize the token reader manager and register all readers
this.readerManager = new TokenReaderManager(input)
.register(new EscapedIdentifierTokenReader(input))
.register(new ParameterTokenReader(input))
.register(new StringSpecifierTokenReader(input))
// LiteralTokenReader should be registered before SpecialSymbolTokenReader and OperatorTokenReader
// Reason: To prevent numeric literals starting with a dot or sign from being misrecognized as operators
// e.g. `1.0` is a literal, not an operator
.register(new LiteralTokenReader(input))
.register(new SpecialSymbolTokenReader(input))
.register(new CommandTokenReader(input))
.register(new OperatorTokenReader(input))
// TypeTokenReader should be registered before FunctionTokenReader
// Reason: To prevent types containing parentheses from being misrecognized as functions
// e.g. `numeric(10, 2)` is a type, not a function
.register(new TypeTokenReader(input))
.register(new FunctionTokenReader(input))
.register(new IdentifierTokenReader(input)) // IdentifierTokenReader should be registered last
;
}
/**
* Checks if the end of input is reached.
*
* @param shift - The shift to consider beyond the current position.
* @returns True if the end of input is reached; otherwise, false.
*/
isEndOfInput(shift = 0) {
return this.position + shift >= this.input.length;
}
/**
* Checks if more input can be read.
*
* @param shift - The shift to consider beyond the current position.
* @returns True if more input can be read; otherwise, false.
*/
canRead(shift = 0) {
return !this.isEndOfInput(shift);
}
/**
* Reads the lexemes from the input string.
*
* @returns An array of lexemes extracted from the input string.
* @throws Error if an unexpected character is encountered.
*/
readLexmes() {
// Pre-allocate array with estimated capacity for better performance
const estimatedTokens = Math.ceil(this.input.length / 8); // Assuming average token length of 8 chars
const lexemes = new Array(estimatedTokens);
let lexemeCount = 0;
// Read initial prefix comments
const comment = this.readComment();
let pendingComments = comment.lines;
this.position = comment.position;
// Track the previous token
let previous = null;
// Read tokens until the end of input is reached
while (this.canRead()) {
// Semicolon is a delimiter, so stop reading
if (this.input[this.position] === ';') {
break;
}
// Read using the token reader manager
const lexeme = this.readerManager.tryRead(this.position, previous);
if (lexeme === null) {
throw new Error(`Unexpected character. actual: ${this.input[this.position]}, position: ${this.position}\n${this.getDebugPositionInfo(this.position)}`);
}
// Update position
this.position = this.readerManager.getMaxPosition();
// Read suffix comments
const currentComment = this.readComment();
this.position = currentComment.position;
if ((lexeme.type & TokenType.Comma) || (lexeme.type & TokenType.Operator)) {
// Carry over comments after commas or operators
if (currentComment.lines.length > 0) {
pendingComments.push(...currentComment.lines);
}
}
else {
// Add comments to the current token if any
const hasComments = pendingComments.length > 0 || currentComment.lines.length > 0;
if (hasComments) {
this.addCommentsToToken(lexeme, pendingComments, currentComment.lines);
}
pendingComments = []; // Clear as they are processed
}
lexemes[lexemeCount++] = lexeme;
previous = lexeme;
}
// Add any pending comments to the last token
if (pendingComments.length > 0 && lexemeCount > 0) {
const lastToken = lexemes[lexemeCount - 1];
if (lastToken.comments === null) {
lastToken.comments = [];
}
lastToken.comments.push(...pendingComments);
}
// Trim the array to actual size used
return lexemeCount === estimatedTokens ? lexemes : lexemes.slice(0, lexemeCount);
}
/**
* Adds pending comments to the last token.
*/
addPendingCommentsToLastToken(lexemes, pendingComments) {
if (pendingComments.length > 0 && lexemes.length > 0) {
const lastToken = lexemes[lexemes.length - 1];
if (lastToken.comments === null) {
lastToken.comments = [];
}
lastToken.comments.push(...pendingComments);
}
}
/**
* Adds comments to the token.
*/
addCommentsToToken(lexeme, prefixComments, suffixComments) {
const hasComments = prefixComments.length > 0 || suffixComments.length > 0;
if (hasComments) {
if (lexeme.comments === null) {
lexeme.comments = [];
}
// Add prefix comments to the beginning.
if (prefixComments.length > 0) {
lexeme.comments.unshift(...prefixComments);
}
// Add suffix comments to the end.
if (suffixComments.length > 0) {
lexeme.comments.push(...suffixComments);
}
}
}
/**
* Skips whitespace characters and SQL comments in the input.
*
* @remarks This method updates the position pointer.
*/
readComment() {
return StringUtils.readWhiteSpaceAndComment(this.input, this.position);
}
/**
* Gets debug information for error reporting.
*
* @param errPosition - The position where the error occurred.
* @returns A string containing the debug position information.
*/
getDebugPositionInfo(errPosition) {
return StringUtils.getDebugPositionInfo(this.input, errPosition);
}
}
//# sourceMappingURL=SqlTokenizer.js.map