rawsql-ts
Version:
High-performance SQL parser and AST analyzer written in TypeScript. Provides fast parsing and advanced transformation capabilities.
226 lines • 9.88 kB
JavaScript
import { CharLookupTable } from "./charLookupTable";
/**
* Utilities for string operations during tokenization
*/
export class StringUtils {
/**
* Creates a visual representation of an error position in text
* @param input The input text
* @param errPosition The error position
* @returns A string with a caret pointing to the error position
*/
static getDebugPositionInfo(input, errPosition) {
// Get 5 characters before and after the error
// If the start and end points are out of the string range, keep them within the range
// Display ^ at the error position on the next line
const start = Math.max(0, errPosition - 5);
const end = Math.min(input.length, errPosition + 5);
const debugInfo = input.slice(start, end);
const caret = ' '.repeat(errPosition - start) + '^';
return `${debugInfo}\n${caret}`;
}
/**
* Skip white space characters.
*/
static skipWhiteSpace(input, position) {
const length = input.length;
/*
* Optimization: Try to skip 4 spaces at once (for 4-space indents).
* This is effective when SQL is deeply nested and uses 4-space indentation.
* In typical cases, charCodeAt in a loop is fastest, but for large/indented SQL,
* this can reduce the number of iterations and improve stability (lower error/deviation in benchmarks).
* If indentation is not 4 spaces, this check is skipped quickly, so overhead is minimal.
*
* Even for 2-space indents or mixed indents (2, 4, tab),
* the remaining whitespace is handled by the following loop, so there is no performance loss.
*
* Benchmark results show that this optimization does not slow down short queries,
* and can make long/indented queries more stable and slightly faster.
*/
while (position + 4 <= length && input.slice(position, position + 4) === ' ') {
position += 4;
}
// Then skip remaining whitespace one by one (space, tab, newline, carriage return)
while (position < length) {
const charCode = input.charCodeAt(position);
// ' '=32, '\t'=9, '\n'=10, '\r'=13
if (charCode !== 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
break;
}
position++;
}
return position;
}
/**
* Skip line comment.
*/
static readLineComment(input, position) {
if (position + 1 >= input.length) {
return { newPosition: position, comment: null };
}
// '-'=45
if (input.charCodeAt(position) === 45 && input.charCodeAt(position + 1) === 45) {
const start = position;
position += 2;
// '\n'=10
while (position < input.length && input.charCodeAt(position) !== 10) {
position++;
}
// Return the trimmed comment content (excluding -- tokens)
const comment = input.slice(start + 2, position).trim();
return { newPosition: position, comment };
}
return { newPosition: position, comment: null };
}
/**
* Skip block comment.
*/
static readBlockComment(input, position) {
if (position + 1 >= input.length) {
return { newPosition: position, comments: null };
}
// Fast check for /* ('/'=47, '*'=42)
if (input.charCodeAt(position) !== 47 || input.charCodeAt(position + 1) !== 42) {
return { newPosition: position, comments: null };
}
// Treat Oracle style hints (/*+) as non-comment so other readers can process them. ('+'=43)
const isHint = position + 2 < input.length && input.charCodeAt(position + 2) === 43;
if (isHint) {
return { newPosition: position, comments: null };
}
const start = position;
position += 2;
while (position + 1 < input.length) {
// Look for closing */ ('*'/=42, '/'=47)
if (input.charCodeAt(position) === 42 && input.charCodeAt(position + 1) === 47) {
position += 2;
const processedLines = this.processBlockCommentContent(input.slice(start + 2, position - 2));
return { newPosition: position, comments: processedLines };
}
position++;
}
// Unterminated comment: consume rest of input and return collected lines.
const processedLinesUnterminated = this.processBlockCommentContent(input.slice(start + 2));
return { newPosition: input.length, comments: processedLinesUnterminated };
}
static processBlockCommentContent(rawContent) {
const rawLines = rawContent.replace(/\r/g, '').split('\n');
const processedLines = [];
for (const rawLine of rawLines) {
const trimmedLine = rawLine.trim();
const isSeparatorLine = /^\s*[-=_+*#]+\s*$/.test(rawLine);
if (trimmedLine !== '' || isSeparatorLine) {
processedLines.push(isSeparatorLine ? rawLine.trim() : trimmedLine);
}
else {
processedLines.push('');
}
}
while (processedLines.length > 0 && processedLines[0] === '') {
processedLines.shift();
}
while (processedLines.length > 0 && processedLines[processedLines.length - 1] === '') {
processedLines.pop();
}
return processedLines;
}
/**
* Skip white space characters and SQL comments.
* @returns Object containing the new position and an array of skipped comments
*/
static readWhiteSpaceAndComment(input, position) {
let lines = null;
const length = input.length;
while (position < length) {
// Store current position
const oldPosition = position;
// Skip whitespace first
position = StringUtils.skipWhiteSpace(input, position);
if (position !== oldPosition) {
continue;
}
// Fast character code check
const charCode = input.charCodeAt(position);
// '-'=45 (Line comment)
if (charCode === 45) {
const lineCommentResult = StringUtils.readLineComment(input, position);
if (lineCommentResult.newPosition !== position) {
position = lineCommentResult.newPosition;
if (lineCommentResult.comment) {
if (lines === null) {
lines = [];
}
lines.push(lineCommentResult.comment.trim());
}
continue;
}
}
// '/'=47 (Block comment)
else if (charCode === 47) {
const blockCommentResult = StringUtils.readBlockComment(input, position);
if (blockCommentResult.newPosition !== position) {
position = blockCommentResult.newPosition;
if (blockCommentResult.comments) {
if (lines === null) {
lines = [];
}
lines.push(...blockCommentResult.comments);
}
continue;
}
}
// No more whitespace or comments found
break;
}
return { position, lines };
}
/**
* Read a regular identifier.
*/
static readRegularIdentifier(input, position) {
const result = this.tryReadRegularIdentifier(input, position);
if (!result) {
throw new Error(`Unexpected character. position: ${position}\n${StringUtils.getDebugPositionInfo(input, position)}`);
}
return result;
}
static tryReadRegularIdentifier(input, position) {
const start = position;
while (position < input.length) {
if (CharLookupTable.isDelimiter(input[position])) {
break;
}
position++;
}
if (start === position) {
return null;
}
// Check index range before checking for [] (array type)
// But don't include [] if it looks like array access rather than type declaration
while (position + 1 < input.length &&
input[position] === '[' &&
input[position + 1] === ']') {
// Check if this looks like array access context by looking at what comes before
// Array access context: after an expression/identifier that could be an array
// Type context: in type declarations, parameter lists, etc.
// Simple heuristic: if we're at the end of what looks like a variable/column name
// and not in a clear type context, treat [] as array access, not type suffix
const beforeIdentifier = input.slice(0, start).trim();
// Don't treat as type suffix if:
// 1. We're at the start of input (standalone identifier)
// 2. Previous context suggests this is a variable/column reference
if (beforeIdentifier === '' ||
/[)]$/.test(beforeIdentifier) || // After closing paren
/\b(select|from|where|and|or|set|values|insert|update|delete)\s*$/i.test(beforeIdentifier)) {
// This looks like array access context, don't include []
break;
}
position += 2; // Skip the [] (keep existing behavior for type contexts)
}
return {
identifier: input.slice(start, position),
newPosition: position
};
}
}
//# sourceMappingURL=stringUtils.js.map