UNPKG

dt-sql-parser

Version:

SQL Parsers for BigData, built with antlr4

github.com/DTStack/dt-sql-parser

DTStack/dt-sql-parser

411 lines (410 loc) • 17.1 kB

JavaScript

import { CharStreams, CommonTokenStream, ParseTreeWalker, PredictionMode, } from 'antlr4ng'; import { CodeCompletionCore } from 'antlr4-c3'; import { findCaretTokenIndex } from './findCaretTokenIndex'; import { ctxToText, tokenToWord } from './textAndWord'; import { ErrorStrategy } from './errorStrategy'; export const SQL_SPLIT_SYMBOL_TEXT = ';'; /** * Basic SQL class, every sql needs extends it. */ export class BasicSQL { constructor() { this._parseErrors = []; /** members for cache end */ this._errorListener = (error) => { this._parseErrors.push(error); }; this.locale = 'en_US'; } /** * Create an antlr4 lexer from input. * @param input string */ createLexer(input, errorListener) { const charStreams = CharStreams.fromString(input); const lexer = this.createLexerFromCharStream(charStreams); if (errorListener) { lexer.removeErrorListeners(); lexer.addErrorListener(this.createErrorListener(errorListener)); } return lexer; } /** * Create an antlr4 parser from input. * @param input string */ createParser(input, errorListener) { const lexer = this.createLexer(input, errorListener); const tokenStream = new CommonTokenStream(lexer); const parser = this.createParserFromTokenStream(tokenStream); parser.interpreter.predictionMode = PredictionMode.SLL; if (errorListener) { parser.removeErrorListeners(); parser.addErrorListener(this.createErrorListener(errorListener)); } return parser; } /** * Parse input string and return parseTree. * @param input string * @param errorListener listen parse errors and lexer errors. * @returns parseTree */ parse(input, errorListener) { const parser = this.createParser(input, errorListener); parser.buildParseTrees = true; parser.errorHandler = new ErrorStrategy(); return parser.program(); } /** * Create an antlr4 parser from input. * And the instances will be cache. * @param input string */ createParserWithCache(input) { this._parseTree = null; this._charStreams = CharStreams.fromString(input); this._lexer = this.createLexerFromCharStream(this._charStreams); this._lexer.removeErrorListeners(); this._lexer.addErrorListener(this.createErrorListener(this._errorListener)); this._tokenStream = new CommonTokenStream(this._lexer); /** * All tokens are generated in advance. * This can cause performance degradation, but it seems necessary for now. * Because the tokens will be used multiple times. */ this._tokenStream.fill(); this._parser = this.createParserFromTokenStream(this._tokenStream); this._parser.interpreter.predictionMode = PredictionMode.SLL; this._parser.buildParseTrees = true; this._parser.errorHandler = new ErrorStrategy(); return this._parser; } /** * If it is invoked multiple times in a row and the input parameters is the same, * this method returns the parsing result directly for the first time * unless the errorListener parameter is passed. * @param input source string * @param errorListener listen errors * @returns parseTree */ parseWithCache(input, errorListener) { // Avoid parsing the same input repeatedly. if (this._parsedInput === input && !errorListener && this._parseTree) { return this._parseTree; } this._parseErrors = []; const parser = this.createParserWithCache(input); this._parsedInput = input; parser.removeErrorListeners(); parser.addErrorListener(this.createErrorListener(this._errorListener)); this._parseTree = parser.program(); return this._parseTree; } /** * Validate input string and return syntax errors if exists. * @param input source string * @returns syntax errors */ validate(input) { this.parseWithCache(input); return this._parseErrors; } /** * Get the input string that has been parsed. */ getParsedInput() { return this._parsedInput; } /** * Get all Tokens of input string，'<EOF>' is not included. * @param input source string * @returns Token[] */ getAllTokens(input) { this.parseWithCache(input); let allTokens = this._tokenStream.getTokens(); if (allTokens[allTokens.length - 1].text === '<EOF>') { allTokens = allTokens.slice(0, -1); } return allTokens; } /** * @param listener Listener instance extends ParserListener * @param parseTree parser Tree */ listen(listener, parseTree) { ParseTreeWalker.DEFAULT.walk(listener, parseTree); } /** * Split input into statements. * If exist syntax error it will return null. * @param input source string */ splitSQLByStatement(input) { const errors = this.validate(input); if (errors.length || !this._parseTree) { return null; } const splitListener = this.splitListener; this.listen(splitListener, this._parseTree); const res = splitListener.statementsContext .map((context) => { return ctxToText(context, this._parsedInput); }) .filter(Boolean); return res; } /** * Get the smaller range of input * @param input string * @param allTokens all tokens from input * @param tokenIndexOffset offset of the tokenIndex in the range of input * @param caretTokenIndex tokenIndex of caretPosition * @returns inputSlice: string, caretTokenIndex: number */ splitInputBySymbolText(input, allTokens, tokenIndexOffset, caretTokenIndex) { const tokens = allTokens.slice(tokenIndexOffset); /** * Set startToken */ let startToken = null; for (let tokenIndex = caretTokenIndex - tokenIndexOffset; tokenIndex >= 0; tokenIndex--) { const token = tokens[tokenIndex]; if ((token === null || token === void 0 ? void 0 : token.text) === SQL_SPLIT_SYMBOL_TEXT) { startToken = tokens[tokenIndex + 1]; break; } } if (startToken === null) { startToken = tokens[0]; } /** * Set stopToken */ let stopToken = null; for (let tokenIndex = caretTokenIndex - tokenIndexOffset; tokenIndex < tokens.length; tokenIndex++) { const token = tokens[tokenIndex]; if ((token === null || token === void 0 ? void 0 : token.text) === SQL_SPLIT_SYMBOL_TEXT) { stopToken = token; break; } } if (stopToken === null) { stopToken = tokens[tokens.length - 1]; } const indexOffset = tokens[0].start; let startIndex = startToken.start - indexOffset; let stopIndex = stopToken.stop + 1 - indexOffset; /** * Save offset of the tokenIndex in the range of input * compared to the tokenIndex in the whole input */ const _tokenIndexOffset = startToken.tokenIndex; const _caretTokenIndex = caretTokenIndex - _tokenIndexOffset; /** * Get the smaller range of _input */ const _input = input.slice(startIndex, stopIndex); return { inputSlice: _input, allTokens: allTokens.slice(_tokenIndexOffset), caretTokenIndex: _caretTokenIndex, }; } /** * Get the minimum input string that can be parsed successfully by c3. * @param input source string * @param caretTokenIndex tokenIndex of caretPosition * @param originParseTree origin parseTree * @returns MinimumInputInfo */ getMinimumInputInfo(input, caretTokenIndex, originParseTree) { var _a, _b, _c, _d, _e, _f, _g, _h, _j; if (!originParseTree || !(input === null || input === void 0 ? void 0 : input.length)) return null; let inputSlice = input; /** * Split sql by statement. * Try to collect candidates in as small a range as possible. */ const splitListener = this.splitListener; this.listen(splitListener, originParseTree); const statementCount = (_a = splitListener.statementsContext) === null || _a === void 0 ? void 0 : _a.length; const statementsContext = splitListener.statementsContext; let tokenIndexOffset = 0; // If there are multiple statements. if (statementCount > 1) { /** * Find a minimum valid range, reparse the fragment, and provide a new parse tree to C3. * The boundaries of this range must be statements with no syntax errors. * This can ensure the stable performance of the C3. */ let startStatement = null; let stopStatement = null; for (let index = 0; index < statementCount; index++) { const ctx = statementsContext[index]; const isCurrentCtxValid = !ctx.exception; if (!isCurrentCtxValid) continue; /** * Ensure that the statementContext before the left boundary * and the last statementContext on the right boundary are qualified SQL statements. */ const isPrevCtxValid = index === 0 || !((_b = statementsContext[index - 1]) === null || _b === void 0 ? void 0 : _b.exception); const isNextCtxValid = index === statementCount - 1 || !((_c = statementsContext[index + 1]) === null || _c === void 0 ? void 0 : _c.exception); if (ctx.stop && ctx.stop.tokenIndex < caretTokenIndex && isPrevCtxValid) { startStatement = ctx; } if (ctx.start && !stopStatement && ctx.start.tokenIndex > caretTokenIndex && isNextCtxValid) { stopStatement = ctx; break; } } // A boundary consisting of the index of the input. const startIndex = (_e = (_d = startStatement === null || startStatement === void 0 ? void 0 : startStatement.start) === null || _d === void 0 ? void 0 : _d.start) !== null && _e !== void 0 ? _e : 0; const stopIndex = (_g = (_f = stopStatement === null || stopStatement === void 0 ? void 0 : stopStatement.stop) === null || _f === void 0 ? void 0 : _f.stop) !== null && _g !== void 0 ? _g : inputSlice.length - 1; /** * Save offset of the tokenIndex in the range of input * compared to the tokenIndex in the whole input */ tokenIndexOffset = (_j = (_h = startStatement === null || startStatement === void 0 ? void 0 : startStatement.start) === null || _h === void 0 ? void 0 : _h.tokenIndex) !== null && _j !== void 0 ? _j : 0; inputSlice = inputSlice.slice(startIndex, stopIndex); } return { input: inputSlice, tokenIndexOffset, statementCount, }; } /** * Get a minimum boundary parser near caretTokenIndex. * @param input source string. * @param caretTokenIndex start from which index to minimize the boundary. * @param originParseTree the parse tree need to be minimized, default value is the result of parsing `input`. * @returns minimum parser info */ getMinimumParserInfo(input, caretTokenIndex, originParseTree) { if (!originParseTree || !(input === null || input === void 0 ? void 0 : input.length)) return null; const inputInfo = this.getMinimumInputInfo(input, caretTokenIndex, originParseTree); if (!inputInfo) return null; const { input: inputSlice, tokenIndexOffset } = inputInfo; caretTokenIndex = caretTokenIndex - tokenIndexOffset; let sqlParserIns = this._parser; let parseTree = originParseTree; /** * Reparse the input fragment, * and c3 will collect candidates in the newly generated parseTree when input changed. */ if (inputSlice !== input) { sqlParserIns = this.createParser(inputSlice); parseTree = sqlParserIns.program(); } return { parser: sqlParserIns, parseTree, tokenIndexOffset, newTokenIndex: caretTokenIndex, }; } /** * Get suggestions of syntax and token at caretPosition * @param input source string * @param caretPosition caret position, such as cursor position * @returns suggestion */ getSuggestionAtCaretPosition(input, caretPosition) { this.parseWithCache(input); if (!this._parseTree) return null; let allTokens = this.getAllTokens(input); let caretTokenIndex = findCaretTokenIndex(caretPosition, allTokens); if (!caretTokenIndex && caretTokenIndex !== 0) return null; const inputInfo = this.getMinimumInputInfo(input, caretTokenIndex, this._parseTree); if (!inputInfo) return null; const { input: _input, tokenIndexOffset, statementCount } = inputInfo; let inputSlice = _input; /** * Split the inputSlice by separator to get the smaller range of inputSlice. */ if (inputSlice.includes(SQL_SPLIT_SYMBOL_TEXT)) { const { inputSlice: _inputSlice, allTokens: _allTokens, caretTokenIndex: _caretTokenIndex, } = this.splitInputBySymbolText(inputSlice, allTokens, tokenIndexOffset, caretTokenIndex); allTokens = _allTokens; caretTokenIndex = _caretTokenIndex; inputSlice = _inputSlice; } else { if (statementCount > 1) { caretTokenIndex = caretTokenIndex - tokenIndexOffset; } } let sqlParserIns = this._parser; let parseTree = this._parseTree; /** * Reparse the input fragment, * and c3 will collect candidates in the newly generated parseTree when input changed. */ if (inputSlice !== input) { sqlParserIns = this.createParser(inputSlice); parseTree = sqlParserIns.program(); } const core = new CodeCompletionCore(sqlParserIns); core.preferredRules = this.preferredRules; const candidates = core.collectCandidates(caretTokenIndex, parseTree); const originalSuggestions = this.processCandidates(candidates, allTokens, caretTokenIndex); const syntaxSuggestions = originalSuggestions.syntax.map((syntaxCtx) => { const wordRanges = syntaxCtx.wordRanges.map((token) => { return tokenToWord(token, this._parsedInput); }); return { syntaxContextType: syntaxCtx.syntaxContextType, wordRanges, }; }); return { syntax: syntaxSuggestions, keywords: originalSuggestions.keywords, }; } getAllEntities(input, caretPosition) { const allTokens = this.getAllTokens(input); const caretTokenIndex = caretPosition ? findCaretTokenIndex(caretPosition, allTokens) : void 0; const collectListener = this.createEntityCollector(input, allTokens, caretTokenIndex); // const parser = this.createParserWithCache(input); // parser.entityCollecting = true; // if(caretPosition) { // const allTokens = this.getAllTokens(input); // const tokenIndex = findCaretTokenIndex(caretPosition, allTokens); // parser.caretTokenIndex = tokenIndex; // } // const parseTree = parser.program(); const parseTree = this.parseWithCache(input); this.listen(collectListener, parseTree); // parser.caretTokenIndex = -1; // parser.entityCollecting = false; return collectListener.getEntities(); } /** * Get semantic context infos * @param input source string * @param caretPosition caret position, such as cursor position * @param options semantic context options * @returns analyzed semantic context */ getSemanticContextAtCaretPosition(input, caretPosition, options) { const allTokens = this.getAllTokens(input); const parseTree = this.parseWithCache(input); const statementContextListener = this.createSemanticContextCollector(input, caretPosition, allTokens, options); this.listen(statementContextListener, parseTree); return statementContextListener.semanticContext; } }