UNPKG

dt-sql-parser

Version:

SQL Parsers for BigData, built with antlr4

github.com/DTStack/dt-sql-parser

DTStack/dt-sql-parser

304 lines (303 loc) • 12.8 kB

JavaScript

import { CharStreams, CommonTokenStream, ParseTreeWalker, PredictionMode, } from 'antlr4ng'; import { CodeCompletionCore } from 'antlr4-c3'; import { findCaretTokenIndex } from './findCaretTokenIndex'; import { ctxToText, tokenToWord } from './textAndWord'; import { ErrorStrategy } from './errorStrategy'; /** * Basic SQL class, every sql needs extends it. */ export class BasicSQL { constructor() { this._parseErrors = []; /** members for cache end */ this._errorListener = (error) => { this._parseErrors.push(error); }; this.locale = 'en_US'; } /** * Create an antlr4 lexer from input. * @param input string */ createLexer(input, errorListener) { const charStreams = CharStreams.fromString(input); const lexer = this.createLexerFromCharStream(charStreams); if (errorListener) { lexer.removeErrorListeners(); lexer.addErrorListener(this.createErrorListener(errorListener)); } return lexer; } /** * Create an antlr4 parser from input. * @param input string */ createParser(input, errorListener) { const lexer = this.createLexer(input, errorListener); const tokenStream = new CommonTokenStream(lexer); const parser = this.createParserFromTokenStream(tokenStream); parser.interpreter.predictionMode = PredictionMode.SLL; if (errorListener) { parser.removeErrorListeners(); parser.addErrorListener(this.createErrorListener(errorListener)); } return parser; } /** * Parse input string and return parseTree. * @param input string * @param errorListener listen parse errors and lexer errors. * @returns parseTree */ parse(input, errorListener) { const parser = this.createParser(input, errorListener); parser.buildParseTrees = true; parser.errorHandler = new ErrorStrategy(); return parser.program(); } /** * Create an antlr4 parser from input. * And the instances will be cache. * @param input string */ createParserWithCache(input) { this._parseTree = null; this._charStreams = CharStreams.fromString(input); this._lexer = this.createLexerFromCharStream(this._charStreams); this._lexer.removeErrorListeners(); this._lexer.addErrorListener(this.createErrorListener(this._errorListener)); this._tokenStream = new CommonTokenStream(this._lexer); /** * All tokens are generated in advance. * This can cause performance degradation, but it seems necessary for now. * Because the tokens will be used multiple times. */ this._tokenStream.fill(); this._parser = this.createParserFromTokenStream(this._tokenStream); this._parser.interpreter.predictionMode = PredictionMode.SLL; this._parser.buildParseTrees = true; this._parser.errorHandler = new ErrorStrategy(); return this._parser; } /** * If it is invoked multiple times in a row and the input parameters is the same, * this method returns the parsing result directly for the first time * unless the errorListener parameter is passed. * @param input source string * @param errorListener listen errors * @returns parseTree */ parseWithCache(input, errorListener) { // Avoid parsing the same input repeatedly. if (this._parsedInput === input && !errorListener && this._parseTree) { return this._parseTree; } this._parseErrors = []; const parser = this.createParserWithCache(input); this._parsedInput = input; parser.removeErrorListeners(); parser.addErrorListener(this.createErrorListener(this._errorListener)); this._parseTree = parser.program(); return this._parseTree; } /** * Validate input string and return syntax errors if exists. * @param input source string * @returns syntax errors */ validate(input) { this.parseWithCache(input); return this._parseErrors; } /** * Get the input string that has been parsed. */ getParsedInput() { return this._parsedInput; } /** * Get all Tokens of input string，'<EOF>' is not included. * @param input source string * @returns Token[] */ getAllTokens(input) { this.parseWithCache(input); let allTokens = this._tokenStream.getTokens(); if (allTokens[allTokens.length - 1].text === '<EOF>') { allTokens = allTokens.slice(0, -1); } return allTokens; } /** * @param listener Listener instance extends ParserListener * @param parseTree parser Tree */ listen(listener, parseTree) { ParseTreeWalker.DEFAULT.walk(listener, parseTree); } /** * Split input into statements. * If exist syntax error it will return null. * @param input source string */ splitSQLByStatement(input) { const errors = this.validate(input); if (errors.length || !this._parseTree) { return null; } const splitListener = this.splitListener; this.listen(splitListener, this._parseTree); const res = splitListener.statementsContext .map((context) => { return ctxToText(context, this._parsedInput); }) .filter(Boolean); return res; } /** * Get a minimum boundary parser near tokenIndex. * @param input source string. * @param tokenIndex start from which index to minimize the boundary. * @param originParseTree the parse tree need to be minimized, default value is the result of parsing `input`. * @returns minimum parser info */ getMinimumParserInfo(input, tokenIndex, originParseTree) { var _a, _b, _c, _d, _e, _f, _g, _h, _j; if (arguments.length <= 2) { this.parseWithCache(input); originParseTree = this._parseTree; } if (!originParseTree || !(input === null || input === void 0 ? void 0 : input.length)) return null; const splitListener = this.splitListener; /** * Split sql by statement. * Try to collect candidates in as small a range as possible. */ this.listen(splitListener, originParseTree); const statementCount = (_a = splitListener.statementsContext) === null || _a === void 0 ? void 0 : _a.length; const statementsContext = splitListener.statementsContext; let tokenIndexOffset = 0; let sqlParserIns = this._parser; let parseTree = originParseTree; // If there are multiple statements. if (statementCount > 1) { /** * Find a minimum valid range, reparse the fragment, and provide a new parse tree to C3. * The boundaries of this range must be statements with no syntax errors. * This can ensure the stable performance of the C3. */ let startStatement = null; let stopStatement = null; for (let index = 0; index < statementCount; index++) { const ctx = statementsContext[index]; const isCurrentCtxValid = !ctx.exception; if (!isCurrentCtxValid) continue; /** * Ensure that the statementContext before the left boundary * and the last statementContext on the right boundary are qualified SQL statements. */ const isPrevCtxValid = index === 0 || !((_b = statementsContext[index - 1]) === null || _b === void 0 ? void 0 : _b.exception); const isNextCtxValid = index === statementCount - 1 || !((_c = statementsContext[index + 1]) === null || _c === void 0 ? void 0 : _c.exception); if (ctx.stop && ctx.stop.tokenIndex < tokenIndex && isPrevCtxValid) { startStatement = ctx; } if (ctx.start && !stopStatement && ctx.start.tokenIndex > tokenIndex && isNextCtxValid) { stopStatement = ctx; break; } } // A boundary consisting of the index of the input. const startIndex = (_e = (_d = startStatement === null || startStatement === void 0 ? void 0 : startStatement.start) === null || _d === void 0 ? void 0 : _d.start) !== null && _e !== void 0 ? _e : 0; const stopIndex = (_g = (_f = stopStatement === null || stopStatement === void 0 ? void 0 : stopStatement.stop) === null || _f === void 0 ? void 0 : _f.stop) !== null && _g !== void 0 ? _g : input.length - 1; /** * Save offset of the tokenIndex in the range of input * compared to the tokenIndex in the whole input */ tokenIndexOffset = (_j = (_h = startStatement === null || startStatement === void 0 ? void 0 : startStatement.start) === null || _h === void 0 ? void 0 : _h.tokenIndex) !== null && _j !== void 0 ? _j : 0; tokenIndex = tokenIndex - tokenIndexOffset; /** * Reparse the input fragment， * and c3 will collect candidates in the newly generated parseTree. */ const inputSlice = input.slice(startIndex, stopIndex); const lexer = this.createLexer(inputSlice); lexer.removeErrorListeners(); const tokenStream = new CommonTokenStream(lexer); tokenStream.fill(); const parser = this.createParserFromTokenStream(tokenStream); parser.interpreter.predictionMode = PredictionMode.SLL; parser.removeErrorListeners(); parser.buildParseTrees = true; parser.errorHandler = new ErrorStrategy(); sqlParserIns = parser; parseTree = parser.program(); } return { parser: sqlParserIns, parseTree, tokenIndexOffset, newTokenIndex: tokenIndex, }; } /** * Get suggestions of syntax and token at caretPosition * @param input source string * @param caretPosition caret position, such as cursor position * @returns suggestion */ getSuggestionAtCaretPosition(input, caretPosition) { this.parseWithCache(input); if (!this._parseTree) return null; const allTokens = this.getAllTokens(input); let caretTokenIndex = findCaretTokenIndex(caretPosition, allTokens); if (!caretTokenIndex && caretTokenIndex !== 0) return null; const minimumParser = this.getMinimumParserInfo(input, caretTokenIndex); if (!minimumParser) return null; const { parser: sqlParserIns, tokenIndexOffset, newTokenIndex, parseTree: c3Context, } = minimumParser; const core = new CodeCompletionCore(sqlParserIns); core.preferredRules = this.preferredRules; const candidates = core.collectCandidates(newTokenIndex, c3Context); const originalSuggestions = this.processCandidates(candidates, allTokens, newTokenIndex, tokenIndexOffset); const syntaxSuggestions = originalSuggestions.syntax.map((syntaxCtx) => { const wordRanges = syntaxCtx.wordRanges.map((token) => { return tokenToWord(token, this._parsedInput); }); return { syntaxContextType: syntaxCtx.syntaxContextType, wordRanges, }; }); return { syntax: syntaxSuggestions, keywords: originalSuggestions.keywords, }; } getAllEntities(input, caretPosition) { const allTokens = this.getAllTokens(input); const caretTokenIndex = caretPosition ? findCaretTokenIndex(caretPosition, allTokens) : void 0; const collectListener = this.createEntityCollector(input, allTokens, caretTokenIndex); // const parser = this.createParserWithCache(input); // parser.entityCollecting = true; // if(caretPosition) { // const allTokens = this.getAllTokens(input); // const tokenIndex = findCaretTokenIndex(caretPosition, allTokens); // parser.caretTokenIndex = tokenIndex; // } // const parseTree = parser.program(); const parseTree = this.parseWithCache(input); this.listen(collectListener, parseTree); // parser.caretTokenIndex = -1; // parser.entityCollecting = false; return collectListener.getEntities(); } }