UNPKG

dt-python-parser

Version:

There are some python parsers built with antlr4, and it's mainly for the **BigData** domain.

github.com/DTStack/dt-python-parser

DTStack/dt-python-parser

143 lines (142 loc) • 4.64 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.lexer = exports.cleanPython = void 0; const token_1 = require("./token"); /** * 获取注释以及分隔符等词法信息 * @param {String} python */ function lexer(input) { // 记录当前字符的位置 let current = 0; let line = 1; // 最终的 TokenTypes 结果 const tokens = []; /** * 提取 TokenType */ // eslint-disable-next-line const extract = (currentChar, validator, TokenType) => { let value = ''; const start = current; while (validator.test(currentChar)) { value += currentChar; currentChar = input[++current]; } return { type: TokenType, start: start, end: current, lineNumber: line, value: value, }; }; /** * 过滤（提取）引号中的内容 */ // eslint-disable-next-line const matchQuotation = (currentChar, validator, TokenType) => { do { if (currentChar === '\n') { line++; } currentChar = input[++current]; } while (!validator.test(currentChar)); ++current; }; while (current < input.length) { let char = input[current]; // 按顺序处理换行符反引号单引号双引号注释分号 // 引号内可能包含注释包含的符号以及分号所以优先处理引号里面的内容去除干扰信息 if (char === '\n') { line++; current++; continue; } if (token_1.TokenReg.BackQuotation.test(char)) { // eslint-disable-next-line matchQuotation(char, token_1.TokenReg.BackQuotation, token_1.TokenType.BackQuotation); continue; } if (token_1.TokenReg.SingleQuotation.test(char)) { // eslint-disable-next-line matchQuotation(char, token_1.TokenReg.SingleQuotation, token_1.TokenType.SingleQuotation); continue; } if (token_1.TokenReg.DoubleQuotation.test(char) && input[current + 1] !== `"`) { // eslint-disable-next-line matchQuotation(char, token_1.TokenReg.DoubleQuotation, token_1.TokenType.DoubleQuotation); continue; } // 处理单行注释，以 # 开始，\n 结束 if (char === '#') { let value = ''; const start = current; while (char !== '\n') { value += char; char = input[++current]; } tokens.push({ type: token_1.TokenType.Comment, value, start: start, lineNumber: line, end: current, }); continue; } // 处理多行注释，以 """ 开始， """结束 if (char === `"` && input[current + 1] === `"` && input[current + 2] === `"`) { let value = '"""'; const start = current; const startLine = line; current += 3; char = input[current]; while (!(char === `"` && input[current - 1] === `"` && input[current - 2] === `"`)) { if (char === '\n') { line++; } value += char; char = input[++current]; } value += char; ++current; tokens.push({ type: token_1.TokenType.Comment, value, start: start, lineNumber: startLine, end: current, }); continue; } // 处理结束符 ; if (token_1.TokenReg.StatementTerminator.test(char)) { const newToken = extract(char, token_1.TokenReg.StatementTerminator, token_1.TokenType.StatementTerminator); tokens.push(newToken); continue; } current++; } return tokens; } exports.lexer = lexer; /** * 清除注释和前后空格 * @param {String} python */ function cleanPython(python) { python.trim(); // 删除前后空格 const tokens = lexer(python); let resultPython = ''; let startIndex = 0; tokens.forEach((ele) => { if (ele.type === token_1.TokenType.Comment) { resultPython += python.slice(startIndex, ele.start); startIndex = ele.end + 1; } }); resultPython += python.slice(startIndex); return resultPython; } exports.cleanPython = cleanPython;