UNPKG

dt-python-parser

Version:

There are some python parsers built with antlr4, and it's mainly for the **BigData** domain.

143 lines (142 loc) 4.64 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.lexer = exports.cleanPython = void 0; const token_1 = require("./token"); /** * 获取 注释 以及 分隔符 等词法信息 * @param {String} python */ function lexer(input) { // 记录当前字符的位置 let current = 0; let line = 1; // 最终的 TokenTypes 结果 const tokens = []; /** * 提取 TokenType */ // eslint-disable-next-line const extract = (currentChar, validator, TokenType) => { let value = ''; const start = current; while (validator.test(currentChar)) { value += currentChar; currentChar = input[++current]; } return { type: TokenType, start: start, end: current, lineNumber: line, value: value, }; }; /** * 过滤(提取) 引号中的内容 */ // eslint-disable-next-line const matchQuotation = (currentChar, validator, TokenType) => { do { if (currentChar === '\n') { line++; } currentChar = input[++current]; } while (!validator.test(currentChar)); ++current; }; while (current < input.length) { let char = input[current]; // 按顺序处理 换行符 反引号 单引号 双引号 注释 分号 // 引号内 可能包含注释包含的符号以及分号 所以优先处理引号里面的内容 去除干扰信息 if (char === '\n') { line++; current++; continue; } if (token_1.TokenReg.BackQuotation.test(char)) { // eslint-disable-next-line matchQuotation(char, token_1.TokenReg.BackQuotation, token_1.TokenType.BackQuotation); continue; } if (token_1.TokenReg.SingleQuotation.test(char)) { // eslint-disable-next-line matchQuotation(char, token_1.TokenReg.SingleQuotation, token_1.TokenType.SingleQuotation); continue; } if (token_1.TokenReg.DoubleQuotation.test(char) && input[current + 1] !== `"`) { // eslint-disable-next-line matchQuotation(char, token_1.TokenReg.DoubleQuotation, token_1.TokenType.DoubleQuotation); continue; } // 处理单行注释,以 # 开始,\n 结束 if (char === '#') { let value = ''; const start = current; while (char !== '\n') { value += char; char = input[++current]; } tokens.push({ type: token_1.TokenType.Comment, value, start: start, lineNumber: line, end: current, }); continue; } // 处理多行注释,以 """ 开始, """结束 if (char === `"` && input[current + 1] === `"` && input[current + 2] === `"`) { let value = '"""'; const start = current; const startLine = line; current += 3; char = input[current]; while (!(char === `"` && input[current - 1] === `"` && input[current - 2] === `"`)) { if (char === '\n') { line++; } value += char; char = input[++current]; } value += char; ++current; tokens.push({ type: token_1.TokenType.Comment, value, start: start, lineNumber: startLine, end: current, }); continue; } // 处理结束符 ; if (token_1.TokenReg.StatementTerminator.test(char)) { const newToken = extract(char, token_1.TokenReg.StatementTerminator, token_1.TokenType.StatementTerminator); tokens.push(newToken); continue; } current++; } return tokens; } exports.lexer = lexer; /** * 清除注释和前后空格 * @param {String} python */ function cleanPython(python) { python.trim(); // 删除前后空格 const tokens = lexer(python); let resultPython = ''; let startIndex = 0; tokens.forEach((ele) => { if (ele.type === token_1.TokenType.Comment) { resultPython += python.slice(startIndex, ele.start); startIndex = ele.end + 1; } }); resultPython += python.slice(startIndex); return resultPython; } exports.cleanPython = cleanPython;