UNPKG

py-ast

Version:

A TypeScript library for parsing and walking Python Abstract Syntax Trees

github.com/kriss-u/py-ast

1,377 lines (1,376 loc) • 207 kB

JavaScript

/** * Python Lexical Analyzer (Tokenizer) * Converts Python source code into a stream of tokens */ var TokenType; (function (TokenType) { // Literals TokenType["NUMBER"] = "NUMBER"; TokenType["STRING"] = "STRING"; TokenType["NAME"] = "NAME"; // Keywords TokenType["AND"] = "AND"; TokenType["AS"] = "AS"; TokenType["ASSERT"] = "ASSERT"; TokenType["ASYNC"] = "ASYNC"; TokenType["AWAIT"] = "AWAIT"; TokenType["BREAK"] = "BREAK"; TokenType["CLASS"] = "CLASS"; TokenType["CONTINUE"] = "CONTINUE"; TokenType["DEF"] = "DEF"; TokenType["DEL"] = "DEL"; TokenType["ELIF"] = "ELIF"; TokenType["ELSE"] = "ELSE"; TokenType["EXCEPT"] = "EXCEPT"; TokenType["FALSE"] = "FALSE"; TokenType["FINALLY"] = "FINALLY"; TokenType["FOR"] = "FOR"; TokenType["FROM"] = "FROM"; TokenType["GLOBAL"] = "GLOBAL"; TokenType["IF"] = "IF"; TokenType["IMPORT"] = "IMPORT"; TokenType["IN"] = "IN"; TokenType["IS"] = "IS"; TokenType["LAMBDA"] = "LAMBDA"; TokenType["MATCH"] = "MATCH"; TokenType["CASE"] = "CASE"; TokenType["NONE"] = "NONE"; TokenType["NONLOCAL"] = "NONLOCAL"; TokenType["NOT"] = "NOT"; TokenType["OR"] = "OR"; TokenType["PASS"] = "PASS"; TokenType["RAISE"] = "RAISE"; TokenType["RETURN"] = "RETURN"; TokenType["TRUE"] = "TRUE"; TokenType["TRY"] = "TRY"; TokenType["WHILE"] = "WHILE"; TokenType["WITH"] = "WITH"; TokenType["YIELD"] = "YIELD"; // Operators TokenType["PLUS"] = "PLUS"; TokenType["MINUS"] = "MINUS"; TokenType["STAR"] = "STAR"; TokenType["DOUBLESTAR"] = "DOUBLESTAR"; TokenType["SLASH"] = "SLASH"; TokenType["DOUBLESLASH"] = "DOUBLESLASH"; TokenType["PERCENT"] = "PERCENT"; TokenType["AT"] = "AT"; TokenType["VBAR"] = "VBAR"; TokenType["AMPER"] = "AMPER"; TokenType["CIRCUMFLEX"] = "CIRCUMFLEX"; TokenType["TILDE"] = "TILDE"; TokenType["LEFTSHIFT"] = "LEFTSHIFT"; TokenType["RIGHTSHIFT"] = "RIGHTSHIFT"; // Delimiters TokenType["LPAR"] = "LPAR"; TokenType["RPAR"] = "RPAR"; TokenType["LSQB"] = "LSQB"; TokenType["RSQB"] = "RSQB"; TokenType["LBRACE"] = "LBRACE"; TokenType["RBRACE"] = "RBRACE"; TokenType["COMMA"] = "COMMA"; TokenType["COLON"] = "COLON"; TokenType["DOT"] = "DOT"; TokenType["SEMI"] = "SEMI"; TokenType["EQUAL"] = "EQUAL"; TokenType["RARROW"] = "RARROW"; // Comparison operators TokenType["EQEQUAL"] = "EQEQUAL"; TokenType["NOTEQUAL"] = "NOTEQUAL"; TokenType["LESS"] = "LESS"; TokenType["GREATER"] = "GREATER"; TokenType["LESSEQUAL"] = "LESSEQUAL"; TokenType["GREATEREQUAL"] = "GREATEREQUAL"; // Assignment operators TokenType["PLUSEQUAL"] = "PLUSEQUAL"; TokenType["MINEQUAL"] = "MINEQUAL"; TokenType["STAREQUAL"] = "STAREQUAL"; TokenType["SLASHEQUAL"] = "SLASHEQUAL"; TokenType["PERCENTEQUAL"] = "PERCENTEQUAL"; TokenType["AMPEREQUAL"] = "AMPEREQUAL"; TokenType["VBAREQUAL"] = "VBAREQUAL"; TokenType["CIRCUMFLEXEQUAL"] = "CIRCUMFLEXEQUAL"; TokenType["LEFTSHIFTEQUAL"] = "LEFTSHIFTEQUAL"; TokenType["RIGHTSHIFTEQUAL"] = "RIGHTSHIFTEQUAL"; TokenType["DOUBLESTAREQUAL"] = "DOUBLESTAREQUAL"; TokenType["DOUBLESLASHEQUAL"] = "DOUBLESLASHEQUAL"; TokenType["ATEQUAL"] = "ATEQUAL"; TokenType["COLONEQUAL"] = "COLONEQUAL"; // Special tokens TokenType["NEWLINE"] = "NEWLINE"; TokenType["INDENT"] = "INDENT"; TokenType["DEDENT"] = "DEDENT"; TokenType["COMMENT"] = "COMMENT"; TokenType["EOF"] = "EOF"; TokenType["ELLIPSIS"] = "ELLIPSIS"; // String formatting TokenType["FSTRING_START"] = "FSTRING_START"; TokenType["FSTRING_MIDDLE"] = "FSTRING_MIDDLE"; TokenType["FSTRING_END"] = "FSTRING_END"; })(TokenType || (TokenType = {})); const KEYWORDS = new Map([ ["and", TokenType.AND], ["as", TokenType.AS], ["assert", TokenType.ASSERT], ["async", TokenType.ASYNC], ["await", TokenType.AWAIT], ["break", TokenType.BREAK], ["class", TokenType.CLASS], ["continue", TokenType.CONTINUE], ["def", TokenType.DEF], ["del", TokenType.DEL], ["elif", TokenType.ELIF], ["else", TokenType.ELSE], ["except", TokenType.EXCEPT], ["False", TokenType.FALSE], ["finally", TokenType.FINALLY], ["for", TokenType.FOR], ["from", TokenType.FROM], ["global", TokenType.GLOBAL], ["if", TokenType.IF], ["import", TokenType.IMPORT], ["in", TokenType.IN], ["is", TokenType.IS], ["lambda", TokenType.LAMBDA], ["match", TokenType.MATCH], ["case", TokenType.CASE], ["None", TokenType.NONE], ["nonlocal", TokenType.NONLOCAL], ["not", TokenType.NOT], ["or", TokenType.OR], ["pass", TokenType.PASS], ["raise", TokenType.RAISE], ["return", TokenType.RETURN], ["True", TokenType.TRUE], ["try", TokenType.TRY], ["while", TokenType.WHILE], ["with", TokenType.WITH], ["yield", TokenType.YIELD], ]); class Lexer { constructor(source) { this.tokens = []; this.indentStack = [0]; this.atLineStart = true; this.parenLevel = 0; this.bracketLevel = 0; this.braceLevel = 0; this.source = source; this.position = { line: 1, column: 0, index: 0 }; } tokenize() { this.tokens = []; this.position = { line: 1, column: 0, index: 0 }; this.indentStack = [0]; this.atLineStart = true; this.parenLevel = 0; this.bracketLevel = 0; this.braceLevel = 0; while (this.position.index < this.source.length) { this.scanToken(); } // Add final dedents while (this.indentStack.length > 1) { this.indentStack.pop(); this.addToken(TokenType.DEDENT, ""); } this.addToken(TokenType.EOF, ""); return this.tokens; } scanToken() { const c = this.peek(); if (c === "\n") { this.scanNewline(); return; } if (this.atLineStart) { this.scanIndentation(); this.atLineStart = false; // After scanning indentation, we need to scan the token at the current position // So we recursively call scanToken to handle the actual token if (this.position.index < this.source.length) { this.scanToken(); } return; } // Skip whitespace (except newlines) if (c === " " || c === "\t" || c === "\r") { this.advance(); return; } // Comments if (c === "#") { this.scanComment(); return; } // String literals if (c === '"' || c === "'") { this.scanString(); return; } // Numbers if (this.isDigit(c)) { this.scanNumber(); return; } // Identifiers and keywords - check for f-strings first if (this.isAlpha(c) || c === "_") { // Check for f-string if (c.toLowerCase() === "f" && this.position.index + 1 < this.source.length) { const nextChar = this.peekNext(); if (nextChar === '"' || nextChar === "'") { this.scanFString(); return; } } this.scanIdentifier(); return; } // Three-character operators (check before two-character to avoid conflicts) const threeChar = this.source.slice(this.position.index, this.position.index + 3); if (this.scanThreeCharOperator(threeChar)) { return; } // Two-character operators const twoChar = this.source.slice(this.position.index, this.position.index + 2); if (this.scanTwoCharOperator(twoChar)) { return; } // Single-character operators and delimiters this.scanSingleCharOperator(c); } scanNewline() { const start = { ...this.position }; // Create a copy this.advance(); // consume '\n' // Only emit NEWLINE if we're not inside parentheses/brackets/braces if (this.parenLevel === 0 && this.bracketLevel === 0 && this.braceLevel === 0) { this.addTokenAt(TokenType.NEWLINE, "\n", start); } this.atLineStart = true; } scanIndentation() { let indent = 0; while (this.position.index < this.source.length) { const c = this.peek(); if (c === " ") { indent++; this.advance(); } else if (c === "\t") { indent += 8; // Tab counts as 8 spaces this.advance(); } else { break; } } // Skip empty lines and comment-only lines const c = this.peek(); if (c === "\n" || c === "#" || this.position.index >= this.source.length) { return; } // Skip indentation tracking when inside parentheses, brackets, or braces if (this.parenLevel > 0 || this.bracketLevel > 0 || this.braceLevel > 0) { return; } const currentIndent = this.indentStack[this.indentStack.length - 1]; if (indent > currentIndent) { this.indentStack.push(indent); this.addToken(TokenType.INDENT, ""); } else if (indent < currentIndent) { while (this.indentStack.length > 1 && this.indentStack[this.indentStack.length - 1] > indent) { this.indentStack.pop(); this.addToken(TokenType.DEDENT, ""); } if (this.indentStack[this.indentStack.length - 1] !== indent) { throw new Error(`Indentation error at line ${this.position.line}`); } } } scanComment() { const start = { ...this.position }; // Create a copy this.advance(); // consume '#' let value = "#"; while (this.position.index < this.source.length && this.peek() !== "\n") { value += this.peek(); this.advance(); } this.addTokenAt(TokenType.COMMENT, value, start); } scanString() { const start = { ...this.position }; // Create a copy const quote = this.peek(); this.advance(); // consume opening quote // Check for triple quotes const isTripleQuote = this.peek() === quote && this.peekNext() === quote; if (isTripleQuote) { this.advance(); // consume second quote this.advance(); // consume third quote } let value = quote; if (isTripleQuote) { value += quote + quote; } let stringClosed = false; while (this.position.index < this.source.length) { const c = this.peek(); if (c === "\\") { value += c; this.advance(); if (this.position.index < this.source.length) { value += this.peek(); this.advance(); } continue; } if (isTripleQuote) { if (c === quote && this.peekNext() === quote && this.peek(2) === quote) { value += quote + quote + quote; this.advance(); // consume first quote this.advance(); // consume second quote this.advance(); // consume third quote stringClosed = true; break; } } else { if (c === quote) { value += quote; this.advance(); stringClosed = true; break; } if (c === "\n") { throw new Error(`Unterminated string literal at line ${this.position.line}`); } } value += c; this.advance(); } // If we reached end of source without closing the string, it's an error if (!stringClosed) { if (isTripleQuote) { throw new Error(`Unterminated triple-quoted string literal at line ${start.line}`); } else { throw new Error(`Unterminated string literal at line ${start.line}`); } } this.addTokenAt(TokenType.STRING, value, start); } scanFString() { const start = { ...this.position }; // Create a copy // Consume 'f' let value = this.peek(); this.advance(); // Get the quote character const quote = this.peek(); value += quote; this.advance(); // Check for triple quotes const isTripleQuote = this.peek() === quote && this.peekNext() === quote; if (isTripleQuote) { value += quote + quote; this.advance(); // consume second quote this.advance(); // consume third quote } let braceLevel = 0; let stringClosed = false; while (this.position.index < this.source.length) { const c = this.peek(); // Handle escape sequences if (c === "\\") { value += c; this.advance(); if (this.position.index < this.source.length) { value += this.peek(); this.advance(); } continue; } // Track braces to handle nested expressions if (c === "{") { braceLevel++; value += c; this.advance(); continue; } if (c === "}") { if (braceLevel > 0) { braceLevel--; } value += c; this.advance(); continue; } // Check for closing quote only when not inside braces if (braceLevel === 0) { if (isTripleQuote) { if (c === quote && this.peekNext() === quote && this.peek(2) === quote) { value += quote + quote + quote; this.advance(); // consume first quote this.advance(); // consume second quote this.advance(); // consume third quote stringClosed = true; break; } } else { if (c === quote) { value += quote; this.advance(); stringClosed = true; break; } if (c === "\n") { throw new Error(`Unterminated f-string literal at line ${this.position.line}`); } } } value += c; this.advance(); } // If we reached end of source without closing the f-string, it's an error if (!stringClosed) { if (isTripleQuote) { throw new Error(`Unterminated triple-quoted f-string literal at line ${start.line}`); } else { throw new Error(`Unterminated f-string literal at line ${start.line}`); } } this.addTokenAt(TokenType.STRING, value, start); } scanNumber() { const start = { ...this.position }; // Create a copy let value = ""; // Handle different number formats (decimal, hex, octal, binary) if (this.peek() === "0" && this.position.index + 1 < this.source.length) { const next = this.peekNext().toLowerCase(); if (next === "x" || next === "o" || next === "b") { value += this.peek(); // '0' this.advance(); value += this.peek(); // 'x', 'o', or 'b' this.advance(); const isHex = next === "x"; const isOctal = next === "o"; const isBinary = next === "b"; while (this.position.index < this.source.length) { const c = this.peek().toLowerCase(); if ((isHex && this.isHexDigit(c)) || (isOctal && this.isOctalDigit(c)) || (isBinary && this.isBinaryDigit(c))) { value += this.peek(); this.advance(); } else if (c === "_") { // Skip underscores in numbers this.advance(); } else { break; } } this.addTokenAt(TokenType.NUMBER, value, start); return; } } // Regular decimal number while (this.position.index < this.source.length && (this.isDigit(this.peek()) || this.peek() === "_")) { if (this.peek() !== "_") { value += this.peek(); } this.advance(); } // Handle decimal point if (this.peek() === "." && this.position.index + 1 < this.source.length && this.isDigit(this.peekNext())) { value += this.peek(); this.advance(); while (this.position.index < this.source.length && (this.isDigit(this.peek()) || this.peek() === "_")) { if (this.peek() !== "_") { value += this.peek(); } this.advance(); } } // Handle scientific notation if (this.peek().toLowerCase() === "e") { value += this.peek(); this.advance(); if (this.peek() === "+" || this.peek() === "-") { value += this.peek(); this.advance(); } while (this.position.index < this.source.length && (this.isDigit(this.peek()) || this.peek() === "_")) { if (this.peek() !== "_") { value += this.peek(); } this.advance(); } } // Handle complex numbers if (this.peek().toLowerCase() === "j") { value += this.peek(); this.advance(); } this.addTokenAt(TokenType.NUMBER, value, start); } scanIdentifier() { const start = { ...this.position }; // Create a copy let value = ""; while (this.position.index < this.source.length && (this.isAlphaNumeric(this.peek()) || this.peek() === "_")) { value += this.peek(); this.advance(); } // Check if this is a string prefix (f, r, b, u, fr, rf, br, rb) if (this.isStringPrefix(value) && (this.peek() === '"' || this.peek() === "'")) { // This is a prefixed string, scan the string part this.scanPrefixedString(value, start); return; } const tokenType = KEYWORDS.get(value) || TokenType.NAME; this.addTokenAt(tokenType, value, start); } isStringPrefix(value) { const lowerValue = value.toLowerCase(); return ["f", "r", "b", "u", "fr", "rf", "br", "rb"].includes(lowerValue); } scanPrefixedString(prefix, start) { const quote = this.peek(); this.advance(); // consume opening quote // Check for triple quotes const isTripleQuote = this.peek() === quote && this.peekNext() === quote; if (isTripleQuote) { this.advance(); // consume second quote this.advance(); // consume third quote } let value = prefix + quote; if (isTripleQuote) { value += quote + quote; } while (this.position.index < this.source.length) { const c = this.peek(); if (c === "\\") { value += c; this.advance(); if (this.position.index < this.source.length) { value += this.peek(); this.advance(); } continue; } if (isTripleQuote) { if (c === quote && this.peekNext() === quote && this.peek(2) === quote) { value += quote + quote + quote; this.advance(); // consume first quote this.advance(); // consume second quote this.advance(); // consume third quote break; } } else { if (c === quote) { value += quote; this.advance(); break; } if (c === "\n") { throw new Error(`Unterminated string literal at line ${this.position.line}`); } } value += c; this.advance(); } this.addTokenAt(TokenType.STRING, value, start); } scanTwoCharOperator(twoChar) { const start = { ...this.position }; // Create a copy let tokenType = null; switch (twoChar) { case "**": tokenType = TokenType.DOUBLESTAR; break; case "//": tokenType = TokenType.DOUBLESLASH; break; case "<<": tokenType = TokenType.LEFTSHIFT; break; case ">>": tokenType = TokenType.RIGHTSHIFT; break; case "==": tokenType = TokenType.EQEQUAL; break; case "!=": tokenType = TokenType.NOTEQUAL; break; case "<=": tokenType = TokenType.LESSEQUAL; break; case ">=": tokenType = TokenType.GREATEREQUAL; break; case "+=": tokenType = TokenType.PLUSEQUAL; break; case "-=": tokenType = TokenType.MINEQUAL; break; case "*=": tokenType = TokenType.STAREQUAL; break; case "/=": tokenType = TokenType.SLASHEQUAL; break; case "%=": tokenType = TokenType.PERCENTEQUAL; break; case "&=": tokenType = TokenType.AMPEREQUAL; break; case "|=": tokenType = TokenType.VBAREQUAL; break; case "^=": tokenType = TokenType.CIRCUMFLEXEQUAL; break; case "@=": tokenType = TokenType.ATEQUAL; break; case ":=": tokenType = TokenType.COLONEQUAL; break; case "->": tokenType = TokenType.RARROW; break; } if (tokenType) { this.advance(); this.advance(); this.addTokenAt(tokenType, twoChar, start); return true; } return false; } scanThreeCharOperator(threeChar) { const start = { ...this.position }; // Create a copy let tokenType = null; switch (threeChar) { case "...": tokenType = TokenType.ELLIPSIS; break; case "<<=": tokenType = TokenType.LEFTSHIFTEQUAL; break; case ">>=": tokenType = TokenType.RIGHTSHIFTEQUAL; break; case "**=": tokenType = TokenType.DOUBLESTAREQUAL; break; case "//=": tokenType = TokenType.DOUBLESLASHEQUAL; break; case "^=": tokenType = TokenType.CIRCUMFLEXEQUAL; break; } if (tokenType) { this.advance(); this.advance(); this.advance(); this.addTokenAt(tokenType, threeChar, start); return true; } return false; } scanSingleCharOperator(c) { const start = { ...this.position }; // Create a copy let tokenType; switch (c) { case "+": tokenType = TokenType.PLUS; break; case "-": tokenType = TokenType.MINUS; break; case "*": tokenType = TokenType.STAR; break; case "/": tokenType = TokenType.SLASH; break; case "%": tokenType = TokenType.PERCENT; break; case "@": tokenType = TokenType.AT; break; case "|": tokenType = TokenType.VBAR; break; case "&": tokenType = TokenType.AMPER; break; case "^": tokenType = TokenType.CIRCUMFLEX; break; case "~": tokenType = TokenType.TILDE; break; case "(": tokenType = TokenType.LPAR; this.parenLevel++; break; case ")": tokenType = TokenType.RPAR; this.parenLevel--; break; case "[": tokenType = TokenType.LSQB; this.bracketLevel++; break; case "]": tokenType = TokenType.RSQB; this.bracketLevel--; break; case "{": tokenType = TokenType.LBRACE; this.braceLevel++; break; case "}": tokenType = TokenType.RBRACE; this.braceLevel--; break; case ",": tokenType = TokenType.COMMA; break; case ":": tokenType = TokenType.COLON; break; case ".": tokenType = TokenType.DOT; break; case ";": tokenType = TokenType.SEMI; break; case "=": tokenType = TokenType.EQUAL; break; case "<": tokenType = TokenType.LESS; break; case ">": tokenType = TokenType.GREATER; break; case "\\": // Handle line continuation if (this.peek(1) === "\n") { this.advance(); // consume '\\' this.advance(); // consume '\n' this.position.line++; this.position.column = 0; return; // Don't emit a token, just continue } else { throw new Error(`Unexpected character '${c}' at line ${this.position.line}, column ${this.position.column}`); } default: throw new Error(`Unexpected character '${c}' at line ${this.position.line}, column ${this.position.column}`); } this.advance(); this.addTokenAt(tokenType, c, start); } peek(offset = 0) { const index = this.position.index + offset; return index < this.source.length ? this.source[index] : ""; } peekNext() { return this.peek(1); } advance() { const c = this.peek(); if (c === "\n") { this.position.line++; this.position.column = 0; } else { this.position.column++; } this.position.index++; return c; } addToken(type, value) { this.addTokenAt(type, value, this.position); } addTokenAt(type, value, start) { this.tokens.push({ type, value, lineno: start.line, col_offset: start.column, end_lineno: this.position.line, end_col_offset: this.position.column, }); } isDigit(c) { return c >= "0" && c <= "9"; } isHexDigit(c) { return this.isDigit(c) || (c >= "a" && c <= "f") || (c >= "A" && c <= "F"); } isOctalDigit(c) { return c >= "0" && c <= "7"; } isBinaryDigit(c) { return c === "0" || c === "1"; } isAlpha(c) { // Support Unicode letters using regex return /^[\p{L}]$/u.test(c); } isAlphaNumeric(c) { return this.isAlpha(c) || this.isDigit(c); } } /** * Python Parser - Recursive Descent Parser for Python Source Code * Based on the Python ASDL grammar specification */ class Parser { constructor(source, options = {}) { this.current = 0; this.lastNonCommentTokenLine = 0; // Track the line of the last non-comment, non-newline token this.pendingComments = []; // Temporary storage for comments during expression parsing const lexer = new Lexer(source); this.tokens = lexer.tokenize(); this.includeComments = options.comments ?? false; // Filter out comments unless needed if (!this.includeComments) { this.tokens = this.tokens.filter((token) => token.type !== TokenType.COMMENT); } } parse() { this.current = 0; return this.parseFileInput(); } // ==== Top level parser ==== parseFileInput() { const body = []; // Skip leading newlines while (this.match(TokenType.NEWLINE)) { // Skip } while (!this.isAtEnd()) { if (this.match(TokenType.NEWLINE)) { continue; } // Handle comments that were collected during token peeking if (this.includeComments && this.pendingComments.length > 0) { for (const comment of this.pendingComments) { // If this is an inline comment and we have a previous statement, attach it if (comment.inline && body.length > 0) { const lastStmt = body[body.length - 1]; // Add the comment as metadata to the last statement if (!lastStmt.inlineComment) { lastStmt.inlineComment = comment; } } else { // For standalone comments, add as separate statement body.push(comment); } } // Clear pending comments after processing this.pendingComments = []; } // Parse comments as proper statement nodes when includeComments is enabled if (this.includeComments && this.check(TokenType.COMMENT)) { const comment = this.parseCommentStatement(); // If this is an inline comment and we have a previous statement, attach it if (comment.inline && body.length > 0) { const lastStmt = body[body.length - 1]; // Add the comment as metadata to the last statement if (!lastStmt.inlineComment) { lastStmt.inlineComment = comment; } } else { // For standalone comments, add as separate statement body.push(comment); } continue; } const stmt = this.parseStatement(); if (stmt) { body.push(stmt); // Process any comments that were collected during statement parsing if (this.includeComments && this.pendingComments.length > 0) { for (const comment of this.pendingComments) { if (comment.inline) { // Attach inline comment to the statement we just parsed if (!stmt.inlineComment) { stmt.inlineComment = comment; } } else { // Add standalone comment as separate statement body.push(comment); } } // Clear pending comments after processing this.pendingComments = []; } } } // Handle any remaining pending comments after the main parsing loop if (this.includeComments && this.pendingComments.length > 0) { for (const comment of this.pendingComments) { if (comment.inline && body.length > 0) { // Attach inline comment to the last statement const lastStmt = body[body.length - 1]; if (!lastStmt.inlineComment) { lastStmt.inlineComment = comment; } } else { // Add standalone comment as separate statement body.push(comment); } } // Clear pending comments after processing this.pendingComments = []; } const result = { nodeType: "Module", body, lineno: 1, col_offset: 0, }; // If comments are enabled, collect all comments and add them to the module if (this.includeComments) { result.comments = this.collectAllComments(result); } return result; } // Parse a comment as a statement node parseCommentStatement() { const token = this.consume(TokenType.COMMENT, "Expected comment"); // Check if this is an inline comment (on the same line as previous content) const isInline = token.lineno === this.lastNonCommentTokenLine; return { nodeType: "Comment", value: token.value, lineno: token.lineno, col_offset: token.col_offset, end_lineno: token.end_lineno, end_col_offset: token.end_col_offset, inline: isInline, }; } // Collect all comments from the AST (both standalone and inline) collectAllComments(module) { const comments = []; const collectFromBody = (body) => { for (const stmt of body) { if (stmt.nodeType === "Comment") { comments.push(stmt); } else { // Check for inline comments attached to this statement if (stmt.inlineComment) { comments.push(stmt.inlineComment); } // Recursively collect from nested bodies this.collectFromStatement(stmt, comments); } } }; collectFromBody(module.body); // Also include any pending comments from expression parsing comments.push(...this.pendingComments); return comments; } // Helper to collect comments from nested statement bodies collectFromStatement(stmt, comments) { switch (stmt.nodeType) { case "FunctionDef": case "AsyncFunctionDef": this.collectFromBody(stmt.body, comments); break; case "ClassDef": this.collectFromBody(stmt.body, comments); break; case "If": this.collectFromBody(stmt.body, comments); this.collectFromBody(stmt.orelse, comments); break; case "For": case "AsyncFor": this.collectFromBody(stmt.body, comments); this.collectFromBody(stmt.orelse, comments); break; case "While": this.collectFromBody(stmt.body, comments); this.collectFromBody(stmt.orelse, comments); break; case "With": case "AsyncWith": this.collectFromBody(stmt.body, comments); break; case "Try": this.collectFromBody(stmt.body, comments); if (stmt.handlers) { for (const handler of stmt.handlers) { this.collectFromBody(handler.body, comments); } } this.collectFromBody(stmt.orelse, comments); this.collectFromBody(stmt.finalbody, comments); break; case "Match": if (stmt.cases) { for (const case_ of stmt.cases) { this.collectFromBody(case_.body, comments); } } break; } } // Helper to collect comments from a statement body collectFromBody(body, comments) { for (const stmt of body) { if (stmt.nodeType === "Comment") { comments.push(stmt); } else { if (stmt.inlineComment) { comments.push(stmt.inlineComment); } this.collectFromStatement(stmt, comments); } } } // ==== Statement parsers ==== parseStatement() { // Handle indentation if (this.check(TokenType.INDENT)) { // INDENT tokens should only appear after compound statements throw this.error("unexpected indent"); } if (this.match(TokenType.DEDENT)) { return null; } // Check for decorators first if (this.check(TokenType.AT)) { return this.parseDecorated(); } return this.parseSimpleStmt() || this.parseCompoundStmt(); } parseSimpleStmt() { const stmt = this.parseSmallStmt(); // Handle multiple statements on one line while (this.match(TokenType.SEMI)) { if (!this.check(TokenType.NEWLINE) && !this.isAtEnd()) { // Additional statements on the same line would go here // For simplicity, we'll just parse the first one break; } } this.match(TokenType.NEWLINE); // Optional newline return stmt; } parseSmallStmt() { const start = this.peek(); // Check if this is a compound statement keyword - let parseCompoundStmt handle it if (this.check(TokenType.DEF) || this.check(TokenType.CLASS) || this.check(TokenType.IF) || this.check(TokenType.WHILE) || this.check(TokenType.FOR) || this.check(TokenType.TRY) || this.check(TokenType.WITH) || this.check(TokenType.ASYNC) || this.check(TokenType.MATCH)) { return null; } // Handle pass statement if (this.match(TokenType.PASS)) { return { nodeType: "Pass", lineno: start.lineno, col_offset: start.col_offset, }; } // Handle break statement if (this.match(TokenType.BREAK)) { return { nodeType: "Break", lineno: start.lineno, col_offset: start.col_offset, }; } // Handle continue statement if (this.match(TokenType.CONTINUE)) { return { nodeType: "Continue", lineno: start.lineno, col_offset: start.col_offset, }; } // Handle return statement if (this.match(TokenType.RETURN)) { let value; if (!this.check(TokenType.NEWLINE) && !this.check(TokenType.SEMI) && !this.isAtEnd()) { value = this.parseTestList(); } return { nodeType: "Return", value, lineno: start.lineno, col_offset: start.col_offset, }; } // Handle delete statement if (this.match(TokenType.DEL)) { const targets = []; targets.push(this.parseExpr()); while (this.match(TokenType.COMMA)) { targets.push(this.parseExpr()); } return { nodeType: "Delete", targets, lineno: start.lineno, col_offset: start.col_offset, }; } // Handle global statement if (this.match(TokenType.GLOBAL)) { const names = []; names.push(this.consume(TokenType.NAME, "Expected name after 'global'").value); while (this.match(TokenType.COMMA)) { names.push(this.consume(TokenType.NAME, "Expected name after ','").value); } return { nodeType: "Global", names, lineno: start.lineno, col_offset: start.col_offset, }; } // Handle nonlocal statement if (this.match(TokenType.NONLOCAL)) { const names = []; names.push(this.consume(TokenType.NAME, "Expected name after 'nonlocal'").value); while (this.match(TokenType.COMMA)) { names.push(this.consume(TokenType.NAME, "Expected name after ','").value); } return { nodeType: "Nonlocal", names, lineno: start.lineno, col_offset: start.col_offset, }; } // Handle import statement if (this.match(TokenType.IMPORT)) { const names = []; do { let name = this.consume(TokenType.NAME, "Expected module name").value; // Handle dotted names like 'os.path' while (this.match(TokenType.DOT)) { name += `.${this.consume(TokenType.NAME, "Expected name after '.'").value}`; } let asname; if (this.match(TokenType.AS)) { asname = this.consume(TokenType.NAME, "Expected name after 'as'").value; } names.push({ name, asname }); } while (this.match(TokenType.COMMA)); return { nodeType: "Import", names: names.map((n) => ({ nodeType: "Alias", name: n.name, asname: n.asname, lineno: start.lineno, col_offset: start.col_offset, })), lineno: start.lineno, col_offset: start.col_offset, }; } // Handle from import statement if (this.match(TokenType.FROM)) { let level = 0; // Handle relative imports (.., ., ..., etc.) while (this.match(TokenType.DOT)) { level++; } // Handle ellipsis (...) as three dots if (this.match(TokenType.ELLIPSIS)) { level += 3; } let module; if (this.check(TokenType.NAME)) { module = this.advance().value; // Handle dotted module names while (this.match(TokenType.DOT)) { module += `.${this.consume(TokenType.NAME, "Expected name after '.'").value}`; } } this.consume(TokenType.IMPORT, "Expected 'import' after module name"); const names = []; // Handle parenthesized import lists const hasParens = this.match(TokenType.LPAR); if (this.match(TokenType.STAR)) { names.push({ name: "*" }); } else { // Parse the first name const firstName = this.consume(TokenType.NAME, "Expected name").value; let firstAsname; if (this.match(TokenType.AS)) { firstAsname = this.consume(TokenType.NAME, "Expected name after 'as'").value; } names.push({ name: firstName, asname: firstAsname }); // Parse additional names if there are commas while (this.match(TokenType.COMMA)) { // Skip any newlines after comma (for multiline imports) while (this.match(TokenType.NEWLINE)) { // Skip newlines } // Check if we've reached the end (trailing comma case) if (hasParens && this.check(TokenType.RPAR)) break; if (!hasParens && (this.check(TokenType.NEWLINE) || this.isAtEnd())) break; const name = this.consume(TokenType.NAME, "Expected name").value; let asname; if (this.match(TokenType.AS)) { asname = this.consume(TokenType.NAME, "Expected name after 'as'").value; } names.push({ name, asname }); } } if (hasParens) { this.consume(TokenType.RPAR, "Expected ')' after import list"); } return { nodeType: "ImportFrom", module, names: names.map((n) => ({ nodeType: "Alias", name: n.name, asname: n.asname, lineno: start.lineno, col_offset: start.col_offset, })), level, lineno: start.lineno, col_offset: start.col_offset, }; } // Handle raise statement if (this.match(TokenType.RAISE)) { let exc; let cause; if (!this.check(TokenType.NEWLINE) && !this.check(TokenType.SEMI) && !this.check(TokenType.DEDENT) && !this.check(TokenType.COMMENT) && !this.isAtEnd()) { exc = this.parseTest(); if (this.match(TokenType.FROM)) { cause = this.parseTest(); } } return { nodeType: "Raise", exc, cause, lineno: start.lineno, col_offset: start.col_offset, }; } // Handle assert statement if (this.match(TokenType.ASSERT)) { const test = this.parseTest(); let msg; if (this.match(TokenType.COMMA)) { msg = this.parseTest(); } return { nodeType: "Assert", test, msg, lineno: start.lineno, col_offset: start.col_offset, }; } // Handle type alias statement (Python 3.12+) if (this.check(TokenType.NAME) && this.peek().value === "type") { const start = this.peek(); this.advance(); // consume 'type' const nameToken = this.consume(TokenType.NAME, "Expected type alias name").value; // Type parameters (optional) const type_params = this.parseTypeParams(); this.consume(TokenType.EQUAL, "Expected '=' in type alias"); const value = this.parseTest(); return { nodeType: "TypeAlias", name: { nodeType: "Name", id: nameToken, ctx: { nodeType: "Store" }, lineno: start.lineno, col_offset: start.col_offset, }, type_params, value, lineno: start.lineno, col_offset: start.col_offset, }; } // Expression statement (including assignments) const expr = this.parseTestListWithStar(); // Check for assignment operators if (this.match(TokenType.EQUAL)) { // Regular assignment - handle multiple assignment const targets = [expr]; this.validateAssignmentTarget(expr); let value = this.parseTestList(); // Collect any comments that were gathered during value parsing const expressionComments = []; if (this.includeComments && this.pendingComments.length > 0) { expressionComments.push(...this.pendingComments); this.pendingComments = []; } // Check for chained assignments like x = y = z while (this.match(TokenType.EQUAL)) { this.validateAssignmentTarget(value); targets.push(value); value = this.parseTestList(); // Collect any additional comments from chained assignment parsing if (this.includeComments && this.pendin