py-ast
Version:
A TypeScript library for parsing and walking Python Abstract Syntax Trees
1,377 lines (1,376 loc) • 207 kB
JavaScript
/**
* Python Lexical Analyzer (Tokenizer)
* Converts Python source code into a stream of tokens
*/
var TokenType;
(function (TokenType) {
// Literals
TokenType["NUMBER"] = "NUMBER";
TokenType["STRING"] = "STRING";
TokenType["NAME"] = "NAME";
// Keywords
TokenType["AND"] = "AND";
TokenType["AS"] = "AS";
TokenType["ASSERT"] = "ASSERT";
TokenType["ASYNC"] = "ASYNC";
TokenType["AWAIT"] = "AWAIT";
TokenType["BREAK"] = "BREAK";
TokenType["CLASS"] = "CLASS";
TokenType["CONTINUE"] = "CONTINUE";
TokenType["DEF"] = "DEF";
TokenType["DEL"] = "DEL";
TokenType["ELIF"] = "ELIF";
TokenType["ELSE"] = "ELSE";
TokenType["EXCEPT"] = "EXCEPT";
TokenType["FALSE"] = "FALSE";
TokenType["FINALLY"] = "FINALLY";
TokenType["FOR"] = "FOR";
TokenType["FROM"] = "FROM";
TokenType["GLOBAL"] = "GLOBAL";
TokenType["IF"] = "IF";
TokenType["IMPORT"] = "IMPORT";
TokenType["IN"] = "IN";
TokenType["IS"] = "IS";
TokenType["LAMBDA"] = "LAMBDA";
TokenType["MATCH"] = "MATCH";
TokenType["CASE"] = "CASE";
TokenType["NONE"] = "NONE";
TokenType["NONLOCAL"] = "NONLOCAL";
TokenType["NOT"] = "NOT";
TokenType["OR"] = "OR";
TokenType["PASS"] = "PASS";
TokenType["RAISE"] = "RAISE";
TokenType["RETURN"] = "RETURN";
TokenType["TRUE"] = "TRUE";
TokenType["TRY"] = "TRY";
TokenType["WHILE"] = "WHILE";
TokenType["WITH"] = "WITH";
TokenType["YIELD"] = "YIELD";
// Operators
TokenType["PLUS"] = "PLUS";
TokenType["MINUS"] = "MINUS";
TokenType["STAR"] = "STAR";
TokenType["DOUBLESTAR"] = "DOUBLESTAR";
TokenType["SLASH"] = "SLASH";
TokenType["DOUBLESLASH"] = "DOUBLESLASH";
TokenType["PERCENT"] = "PERCENT";
TokenType["AT"] = "AT";
TokenType["VBAR"] = "VBAR";
TokenType["AMPER"] = "AMPER";
TokenType["CIRCUMFLEX"] = "CIRCUMFLEX";
TokenType["TILDE"] = "TILDE";
TokenType["LEFTSHIFT"] = "LEFTSHIFT";
TokenType["RIGHTSHIFT"] = "RIGHTSHIFT";
// Delimiters
TokenType["LPAR"] = "LPAR";
TokenType["RPAR"] = "RPAR";
TokenType["LSQB"] = "LSQB";
TokenType["RSQB"] = "RSQB";
TokenType["LBRACE"] = "LBRACE";
TokenType["RBRACE"] = "RBRACE";
TokenType["COMMA"] = "COMMA";
TokenType["COLON"] = "COLON";
TokenType["DOT"] = "DOT";
TokenType["SEMI"] = "SEMI";
TokenType["EQUAL"] = "EQUAL";
TokenType["RARROW"] = "RARROW";
// Comparison operators
TokenType["EQEQUAL"] = "EQEQUAL";
TokenType["NOTEQUAL"] = "NOTEQUAL";
TokenType["LESS"] = "LESS";
TokenType["GREATER"] = "GREATER";
TokenType["LESSEQUAL"] = "LESSEQUAL";
TokenType["GREATEREQUAL"] = "GREATEREQUAL";
// Assignment operators
TokenType["PLUSEQUAL"] = "PLUSEQUAL";
TokenType["MINEQUAL"] = "MINEQUAL";
TokenType["STAREQUAL"] = "STAREQUAL";
TokenType["SLASHEQUAL"] = "SLASHEQUAL";
TokenType["PERCENTEQUAL"] = "PERCENTEQUAL";
TokenType["AMPEREQUAL"] = "AMPEREQUAL";
TokenType["VBAREQUAL"] = "VBAREQUAL";
TokenType["CIRCUMFLEXEQUAL"] = "CIRCUMFLEXEQUAL";
TokenType["LEFTSHIFTEQUAL"] = "LEFTSHIFTEQUAL";
TokenType["RIGHTSHIFTEQUAL"] = "RIGHTSHIFTEQUAL";
TokenType["DOUBLESTAREQUAL"] = "DOUBLESTAREQUAL";
TokenType["DOUBLESLASHEQUAL"] = "DOUBLESLASHEQUAL";
TokenType["ATEQUAL"] = "ATEQUAL";
TokenType["COLONEQUAL"] = "COLONEQUAL";
// Special tokens
TokenType["NEWLINE"] = "NEWLINE";
TokenType["INDENT"] = "INDENT";
TokenType["DEDENT"] = "DEDENT";
TokenType["COMMENT"] = "COMMENT";
TokenType["EOF"] = "EOF";
TokenType["ELLIPSIS"] = "ELLIPSIS";
// String formatting
TokenType["FSTRING_START"] = "FSTRING_START";
TokenType["FSTRING_MIDDLE"] = "FSTRING_MIDDLE";
TokenType["FSTRING_END"] = "FSTRING_END";
})(TokenType || (TokenType = {}));
const KEYWORDS = new Map([
["and", TokenType.AND],
["as", TokenType.AS],
["assert", TokenType.ASSERT],
["async", TokenType.ASYNC],
["await", TokenType.AWAIT],
["break", TokenType.BREAK],
["class", TokenType.CLASS],
["continue", TokenType.CONTINUE],
["def", TokenType.DEF],
["del", TokenType.DEL],
["elif", TokenType.ELIF],
["else", TokenType.ELSE],
["except", TokenType.EXCEPT],
["False", TokenType.FALSE],
["finally", TokenType.FINALLY],
["for", TokenType.FOR],
["from", TokenType.FROM],
["global", TokenType.GLOBAL],
["if", TokenType.IF],
["import", TokenType.IMPORT],
["in", TokenType.IN],
["is", TokenType.IS],
["lambda", TokenType.LAMBDA],
["match", TokenType.MATCH],
["case", TokenType.CASE],
["None", TokenType.NONE],
["nonlocal", TokenType.NONLOCAL],
["not", TokenType.NOT],
["or", TokenType.OR],
["pass", TokenType.PASS],
["raise", TokenType.RAISE],
["return", TokenType.RETURN],
["True", TokenType.TRUE],
["try", TokenType.TRY],
["while", TokenType.WHILE],
["with", TokenType.WITH],
["yield", TokenType.YIELD],
]);
class Lexer {
constructor(source) {
this.tokens = [];
this.indentStack = [0];
this.atLineStart = true;
this.parenLevel = 0;
this.bracketLevel = 0;
this.braceLevel = 0;
this.source = source;
this.position = { line: 1, column: 0, index: 0 };
}
tokenize() {
this.tokens = [];
this.position = { line: 1, column: 0, index: 0 };
this.indentStack = [0];
this.atLineStart = true;
this.parenLevel = 0;
this.bracketLevel = 0;
this.braceLevel = 0;
while (this.position.index < this.source.length) {
this.scanToken();
}
// Add final dedents
while (this.indentStack.length > 1) {
this.indentStack.pop();
this.addToken(TokenType.DEDENT, "");
}
this.addToken(TokenType.EOF, "");
return this.tokens;
}
scanToken() {
const c = this.peek();
if (c === "\n") {
this.scanNewline();
return;
}
if (this.atLineStart) {
this.scanIndentation();
this.atLineStart = false;
// After scanning indentation, we need to scan the token at the current position
// So we recursively call scanToken to handle the actual token
if (this.position.index < this.source.length) {
this.scanToken();
}
return;
}
// Skip whitespace (except newlines)
if (c === " " || c === "\t" || c === "\r") {
this.advance();
return;
}
// Comments
if (c === "#") {
this.scanComment();
return;
}
// String literals
if (c === '"' || c === "'") {
this.scanString();
return;
}
// Numbers
if (this.isDigit(c)) {
this.scanNumber();
return;
}
// Identifiers and keywords - check for f-strings first
if (this.isAlpha(c) || c === "_") {
// Check for f-string
if (c.toLowerCase() === "f" &&
this.position.index + 1 < this.source.length) {
const nextChar = this.peekNext();
if (nextChar === '"' || nextChar === "'") {
this.scanFString();
return;
}
}
this.scanIdentifier();
return;
}
// Three-character operators (check before two-character to avoid conflicts)
const threeChar = this.source.slice(this.position.index, this.position.index + 3);
if (this.scanThreeCharOperator(threeChar)) {
return;
}
// Two-character operators
const twoChar = this.source.slice(this.position.index, this.position.index + 2);
if (this.scanTwoCharOperator(twoChar)) {
return;
}
// Single-character operators and delimiters
this.scanSingleCharOperator(c);
}
scanNewline() {
const start = { ...this.position }; // Create a copy
this.advance(); // consume '\n'
// Only emit NEWLINE if we're not inside parentheses/brackets/braces
if (this.parenLevel === 0 &&
this.bracketLevel === 0 &&
this.braceLevel === 0) {
this.addTokenAt(TokenType.NEWLINE, "\n", start);
}
this.atLineStart = true;
}
scanIndentation() {
let indent = 0;
while (this.position.index < this.source.length) {
const c = this.peek();
if (c === " ") {
indent++;
this.advance();
}
else if (c === "\t") {
indent += 8; // Tab counts as 8 spaces
this.advance();
}
else {
break;
}
}
// Skip empty lines and comment-only lines
const c = this.peek();
if (c === "\n" || c === "#" || this.position.index >= this.source.length) {
return;
}
// Skip indentation tracking when inside parentheses, brackets, or braces
if (this.parenLevel > 0 || this.bracketLevel > 0 || this.braceLevel > 0) {
return;
}
const currentIndent = this.indentStack[this.indentStack.length - 1];
if (indent > currentIndent) {
this.indentStack.push(indent);
this.addToken(TokenType.INDENT, "");
}
else if (indent < currentIndent) {
while (this.indentStack.length > 1 &&
this.indentStack[this.indentStack.length - 1] > indent) {
this.indentStack.pop();
this.addToken(TokenType.DEDENT, "");
}
if (this.indentStack[this.indentStack.length - 1] !== indent) {
throw new Error(`Indentation error at line ${this.position.line}`);
}
}
}
scanComment() {
const start = { ...this.position }; // Create a copy
this.advance(); // consume '#'
let value = "#";
while (this.position.index < this.source.length && this.peek() !== "\n") {
value += this.peek();
this.advance();
}
this.addTokenAt(TokenType.COMMENT, value, start);
}
scanString() {
const start = { ...this.position }; // Create a copy
const quote = this.peek();
this.advance(); // consume opening quote
// Check for triple quotes
const isTripleQuote = this.peek() === quote && this.peekNext() === quote;
if (isTripleQuote) {
this.advance(); // consume second quote
this.advance(); // consume third quote
}
let value = quote;
if (isTripleQuote) {
value += quote + quote;
}
let stringClosed = false;
while (this.position.index < this.source.length) {
const c = this.peek();
if (c === "\\") {
value += c;
this.advance();
if (this.position.index < this.source.length) {
value += this.peek();
this.advance();
}
continue;
}
if (isTripleQuote) {
if (c === quote &&
this.peekNext() === quote &&
this.peek(2) === quote) {
value += quote + quote + quote;
this.advance(); // consume first quote
this.advance(); // consume second quote
this.advance(); // consume third quote
stringClosed = true;
break;
}
}
else {
if (c === quote) {
value += quote;
this.advance();
stringClosed = true;
break;
}
if (c === "\n") {
throw new Error(`Unterminated string literal at line ${this.position.line}`);
}
}
value += c;
this.advance();
}
// If we reached end of source without closing the string, it's an error
if (!stringClosed) {
if (isTripleQuote) {
throw new Error(`Unterminated triple-quoted string literal at line ${start.line}`);
}
else {
throw new Error(`Unterminated string literal at line ${start.line}`);
}
}
this.addTokenAt(TokenType.STRING, value, start);
}
scanFString() {
const start = { ...this.position }; // Create a copy
// Consume 'f'
let value = this.peek();
this.advance();
// Get the quote character
const quote = this.peek();
value += quote;
this.advance();
// Check for triple quotes
const isTripleQuote = this.peek() === quote && this.peekNext() === quote;
if (isTripleQuote) {
value += quote + quote;
this.advance(); // consume second quote
this.advance(); // consume third quote
}
let braceLevel = 0;
let stringClosed = false;
while (this.position.index < this.source.length) {
const c = this.peek();
// Handle escape sequences
if (c === "\\") {
value += c;
this.advance();
if (this.position.index < this.source.length) {
value += this.peek();
this.advance();
}
continue;
}
// Track braces to handle nested expressions
if (c === "{") {
braceLevel++;
value += c;
this.advance();
continue;
}
if (c === "}") {
if (braceLevel > 0) {
braceLevel--;
}
value += c;
this.advance();
continue;
}
// Check for closing quote only when not inside braces
if (braceLevel === 0) {
if (isTripleQuote) {
if (c === quote &&
this.peekNext() === quote &&
this.peek(2) === quote) {
value += quote + quote + quote;
this.advance(); // consume first quote
this.advance(); // consume second quote
this.advance(); // consume third quote
stringClosed = true;
break;
}
}
else {
if (c === quote) {
value += quote;
this.advance();
stringClosed = true;
break;
}
if (c === "\n") {
throw new Error(`Unterminated f-string literal at line ${this.position.line}`);
}
}
}
value += c;
this.advance();
}
// If we reached end of source without closing the f-string, it's an error
if (!stringClosed) {
if (isTripleQuote) {
throw new Error(`Unterminated triple-quoted f-string literal at line ${start.line}`);
}
else {
throw new Error(`Unterminated f-string literal at line ${start.line}`);
}
}
this.addTokenAt(TokenType.STRING, value, start);
}
scanNumber() {
const start = { ...this.position }; // Create a copy
let value = "";
// Handle different number formats (decimal, hex, octal, binary)
if (this.peek() === "0" && this.position.index + 1 < this.source.length) {
const next = this.peekNext().toLowerCase();
if (next === "x" || next === "o" || next === "b") {
value += this.peek(); // '0'
this.advance();
value += this.peek(); // 'x', 'o', or 'b'
this.advance();
const isHex = next === "x";
const isOctal = next === "o";
const isBinary = next === "b";
while (this.position.index < this.source.length) {
const c = this.peek().toLowerCase();
if ((isHex && this.isHexDigit(c)) ||
(isOctal && this.isOctalDigit(c)) ||
(isBinary && this.isBinaryDigit(c))) {
value += this.peek();
this.advance();
}
else if (c === "_") {
// Skip underscores in numbers
this.advance();
}
else {
break;
}
}
this.addTokenAt(TokenType.NUMBER, value, start);
return;
}
}
// Regular decimal number
while (this.position.index < this.source.length &&
(this.isDigit(this.peek()) || this.peek() === "_")) {
if (this.peek() !== "_") {
value += this.peek();
}
this.advance();
}
// Handle decimal point
if (this.peek() === "." &&
this.position.index + 1 < this.source.length &&
this.isDigit(this.peekNext())) {
value += this.peek();
this.advance();
while (this.position.index < this.source.length &&
(this.isDigit(this.peek()) || this.peek() === "_")) {
if (this.peek() !== "_") {
value += this.peek();
}
this.advance();
}
}
// Handle scientific notation
if (this.peek().toLowerCase() === "e") {
value += this.peek();
this.advance();
if (this.peek() === "+" || this.peek() === "-") {
value += this.peek();
this.advance();
}
while (this.position.index < this.source.length &&
(this.isDigit(this.peek()) || this.peek() === "_")) {
if (this.peek() !== "_") {
value += this.peek();
}
this.advance();
}
}
// Handle complex numbers
if (this.peek().toLowerCase() === "j") {
value += this.peek();
this.advance();
}
this.addTokenAt(TokenType.NUMBER, value, start);
}
scanIdentifier() {
const start = { ...this.position }; // Create a copy
let value = "";
while (this.position.index < this.source.length &&
(this.isAlphaNumeric(this.peek()) || this.peek() === "_")) {
value += this.peek();
this.advance();
}
// Check if this is a string prefix (f, r, b, u, fr, rf, br, rb)
if (this.isStringPrefix(value) &&
(this.peek() === '"' || this.peek() === "'")) {
// This is a prefixed string, scan the string part
this.scanPrefixedString(value, start);
return;
}
const tokenType = KEYWORDS.get(value) || TokenType.NAME;
this.addTokenAt(tokenType, value, start);
}
isStringPrefix(value) {
const lowerValue = value.toLowerCase();
return ["f", "r", "b", "u", "fr", "rf", "br", "rb"].includes(lowerValue);
}
scanPrefixedString(prefix, start) {
const quote = this.peek();
this.advance(); // consume opening quote
// Check for triple quotes
const isTripleQuote = this.peek() === quote && this.peekNext() === quote;
if (isTripleQuote) {
this.advance(); // consume second quote
this.advance(); // consume third quote
}
let value = prefix + quote;
if (isTripleQuote) {
value += quote + quote;
}
while (this.position.index < this.source.length) {
const c = this.peek();
if (c === "\\") {
value += c;
this.advance();
if (this.position.index < this.source.length) {
value += this.peek();
this.advance();
}
continue;
}
if (isTripleQuote) {
if (c === quote &&
this.peekNext() === quote &&
this.peek(2) === quote) {
value += quote + quote + quote;
this.advance(); // consume first quote
this.advance(); // consume second quote
this.advance(); // consume third quote
break;
}
}
else {
if (c === quote) {
value += quote;
this.advance();
break;
}
if (c === "\n") {
throw new Error(`Unterminated string literal at line ${this.position.line}`);
}
}
value += c;
this.advance();
}
this.addTokenAt(TokenType.STRING, value, start);
}
scanTwoCharOperator(twoChar) {
const start = { ...this.position }; // Create a copy
let tokenType = null;
switch (twoChar) {
case "**":
tokenType = TokenType.DOUBLESTAR;
break;
case "//":
tokenType = TokenType.DOUBLESLASH;
break;
case "<<":
tokenType = TokenType.LEFTSHIFT;
break;
case ">>":
tokenType = TokenType.RIGHTSHIFT;
break;
case "==":
tokenType = TokenType.EQEQUAL;
break;
case "!=":
tokenType = TokenType.NOTEQUAL;
break;
case "<=":
tokenType = TokenType.LESSEQUAL;
break;
case ">=":
tokenType = TokenType.GREATEREQUAL;
break;
case "+=":
tokenType = TokenType.PLUSEQUAL;
break;
case "-=":
tokenType = TokenType.MINEQUAL;
break;
case "*=":
tokenType = TokenType.STAREQUAL;
break;
case "/=":
tokenType = TokenType.SLASHEQUAL;
break;
case "%=":
tokenType = TokenType.PERCENTEQUAL;
break;
case "&=":
tokenType = TokenType.AMPEREQUAL;
break;
case "|=":
tokenType = TokenType.VBAREQUAL;
break;
case "^=":
tokenType = TokenType.CIRCUMFLEXEQUAL;
break;
case "@=":
tokenType = TokenType.ATEQUAL;
break;
case ":=":
tokenType = TokenType.COLONEQUAL;
break;
case "->":
tokenType = TokenType.RARROW;
break;
}
if (tokenType) {
this.advance();
this.advance();
this.addTokenAt(tokenType, twoChar, start);
return true;
}
return false;
}
scanThreeCharOperator(threeChar) {
const start = { ...this.position }; // Create a copy
let tokenType = null;
switch (threeChar) {
case "...":
tokenType = TokenType.ELLIPSIS;
break;
case "<<=":
tokenType = TokenType.LEFTSHIFTEQUAL;
break;
case ">>=":
tokenType = TokenType.RIGHTSHIFTEQUAL;
break;
case "**=":
tokenType = TokenType.DOUBLESTAREQUAL;
break;
case "//=":
tokenType = TokenType.DOUBLESLASHEQUAL;
break;
case "^=":
tokenType = TokenType.CIRCUMFLEXEQUAL;
break;
}
if (tokenType) {
this.advance();
this.advance();
this.advance();
this.addTokenAt(tokenType, threeChar, start);
return true;
}
return false;
}
scanSingleCharOperator(c) {
const start = { ...this.position }; // Create a copy
let tokenType;
switch (c) {
case "+":
tokenType = TokenType.PLUS;
break;
case "-":
tokenType = TokenType.MINUS;
break;
case "*":
tokenType = TokenType.STAR;
break;
case "/":
tokenType = TokenType.SLASH;
break;
case "%":
tokenType = TokenType.PERCENT;
break;
case "@":
tokenType = TokenType.AT;
break;
case "|":
tokenType = TokenType.VBAR;
break;
case "&":
tokenType = TokenType.AMPER;
break;
case "^":
tokenType = TokenType.CIRCUMFLEX;
break;
case "~":
tokenType = TokenType.TILDE;
break;
case "(":
tokenType = TokenType.LPAR;
this.parenLevel++;
break;
case ")":
tokenType = TokenType.RPAR;
this.parenLevel--;
break;
case "[":
tokenType = TokenType.LSQB;
this.bracketLevel++;
break;
case "]":
tokenType = TokenType.RSQB;
this.bracketLevel--;
break;
case "{":
tokenType = TokenType.LBRACE;
this.braceLevel++;
break;
case "}":
tokenType = TokenType.RBRACE;
this.braceLevel--;
break;
case ",":
tokenType = TokenType.COMMA;
break;
case ":":
tokenType = TokenType.COLON;
break;
case ".":
tokenType = TokenType.DOT;
break;
case ";":
tokenType = TokenType.SEMI;
break;
case "=":
tokenType = TokenType.EQUAL;
break;
case "<":
tokenType = TokenType.LESS;
break;
case ">":
tokenType = TokenType.GREATER;
break;
case "\\":
// Handle line continuation
if (this.peek(1) === "\n") {
this.advance(); // consume '\\'
this.advance(); // consume '\n'
this.position.line++;
this.position.column = 0;
return; // Don't emit a token, just continue
}
else {
throw new Error(`Unexpected character '${c}' at line ${this.position.line}, column ${this.position.column}`);
}
default:
throw new Error(`Unexpected character '${c}' at line ${this.position.line}, column ${this.position.column}`);
}
this.advance();
this.addTokenAt(tokenType, c, start);
}
peek(offset = 0) {
const index = this.position.index + offset;
return index < this.source.length ? this.source[index] : "";
}
peekNext() {
return this.peek(1);
}
advance() {
const c = this.peek();
if (c === "\n") {
this.position.line++;
this.position.column = 0;
}
else {
this.position.column++;
}
this.position.index++;
return c;
}
addToken(type, value) {
this.addTokenAt(type, value, this.position);
}
addTokenAt(type, value, start) {
this.tokens.push({
type,
value,
lineno: start.line,
col_offset: start.column,
end_lineno: this.position.line,
end_col_offset: this.position.column,
});
}
isDigit(c) {
return c >= "0" && c <= "9";
}
isHexDigit(c) {
return this.isDigit(c) || (c >= "a" && c <= "f") || (c >= "A" && c <= "F");
}
isOctalDigit(c) {
return c >= "0" && c <= "7";
}
isBinaryDigit(c) {
return c === "0" || c === "1";
}
isAlpha(c) {
// Support Unicode letters using regex
return /^[\p{L}]$/u.test(c);
}
isAlphaNumeric(c) {
return this.isAlpha(c) || this.isDigit(c);
}
}
/**
* Python Parser - Recursive Descent Parser for Python Source Code
* Based on the Python ASDL grammar specification
*/
class Parser {
constructor(source, options = {}) {
this.current = 0;
this.lastNonCommentTokenLine = 0; // Track the line of the last non-comment, non-newline token
this.pendingComments = []; // Temporary storage for comments during expression parsing
const lexer = new Lexer(source);
this.tokens = lexer.tokenize();
this.includeComments = options.comments ?? false;
// Filter out comments unless needed
if (!this.includeComments) {
this.tokens = this.tokens.filter((token) => token.type !== TokenType.COMMENT);
}
}
parse() {
this.current = 0;
return this.parseFileInput();
}
// ==== Top level parser ====
parseFileInput() {
const body = [];
// Skip leading newlines
while (this.match(TokenType.NEWLINE)) {
// Skip
}
while (!this.isAtEnd()) {
if (this.match(TokenType.NEWLINE)) {
continue;
}
// Handle comments that were collected during token peeking
if (this.includeComments && this.pendingComments.length > 0) {
for (const comment of this.pendingComments) {
// If this is an inline comment and we have a previous statement, attach it
if (comment.inline && body.length > 0) {
const lastStmt = body[body.length - 1];
// Add the comment as metadata to the last statement
if (!lastStmt.inlineComment) {
lastStmt.inlineComment = comment;
}
}
else {
// For standalone comments, add as separate statement
body.push(comment);
}
}
// Clear pending comments after processing
this.pendingComments = [];
}
// Parse comments as proper statement nodes when includeComments is enabled
if (this.includeComments && this.check(TokenType.COMMENT)) {
const comment = this.parseCommentStatement();
// If this is an inline comment and we have a previous statement, attach it
if (comment.inline && body.length > 0) {
const lastStmt = body[body.length - 1];
// Add the comment as metadata to the last statement
if (!lastStmt.inlineComment) {
lastStmt.inlineComment = comment;
}
}
else {
// For standalone comments, add as separate statement
body.push(comment);
}
continue;
}
const stmt = this.parseStatement();
if (stmt) {
body.push(stmt);
// Process any comments that were collected during statement parsing
if (this.includeComments && this.pendingComments.length > 0) {
for (const comment of this.pendingComments) {
if (comment.inline) {
// Attach inline comment to the statement we just parsed
if (!stmt.inlineComment) {
stmt.inlineComment = comment;
}
}
else {
// Add standalone comment as separate statement
body.push(comment);
}
}
// Clear pending comments after processing
this.pendingComments = [];
}
}
}
// Handle any remaining pending comments after the main parsing loop
if (this.includeComments && this.pendingComments.length > 0) {
for (const comment of this.pendingComments) {
if (comment.inline && body.length > 0) {
// Attach inline comment to the last statement
const lastStmt = body[body.length - 1];
if (!lastStmt.inlineComment) {
lastStmt.inlineComment = comment;
}
}
else {
// Add standalone comment as separate statement
body.push(comment);
}
}
// Clear pending comments after processing
this.pendingComments = [];
}
const result = {
nodeType: "Module",
body,
lineno: 1,
col_offset: 0,
};
// If comments are enabled, collect all comments and add them to the module
if (this.includeComments) {
result.comments = this.collectAllComments(result);
}
return result;
}
// Parse a comment as a statement node
parseCommentStatement() {
const token = this.consume(TokenType.COMMENT, "Expected comment");
// Check if this is an inline comment (on the same line as previous content)
const isInline = token.lineno === this.lastNonCommentTokenLine;
return {
nodeType: "Comment",
value: token.value,
lineno: token.lineno,
col_offset: token.col_offset,
end_lineno: token.end_lineno,
end_col_offset: token.end_col_offset,
inline: isInline,
};
}
// Collect all comments from the AST (both standalone and inline)
collectAllComments(module) {
const comments = [];
const collectFromBody = (body) => {
for (const stmt of body) {
if (stmt.nodeType === "Comment") {
comments.push(stmt);
}
else {
// Check for inline comments attached to this statement
if (stmt.inlineComment) {
comments.push(stmt.inlineComment);
}
// Recursively collect from nested bodies
this.collectFromStatement(stmt, comments);
}
}
};
collectFromBody(module.body);
// Also include any pending comments from expression parsing
comments.push(...this.pendingComments);
return comments;
}
// Helper to collect comments from nested statement bodies
collectFromStatement(stmt, comments) {
switch (stmt.nodeType) {
case "FunctionDef":
case "AsyncFunctionDef":
this.collectFromBody(stmt.body, comments);
break;
case "ClassDef":
this.collectFromBody(stmt.body, comments);
break;
case "If":
this.collectFromBody(stmt.body, comments);
this.collectFromBody(stmt.orelse, comments);
break;
case "For":
case "AsyncFor":
this.collectFromBody(stmt.body, comments);
this.collectFromBody(stmt.orelse, comments);
break;
case "While":
this.collectFromBody(stmt.body, comments);
this.collectFromBody(stmt.orelse, comments);
break;
case "With":
case "AsyncWith":
this.collectFromBody(stmt.body, comments);
break;
case "Try":
this.collectFromBody(stmt.body, comments);
if (stmt.handlers) {
for (const handler of stmt.handlers) {
this.collectFromBody(handler.body, comments);
}
}
this.collectFromBody(stmt.orelse, comments);
this.collectFromBody(stmt.finalbody, comments);
break;
case "Match":
if (stmt.cases) {
for (const case_ of stmt.cases) {
this.collectFromBody(case_.body, comments);
}
}
break;
}
}
// Helper to collect comments from a statement body
collectFromBody(body, comments) {
for (const stmt of body) {
if (stmt.nodeType === "Comment") {
comments.push(stmt);
}
else {
if (stmt.inlineComment) {
comments.push(stmt.inlineComment);
}
this.collectFromStatement(stmt, comments);
}
}
} // ==== Statement parsers ====
parseStatement() {
// Handle indentation
if (this.check(TokenType.INDENT)) {
// INDENT tokens should only appear after compound statements
throw this.error("unexpected indent");
}
if (this.match(TokenType.DEDENT)) {
return null;
}
// Check for decorators first
if (this.check(TokenType.AT)) {
return this.parseDecorated();
}
return this.parseSimpleStmt() || this.parseCompoundStmt();
}
parseSimpleStmt() {
const stmt = this.parseSmallStmt();
// Handle multiple statements on one line
while (this.match(TokenType.SEMI)) {
if (!this.check(TokenType.NEWLINE) && !this.isAtEnd()) {
// Additional statements on the same line would go here
// For simplicity, we'll just parse the first one
break;
}
}
this.match(TokenType.NEWLINE); // Optional newline
return stmt;
}
parseSmallStmt() {
const start = this.peek();
// Check if this is a compound statement keyword - let parseCompoundStmt handle it
if (this.check(TokenType.DEF) ||
this.check(TokenType.CLASS) ||
this.check(TokenType.IF) ||
this.check(TokenType.WHILE) ||
this.check(TokenType.FOR) ||
this.check(TokenType.TRY) ||
this.check(TokenType.WITH) ||
this.check(TokenType.ASYNC) ||
this.check(TokenType.MATCH)) {
return null;
}
// Handle pass statement
if (this.match(TokenType.PASS)) {
return {
nodeType: "Pass",
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle break statement
if (this.match(TokenType.BREAK)) {
return {
nodeType: "Break",
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle continue statement
if (this.match(TokenType.CONTINUE)) {
return {
nodeType: "Continue",
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle return statement
if (this.match(TokenType.RETURN)) {
let value;
if (!this.check(TokenType.NEWLINE) &&
!this.check(TokenType.SEMI) &&
!this.isAtEnd()) {
value = this.parseTestList();
}
return {
nodeType: "Return",
value,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle delete statement
if (this.match(TokenType.DEL)) {
const targets = [];
targets.push(this.parseExpr());
while (this.match(TokenType.COMMA)) {
targets.push(this.parseExpr());
}
return {
nodeType: "Delete",
targets,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle global statement
if (this.match(TokenType.GLOBAL)) {
const names = [];
names.push(this.consume(TokenType.NAME, "Expected name after 'global'").value);
while (this.match(TokenType.COMMA)) {
names.push(this.consume(TokenType.NAME, "Expected name after ','").value);
}
return {
nodeType: "Global",
names,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle nonlocal statement
if (this.match(TokenType.NONLOCAL)) {
const names = [];
names.push(this.consume(TokenType.NAME, "Expected name after 'nonlocal'").value);
while (this.match(TokenType.COMMA)) {
names.push(this.consume(TokenType.NAME, "Expected name after ','").value);
}
return {
nodeType: "Nonlocal",
names,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle import statement
if (this.match(TokenType.IMPORT)) {
const names = [];
do {
let name = this.consume(TokenType.NAME, "Expected module name").value;
// Handle dotted names like 'os.path'
while (this.match(TokenType.DOT)) {
name += `.${this.consume(TokenType.NAME, "Expected name after '.'").value}`;
}
let asname;
if (this.match(TokenType.AS)) {
asname = this.consume(TokenType.NAME, "Expected name after 'as'").value;
}
names.push({ name, asname });
} while (this.match(TokenType.COMMA));
return {
nodeType: "Import",
names: names.map((n) => ({
nodeType: "Alias",
name: n.name,
asname: n.asname,
lineno: start.lineno,
col_offset: start.col_offset,
})),
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle from import statement
if (this.match(TokenType.FROM)) {
let level = 0;
// Handle relative imports (.., ., ..., etc.)
while (this.match(TokenType.DOT)) {
level++;
}
// Handle ellipsis (...) as three dots
if (this.match(TokenType.ELLIPSIS)) {
level += 3;
}
let module;
if (this.check(TokenType.NAME)) {
module = this.advance().value;
// Handle dotted module names
while (this.match(TokenType.DOT)) {
module += `.${this.consume(TokenType.NAME, "Expected name after '.'").value}`;
}
}
this.consume(TokenType.IMPORT, "Expected 'import' after module name");
const names = [];
// Handle parenthesized import lists
const hasParens = this.match(TokenType.LPAR);
if (this.match(TokenType.STAR)) {
names.push({ name: "*" });
}
else {
// Parse the first name
const firstName = this.consume(TokenType.NAME, "Expected name").value;
let firstAsname;
if (this.match(TokenType.AS)) {
firstAsname = this.consume(TokenType.NAME, "Expected name after 'as'").value;
}
names.push({ name: firstName, asname: firstAsname });
// Parse additional names if there are commas
while (this.match(TokenType.COMMA)) {
// Skip any newlines after comma (for multiline imports)
while (this.match(TokenType.NEWLINE)) {
// Skip newlines
}
// Check if we've reached the end (trailing comma case)
if (hasParens && this.check(TokenType.RPAR))
break;
if (!hasParens && (this.check(TokenType.NEWLINE) || this.isAtEnd()))
break;
const name = this.consume(TokenType.NAME, "Expected name").value;
let asname;
if (this.match(TokenType.AS)) {
asname = this.consume(TokenType.NAME, "Expected name after 'as'").value;
}
names.push({ name, asname });
}
}
if (hasParens) {
this.consume(TokenType.RPAR, "Expected ')' after import list");
}
return {
nodeType: "ImportFrom",
module,
names: names.map((n) => ({
nodeType: "Alias",
name: n.name,
asname: n.asname,
lineno: start.lineno,
col_offset: start.col_offset,
})),
level,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle raise statement
if (this.match(TokenType.RAISE)) {
let exc;
let cause;
if (!this.check(TokenType.NEWLINE) &&
!this.check(TokenType.SEMI) &&
!this.check(TokenType.DEDENT) &&
!this.check(TokenType.COMMENT) &&
!this.isAtEnd()) {
exc = this.parseTest();
if (this.match(TokenType.FROM)) {
cause = this.parseTest();
}
}
return {
nodeType: "Raise",
exc,
cause,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle assert statement
if (this.match(TokenType.ASSERT)) {
const test = this.parseTest();
let msg;
if (this.match(TokenType.COMMA)) {
msg = this.parseTest();
}
return {
nodeType: "Assert",
test,
msg,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Handle type alias statement (Python 3.12+)
if (this.check(TokenType.NAME) && this.peek().value === "type") {
const start = this.peek();
this.advance(); // consume 'type'
const nameToken = this.consume(TokenType.NAME, "Expected type alias name").value;
// Type parameters (optional)
const type_params = this.parseTypeParams();
this.consume(TokenType.EQUAL, "Expected '=' in type alias");
const value = this.parseTest();
return {
nodeType: "TypeAlias",
name: {
nodeType: "Name",
id: nameToken,
ctx: { nodeType: "Store" },
lineno: start.lineno,
col_offset: start.col_offset,
},
type_params,
value,
lineno: start.lineno,
col_offset: start.col_offset,
};
}
// Expression statement (including assignments)
const expr = this.parseTestListWithStar();
// Check for assignment operators
if (this.match(TokenType.EQUAL)) {
// Regular assignment - handle multiple assignment
const targets = [expr];
this.validateAssignmentTarget(expr);
let value = this.parseTestList();
// Collect any comments that were gathered during value parsing
const expressionComments = [];
if (this.includeComments && this.pendingComments.length > 0) {
expressionComments.push(...this.pendingComments);
this.pendingComments = [];
}
// Check for chained assignments like x = y = z
while (this.match(TokenType.EQUAL)) {
this.validateAssignmentTarget(value);
targets.push(value);
value = this.parseTestList();
// Collect any additional comments from chained assignment parsing
if (this.includeComments && this.pendin