UNPKG

@eagleoutice/flowr

Version:

Static Dataflow Analyzer and Program Slicer for the R Programming Language

725 lines 31 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.normalizeTreeSitterTreeToAst = normalizeTreeSitterTreeToAst; exports.makeTreeSitterLax = makeTreeSitterLax; exports.makeTreeSitterStrict = makeTreeSitterStrict; const normalizer_data_1 = require("../ast/parser/main/normalizer-data"); const tree_sitter_types_1 = require("./tree-sitter-types"); const type_1 = require("../ast/model/type"); const range_1 = require("../../../util/range"); const retriever_1 = require("../../retriever"); const convert_values_1 = require("../convert-values"); const normalize_meta_1 = require("../ast/parser/main/normalize-meta"); const arrays_1 = require("../../../util/collections/arrays"); const r_function_call_1 = require("../ast/model/nodes/r-function-call"); const strings_1 = require("../../../util/text/strings"); const log_1 = require("../../../util/log"); const parser_1 = require("../ast/parser/json/parser"); const identifier_1 = require("../../../dataflow/environments/identifier"); /** * @param tree - The tree to normalize * @param lax - Whether to use lax parsing (i.e., ignore errors) or strict parsing (i.e., fail on errors) */ function normalizeTreeSitterTreeToAst(tree, lax) { if (lax) { makeTreeSitterLax(); } else { makeTreeSitterStrict(); } const files = []; for (const t of tree) { const root = convertTreeNode(t.parsed.rootNode); if (root.type !== type_1.RType.ExpressionList) { throw new normalizer_data_1.ParseError(`expected root to resolve to an expression list, got a ${root.type}`); } files.push({ filePath: t.filePath, root: root }); } return { type: type_1.RType.Project, files }; } function nonErrorChildrenStrict(node) { return node.hasError ? [] : node.children; } function nonErrorChildrenLax(node) { return node.hasError ? node.children.filter(n => n.type !== tree_sitter_types_1.TreeSitterType.Error) : node.children; } let nonErrorChildren = nonErrorChildrenStrict; /** * Globally switch to lax parsing mode for tree-sitter normalization * @see {@link makeTreeSitterStrict} */ function makeTreeSitterLax() { log_1.log.info('[Tree-Sitter] Lax parsing active'); nonErrorChildren = nonErrorChildrenLax; } /** * Globally switch to strict parsing mode for tree-sitter normalization * @see {@link makeTreeSitterLax} */ function makeTreeSitterStrict() { log_1.log.info('[Tree-Sitter] Strict parsing active'); nonErrorChildren = nonErrorChildrenStrict; } function convertTreeNode(node) { if (!node) { return { type: type_1.RType.ExpressionList, location: undefined, lexeme: undefined, children: [], grouping: undefined, info: { fullRange: undefined, adToks: [], tsId: -1, } }; } try { // generally, the grammar source file dictates what children a node has in what order: // https://github.com/r-lib/tree-sitter-r/blob/main/grammar.js const range = makeSourceRange(node); const defaultInfo = { info: { fullRange: range, adToks: [], fullLexeme: node.text, tsId: node.id } }; switch (node.type) { case tree_sitter_types_1.TreeSitterType.Program: { const [comments, children] = splitComments(nonErrorChildren(node)); const body = children.map(n => [n, convertTreeNode(n)]); const remainingComments = linkCommentsToNextNodes(body, comments); return { type: type_1.RType.ExpressionList, children: body.map(n => n[1]), grouping: undefined, lexeme: undefined, info: { adToks: remainingComments.map(c => c[1]), tsId: node.id } }; } case tree_sitter_types_1.TreeSitterType.BracedExpression: case tree_sitter_types_1.TreeSitterType.ParenthesizedExpression: { const [comments, children] = splitComments(nonErrorChildren(node)); const opening = children[0]; const body = children.slice(1, -1).map(n => [n, convertTreeNode(n)]); const remainingComments = linkCommentsToNextNodes(body, comments); const closing = children[children.length - 1]; return { type: type_1.RType.ExpressionList, location: undefined, lexeme: undefined, children: body.map(n => n[1]), grouping: [ { type: type_1.RType.Symbol, location: makeSourceRange(opening), content: (0, retriever_1.removeRQuotes)(opening.text), lexeme: opening.text, ...defaultInfo }, { type: type_1.RType.Symbol, location: makeSourceRange(closing), content: (0, retriever_1.removeRQuotes)(closing.text), lexeme: closing.text, ...defaultInfo } ], info: { adToks: remainingComments.map(c => c[1]), tsId: node.id } }; } case tree_sitter_types_1.TreeSitterType.BinaryOperator: { const children = nonErrorChildren(node); const lhs = convertTreeNode(children[0]); const rhs = convertTreeNode(children[children.length - 1]); const [commentsBoth, [op]] = splitComments(children.slice(1, -1)); const comments = commentsBoth.map(c => c[1]); const opSource = makeSourceRange(op); const lhsAsArg = { type: type_1.RType.Argument, location: lhs.location, value: lhs, name: undefined, lexeme: lhs.lexeme, info: { tsId: lhs.info.tsId } }; if (op.type === 'special') { return { type: type_1.RType.FunctionCall, location: opSource, lexeme: node.text, functionName: { type: type_1.RType.Symbol, location: opSource, lexeme: op.text, content: op.text, info: { tsId: op.id } }, arguments: [lhsAsArg, { type: type_1.RType.Argument, location: rhs.location, value: rhs, name: undefined, lexeme: rhs.lexeme, info: { tsId: rhs.info.tsId } }], named: true, infixSpecial: true, info: { adToks: comments, tsId: node.id } }; } else if (op.text === '|>') { return { type: type_1.RType.Pipe, location: opSource, lhs: lhsAsArg, rhs, lexeme: op.text, ...defaultInfo, info: { fullRange: range, adToks: comments, fullLexeme: node.text, tsId: node.id } }; } else { return { type: type_1.RType.BinaryOp, location: opSource, lhs, rhs, operator: op.text, lexeme: op.text, info: { fullRange: range, adToks: comments, fullLexeme: node.text, tsId: node.id } }; } } case tree_sitter_types_1.TreeSitterType.UnaryOperator: { const [comments, children] = splitComments(nonErrorChildren(node)); const [op, operand] = children; return { type: type_1.RType.UnaryOp, operand: convertTreeNode(operand), location: makeSourceRange(op), operator: op.text, lexeme: op.text, info: { ...defaultInfo.info, adToks: comments.map(c => c[1]), } }; } case tree_sitter_types_1.TreeSitterType.NamespaceOperator: { const [lhs, int, rhs] = nonErrorChildren(node); return { type: type_1.RType.Symbol, location: makeSourceRange(rhs), content: identifier_1.Identifier.make(rhs.text, lhs.text, int.text === ':::'), lexeme: rhs.text, ...defaultInfo }; } case '(': case ')': case tree_sitter_types_1.TreeSitterType.Na: case tree_sitter_types_1.TreeSitterType.Null: case tree_sitter_types_1.TreeSitterType.Dots: case tree_sitter_types_1.TreeSitterType.DotDotI: case tree_sitter_types_1.TreeSitterType.Identifier: case tree_sitter_types_1.TreeSitterType.Return: return { type: type_1.RType.Symbol, location: range, content: (0, strings_1.startAndEndsWith)(node.text, '`') ? node.text.slice(1, -1) : (0, retriever_1.removeRQuotes)(node.text), lexeme: node.text, ...defaultInfo }; case tree_sitter_types_1.TreeSitterType.IfStatement: { const [comments, children] = splitComments(nonErrorChildren(node)); const [ifNode, /* ( */ , condition, /* ) */ , then, /* else */ , ...otherwise] = children; const filteredOtherwise = otherwise.filter(n => n.type !== tree_sitter_types_1.TreeSitterType.ElseStatement); return { type: type_1.RType.IfThenElse, condition: convertTreeNode(condition), then: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(then)), otherwise: filteredOtherwise.length > 0 ? (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(filteredOtherwise[0])) : undefined, location: makeSourceRange(ifNode), lexeme: ifNode.text, info: { ...defaultInfo.info, adToks: comments.map(c => c[1]), } }; } case tree_sitter_types_1.TreeSitterType.ForStatement: { const [comments, children] = splitComments(nonErrorChildren(node)); const forNode = children[0]; // we follow with a ( const variable = getNodesUntil(children, 'in', 2); // we follow with the "in" const sequence = getNodesUntil(children, ')', 2 + variable.length + 1); // we follow with a ( const body = children[2 + variable.length + 1 + sequence.length + 1]; const [variableComments, [variableNode]] = splitComments(variable); const [sequenceComments, [sequenceNode]] = splitComments(sequence); return { type: type_1.RType.ForLoop, variable: { type: type_1.RType.Symbol, location: makeSourceRange(variableNode), content: (0, retriever_1.removeRQuotes)(variableNode.text), lexeme: variableNode.text, info: { fullRange: undefined, adToks: [], fullLexeme: undefined, tsId: variableNode.id } }, vector: convertTreeNode(sequenceNode), body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)), location: makeSourceRange(forNode), lexeme: forNode.text, info: { fullRange: range, adToks: variableComments.concat(comments, sequenceComments).map(c => c[1]), fullLexeme: node.text, tsId: node.id } }; } case tree_sitter_types_1.TreeSitterType.WhileStatement: { const [comments, children] = splitComments(nonErrorChildren(node)); const [whileNode, /* ( */ , condition, /* ) */ , body] = children; return { type: type_1.RType.WhileLoop, condition: convertTreeNode(condition), body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)), location: makeSourceRange(whileNode), lexeme: whileNode.text, info: { ...defaultInfo.info, adToks: comments.map(c => c[1]), } }; } case tree_sitter_types_1.TreeSitterType.RepeatStatement: { const [comments, [repeatNode, body]] = splitComments(nonErrorChildren(node)); return { type: type_1.RType.RepeatLoop, body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)), location: makeSourceRange(repeatNode), lexeme: repeatNode.text, info: { ...defaultInfo.info, adToks: comments.map(c => c[1]), } }; } case tree_sitter_types_1.TreeSitterType.Call: { const [func, argsParentheses] = nonErrorChildren(node); // tree-sitter wraps next and break in a function call, but we don't, so unwrap if (func.type === tree_sitter_types_1.TreeSitterType.Next || func.type === tree_sitter_types_1.TreeSitterType.Break) { return { ...convertTreeNode(func), ...defaultInfo }; } const rawArgs = nonErrorChildren(argsParentheses); const [comments, noCommentrawArgs] = splitComments(rawArgs); const args = (0, arrays_1.splitArrayOn)(noCommentrawArgs.slice(1, -1), x => x.type === 'comma'); const funcRange = makeSourceRange(func); const mappedArgs = args.map(n => n.length === 0 ? r_function_call_1.EmptyArgument : convertTreeNode(n[0])); const call = { arguments: mappedArgs, location: funcRange, lexeme: func.text, ...defaultInfo, info: { ...defaultInfo.info, adToks: comments.map(c => c[1]), } }; if (func.type === tree_sitter_types_1.TreeSitterType.Identifier || func.type === tree_sitter_types_1.TreeSitterType.String || func.type === tree_sitter_types_1.TreeSitterType.NamespaceOperator || func.type === tree_sitter_types_1.TreeSitterType.Return) { let funcNode = convertTreeNode(func); if (funcNode.type === type_1.RType.String) { funcNode = { ...funcNode, type: type_1.RType.Symbol, content: (0, retriever_1.removeRQuotes)(func.text) }; } return { ...call, type: type_1.RType.FunctionCall, functionName: { ...funcNode, info: { fullRange: range, adToks: [], fullLexeme: node.text, tsId: node.id } }, named: true }; } else { return { ...call, type: type_1.RType.FunctionCall, calledFunction: convertTreeNode(func), named: undefined }; } } case tree_sitter_types_1.TreeSitterType.FunctionDefinition: { const [name, paramsParens, body] = nonErrorChildren(node); const [comments, noCommentRawParams] = splitComments(paramsParens.children.slice(1, -1)); const params = (0, arrays_1.splitArrayOn)(noCommentRawParams, x => x.type === 'comma'); return { type: type_1.RType.FunctionDefinition, parameters: params.map(n => convertTreeNode(n[0])), body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)), location: makeSourceRange(name), lexeme: name.text, info: { ...defaultInfo.info, adToks: comments.map(c => c[1]), } }; } case tree_sitter_types_1.TreeSitterType.String: return { type: type_1.RType.String, location: range, content: convert_values_1.RStringValue.fromRLexeme(node.text), lexeme: node.text, ...defaultInfo }; case tree_sitter_types_1.TreeSitterType.Float: case tree_sitter_types_1.TreeSitterType.Integer: case tree_sitter_types_1.TreeSitterType.Complex: case tree_sitter_types_1.TreeSitterType.Inf: case tree_sitter_types_1.TreeSitterType.Nan: return { type: type_1.RType.Number, location: range, content: convert_values_1.RNumberValue.fromRLexeme(node.text), lexeme: node.text, ...defaultInfo }; case tree_sitter_types_1.TreeSitterType.True: case tree_sitter_types_1.TreeSitterType.False: return { type: type_1.RType.Logical, location: range, content: node.text === convert_values_1.RTrue, lexeme: node.text, ...defaultInfo }; case tree_sitter_types_1.TreeSitterType.Break: return { type: type_1.RType.Break, location: range, lexeme: node.text, ...defaultInfo }; case tree_sitter_types_1.TreeSitterType.Next: return { type: type_1.RType.Next, location: range, lexeme: node.text, ...defaultInfo }; case tree_sitter_types_1.TreeSitterType.Subset: case tree_sitter_types_1.TreeSitterType.Subset2: { // subset has children like a and [x] const [func, content] = nonErrorChildren(node); // bracket is now [ or [[ and argsClosing is x] or x]] const [bracket, ...argsClosing] = nonErrorChildren(content); const [argsComments, argsNoComments] = splitComments(argsClosing.slice(0, -1)); const args = (0, arrays_1.splitArrayOn)(argsNoComments, x => x.type === 'comma'); return { type: type_1.RType.Access, operator: bracket.text, accessed: convertTreeNode(func), access: args.map(n => n.length === 0 ? r_function_call_1.EmptyArgument : convertTreeNode(n[0])), location: makeSourceRange(bracket), lexeme: bracket.text, info: { ...defaultInfo.info, adToks: argsComments.map(c => c[1]), } }; } case tree_sitter_types_1.TreeSitterType.ExtractOperator: { const [lhs, operator, rhs] = nonErrorChildren(node); const rhsRange = makeSourceRange(rhs); return { type: type_1.RType.Access, operator: operator.text, accessed: convertTreeNode(lhs), access: [{ type: type_1.RType.Argument, name: undefined, value: { ...convertTreeNode(rhs), ...defaultInfo }, location: rhsRange, lexeme: rhs?.text, info: { fullRange: rhsRange, adToks: [], fullLexeme: rhs?.text, tsId: rhs?.id } }], location: makeSourceRange(operator), lexeme: operator.text, ...defaultInfo }; } case tree_sitter_types_1.TreeSitterType.Parameter: { const children = nonErrorChildren(node); const name = children[0]; const nameRange = makeSourceRange(name); let defaultValue = undefined; if (children.length === 3) { defaultValue = convertTreeNode(children[2]); } return { type: type_1.RType.Parameter, name: { type: type_1.RType.Symbol, location: nameRange, content: name.text, lexeme: name.text, info: { fullRange: range, adToks: [], fullLexeme: name.text, tsId: name.id } }, special: name.text === '...', defaultValue, location: nameRange, lexeme: name.text, info: { fullRange: range, adToks: [], fullLexeme: name.text, tsId: name.id } }; } case tree_sitter_types_1.TreeSitterType.Argument: { const [commentChildren, children] = splitComments(nonErrorChildren(node)); if (children.length === 1) { const [arg] = children; return { type: type_1.RType.Argument, name: undefined, value: convertTreeNode(arg), location: range, lexeme: node.text, info: { ...defaultInfo.info, adToks: commentChildren.map(c => c[1]), } }; } else { const [nameNode, /* = */ , valueNode] = children; let name = convertTreeNode(nameNode); // unescape argument names if (name.type === type_1.RType.String) { name = { ...name, type: type_1.RType.Symbol, content: name.content.str }; } else if ((0, strings_1.startAndEndsWith)(name.content, '`')) { name.content = name.content.slice(1, -1); } const nameRange = makeSourceRange(nameNode); return { type: type_1.RType.Argument, name, value: valueNode ? convertTreeNode(valueNode) : undefined, location: nameRange, lexeme: nameNode.text, info: { fullRange: nameRange, adToks: commentChildren.map(c => c[1]), fullLexeme: nameNode.text, tsId: nameNode.id } }; } } case tree_sitter_types_1.TreeSitterType.Comment: return { type: type_1.RType.Comment, location: range, lexeme: node.text, ...defaultInfo }; case tree_sitter_types_1.TreeSitterType.Error: return { type: type_1.RType.ExpressionList, location: undefined, lexeme: undefined, children: [], grouping: undefined, ...defaultInfo }; } } catch { parser_1.parseLog.error(`[Tree-Sitter] Failed to convert node of type ${node.type} at ${JSON.stringify(makeSourceRange(node))}`); } return { type: type_1.RType.ExpressionList, location: undefined, lexeme: undefined, children: [], grouping: undefined, info: { fullRange: undefined, adToks: [], tsId: -1, } }; } function makeSourceRange(node) { if (!node) { return range_1.SourceRange.invalid(); } const s = node.startPosition; const e = node.endPosition; return [ // tree-sitter is 0-based but we want 1-based (s?.row ?? -2) + 1, (s?.column ?? -2) + 1, // tree-sitter's end position is one off from ours, so we don't add 1 here (e?.row ?? -2) + 1, e?.column ?? -1 ]; } function splitComments(nodes) { const comments = []; const others = []; for (const node of nodes) { if (node.type === tree_sitter_types_1.TreeSitterType.Comment) { comments.push([node, { type: type_1.RType.Comment, location: makeSourceRange(node), lexeme: node.text, info: { adToks: [], fullLexeme: node.text, tsId: node.id } }]); } else { others.push(node); } } return [comments, others]; } /** * Find the first sibling of the given node that is not a comment, starting from the given node and going to the right. * @param snode - the node for which to find the first non-comment sibling * @param knownNexts - cache map from node id to the id of the first non-comment sibling */ function findFirstNonCommentSibling(snode, knownNexts) { const cache = knownNexts.get(snode.id); if (cache !== undefined) { return cache; } const cursor = snode.parent?.walk(); if (!cursor) { return null; } const linkCaches = [snode.id]; cursor.gotoFirstChild(); while (cursor.nodeId !== snode.id && cursor.gotoNextSibling()) { /* skip */ } cursor.gotoNextSibling(); while (cursor.nodeType === tree_sitter_types_1.TreeSitterType.Comment && cursor.gotoNextSibling()) { /* skip */ linkCaches.push(cursor.nodeId); } const cur = cursor.currentNode; for (const id of linkCaches) { knownNexts.set(id, cur); } cursor.delete(); return cur; } function linkCommentsToNextNodes(nodes, comments) { const remain = []; const cacheMap = new Map(); for (const [commentSyntaxNode, commentNode] of comments) { let sibling; const prev = commentSyntaxNode.previousSibling; if (prev?.endIndex === commentSyntaxNode.startIndex) { // if there is a sibling on the same line, we link the comment to that node sibling = prev; } else { sibling = findFirstNonCommentSibling(commentSyntaxNode, cacheMap); } // if there is no valid sibling, we just link the comment to the first node (see normalize-expressions.ts) const [, node] = (sibling ? nodes.find(([s]) => s.id === sibling.id) : undefined) ?? nodes[0] ?? []; if (node) { node.info.adToks ??= []; node.info.adToks.push(commentNode); } else { remain.push([commentSyntaxNode, commentNode]); } } return remain; } function getNodesUntil(nodes, type, startIndex = 0) { const ret = []; for (let i = startIndex; i < nodes.length; i++) { if (nodes[i].type === type) { break; } ret.push(nodes[i]); } return ret; } //# sourceMappingURL=tree-sitter-normalize.js.map