@eagleoutice/flowr
Version:
Static Dataflow Analyzer and Program Slicer for the R Programming Language
725 lines • 31 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeTreeSitterTreeToAst = normalizeTreeSitterTreeToAst;
exports.makeTreeSitterLax = makeTreeSitterLax;
exports.makeTreeSitterStrict = makeTreeSitterStrict;
const normalizer_data_1 = require("../ast/parser/main/normalizer-data");
const tree_sitter_types_1 = require("./tree-sitter-types");
const type_1 = require("../ast/model/type");
const range_1 = require("../../../util/range");
const retriever_1 = require("../../retriever");
const convert_values_1 = require("../convert-values");
const normalize_meta_1 = require("../ast/parser/main/normalize-meta");
const arrays_1 = require("../../../util/collections/arrays");
const r_function_call_1 = require("../ast/model/nodes/r-function-call");
const strings_1 = require("../../../util/text/strings");
const log_1 = require("../../../util/log");
const parser_1 = require("../ast/parser/json/parser");
const identifier_1 = require("../../../dataflow/environments/identifier");
/**
* @param tree - The tree to normalize
* @param lax - Whether to use lax parsing (i.e., ignore errors) or strict parsing (i.e., fail on errors)
*/
function normalizeTreeSitterTreeToAst(tree, lax) {
if (lax) {
makeTreeSitterLax();
}
else {
makeTreeSitterStrict();
}
const files = [];
for (const t of tree) {
const root = convertTreeNode(t.parsed.rootNode);
if (root.type !== type_1.RType.ExpressionList) {
throw new normalizer_data_1.ParseError(`expected root to resolve to an expression list, got a ${root.type}`);
}
files.push({
filePath: t.filePath,
root: root
});
}
return {
type: type_1.RType.Project,
files
};
}
function nonErrorChildrenStrict(node) {
return node.hasError ? [] : node.children;
}
function nonErrorChildrenLax(node) {
return node.hasError ? node.children.filter(n => n.type !== tree_sitter_types_1.TreeSitterType.Error) : node.children;
}
let nonErrorChildren = nonErrorChildrenStrict;
/**
* Globally switch to lax parsing mode for tree-sitter normalization
* @see {@link makeTreeSitterStrict}
*/
function makeTreeSitterLax() {
log_1.log.info('[Tree-Sitter] Lax parsing active');
nonErrorChildren = nonErrorChildrenLax;
}
/**
* Globally switch to strict parsing mode for tree-sitter normalization
* @see {@link makeTreeSitterLax}
*/
function makeTreeSitterStrict() {
log_1.log.info('[Tree-Sitter] Strict parsing active');
nonErrorChildren = nonErrorChildrenStrict;
}
function convertTreeNode(node) {
if (!node) {
return {
type: type_1.RType.ExpressionList,
location: undefined,
lexeme: undefined,
children: [],
grouping: undefined,
info: {
fullRange: undefined,
adToks: [],
tsId: -1,
}
};
}
try {
// generally, the grammar source file dictates what children a node has in what order:
// https://github.com/r-lib/tree-sitter-r/blob/main/grammar.js
const range = makeSourceRange(node);
const defaultInfo = {
info: {
fullRange: range,
adToks: [],
fullLexeme: node.text,
tsId: node.id
}
};
switch (node.type) {
case tree_sitter_types_1.TreeSitterType.Program: {
const [comments, children] = splitComments(nonErrorChildren(node));
const body = children.map(n => [n, convertTreeNode(n)]);
const remainingComments = linkCommentsToNextNodes(body, comments);
return {
type: type_1.RType.ExpressionList,
children: body.map(n => n[1]),
grouping: undefined,
lexeme: undefined,
info: {
adToks: remainingComments.map(c => c[1]),
tsId: node.id
}
};
}
case tree_sitter_types_1.TreeSitterType.BracedExpression:
case tree_sitter_types_1.TreeSitterType.ParenthesizedExpression: {
const [comments, children] = splitComments(nonErrorChildren(node));
const opening = children[0];
const body = children.slice(1, -1).map(n => [n, convertTreeNode(n)]);
const remainingComments = linkCommentsToNextNodes(body, comments);
const closing = children[children.length - 1];
return {
type: type_1.RType.ExpressionList,
location: undefined,
lexeme: undefined,
children: body.map(n => n[1]),
grouping: [
{
type: type_1.RType.Symbol,
location: makeSourceRange(opening),
content: (0, retriever_1.removeRQuotes)(opening.text),
lexeme: opening.text,
...defaultInfo
}, {
type: type_1.RType.Symbol,
location: makeSourceRange(closing),
content: (0, retriever_1.removeRQuotes)(closing.text),
lexeme: closing.text,
...defaultInfo
}
],
info: {
adToks: remainingComments.map(c => c[1]),
tsId: node.id
}
};
}
case tree_sitter_types_1.TreeSitterType.BinaryOperator: {
const children = nonErrorChildren(node);
const lhs = convertTreeNode(children[0]);
const rhs = convertTreeNode(children[children.length - 1]);
const [commentsBoth, [op]] = splitComments(children.slice(1, -1));
const comments = commentsBoth.map(c => c[1]);
const opSource = makeSourceRange(op);
const lhsAsArg = {
type: type_1.RType.Argument,
location: lhs.location,
value: lhs,
name: undefined,
lexeme: lhs.lexeme,
info: {
tsId: lhs.info.tsId
}
};
if (op.type === 'special') {
return {
type: type_1.RType.FunctionCall,
location: opSource,
lexeme: node.text,
functionName: {
type: type_1.RType.Symbol,
location: opSource,
lexeme: op.text,
content: op.text,
info: {
tsId: op.id
}
},
arguments: [lhsAsArg, {
type: type_1.RType.Argument,
location: rhs.location,
value: rhs,
name: undefined,
lexeme: rhs.lexeme,
info: {
tsId: rhs.info.tsId
}
}],
named: true,
infixSpecial: true,
info: {
adToks: comments,
tsId: node.id
}
};
}
else if (op.text === '|>') {
return {
type: type_1.RType.Pipe,
location: opSource,
lhs: lhsAsArg,
rhs,
lexeme: op.text,
...defaultInfo,
info: {
fullRange: range,
adToks: comments,
fullLexeme: node.text,
tsId: node.id
}
};
}
else {
return {
type: type_1.RType.BinaryOp,
location: opSource,
lhs, rhs,
operator: op.text,
lexeme: op.text,
info: {
fullRange: range,
adToks: comments,
fullLexeme: node.text,
tsId: node.id
}
};
}
}
case tree_sitter_types_1.TreeSitterType.UnaryOperator: {
const [comments, children] = splitComments(nonErrorChildren(node));
const [op, operand] = children;
return {
type: type_1.RType.UnaryOp,
operand: convertTreeNode(operand),
location: makeSourceRange(op),
operator: op.text,
lexeme: op.text,
info: {
...defaultInfo.info,
adToks: comments.map(c => c[1]),
}
};
}
case tree_sitter_types_1.TreeSitterType.NamespaceOperator: {
const [lhs, int, rhs] = nonErrorChildren(node);
return {
type: type_1.RType.Symbol,
location: makeSourceRange(rhs),
content: identifier_1.Identifier.make(rhs.text, lhs.text, int.text === ':::'),
lexeme: rhs.text,
...defaultInfo
};
}
case '(':
case ')':
case tree_sitter_types_1.TreeSitterType.Na:
case tree_sitter_types_1.TreeSitterType.Null:
case tree_sitter_types_1.TreeSitterType.Dots:
case tree_sitter_types_1.TreeSitterType.DotDotI:
case tree_sitter_types_1.TreeSitterType.Identifier:
case tree_sitter_types_1.TreeSitterType.Return:
return {
type: type_1.RType.Symbol,
location: range,
content: (0, strings_1.startAndEndsWith)(node.text, '`') ? node.text.slice(1, -1) : (0, retriever_1.removeRQuotes)(node.text),
lexeme: node.text,
...defaultInfo
};
case tree_sitter_types_1.TreeSitterType.IfStatement: {
const [comments, children] = splitComments(nonErrorChildren(node));
const [ifNode, /* ( */ , condition, /* ) */ , then, /* else */ , ...otherwise] = children;
const filteredOtherwise = otherwise.filter(n => n.type !== tree_sitter_types_1.TreeSitterType.ElseStatement);
return {
type: type_1.RType.IfThenElse,
condition: convertTreeNode(condition),
then: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(then)),
otherwise: filteredOtherwise.length > 0 ? (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(filteredOtherwise[0])) : undefined,
location: makeSourceRange(ifNode),
lexeme: ifNode.text,
info: {
...defaultInfo.info,
adToks: comments.map(c => c[1]),
}
};
}
case tree_sitter_types_1.TreeSitterType.ForStatement: {
const [comments, children] = splitComments(nonErrorChildren(node));
const forNode = children[0]; // we follow with a (
const variable = getNodesUntil(children, 'in', 2); // we follow with the "in"
const sequence = getNodesUntil(children, ')', 2 + variable.length + 1); // we follow with a (
const body = children[2 + variable.length + 1 + sequence.length + 1];
const [variableComments, [variableNode]] = splitComments(variable);
const [sequenceComments, [sequenceNode]] = splitComments(sequence);
return {
type: type_1.RType.ForLoop,
variable: {
type: type_1.RType.Symbol,
location: makeSourceRange(variableNode),
content: (0, retriever_1.removeRQuotes)(variableNode.text),
lexeme: variableNode.text,
info: {
fullRange: undefined,
adToks: [],
fullLexeme: undefined,
tsId: variableNode.id
}
},
vector: convertTreeNode(sequenceNode),
body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)),
location: makeSourceRange(forNode),
lexeme: forNode.text,
info: {
fullRange: range,
adToks: variableComments.concat(comments, sequenceComments).map(c => c[1]),
fullLexeme: node.text,
tsId: node.id
}
};
}
case tree_sitter_types_1.TreeSitterType.WhileStatement: {
const [comments, children] = splitComments(nonErrorChildren(node));
const [whileNode, /* ( */ , condition, /* ) */ , body] = children;
return {
type: type_1.RType.WhileLoop,
condition: convertTreeNode(condition),
body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)),
location: makeSourceRange(whileNode),
lexeme: whileNode.text,
info: {
...defaultInfo.info,
adToks: comments.map(c => c[1]),
}
};
}
case tree_sitter_types_1.TreeSitterType.RepeatStatement: {
const [comments, [repeatNode, body]] = splitComments(nonErrorChildren(node));
return {
type: type_1.RType.RepeatLoop,
body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)),
location: makeSourceRange(repeatNode),
lexeme: repeatNode.text,
info: {
...defaultInfo.info,
adToks: comments.map(c => c[1]),
}
};
}
case tree_sitter_types_1.TreeSitterType.Call: {
const [func, argsParentheses] = nonErrorChildren(node);
// tree-sitter wraps next and break in a function call, but we don't, so unwrap
if (func.type === tree_sitter_types_1.TreeSitterType.Next || func.type === tree_sitter_types_1.TreeSitterType.Break) {
return {
...convertTreeNode(func),
...defaultInfo
};
}
const rawArgs = nonErrorChildren(argsParentheses);
const [comments, noCommentrawArgs] = splitComments(rawArgs);
const args = (0, arrays_1.splitArrayOn)(noCommentrawArgs.slice(1, -1), x => x.type === 'comma');
const funcRange = makeSourceRange(func);
const mappedArgs = args.map(n => n.length === 0 ? r_function_call_1.EmptyArgument : convertTreeNode(n[0]));
const call = {
arguments: mappedArgs,
location: funcRange,
lexeme: func.text,
...defaultInfo,
info: {
...defaultInfo.info,
adToks: comments.map(c => c[1]),
}
};
if (func.type === tree_sitter_types_1.TreeSitterType.Identifier || func.type === tree_sitter_types_1.TreeSitterType.String || func.type === tree_sitter_types_1.TreeSitterType.NamespaceOperator || func.type === tree_sitter_types_1.TreeSitterType.Return) {
let funcNode = convertTreeNode(func);
if (funcNode.type === type_1.RType.String) {
funcNode = {
...funcNode,
type: type_1.RType.Symbol,
content: (0, retriever_1.removeRQuotes)(func.text)
};
}
return {
...call,
type: type_1.RType.FunctionCall,
functionName: {
...funcNode,
info: {
fullRange: range,
adToks: [],
fullLexeme: node.text,
tsId: node.id
}
},
named: true
};
}
else {
return {
...call,
type: type_1.RType.FunctionCall,
calledFunction: convertTreeNode(func),
named: undefined
};
}
}
case tree_sitter_types_1.TreeSitterType.FunctionDefinition: {
const [name, paramsParens, body] = nonErrorChildren(node);
const [comments, noCommentRawParams] = splitComments(paramsParens.children.slice(1, -1));
const params = (0, arrays_1.splitArrayOn)(noCommentRawParams, x => x.type === 'comma');
return {
type: type_1.RType.FunctionDefinition,
parameters: params.map(n => convertTreeNode(n[0])),
body: (0, normalize_meta_1.ensureExpressionList)(convertTreeNode(body)),
location: makeSourceRange(name),
lexeme: name.text,
info: {
...defaultInfo.info,
adToks: comments.map(c => c[1]),
}
};
}
case tree_sitter_types_1.TreeSitterType.String:
return {
type: type_1.RType.String,
location: range,
content: convert_values_1.RStringValue.fromRLexeme(node.text),
lexeme: node.text,
...defaultInfo
};
case tree_sitter_types_1.TreeSitterType.Float:
case tree_sitter_types_1.TreeSitterType.Integer:
case tree_sitter_types_1.TreeSitterType.Complex:
case tree_sitter_types_1.TreeSitterType.Inf:
case tree_sitter_types_1.TreeSitterType.Nan:
return {
type: type_1.RType.Number,
location: range,
content: convert_values_1.RNumberValue.fromRLexeme(node.text),
lexeme: node.text,
...defaultInfo
};
case tree_sitter_types_1.TreeSitterType.True:
case tree_sitter_types_1.TreeSitterType.False:
return {
type: type_1.RType.Logical,
location: range,
content: node.text === convert_values_1.RTrue,
lexeme: node.text,
...defaultInfo
};
case tree_sitter_types_1.TreeSitterType.Break:
return {
type: type_1.RType.Break,
location: range,
lexeme: node.text,
...defaultInfo
};
case tree_sitter_types_1.TreeSitterType.Next:
return {
type: type_1.RType.Next,
location: range,
lexeme: node.text,
...defaultInfo
};
case tree_sitter_types_1.TreeSitterType.Subset:
case tree_sitter_types_1.TreeSitterType.Subset2: {
// subset has children like a and [x]
const [func, content] = nonErrorChildren(node);
// bracket is now [ or [[ and argsClosing is x] or x]]
const [bracket, ...argsClosing] = nonErrorChildren(content);
const [argsComments, argsNoComments] = splitComments(argsClosing.slice(0, -1));
const args = (0, arrays_1.splitArrayOn)(argsNoComments, x => x.type === 'comma');
return {
type: type_1.RType.Access,
operator: bracket.text,
accessed: convertTreeNode(func),
access: args.map(n => n.length === 0 ? r_function_call_1.EmptyArgument : convertTreeNode(n[0])),
location: makeSourceRange(bracket),
lexeme: bracket.text,
info: {
...defaultInfo.info,
adToks: argsComments.map(c => c[1]),
}
};
}
case tree_sitter_types_1.TreeSitterType.ExtractOperator: {
const [lhs, operator, rhs] = nonErrorChildren(node);
const rhsRange = makeSourceRange(rhs);
return {
type: type_1.RType.Access,
operator: operator.text,
accessed: convertTreeNode(lhs),
access: [{
type: type_1.RType.Argument,
name: undefined,
value: {
...convertTreeNode(rhs),
...defaultInfo
},
location: rhsRange,
lexeme: rhs?.text,
info: {
fullRange: rhsRange,
adToks: [],
fullLexeme: rhs?.text,
tsId: rhs?.id
}
}],
location: makeSourceRange(operator),
lexeme: operator.text,
...defaultInfo
};
}
case tree_sitter_types_1.TreeSitterType.Parameter: {
const children = nonErrorChildren(node);
const name = children[0];
const nameRange = makeSourceRange(name);
let defaultValue = undefined;
if (children.length === 3) {
defaultValue = convertTreeNode(children[2]);
}
return {
type: type_1.RType.Parameter,
name: {
type: type_1.RType.Symbol,
location: nameRange,
content: name.text,
lexeme: name.text,
info: {
fullRange: range,
adToks: [],
fullLexeme: name.text,
tsId: name.id
}
},
special: name.text === '...',
defaultValue,
location: nameRange,
lexeme: name.text,
info: {
fullRange: range,
adToks: [],
fullLexeme: name.text,
tsId: name.id
}
};
}
case tree_sitter_types_1.TreeSitterType.Argument: {
const [commentChildren, children] = splitComments(nonErrorChildren(node));
if (children.length === 1) {
const [arg] = children;
return {
type: type_1.RType.Argument,
name: undefined,
value: convertTreeNode(arg),
location: range,
lexeme: node.text,
info: {
...defaultInfo.info,
adToks: commentChildren.map(c => c[1]),
}
};
}
else {
const [nameNode, /* = */ , valueNode] = children;
let name = convertTreeNode(nameNode);
// unescape argument names
if (name.type === type_1.RType.String) {
name = {
...name,
type: type_1.RType.Symbol,
content: name.content.str
};
}
else if ((0, strings_1.startAndEndsWith)(name.content, '`')) {
name.content = name.content.slice(1, -1);
}
const nameRange = makeSourceRange(nameNode);
return {
type: type_1.RType.Argument,
name,
value: valueNode ? convertTreeNode(valueNode) : undefined,
location: nameRange,
lexeme: nameNode.text,
info: {
fullRange: nameRange,
adToks: commentChildren.map(c => c[1]),
fullLexeme: nameNode.text,
tsId: nameNode.id
}
};
}
}
case tree_sitter_types_1.TreeSitterType.Comment:
return {
type: type_1.RType.Comment,
location: range,
lexeme: node.text,
...defaultInfo
};
case tree_sitter_types_1.TreeSitterType.Error:
return {
type: type_1.RType.ExpressionList,
location: undefined,
lexeme: undefined,
children: [],
grouping: undefined,
...defaultInfo
};
}
}
catch {
parser_1.parseLog.error(`[Tree-Sitter] Failed to convert node of type ${node.type} at ${JSON.stringify(makeSourceRange(node))}`);
}
return {
type: type_1.RType.ExpressionList,
location: undefined,
lexeme: undefined,
children: [],
grouping: undefined,
info: {
fullRange: undefined,
adToks: [],
tsId: -1,
}
};
}
function makeSourceRange(node) {
if (!node) {
return range_1.SourceRange.invalid();
}
const s = node.startPosition;
const e = node.endPosition;
return [
// tree-sitter is 0-based but we want 1-based
(s?.row ?? -2) + 1, (s?.column ?? -2) + 1,
// tree-sitter's end position is one off from ours, so we don't add 1 here
(e?.row ?? -2) + 1, e?.column ?? -1
];
}
function splitComments(nodes) {
const comments = [];
const others = [];
for (const node of nodes) {
if (node.type === tree_sitter_types_1.TreeSitterType.Comment) {
comments.push([node, {
type: type_1.RType.Comment,
location: makeSourceRange(node),
lexeme: node.text,
info: {
adToks: [],
fullLexeme: node.text,
tsId: node.id
}
}]);
}
else {
others.push(node);
}
}
return [comments, others];
}
/**
* Find the first sibling of the given node that is not a comment, starting from the given node and going to the right.
* @param snode - the node for which to find the first non-comment sibling
* @param knownNexts - cache map from node id to the id of the first non-comment sibling
*/
function findFirstNonCommentSibling(snode, knownNexts) {
const cache = knownNexts.get(snode.id);
if (cache !== undefined) {
return cache;
}
const cursor = snode.parent?.walk();
if (!cursor) {
return null;
}
const linkCaches = [snode.id];
cursor.gotoFirstChild();
while (cursor.nodeId !== snode.id && cursor.gotoNextSibling()) {
/* skip */
}
cursor.gotoNextSibling();
while (cursor.nodeType === tree_sitter_types_1.TreeSitterType.Comment && cursor.gotoNextSibling()) {
/* skip */
linkCaches.push(cursor.nodeId);
}
const cur = cursor.currentNode;
for (const id of linkCaches) {
knownNexts.set(id, cur);
}
cursor.delete();
return cur;
}
function linkCommentsToNextNodes(nodes, comments) {
const remain = [];
const cacheMap = new Map();
for (const [commentSyntaxNode, commentNode] of comments) {
let sibling;
const prev = commentSyntaxNode.previousSibling;
if (prev?.endIndex === commentSyntaxNode.startIndex) {
// if there is a sibling on the same line, we link the comment to that node
sibling = prev;
}
else {
sibling = findFirstNonCommentSibling(commentSyntaxNode, cacheMap);
}
// if there is no valid sibling, we just link the comment to the first node (see normalize-expressions.ts)
const [, node] = (sibling ? nodes.find(([s]) => s.id === sibling.id) : undefined) ?? nodes[0] ?? [];
if (node) {
node.info.adToks ??= [];
node.info.adToks.push(commentNode);
}
else {
remain.push([commentSyntaxNode, commentNode]);
}
}
return remain;
}
function getNodesUntil(nodes, type, startIndex = 0) {
const ret = [];
for (let i = startIndex; i < nodes.length; i++) {
if (nodes[i].type === type) {
break;
}
ret.push(nodes[i]);
}
return ret;
}
//# sourceMappingURL=tree-sitter-normalize.js.map