UNPKG

@eagleoutice/flowr

Version:

Static Dataflow Analyzer and Program Slicer for the R Programming Language

761 lines (634 loc) 71.5 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const graph_1 = require("../dataflow/graph/graph"); const shell_1 = require("../r-bridge/shell"); const vertex_1 = require("../dataflow/graph/vertex"); const edge_1 = require("../dataflow/graph/edge"); const dataflowgraph_builder_1 = require("../dataflow/graph/dataflowgraph-builder"); const assert_1 = require("../util/assert"); const doc_dfg_1 = require("./doc-util/doc-dfg"); const doc_files_1 = require("./doc-util/doc-files"); const pipeline_executor_1 = require("../core/pipeline-executor"); const retriever_1 = require("../r-bridge/retriever"); const json_1 = require("../util/json"); const doc_env_1 = require("./doc-util/doc-env"); const doc_data_dfg_util_1 = require("./data/dfg/doc-data-dfg-util"); const doc_cli_option_1 = require("./doc-util/doc-cli-option"); const doc_types_1 = require("./doc-util/doc-types"); const doc_structure_1 = require("./doc-util/doc-structure"); const doc_code_1 = require("./doc-util/doc-code"); const path_1 = __importDefault(require("path")); const doc_general_1 = require("./doc-util/doc-general"); const node_id_1 = require("../r-bridge/lang-4.x/ast/model/processing/node-id"); const identifier_1 = require("../dataflow/environments/identifier"); const r_function_call_1 = require("../r-bridge/lang-4.x/ast/model/nodes/r-function-call"); const resolve_by_name_1 = require("../dataflow/environments/resolve-by-name"); const default_pipelines_1 = require("../core/steps/pipeline/default-pipelines"); const doc_auto_gen_1 = require("./doc-util/doc-auto-gen"); const text_1 = require("../util/text/text"); const log_1 = require("../../test/functionality/_helper/log"); const linker_1 = require("../dataflow/internal/linker"); const doc_normalized_ast_1 = require("./doc-util/doc-normalized-ast"); const dfg_get_origin_1 = require("../dataflow/origin/dfg-get-origin"); const identify_link_to_last_call_relation_1 = require("../queries/catalog/call-context-query/identify-link-to-last-call-relation"); const alias_tracking_1 = require("../dataflow/eval/resolve/alias-tracking"); const doc_issue_1 = require("./doc-util/doc-issue"); const unnamed_call_handling_1 = require("../dataflow/internal/process/functions/call/unnamed-call-handling"); const environment_builder_1 = require("../../test/functionality/_helper/dataflow/environment-builder"); const config_1 = require("../config"); async function subExplanation(shell, { description, code, expectedSubgraph }) { expectedSubgraph = await (0, doc_dfg_1.verifyExpectedSubgraph)(shell, code, expectedSubgraph); const marks = []; for (const [id] of expectedSubgraph.vertices(true)) { marks.push(id); } for (const [from, targets] of expectedSubgraph.edges()) { for (const [to] of targets) { marks.push(`${from}->${to}`); } } return ` ${await (0, doc_dfg_1.printDfGraphForCode)(shell, code, { mark: new Set(marks) })} ${description}`; } async function printAllSubExplanations(shell, expls) { let result = ` <details> <summary>Additional Case${expls.length > 1 ? 's' : ''}</summary> `; for (const sub of expls) { result += `#### ${sub.name}\n`; result += await subExplanation(shell, sub) + '\n'; } return result + '\n\n</details>'; } async function explanation({ shell, name, type, description, code, expectedSubgraph }, index, ...subExplanations) { await (0, doc_dfg_1.verifyExpectedSubgraph)(shell, code, expectedSubgraph); return ` <a id='${name.toLowerCase().replaceAll(' ', '-')}'> </a> ### ${index}) ${name} Type: \`${type}\` (this is the bit-flag value, e.g., when looking at the serialization) ${await subExplanation(shell, { name, description, code, expectedSubgraph })} ${subExplanations.length > 0 ? await printAllSubExplanations(shell, subExplanations) : ''} `; } function edgeTypeToId(edgeType) { return (0, edge_1.edgeTypeToName)(edgeType).toLowerCase().replaceAll(' ', '-'); } function linkEdgeName(edgeType, page = '') { return `[\`${(0, edge_1.edgeTypeToName)(edgeType)}\`](${page}#${edgeTypeToId(edgeType)})`; } async function getVertexExplanations(shell, vertexType) { /* we use the map to ensure order easily :D */ const vertexExplanations = new Map(); vertexExplanations.set(vertex_1.VertexType.Value, [{ shell, name: 'Value Vertex', type: vertex_1.VertexType.Value, description: ` Describes a constant value (numbers, booleans/logicals, strings, ...). In general, the respective vertex is more or less a dummy vertex as you can see from its implementation. ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'DataflowGraphVertexValue' })} ${(0, doc_structure_1.block)({ type: 'NOTE', content: ` The value is not stored in the vertex itself, but in the normalized AST. To access the value, you can use the \`id\` of the vertex to access the respective node in the [normalized AST](${doc_files_1.FlowrWikiBaseRef}/Normalized%20AST) and ask for the value associated with it. ` })} Please be aware that such nodes may be the result from language semantics as well, and not just from constants directly in the source. For example, an access operation like \`df$column\` will treat the column name as a constant value. ${(0, doc_structure_1.details)('Example: Semantics Create a Value', `In the following graph, the original type printed by mermaid is still \`RSymbol\` (from the [normalized AST](${doc_files_1.FlowrWikiBaseRef}/Normalized%20AST)), however, the shape of the vertex signals to you that the symbol is in-fact treated as a constant! If you do not know what \`df$column\` even means, please refer to the [R topic](https://rdrr.io/r/base/Extract.html).\n` + await (0, doc_dfg_1.printDfGraphForCode)(shell, 'df$column', { mark: new Set([1]) }))} `, code: '42', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().constant('0') }, []]); vertexExplanations.set(vertex_1.VertexType.Use, [{ shell, name: 'Use Vertex', type: vertex_1.VertexType.Use, description: ` Describes symbol/variable references which are read (or potentially read at a given position). Similar to the [value vertex](#value-vertex) described above, this is more a marker vertex as you can see from the implementation. ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'DataflowGraphVertexUse' })} ${(0, doc_structure_1.block)({ type: 'NOTE', content: ` The name of the symbol is not actually part of what we store in the dataflow graph, as we have it within the normalized AST. To access the name, you can use the \`id\` of the vertex: ${(0, doc_code_1.codeBlock)('ts', `const name = ${node_id_1.recoverName.name}(id, graph.idMap);`)} ` })} Most often, you will see the _use_ vertex whenever a variable is read. However, similar to the [value vertex](#value-vertex), the _use_ vertex can also be the result of language semantics. Consider a case, in which we refer to a variable with a string, as in \`get("x")\`. ${(0, doc_structure_1.details)('Example: Semantics Create a Symbol', `In the following graph, the original type printed by mermaid is still \`RString\` (from the [normalized AST](${doc_files_1.FlowrWikiBaseRef}/Normalized%20AST)), however, the shape of the vertex signals to you that the symbol is in-fact treated as a variable use! ` + 'If you are unsure what `get` does, refer to the [documentation](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/get). ' + 'Please note, that the lexeme being printed as `"x"` may be misleading (after all it is recovered from the AST), the quotes are not part of the reference.\n' + await (0, doc_dfg_1.printDfGraphForCode)(shell, 'get("x")', { mark: new Set([1]) }))} But now to the interesting stuff: how do we actually know which values are read by the respective variable use? This usually involves a [variable definition](#variable-definition-vertex) and a [reads edge](#reads-edge) linking the two. ${(0, doc_structure_1.details)('Example: Reads Edge Identifying a Single Definition', 'In the following graph, the `x` is read from the definition `x <- 1`.\n' + await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <- 1\nprint(x)', { mark: new Set([3, '0->3']), codeOpen: true }))} In general, there may be many such edges, identifying every possible definition of the variable. ${(0, doc_structure_1.details)('Example: Reads Edge Identifying Multiple Definitions (conditional)', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <- 1\nif(u) x <- 2\nprint(x)', { mark: new Set([10, '10->0', '10->4']), codeOpen: true }))} ${(0, doc_structure_1.details)('Example: Reads Edge Identifying Multiple Definitions (loop)', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <- 1\nfor(i in v) x <- 2\nprint(x)', { mark: new Set([11, '11->0', '11->5']), codeOpen: true }))} ${(0, doc_structure_1.details)('Example: Reads Edge Identifying Multiple Definitions (side-effect)', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'f <- function() x <<- 2\nx <- 2\nif(u) f()\nprint(x)', { mark: new Set([16, '16->1', '16->7']), codeOpen: true }))} ${(0, doc_structure_1.block)({ type: 'IMPORTANT', content: ` If you want to obtain the locations where a variable is defined, or read, or re-defined, refrain from tracking these details manually in the dataflow graph as there are some edge-cases that require special attention. In general, the ${(0, doc_types_1.shortLink)(dfg_get_origin_1.getOriginInDfg.name, vertexType.info)} function explained below in [working with the dataflow graph](${doc_files_1.FlowrWikiBaseRef}/Working%20with%20the%20Dataflow%20Graph) will help you to get the information you need. ` })} `, code: 'x', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().use('1@x', 'x') }, []]); vertexExplanations.set(vertex_1.VertexType.FunctionCall, [{ shell, name: 'Function Call Vertex', type: vertex_1.VertexType.FunctionCall, description: ` Describes any kind of function call, including unnamed calls and those that happen implicitly! In general the vertex provides you with information about the _name_ of the called function, the passed _arguments_, and the _environment_ in which the call happens (if it is of importance). However, the implementation reveals that it may hold an additional \`onlyBuiltin\` flag to indicate that the call is only calling builtin functions &mdash; however, this is only a flag to improve performance, and it should not be relied on as it may under-approximate the actual calling targets (e.g., being \`false\` even though all calls resolve to builtins). ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'DataflowGraphVertexFunctionCall' })} The related function argument references are defined like this: ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'FunctionArgument' })} There is another element of potential interest to you, the \`origin\` property which records how flowR created the respective function call. These origins may hold the name of any processor that is part of the ${(0, doc_types_1.shortLink)('BuiltInProcessorMapper', vertexType.info)} to signal that the respective processor was responsible for creating the vertex. The entry \`function\` signals that flowR used a processor for a user-defined function defined within the source code, \`unnamed\` signals that the function as an anonymous function definition. However, in general, flowR may use any fitting handler as an origin. For example, within a access definition, flowR will correspondingly redefine the meaning of \`:=\` to that of the \`table:assign\`. ${(0, doc_structure_1.details)('Example: Simple Function Call (unresolved)', await (async () => { const code = 'foo(x,3,y=3,)'; const [text, info] = await (0, doc_dfg_1.printDfGraphForCode)(shell, code, { mark: new Set([8]), exposeResult: true }); const callInfo = [...info.dataflow.graph.vertices(true)].find(([, vertex]) => vertex.tag === vertex_1.VertexType.FunctionCall && vertex.name === 'foo'); (0, assert_1.guard)(callInfo !== undefined, () => `Could not find call vertex for ${code}`); const [callId, callVert] = callInfo; const inverseMapReferenceTypes = Object.fromEntries(Object.entries(identifier_1.ReferenceType).map(([k, v]) => [v, k])); const identifierType = (0, doc_types_1.getTypesFromFolder)({ files: [path_1.default.resolve('./src/dataflow/environments/identifier.ts')], inlineTypes: ['ControlDependency'] }); return ` To get a better understanding, let's look at a simple function call without any known call target, like \`${code}\`: ${text} In this case, we have a function call vertex with id \`${callId}\` and the following arguments: ${(0, doc_code_1.codeBlock)('json', JSON.stringify(callVert.args, json_1.jsonReplacer, 2))} Of course now, this is hard to read in this form (although the ids of the arguments can be mapped pretty easily to the visualization), as the \`type\` of these references is a bit-mask, encoding one of the following reference types: | Value | Reference Type | |------:|----------------| ${Object.values(identifier_1.ReferenceType).filter(k => typeof k === 'string').map(k => `| ${identifier_1.ReferenceType[k]} | ${k} |`).join('\n')} In other words, we classify the references as ${(0, doc_general_1.lastJoin)(callVert.args.map(a => { if (a === r_function_call_1.EmptyArgument) { return `the (special) empty argument type (\`${r_function_call_1.EmptyArgument}\`)`; } else { return inverseMapReferenceTypes[a.type]; } }), ', ', ', and ')}. For more information on the types of references, please consult the implementation. ${(0, doc_types_1.printHierarchy)({ program: identifierType.program, info: identifierType.info, root: 'ReferenceType' })} `; })())} ${(0, doc_structure_1.block)({ type: 'NOTE', content: ` But how do you know which definitions are actually called by the function? So first of all, some frontends of _flowR_ (like the ${(0, doc_cli_option_1.getReplCommand)('slicer')} and ${(0, doc_cli_option_1.getReplCommand)('query')} with the [Query API](${doc_files_1.FlowrWikiBaseRef}/Query%20API)) already provide you with this information. In general there are three scenarios you may be interested in: ${(0, doc_structure_1.details)('1) the function resolves only to builtin definitions (like <code><-</code>)', ` Let's have a look at a simple assignment: ${await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <- 2')} In this case, the call does not have a single ${linkEdgeName(edge_1.EdgeType.Calls)} edge, which in general means (i.e., if the analysis is done and you are not looking at an intermediate result) it is bound to anything global beyond the scope of the given script. _flowR_ generally (theoretically at least) does not know if the call really refers to a built-in variable or function, as any code that is not part of the analysis could cause the semantics to change. However, it is (in most cases) safe to assume we call a builtin if there is a builtin function with the given name and if there is no ${linkEdgeName(edge_1.EdgeType.Calls)} edge attached to a call. If you want to check the resolve targets, refer to ${(0, doc_types_1.shortLink)(resolve_by_name_1.resolveByName.name, vertexType.info)}. `)} ${(0, doc_structure_1.details)('2) the function only resolves to definitions that are present in the program', ` Let's have a look at a call to a function named \`foo\` which is defined in the same script: ${await (async () => { const code = 'foo <- function() 3\nfoo()'; const [text, info] = await (0, doc_dfg_1.printDfGraphForCode)(shell, code, { exposeResult: true, mark: new Set([6, '6->0', '6->1', '6->3']) }); const numberOfEdges = [...info.dataflow.graph.edges()].flatMap(e => [...e[1].keys()]).length; const callVertex = info.dataflow.graph.vertices(true).find(([, vertex]) => vertex.tag === vertex_1.VertexType.FunctionCall && vertex.name === 'foo'); (0, assert_1.guard)(callVertex !== undefined, () => `Could not find call vertex for ${code}`); const [callId] = callVertex; return ` ${text} Now, there are several edges, ${numberOfEdges} to be precise, although we are primarily interested in the ${info.dataflow.graph.outgoingEdges(callId)?.size ?? 0} edges going out from the call vertex \`${callId}\`. The ${linkEdgeName(edge_1.EdgeType.Reads)} edge signals all definitions which are read by the \`foo\` identifier (similar to a [use vertex](#use-vertex)). While it seems to be somewhat redundant given the ${linkEdgeName(edge_1.EdgeType.Calls)} edge that identifies the called [function definition](#function-definition-vertex), you have to consider cases in which aliases are involved in the call resolution (e.g., with higher order functions). ${(0, doc_structure_1.details)('Example: Alias in Call Resolution', `In the following example, \`g\` ${linkEdgeName(edge_1.EdgeType.Reads)} the previous definition, but ${linkEdgeName(edge_1.EdgeType.Calls)} the function assigned to \`f\`.\n` + await (0, doc_dfg_1.printDfGraphForCode)(shell, 'f <- function() 3\ng <- f\ng()', { mark: new Set(['9', '9->5', '9->3']) }))} Lastly, the ${linkEdgeName(edge_1.EdgeType.Returns)} edge links the call to the return vertices(s) of the function. Please be aware, that these multiple exit points may be counter intuitive as they often appear with a nested call (usually a call to the built-in \`{\` function). ${(0, doc_structure_1.details)('(Advanced) Example: Multiple Exit Points May Still Reflect As One', await (0, doc_dfg_1.printDfGraphForCode)(shell, ` f <- function() { if(u) return(3) if(v) return(2) 1 } f()`.trim(), { mark: new Set([22, '22->18']) }) + ` In this case the call of \`f\` still only has one ${linkEdgeName(edge_1.EdgeType.Returns)} edge, although the function _looks_ as if it would have multiple exit points! But you have to beware that \`{\` is a function call as well (see below) and it may be redefined, or at least affect the actual returns of the function. In this scenario we show two types of such returns (or exit points): _explicit_ returns with the \`return\` function and _implicit_ returns (the result of the last evaluated expression). However, they are actually linked with the call of the built-in function \`{\` (and, in fact, they are highlighted in the mermaid graph). `)} `; })()} `)} ${(0, doc_structure_1.details)('3) the function resolves to a mix of both', ` Users may write… interesting pieces of code - for reasons we should not be interested in! Consider a case in which you have a built-in function (like the assignment operator \`<-\`) and a user that wants to redefine the meaning of the function call _sometimes_: ${await (async () => { const [text, info] = await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <- 2\nif(u) `<-` <- `*`\nx <- 3', { switchCodeAndGraph: true, mark: new Set([9, '9->0', '9->10']), exposeResult: true }); const interestingUseOfAssignment = [...info.dataflow.graph.vertices(true)].find(([, vertex]) => vertex.id === 11); (0, assert_1.guard)(interestingUseOfAssignment !== undefined, () => 'Could not find interesting assignment vertex for the code'); const [id, interestingVertex] = interestingUseOfAssignment; const env = interestingVertex.environment; (0, assert_1.guard)(env !== undefined, () => 'Could not find environment for interesting assignment vertex'); const name = interestingVertex.name; (0, assert_1.guard)(name !== undefined, () => 'Could not find name for interesting assignment vertex'); return ` ${text} Interesting program, right? Running this with \`u <- TRUE\` will cause the last line to evaluate to \`6\` because we redefined the assignment operator to mean multiplication, while with \`u <- FALSE\` causes \`x\` to be assigned to \`3\`. In short: the last line may either refer to a definition or to a use of \`x\`, and we are not fully equipped to visualize this (this causes a warning). First of all how can you spot that something weird is happening? Well, this definition has a ${linkEdgeName(edge_1.EdgeType.Reads)} and a ${linkEdgeName(edge_1.EdgeType.DefinedBy)} edge, but this of course does not apply to the general case. For starters, let's have a look at the environment of the call to \`<-\` in the last line: ${(0, doc_env_1.printEnvironmentToMarkdown)(env.current)} Great, you should see a definition of \`<-\` which is constraint by the [control dependency](#control-dependencies) to the \`if\`. Hence, trying to re-resolve the call using \`${linker_1.getAllFunctionCallTargets.name}\` (defined in ${(0, doc_files_1.getFilePathMd)('../dataflow/internal/linker.ts')}) with the id \`${id}\` of the call as starting point will present you with the following target ids: { \`${[...(0, linker_1.getAllFunctionCallTargets)(id, info.dataflow.graph)].join('`, `')}\` }. This way we know that the call may refer to the built-in assignment operator or to the multiplication. Similarly, trying to resolve the name with \`${resolve_by_name_1.resolveByName.name}\` using the environment attached to the call vertex (filtering for any reference type) returns (in a similar fashion): { \`${(0, resolve_by_name_1.resolveByName)(name, env)?.map(d => d.nodeId).join('`, `')}\` } (however, the latter will not trace aliases). `; })()} `)} Similar to finding the definitions read by a variable use, please use the ${(0, doc_types_1.shortLink)(linker_1.getAllFunctionCallTargets.name, vertexType.info)} function to find all possible definitions of a function call, as explained in the [working with the dataflow graph](${doc_files_1.FlowrWikiBaseRef}/Working%20with%20the%20Dataflow%20Graph) section.` })} Function calls are the most complicated mechanism in R as essentially everything is a function call. Even **control structures** like \`if(p) a else b\` are desugared into function calls (e.g., as \`\` \`if\`(p, a, b) \`\`). ${(0, doc_structure_1.details)('Example: <code>if</code> as a Function Call', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'if(p) a else b'))} Similarly, you should be aware of calls to **anonymous functions**, which may appear given directly (e.g. as \`(function() 1)()\`) or indirectly, with code directly calling the return of another function call: \`foo()()\`. ${(0, doc_structure_1.details)('Example: Anonymous Function Call (given directly)', await (0, doc_dfg_1.printDfGraphForCode)(shell, '(function() 1)()', { mark: new Set([6, '6->4']) }))} ${(0, doc_structure_1.details)('Example: Anonymous Function Call (given indirectly)', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'foo <- function() return(function() 3)\nfoo()()', { mark: new Set([12, '12->4']) }))} ${(0, doc_structure_1.block)({ type: 'NOTE', content: `Now you might be asking yourself how to differentiate anonymous and named functions and what you have to keep in mind when working with them? Unnamed functions have an array of signatures which you can use to identify them. But in short: the \`origin\` attribute of the ${(0, doc_types_1.shortLink)('DataflowGraphVertexFunctionCall', vertexType.info)} is \`${unnamed_call_handling_1.UnnamedFunctionCallOrigin}\`. Please be aware that unnamed functions still have a \`name\` property to give it a unique identifier that can be used for debugging and reference. This name _always_ starts with \`${unnamed_call_handling_1.UnnamedFunctionCallPrefix}\`. To identify these calls please do not rely on the [normalized AST](${doc_files_1.FlowrWikiBaseRef}/Normalized-AST). An expression like \`1 + 1\` will be correctly identified as a syntactical binary operation. Yet, from a dataflow/semantic perspective this is equivalent to \`\` \`+\`(1, 1) \`\` (which is a named function call and marked as such in the dataflow graph). To know which function is called, please rely on the ${linkEdgeName(edge_1.EdgeType.Calls)} edge. ` })} Another interesting case is a function with **side effects**, most prominently with the super-assignment \`<<-\`. In this case, you may encounter the ${linkEdgeName(edge_1.EdgeType.SideEffectOnCall)} as exemplified below. ${(0, doc_structure_1.details)('Example: Function Call with a Side-Effect', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'f <- function() x <<- 3\n f()', { mark: new Set([8, '1->8']) }))} `, code: 'foo()', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().call('1@foo', 'foo', []) }, []]); vertexExplanations.set(vertex_1.VertexType.VariableDefinition, [{ shell, name: 'Variable Definition Vertex', type: vertex_1.VertexType.VariableDefinition, description: ` Defined variables most commonly occur in the context of an assignment, for example, with the \`<-\` operator as shown above. ${(0, doc_structure_1.details)('Example: Super Definition (<code><<-</code>)', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <<- 1', { mark: new Set([0]) }))} The implementation is relatively sparse and similar to the other marker vertices: ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'DataflowGraphVertexVariableDefinition' })} Of course, there are not just operators that define variables, but also functions, like \`assign\`. ${(0, doc_structure_1.details)('Example: Using <code>assign</code>', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'assign("x", 1)\nx', { mark: new Set([1]) }) + `\nThe example may be misleading as the visualization uses \`${node_id_1.recoverName.name}\` to print the lexeme of the variable. However, this actually defines the variable \`x\` (without the quotes) as you can see with the ${linkEdgeName(edge_1.EdgeType.Reads)} edge.`)} Please be aware, that the name of the symbol defined may differ from what you read in the program as R allows the assignments to strings, escaped names, and more: ${(0, doc_structure_1.details)('Example: Assigning with an Escaped Name', await (0, doc_dfg_1.printDfGraphForCode)(shell, '`x` <- 1\nx', { mark: new Set([0]) }))} ${(0, doc_structure_1.details)('Example: Assigning with a String', await (0, doc_dfg_1.printDfGraphForCode)(shell, '"x" <- 1\nx', { mark: new Set([0]) }))} Definitions may be constrained by conditionals (_flowR_ takes care of calculating the dominating front for you). ${(0, doc_structure_1.details)('Conditional Assignments', await (async () => { const constrainedDefinitions = await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <- 0\nif(u) x <- 1 else x <- 2\nx', { exposeResult: true }); const [text, info] = constrainedDefinitions; const finalEnvironment = (0, doc_env_1.printEnvironmentToMarkdown)(info.dataflow.environment.current); return ` ${text} In this case, the definition of \`x\` is constrained by the conditional, which is reflected in the environment at the end of the analysis: ${finalEnvironment} As you can see, _flowR_ is able to recognize that the initial definition of \`x\` has no influence on the final value of the variable. `; })())} `, code: 'x <- 1', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().defineVariable('1@x', 'x') }, []]); vertexExplanations.set(vertex_1.VertexType.FunctionDefinition, [{ shell, name: 'Function Definition Vertex', type: vertex_1.VertexType.FunctionDefinition, description: ` Defining a function does do a lot of things: 1) it creates a new scope, 2) it may introduce parameters which act as promises and which are only evaluated if they are actually required in the body, 3) it may access the enclosing environments and the callstack. The vertex object in the dataflow graph stores multiple things, including all exit points, the enclosing environment if necessary, and the information of the subflow (the "body" of the function). ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'DataflowGraphVertexFunctionDefinition' })} The subflow is defined like this: ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'DataflowFunctionFlowInformation' })} And if you are interested in the exit points, they are defined like this: ${(0, doc_types_1.printHierarchy)({ program: vertexType.program, info: vertexType.info, root: 'ExitPoint' })} Whenever we visualize a function definition, we use a dedicated node to represent the anonymous function object, and a subgraph (usually with the name \`"function <id>"\`) to encompass the body of the function (they are linked with a dotted line). ${(0, doc_structure_1.block)({ type: 'NOTE', content: ` You may ask yourself: How can I know which vertices are part of the function body? how do i know the parameters? All vertices that are part of the graph are present in the \`graph\` property of the function definition &mdash; it contains a set of all ids of the contained vertices: the actual dataflow graph is flat, and you can query all root vertices (i.e., those not part of any function definition) using \`${new graph_1.DataflowGraph(undefined).rootIds.name}\`. Additionally, most functions that you can call on the dataflow graph offer a flag whether you want to include vertices of function definitions or not (e.g., \`${new graph_1.DataflowGraph(undefined).vertices.name}\`) ${(0, doc_structure_1.details)('Example: Nested Function Definitions', await (async () => { const [text, info] = await (0, doc_dfg_1.printDfGraphForCode)(shell, 'f <- function() { g <- function() 3 }', { mark: new Set([9, 6]), exposeResult: true }); const definitions = info.dataflow.graph.vertices(true) .filter(([, vertex]) => vertex.tag === vertex_1.VertexType.FunctionDefinition) .map(([id, vertex]) => `| \`${id}\` | { \`${[...vertex.subflow.graph].join('`, `')}\` } |`) .toArray(); return ` ${text} As you can see, the vertex ids of the subflow do not contain those of nested function definitions but again only those which are part of the respective scope (creating a tree-like structure): | Id | Vertex Ids in Subflow | |---:|-----------------------| ${definitions.join('\n')} `; })())} But now there is still an open question: how do you know which vertices are the parameters? In short: there is no direct way to infer this from the dataflow graph (as parameters are handled as open references which are promises). However, you can use the [normalized AST](${doc_files_1.FlowrWikiBaseRef}/Normalized%20AST) to get the parameters used. ${(0, doc_structure_1.details)('Example: Parameters of a Function', await (async () => { const code = 'f <- function(x, y = 3) x + y'; const [text, info] = await (0, doc_dfg_1.printDfGraphForCode)(shell, code, { mark: new Set([10, 1, 3]), exposeResult: true }); const ast = await (0, doc_normalized_ast_1.printNormalizedAstForCode)(shell, code, { prefix: 'flowchart LR\n', showCode: false }); const functionDefinition = [...info.dataflow.graph.vertices(true)].find(([, vertex]) => vertex.tag === vertex_1.VertexType.FunctionDefinition); (0, assert_1.guard)(functionDefinition !== undefined, () => `Could not find function definition for ${code}`); const [id] = functionDefinition; const normalized = info.normalize.idMap.get(id); return ` Let's first consider the following dataflow graph (of \`${code}\`): ${text} The function definition we are interested in has the id \`${id}\`. Looking at the [normalized AST](${doc_files_1.FlowrWikiBaseRef}/Normalized%20AST) of the code, we can get the parameters simply be requesting the \`parameters\` property of the function definition (yielding the names: [${normalized.parameters.map(p => `\`${p.name.content}\``).join(', ')}]): ${ast} `; })())} ` })} Last but not least, please keep in mind that R offers another way of writing anonymous functions (using the backslash): ${await (0, doc_dfg_1.printDfGraphForCode)(shell, '\\(x) x + 1', { switchCodeAndGraph: true })} Besides this being a theoretically "shorter" way of defining a function, this behaves similarly to the use of \`function\`. `, code: 'function() 1', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().defineFunction('1@function', [0], { graph: new Set('0'), in: [], out: [], unknownReferences: [], entryPoint: 0, environment: (0, environment_builder_1.defaultEnv)() }) }, []]); const results = []; let i = 0; for (const [, vertex] of (0, doc_data_dfg_util_1.getAllVertices)()) { const get = vertexExplanations.get(vertex); (0, assert_1.guard)(get !== undefined, () => `No explanation for vertex type ${vertex}`); const [expl, subExplanations] = get; results.push(await explanation(expl, ++i, ...subExplanations)); } return results.join('\n'); } async function getEdgesExplanations(shell, vertexType) { const edgeExplanations = new Map(); edgeExplanations.set(edge_1.EdgeType.Reads, [{ shell, name: 'Reads Edge', type: edge_1.EdgeType.Reads, description: ` Reads edges mark that the source vertex (usually a [use vertex](#use-vertex)) reads whatever is defined by the target vertex (usually a [variable definition](#variable-definition-vertex)). ${(0, doc_structure_1.block)({ type: 'NOTE', content: ` A ${linkEdgeName(edge_1.EdgeType.Reads)} edge is not a transitive closure and only links the "directly read" definition(s). Our abstract domains resolving transitive ${linkEdgeName(edge_1.EdgeType.Reads)} edges (and for that matter, following ${linkEdgeName(edge_1.EdgeType.Returns)} as well) are currently tailored to what we need in _flowR_. Hence, we offer a function like ${(0, doc_types_1.shortLink)(linker_1.getAllFunctionCallTargets.name, vertexType.info)}, as well as ${(0, doc_types_1.shortLink)(resolve_by_name_1.resolvesToBuiltInConstant.name, vertexType.info)} which do this for specific cases. Refer to ${(0, doc_types_1.shortLink)(dfg_get_origin_1.getOriginInDfg.name, vertexType.info)} for a more general solution, as explained in [working with the dataflow graph](${doc_files_1.FlowrWikiBaseRef}/Working%20with%20the%20Dataflow%20Graph). ${(0, doc_structure_1.details)('Example: Multi-Level Reads', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'x <- 3\ny <- x\nprint(y)', { mark: new Set(['9->7', '7->3', '4->0']) }))} Similarly, ${linkEdgeName(edge_1.EdgeType.Reads)} can be cyclic, for example in the context of loops: ${(0, doc_structure_1.details)('Example: Cyclic Reads', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'for(i in v) x <- x + 1', { mark: new Set(['3->2']) }))} ` })} Reads edges may point to built-in definitions as well, to signal that something relates to a built-in element of flowR. Their targets are not part of the ${(0, doc_types_1.shortLink)(graph_1.DataflowGraph.name, vertexType.info)} but only markers to signal that the respective definition is a built-in. Please refer to the explanation of the respective vertices for more information. `, code: 'x <- 2\nprint(x)', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().reads('2@x', '1@x') }, [{ name: 'Reads Edge (Call)', description: 'Named calls are resolved too, linking to the symbol that holds the anonymous function definition (indirectly or directly)', code: 'foo <- function() {}\nfoo()', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().reads('2@foo', '1@foo') }, { name: 'Reads Edge (Parameter)', description: 'Parameters can read from each other as well.', code: 'f <- function(x, y=x) {}', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().reads('1:20', '1@x') }]]); edgeExplanations.set(edge_1.EdgeType.DefinedBy, [{ shell, name: 'DefinedBy Edge', /* concat for link generation */ type: edge_1.EdgeType.DefinedBy, description: ` The source vertex is usually a [\`variable definition\`](#variable-definition-vertex) linking the defined symbol to the entry point of the resulting side. ${(0, doc_structure_1.details)('In general, this does not have to be the right hand side of the operator.', await (0, doc_dfg_1.printDfGraphForCode)(shell, '3 -> x', { mark: new Set([0]) }))} However, nested definitions can carry it (in the nested case, \`x\` is defined by the return value of <code>\\\`<-\\\`(y, z)</code>). Additionally, we link the assignment function. `, code: 'x <- y', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().definedBy('1@x', '1@y').definedBy('1@x', '1:3') }, [{ name: 'DefinedBy Edge (Nested)', description: `Nested definitions can carry the ${linkEdgeName(edge_1.EdgeType.DefinedBy)} edge as well.`, code: 'x <- y <- z', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().definedBy('1@x', '1:3').definedBy('1@x', '1:8').definedBy('1@y', '1:8') }, { name: 'DefinedBy Edge (Expression)', description: 'Here, we define by the result of the `+` expression.', code: 'x <- y + z', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().definedBy('1@x', '1:8') }]]); edgeExplanations.set(edge_1.EdgeType.Calls, [{ shell, name: 'Calls Edge', type: edge_1.EdgeType.Calls, description: `Link the [function call](#function-call-vertex) to the [function definition](#function-definition-vertex) that is called. To find all called definitions, please use the ${(0, doc_types_1.shortLink)(dfg_get_origin_1.getOriginInDfg.name, vertexType.info)} function, as explained in [working with the dataflow graph](${doc_files_1.FlowrWikiBaseRef}/Working%20with%20the%20Dataflow%20Graph).`, code: 'foo <- function() {}\nfoo()', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().calls('2@foo', '1@function') }, []]); edgeExplanations.set(edge_1.EdgeType.Returns, [{ shell, name: 'Returns Edge', type: edge_1.EdgeType.Returns, description: `Link the [function call](#function-call-vertex) to the exit points of the target definition (this may incorporate the call-context). As you can see in the example, this happens for user-defined functions (like \`foo\`) as well as for built-in functions (like \`<-\`). However, these edges are specific to scenarios in which flowR knows that a specific element is returned. For contrast, compare this to a use of, for example, \`+\`: ${(0, doc_structure_1.details)('Example: No returns edge for +', await (0, doc_dfg_1.printDfGraphForCode)(shell, '1 + 1'))} Here, we do not get a ${linkEdgeName(edge_1.EdgeType.Returns)} edge as this function call creates a new value based on its arguments. In these scenarios you should rely on the \`args\` property of the ${(0, doc_types_1.shortLink)('DataflowGraphVertexFunctionCall', vertexType.info)} and use the arguments to calculate what you need to know. Alternatively, you can track the ${linkEdgeName(edge_1.EdgeType.Argument)} edges. In general, the ${linkEdgeName(edge_1.EdgeType.Returns)} edge already does most of the heavy lifting for you, by respecting control flow influences and (as long as flowR is able to detect it) dead code. ${(0, doc_structure_1.details)('Example: Tricky Returns', `We show the _simplified_ DFG for simplicity and highlight all ${linkEdgeName(edge_1.EdgeType.Returns)} edges involved in tracking the return of a call to \`f\` (as ${linkEdgeName(edge_1.EdgeType.Returns)} are never transitive and must hence be followed):\n` + await (0, doc_dfg_1.printDfGraphForCode)(shell, 'f <- function() { if(u) { return(3); 2 } else 42 }\nf()', { simplified: true, mark: new Set(['19->15', '15->14', '14->12', '14->11', '11->9', '9->7']) }) + '\n\n Note, that the `2` should be completely absent of the dataflow graph (recognized as dead code).')} <br/> ${(0, doc_structure_1.block)({ type: 'NOTE', content: `You might find it an inconvenience that there is no ${linkEdgeName(edge_1.EdgeType.Returns)} edge for _every_ function call. If there is particular function for which you think flowR should be able to detect the return, please open a [new issue](${doc_issue_1.NewIssueUrl}). Yet the problem of flowR not tracking returns for functions that create new/transform existing values is a fundamental design decision &mdash; if this irritates you ~~you may be eligible for compensation~~, you may be interested in an alternative with the [Control Flow Graph](${doc_files_1.FlowrWikiBaseRef}/Control%20Flow%20Graph#cfg-exit-points) which not just tracks all possible execution orders of the program, but also the exit points of _all_ function calls. ` })} `, code: 'foo <- function() x\nfoo()', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().returns('2@foo', '1@x').returns('1@<-', '1@foo').argument('1@<-', '1@foo') }, []]); const lateBindingExample = ` f <- function() x x <- 3 f() `.trim(); const dfInfo = await (0, doc_dfg_1.printDfGraphForCode)(shell, lateBindingExample, { switchCodeAndGraph: true, codeOpen: true, mark: new Set([1, '1->5', '9->5']) }); edgeExplanations.set(edge_1.EdgeType.DefinesOnCall, [{ shell, name: 'DefinesOnCall Edge', type: edge_1.EdgeType.DefinesOnCall, description: `*This edge is usually joined with ${linkEdgeName(edge_1.EdgeType.DefinedByOnCall)}!* Links an argument to whichever parameter they cause to be defined if the related function call is invoked. In the context of functions which access their closure environment these edges play another tricky role as there are many cases made more difficult by R's way of allowing closure environments to later receive variables. Consider the following scenario in which we first define a function which returns the value of a variable named \`x\` and then define \`x\` only after we defined the function: ${dfInfo} The final call evaluates to \`3\` (similar to if we defined \`x\` before the function definition). Within a dataflow graph you can see this with two edges. The \`x\` within the function body will have a ${linkEdgeName(edge_1.EdgeType.DefinedByOnCall)} to every definition it _may_ refer to. In turn, each call vertex calling the function which encloses the use of \`x\` will have a ${linkEdgeName(edge_1.EdgeType.DefinesOnCall)} edge to the definition(s) it causes to be active within the function body. `, code: 'f <- function(x) {}\nf(x=1)', // here we use the ids as the argument wrappers are not easily selected with slicing criteria expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().definesOnCall('$11', '$1').definedByOnCall('$1', '$11') }, []]); edgeExplanations.set(edge_1.EdgeType.DefinedByOnCall, [{ shell, name: 'DefinedByOnCall Edge', type: edge_1.EdgeType.DefinedByOnCall, description: `*This edge is usually joined with ${linkEdgeName(edge_1.EdgeType.DefinesOnCall)}!* This represents the other part of the ${linkEdgeName(edge_1.EdgeType.DefinesOnCall)} edge (e.g., links the parameter to the argument). Please look there for further documentation.`, code: 'f <- function(x) {}\nf(x=1)', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().definesOnCall('$11', '$1').definedByOnCall('$1', '$11') }, []]); edgeExplanations.set(edge_1.EdgeType.Argument, [{ shell, name: 'Argument Edge', type: edge_1.EdgeType.Argument, description: `Links a [function call](#function-call-vertex) to the entry point of its arguments. If we do not know the target of such a call, we automatically assume that all arguments are read by the call as well! The exception to this is the [function definition](#function-definition-vertex) which does no longer hold these argument relationships (as they are not implicit in the structure). `, code: 'f(x,y)', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().argument('1@f', '1@x').reads('1@f', '1@x').argument('1@f', '1@y').reads('1@f', '1@y') }, []]); edgeExplanations.set(edge_1.EdgeType.SideEffectOnCall, [{ shell, name: 'SideEffectOnCall Edge', type: edge_1.EdgeType.SideEffectOnCall, description: 'Links a global side effect to an affected function call (e.g., a super definition within the function body)', code: 'f <- function() { x <<- 2 }\nf()', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().sideEffectOnCall('1@x', '2@f') }, []]); edgeExplanations.set(edge_1.EdgeType.NonStandardEvaluation, [{ shell, name: 'NonStandardEvaluation Edge', type: edge_1.EdgeType.NonStandardEvaluation, description: ` Marks cases in which R's non-standard evaluation mechanisms cause the default semantics to deviate (see the case below for multiple vertices) ${(0, doc_structure_1.block)({ type: 'NOTE', content: ` What to do if you encounter a vertex marked with this edge? This depends on your analysis. To handle many real-world sources correctly you are probably fine with just ignoring it. Yet, you may choose to follow these references for other queries. For now, _flowR's_ support for non-standard evaluation is limited. Besides the obvious quotation there are other cases in which _flowR_ may choose to create a ${linkEdgeName(edge_1.EdgeType.NonStandardEvaluation)} edge, there are some that may appear to be counter-intuitive. For example, a for-loop body, as in the following example. ${(0, doc_structure_1.details)('Example: For-Loop Body', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'for(i in v) b', { mark: new Set([2, '4->2']) }))} ${(0, doc_structure_1.details)('Example: While-Loop Body', await (0, doc_dfg_1.printDfGraphForCode)(shell, 'while(TRUE) b', { mark: new Set([1, '3->1']) }))} ` })} `, code: 'quote(x)', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)().argument('1@quote', '1@x').nse('1@quote', '1@x') }, [{ name: 'Complete Expressions', description: 'This works, even if we have a larger expression in `quote`.', code: 'quote(x + y)', expectedSubgraph: (0, dataflowgraph_builder_1.emptyGraph)() .argument('1@quote', '1@+').nse('1@quote', '1@+') .nse('1@quote', '1@x') .nse('1@quote', '1@y') }]]); const results = []; let i = 0; for (const [, edge] of (0, doc_data_dfg_util_1.getAllEdges)()) { const get = edgeExplanations.get(edge); (0, assert_1.guard)(get !== undefined, () => `No explanation for edge type ${edge}`); const [expl, subExplanations] = get; results.push(`<a id='${edgeTypeToId(edge)}'></a>` + await explanation(expl, ++i, ...subExplanations)); } return results.join('\n'); } async function dummyDataflow() { const shell = new shell_1.RShell(); const result = await new pipeline_executor_1.PipelineExecutor(default_pipelines_1.DEFAULT_DATAFLOW_PIPELINE, { parser: shell, request: (0, retriever_1.requestFromInput)('x <- 1\nx + 1') }, config_1.defaultConfigOptions).allRemainingSteps(); shell.close(); return result; } async function getText(shell) { const rversion = (await shell.usedRVersion())?.format() ?? 'unknown'; /* we collect type information on the graph */ const vertexType = (0, doc_types_1.getTypesFromFolder)({ rootFolder: path_1.default.resolve('./src/'), typeNameForMermaid: 'DataflowGraphVertexInfo', inlineTypes: ['MergeableRecord'] }); const edgeType = (