UNPKG

@eagleoutice/flowr

Version:

Static Dataflow Analyzer and Program Slicer for the R Programming Language

481 lines 24.1 kB
"use strict"; /** * Provides a top-level slicer that can be used to slice code *and* retrieve stats. * @module */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.BenchmarkSlicer = exports.benchmarkLogger = void 0; const stopwatch_1 = require("./stopwatch"); const seedrandom_1 = __importDefault(require("seedrandom")); const log_1 = require("../util/log"); const assert_1 = require("../util/assert"); const strings_1 = require("../util/text/strings"); const parse_1 = require("../slicing/criterion/parse"); const default_pipelines_1 = require("../core/steps/pipeline/default-pipelines"); const retriever_1 = require("../r-bridge/retriever"); const collect_all_1 = require("../slicing/criterion/collect-all"); const size_of_1 = require("./stats/size-of"); const shell_1 = require("../r-bridge/shell"); const tree_sitter_types_1 = require("../r-bridge/lang-4.x/tree-sitter/tree-sitter-types"); const tree_sitter_executor_1 = require("../r-bridge/lang-4.x/tree-sitter/tree-sitter-executor"); const vertex_1 = require("../dataflow/graph/vertex"); const arrays_1 = require("../util/collections/arrays"); const config_1 = require("../config"); const extract_cfg_1 = require("../control-flow/extract-cfg"); const shape_inference_1 = require("../abstract-interpretation/data-frame/shape-inference"); const lattice_1 = require("../abstract-interpretation/domains/lattice"); const set_range_domain_1 = require("../abstract-interpretation/domains/set-range-domain"); const fs_1 = __importDefault(require("fs")); const flowr_analyzer_context_1 = require("../project/context/flowr-analyzer-context"); const r_project_1 = require("../r-bridge/lang-4.x/ast/model/nodes/r-project"); const r_comment_1 = require("../r-bridge/lang-4.x/ast/model/nodes/r-comment"); const call_graph_1 = require("../dataflow/graph/call-graph"); /** * The logger to be used for benchmarking as a global object. */ exports.benchmarkLogger = log_1.log.getSubLogger({ name: 'benchmark' }); class BenchmarkSlicer { /** Measures all data recorded *once* per slicer (complete setup up to the dataflow graph creation) */ commonMeasurements = new stopwatch_1.Measurements(); perSliceMeasurements = new Map(); deltas = new Map(); parserName; context; stats; loadedXml; dataflow; normalizedAst; controlFlow; callGraph; totalStopwatch; finished = false; // Yes, this is unclean, but we know that we assign the executor during the initialization and this saves us from having to check for nullability every time executor = null; parser = null; constructor(parserName) { this.totalStopwatch = this.commonMeasurements.start('total'); this.parserName = parserName; } /** * Initialize the slicer on the given request. * Can only be called once for each instance. */ async init(request, config, autoSelectIf, threshold) { (0, assert_1.guard)(this.stats === undefined, 'cannot initialize the slicer twice'); // we know these are in sync so we just cast to one of them this.parser = await this.commonMeasurements.measure('initialize R session', async () => { if (this.parserName === 'r-shell') { return new shell_1.RShell(config_1.FlowrConfig.getForEngine(config, 'r-shell')); } else { await tree_sitter_executor_1.TreeSitterExecutor.initTreeSitter(config_1.FlowrConfig.getForEngine(config, 'tree-sitter')); return new tree_sitter_executor_1.TreeSitterExecutor(); } }); this.context = (0, flowr_analyzer_context_1.contextFromInput)({ ...request }, config); this.executor = (0, default_pipelines_1.createSlicePipeline)(this.parser, { context: this.context, criterion: [], autoSelectIf, threshold, }); this.loadedXml = (await this.measureCommonStep('parse', 'retrieve AST from R code')).files.map(p => p.parsed); this.normalizedAst = await this.measureCommonStep('normalize', 'normalize R AST'); this.dataflow = await this.measureCommonStep('dataflow', 'produce dataflow information'); this.executor.switchToRequestStage(); await this.calculateStatsAfterInit(request); } async calculateStatsAfterInit(request) { const loadedContent = request.request === 'text' ? request.content : fs_1.default.readFileSync(request.content, 'utf-8'); let numberOfRTokens; let numberOfRTokensNoComments; if (this.parser.name === 'r-shell') { // retrieve number of R tokens - flowr_parsed should still contain the last parsed code numberOfRTokens = await (0, retriever_1.retrieveNumberOfRTokensOfLastParse)(this.parser); numberOfRTokensNoComments = await (0, retriever_1.retrieveNumberOfRTokensOfLastParse)(this.parser, true); } else { const countChildren = function (node, ignoreComments = false) { let ret = node.type === tree_sitter_types_1.TreeSitterType.Comment && ignoreComments ? 0 : 1; for (const child of node.children) { ret += countChildren(child, ignoreComments); } return ret; }; const root = this.loadedXml.map(t => t.rootNode); numberOfRTokens = root.map(r => countChildren(r)).reduce((a, b) => a + b, 0); numberOfRTokensNoComments = root.map(r => countChildren(r, true)).reduce((a, b) => a + b, 0); } (0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined after initialization'); (0, assert_1.guard)(this.dataflow !== undefined, 'dataflow should be defined after initialization'); // collect dataflow graph size const vertices = this.dataflow.graph.vertices(true); let numberOfEdges = 0; let numberOfCalls = 0; let numberOfDefinitions = 0; for (const [n, info] of vertices) { const outgoingEdges = this.dataflow.graph.outgoingEdges(n); numberOfEdges += outgoingEdges?.size ?? 0; if (info.tag === vertex_1.VertexType.FunctionCall) { numberOfCalls++; } else if (info.tag === vertex_1.VertexType.FunctionDefinition) { numberOfDefinitions++; } } let nodes = 0; let nodesNoComments = 0; let commentChars = 0; let commentCharsNoWhitespace = 0; r_project_1.RProject.visitAst(this.normalizedAst.ast, t => { nodes++; const comments = t.info.adToks?.filter(r_comment_1.RComment.is); if (comments && comments.length > 0) { const content = comments.map(c => c.lexeme ?? '').join(''); commentChars += content.length; commentCharsNoWhitespace += (0, strings_1.withoutWhitespace)(content).length; } else { nodesNoComments++; } return false; }); const split = loadedContent.split('\n'); const nonWhitespace = (0, strings_1.withoutWhitespace)(loadedContent).length; this.stats = { perSliceMeasurements: this.perSliceMeasurements, memory: this.deltas, request, input: { numberOfLines: split.length, numberOfNonEmptyLines: split.filter(l => l.trim().length > 0).length, numberOfCharacters: loadedContent.length, numberOfCharactersNoComments: loadedContent.length - commentChars, numberOfNonWhitespaceCharacters: nonWhitespace, numberOfNonWhitespaceCharactersNoComments: nonWhitespace - commentCharsNoWhitespace, numberOfRTokens: numberOfRTokens, numberOfRTokensNoComments: numberOfRTokensNoComments, numberOfNormalizedTokens: nodes, numberOfNormalizedTokensNoComments: nodesNoComments }, dataflow: { numberOfNodes: this.dataflow.graph.vertices(true).toArray().length, numberOfEdges: numberOfEdges, numberOfCalls: numberOfCalls, numberOfFunctionDefinitions: numberOfDefinitions, sizeOfObject: (0, size_of_1.getSizeOfDfGraph)(this.dataflow.graph), }, // these are all properly initialized in finish() commonMeasurements: new Map(), retrieveTimePerToken: { raw: 0, normalized: 0 }, normalizeTimePerToken: { raw: 0, normalized: 0 }, dataflowTimePerToken: { raw: 0, normalized: 0 }, totalCommonTimePerToken: { raw: 0, normalized: 0 } }; } /** * Slice for the given {@link SlicingCriteria}. * @see SingleSlicingCriterion * @returns The per slice stats retrieved for this slicing criteria */ async slice(...slicingCriteria) { exports.benchmarkLogger.trace(`try to slice for criteria ${JSON.stringify(slicingCriteria)}`); this.guardActive(); (0, assert_1.guard)(!this.perSliceMeasurements.has(slicingCriteria), 'do not slice the same criteria combination twice'); const measurements = new stopwatch_1.Measurements(); const stats = { measurements: undefined, slicingCriteria: [], numberOfDataflowNodesSliced: 0, timesHitThreshold: 0, reconstructedCode: { code: '', linesWithAutoSelected: 0 } }; this.perSliceMeasurements.set(slicingCriteria, stats); this.executor.updateRequest({ criterion: slicingCriteria }); const totalStopwatch = measurements.start('total'); const slicedOutput = await this.measureSliceStep('slice', measurements, 'static slicing'); const decodedCriteria = parse_1.SlicingCriteria.decodeAll(slicingCriteria, this.normalizedAst.idMap); stats.slicingCriteria = Array.from(decodedCriteria); stats.reconstructedCode = await this.measureSliceStep('reconstruct', measurements, 'reconstruct code'); totalStopwatch.stop(); exports.benchmarkLogger.debug(`Produced code for ${JSON.stringify(slicingCriteria)}: ${stats.reconstructedCode.code}`); const results = this.executor.getResults(false); if (exports.benchmarkLogger.settings.minLevel >= 3 /* LogLevel.Info */) { exports.benchmarkLogger.info(`mapped slicing criteria: ${slicedOutput.slicedFor.map(id => { const node = results.normalize.idMap.get(id); return `\n- id: ${id}, location: ${JSON.stringify(node?.location)}, lexeme: ${JSON.stringify(node?.lexeme)}`; }).join('')}`); } // if it is not in the dataflow graph it was kept to be safe and should not count to the included nodes stats.numberOfDataflowNodesSliced = Array.from(slicedOutput.result).filter(id => results.dataflow.graph.hasVertex(id, false)).length; stats.timesHitThreshold = slicedOutput.timesHitThreshold; stats.measurements = measurements.get(); return { stats, slice: slicedOutput, code: stats.reconstructedCode }; } /** * Extract the control flow graph using {@link extractCFG} */ extractCFG() { exports.benchmarkLogger.trace('try to extract the control flow graph'); this.guardActive(); (0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined for control flow extraction'); const ast = this.normalizedAst; this.controlFlow = this.measureSimpleStep('extract control flow graph', () => (0, extract_cfg_1.extractCfg)(ast, this.context, undefined, undefined, true)); } extractCG() { exports.benchmarkLogger.trace('try to extract the call graph'); this.guardActive(); const g = this.dataflow?.graph; (0, assert_1.guard)(g !== undefined, 'dataflow should be defined for call graph extraction'); this.callGraph = this.measureSimpleStep('extract call graph', () => call_graph_1.CallGraph.compute(g)); } /** * Infer the shape of data frames using abstract interpretation with {@link inferDataFrameShapes} * @returns The statistics of the data frame shape inference */ inferDataFrameShapes() { exports.benchmarkLogger.trace('try to infer shapes for data frames'); (0, assert_1.guard)(this.stats !== undefined && !this.finished, 'need to call init before, and can not do after finish!'); (0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined for data frame shape inference'); (0, assert_1.guard)(this.dataflow !== undefined, 'dataflow should be defined for data frame shape inference'); (0, assert_1.guard)(this.controlFlow !== undefined, 'controlFlow should be defined for data frame shape inference'); (0, assert_1.guard)(this.context !== undefined, 'context should be defined for data frame shape inference'); const ast = this.normalizedAst; const dfg = this.dataflow.graph; const cfinfo = this.controlFlow; const stats = { numberOfDataFrameFiles: 0, numberOfNonDataFrameFiles: 0, numberOfResultConstraints: 0, numberOfResultingValues: 0, numberOfResultingBottom: 0, numberOfResultingTop: 0, numberOfEmptyNodes: 0, numberOfOperationNodes: 0, numberOfValueNodes: 0, sizeOfInfo: 0, perNodeStats: new Map() }; const inference = new shape_inference_1.DataFrameShapeInferenceVisitor({ controlFlow: cfinfo, dfg, normalizedAst: ast, ctx: this.context }); this.measureSimpleStep('infer data frame shapes', () => inference.start()); const result = inference.getEndState(); stats.numberOfResultConstraints = result.value.size; stats.sizeOfInfo = (0, size_of_1.safeSizeOf)([inference.getAbstractTrace()]); for (const value of result.value.values()) { if (value.isTop()) { stats.numberOfResultingTop++; } else if (value.isBottom()) { stats.numberOfResultingBottom++; } else { stats.numberOfResultingValues++; } } r_project_1.RProject.visitAst(this.normalizedAst.ast, node => { const operations = inference.getAbstractOperations(node.info.id); const value = inference.getAbstractValue(node.info.id); // Only store per-node information for nodes representing expressions or nodes with abstract values if (operations === undefined && value === undefined) { stats.numberOfEmptyNodes++; return; } const nodeStats = { numberOfEntries: inference.getAbstractState(node.info.id)?.value.size ?? 0 }; if (operations !== undefined) { nodeStats.mappedOperations = operations.map(op => op.operation); stats.numberOfOperationNodes++; if (value !== undefined) { nodeStats.inferredColNames = this.getInferredNumber(value.colnames); nodeStats.inferredColCount = this.getInferredNumber(value.cols); nodeStats.inferredRowCount = this.getInferredNumber(value.rows); nodeStats.approxRangeColNames = this.getInferredRange(value.colnames); nodeStats.approxRangeColCount = this.getInferredRange(value.cols); nodeStats.approxRangeRowCount = this.getInferredRange(value.rows); } } if (value !== undefined) { stats.numberOfValueNodes++; } stats.perNodeStats.set(node.info.id, nodeStats); }); if (stats.numberOfOperationNodes > 0) { stats.numberOfDataFrameFiles = 1; } else { stats.numberOfNonDataFrameFiles = 1; } this.stats.dataFrameShape = stats; return stats; } getInferredRange(value) { if (value.isValue()) { if (value instanceof set_range_domain_1.SetRangeDomain) { return value.value.range === lattice_1.Top ? Infinity : value.value.range.size; } else { return value.value[1] - value.value[0]; } } return 0; } getInferredNumber(value) { if (value.isTop()) { return 'top'; } else if (value.isValue()) { if (value instanceof set_range_domain_1.SetRangeDomain) { if (value.value.range === lattice_1.Top) { return 'infinite'; } return Math.floor(value.value.min.size + (value.value.range.size / 2)); } else { if (!isFinite(value.value[1])) { return 'infinite'; } return Math.floor((value.value[0] + value.value[1]) / 2); } } return 'bottom'; } /** Bridging the gap between the new internal and the old names for the benchmarking */ async measureCommonStep(expectedStep, keyToMeasure) { const memoryInit = process.memoryUsage(); const { result } = await this.commonMeasurements.measureAsync(keyToMeasure, () => this.executor.nextStep(expectedStep)); const memoryEnd = process.memoryUsage(); this.deltas.set(keyToMeasure, { heap: memoryEnd.heapUsed - memoryInit.heapUsed, rss: memoryEnd.rss - memoryInit.rss, external: memoryEnd.external - memoryInit.external, buffs: memoryEnd.arrayBuffers - memoryInit.arrayBuffers }); return result; } measureSimpleStep(keyToMeasure, measurement) { const memoryInit = process.memoryUsage(); const result = this.commonMeasurements.measure(keyToMeasure, measurement); const memoryEnd = process.memoryUsage(); this.deltas.set(keyToMeasure, { heap: memoryEnd.heapUsed - memoryInit.heapUsed, rss: memoryEnd.rss - memoryInit.rss, external: memoryEnd.external - memoryInit.external, buffs: memoryEnd.arrayBuffers - memoryInit.arrayBuffers }); return result; } async measureSliceStep(expectedStep, measure, keyToMeasure) { const { result } = await measure.measureAsync(keyToMeasure, () => this.executor.nextStep(expectedStep)); return result; } guardActive() { (0, assert_1.guard)(this.stats !== undefined && !this.finished, 'need to call init before, and can not do after finish!'); } /** * Call {@link slice} for all slicing criteria that match the given filter. * See {@link collectAllSlicingCriteria} for details. * <p> * the `report` function will be called *before* each *individual* slice is performed. * @returns The number of slices that were produced * @see collectAllSlicingCriteria * @see SlicingCriteriaFilter */ async sliceForAll(filter, report = () => { }, options = {}) { const { sampleCount, maxSliceCount, sampleStrategy } = { sampleCount: -1, maxSliceCount: -1, sampleStrategy: 'random', ...options }; this.guardActive(); let count = 0; let allCriteria = [...(0, collect_all_1.collectAllSlicingCriteria)(this.normalizedAst.ast, filter)]; // Cancel slicing if the number of slices exceeds the limit if (maxSliceCount > 0 && allCriteria.length > maxSliceCount) { return -allCriteria.length; } if (sampleCount > 0) { if (sampleStrategy === 'equidistant') { allCriteria = (0, arrays_1.equidistantSampling)(allCriteria, sampleCount, 'ceil'); } else { const random = options.seed ? (0, seedrandom_1.default)(options.seed) : Math.random; allCriteria.sort(() => random() - 0.5); allCriteria.length = Math.min(allCriteria.length, sampleCount); } } for (const slicingCriteria of allCriteria) { report(count, allCriteria.length, allCriteria); await this.slice(...slicingCriteria); count++; } return count; } /** * Retrieves the final stats and closes the shell session. * Can be called multiple times to retrieve the stored stats, but will only close the session once (the first time). */ finish() { (0, assert_1.guard)(this.stats !== undefined, 'need to call init before finish'); if (!this.finished) { this.commonMeasurements.measure('close R session', () => this.parser.close()); this.totalStopwatch.stop(); this.finished = true; } this.stats.commonMeasurements = this.commonMeasurements.get(); const retrieveTime = Number(this.stats.commonMeasurements.get('retrieve AST from R code')); const normalizeTime = Number(this.stats.commonMeasurements.get('normalize R AST')); const dataflowTime = Number(this.stats.commonMeasurements.get('produce dataflow information')); const controlFlowTime = Number(this.stats.commonMeasurements.get('extract control flow graph')); const callGraphTime = Number(this.stats.commonMeasurements.get('extract call graph')); const dataFrameShapeTime = Number(this.stats.commonMeasurements.get('infer data frame shapes')); this.stats.retrieveTimePerToken = { raw: retrieveTime / this.stats.input.numberOfRTokens, normalized: retrieveTime / this.stats.input.numberOfNormalizedTokens }; this.stats.normalizeTimePerToken = { raw: normalizeTime / this.stats.input.numberOfRTokens, normalized: normalizeTime / this.stats.input.numberOfNormalizedTokens }; this.stats.dataflowTimePerToken = { raw: dataflowTime / this.stats.input.numberOfRTokens, normalized: dataflowTime / this.stats.input.numberOfNormalizedTokens }; this.stats.totalCommonTimePerToken = { raw: (retrieveTime + normalizeTime + dataflowTime) / this.stats.input.numberOfRTokens, normalized: (retrieveTime + normalizeTime + dataflowTime) / this.stats.input.numberOfNormalizedTokens }; this.stats.controlFlowTimePerToken = Number.isNaN(controlFlowTime) ? undefined : { raw: controlFlowTime / this.stats.input.numberOfRTokens, normalized: controlFlowTime / this.stats.input.numberOfNormalizedTokens, }; this.stats.callGraphTimePerToken = Number.isNaN(callGraphTime) ? undefined : { raw: callGraphTime / this.stats.input.numberOfRTokens, normalized: callGraphTime / this.stats.input.numberOfNormalizedTokens, }; this.stats.dataFrameShapeTimePerToken = Number.isNaN(dataFrameShapeTime) ? undefined : { raw: dataFrameShapeTime / this.stats.input.numberOfRTokens, normalized: dataFrameShapeTime / this.stats.input.numberOfNormalizedTokens, }; return { stats: this.stats, parse: typeof this.loadedXml === 'string' ? this.loadedXml : JSON.stringify(this.loadedXml), dataflow: this.dataflow, normalize: this.normalizedAst }; } /** * Only call in case of an error - if the session must be closed and the benchmark itself is to be considered failed/dead. */ ensureSessionClosed() { this.parser?.close(); } } exports.BenchmarkSlicer = BenchmarkSlicer; //# sourceMappingURL=slicer.js.map