UNPKG

@eagleoutice/flowr

Version:

Static Dataflow Analyzer and Program Slicer for the R Programming Language

github.com/flowr-analysis/flowr

flowr-analysis/flowr

501 lines • 24.5 kB

JavaScript

"use strict"; /** * Provides a top-level slicer that can be used to slice code *and* retrieve stats. * @module */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.BenchmarkSlicer = exports.benchmarkLogger = void 0; const stopwatch_1 = require("./stopwatch"); const fs_1 = __importDefault(require("fs")); const seedrandom_1 = __importDefault(require("seedrandom")); const log_1 = require("../util/log"); const assert_1 = require("../util/assert"); const strings_1 = require("../util/text/strings"); const default_pipelines_1 = require("../core/steps/pipeline/default-pipelines"); const retriever_1 = require("../r-bridge/retriever"); const collect_all_1 = require("../slicing/criterion/collect-all"); const type_1 = require("../r-bridge/lang-4.x/ast/model/type"); const visitor_1 = require("../r-bridge/lang-4.x/ast/model/processing/visitor"); const size_of_1 = require("./stats/size-of"); const shell_1 = require("../r-bridge/shell"); const tree_sitter_types_1 = require("../r-bridge/lang-4.x/tree-sitter/tree-sitter-types"); const tree_sitter_executor_1 = require("../r-bridge/lang-4.x/tree-sitter/tree-sitter-executor"); const vertex_1 = require("../dataflow/graph/vertex"); const arrays_1 = require("../util/collections/arrays"); const config_1 = require("../config"); const extract_cfg_1 = require("../control-flow/extract-cfg"); const absint_info_1 = require("../abstract-interpretation/data-frame/absint-info"); const domain_1 = require("../abstract-interpretation/data-frame/domain"); const shape_inference_1 = require("../abstract-interpretation/data-frame/shape-inference"); /** * The logger to be used for benchmarking as a global object. */ exports.benchmarkLogger = log_1.log.getSubLogger({ name: 'benchmark' }); class BenchmarkSlicer { /** Measures all data recorded *once* per slicer (complete setup up to the dataflow graph creation) */ commonMeasurements = new stopwatch_1.Measurements(); perSliceMeasurements = new Map(); deltas = new Map(); parserName; config; stats; loadedXml; dataflow; normalizedAst; controlFlow; totalStopwatch; finished = false; // Yes, this is unclean, but we know that we assign the executor during the initialization and this saves us from having to check for nullability every time executor = null; parser = null; constructor(parserName) { this.totalStopwatch = this.commonMeasurements.start('total'); this.parserName = parserName; } /** * Initialize the slicer on the given request. * Can only be called once for each instance. */ async init(request, config, autoSelectIf, threshold) { (0, assert_1.guard)(this.stats === undefined, 'cannot initialize the slicer twice'); this.config = config; // we know these are in sync so we just cast to one of them this.parser = await this.commonMeasurements.measure('initialize R session', async () => { if (this.parserName === 'r-shell') { return new shell_1.RShell((0, config_1.getEngineConfig)(config, 'r-shell')); } else { await tree_sitter_executor_1.TreeSitterExecutor.initTreeSitter((0, config_1.getEngineConfig)(config, 'tree-sitter')); return new tree_sitter_executor_1.TreeSitterExecutor(); } }); this.executor = (0, default_pipelines_1.createSlicePipeline)(this.parser, { request: { ...request }, criterion: [], autoSelectIf, threshold, }, config); this.loadedXml = (await this.measureCommonStep('parse', 'retrieve AST from R code')).parsed; this.normalizedAst = await this.measureCommonStep('normalize', 'normalize R AST'); this.dataflow = await this.measureCommonStep('dataflow', 'produce dataflow information'); this.executor.switchToRequestStage(); await this.calculateStatsAfterInit(request); } async calculateStatsAfterInit(request) { const loadedContent = request.request === 'text' ? request.content : fs_1.default.readFileSync(request.content, 'utf-8'); let numberOfRTokens; let numberOfRTokensNoComments; if (this.parser.name === 'r-shell') { // retrieve number of R tokens - flowr_parsed should still contain the last parsed code numberOfRTokens = await (0, retriever_1.retrieveNumberOfRTokensOfLastParse)(this.parser); numberOfRTokensNoComments = await (0, retriever_1.retrieveNumberOfRTokensOfLastParse)(this.parser, true); } else { const countChildren = function (node, ignoreComments = false) { let ret = node.type === tree_sitter_types_1.TreeSitterType.Comment && ignoreComments ? 0 : 1; for (const child of node.children) { ret += countChildren(child, ignoreComments); } return ret; }; const root = this.loadedXml.rootNode; numberOfRTokens = countChildren(root); numberOfRTokensNoComments = countChildren(root, true); } (0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined after initialization'); (0, assert_1.guard)(this.dataflow !== undefined, 'dataflow should be defined after initialization'); // collect dataflow graph size const vertices = [...this.dataflow.graph.vertices(true)]; let numberOfEdges = 0; let numberOfCalls = 0; let numberOfDefinitions = 0; for (const [n, info] of vertices) { const outgoingEdges = this.dataflow.graph.outgoingEdges(n); numberOfEdges += outgoingEdges?.size ?? 0; if (info.tag === 'function-call') { numberOfCalls++; } else if (info.tag === 'function-definition') { numberOfDefinitions++; } } let nodes = 0; let nodesNoComments = 0; let commentChars = 0; let commentCharsNoWhitespace = 0; (0, visitor_1.visitAst)(this.normalizedAst.ast, t => { nodes++; const comments = t.info.additionalTokens?.filter(t => t.type === type_1.RType.Comment); if (comments && comments.length > 0) { const content = comments.map(c => c.lexeme ?? '').join(''); commentChars += content.length; commentCharsNoWhitespace += (0, strings_1.withoutWhitespace)(content).length; } else { nodesNoComments++; } return false; }); const storedVertexIndices = this.countStoredVertexIndices(); const storedEnvIndices = this.countStoredEnvIndices(); const overwrittenIndices = storedVertexIndices - storedEnvIndices; const split = loadedContent.split('\n'); const nonWhitespace = (0, strings_1.withoutWhitespace)(loadedContent).length; this.stats = { perSliceMeasurements: this.perSliceMeasurements, memory: this.deltas, request, input: { numberOfLines: split.length, numberOfNonEmptyLines: split.filter(l => l.trim().length > 0).length, numberOfCharacters: loadedContent.length, numberOfCharactersNoComments: loadedContent.length - commentChars, numberOfNonWhitespaceCharacters: nonWhitespace, numberOfNonWhitespaceCharactersNoComments: nonWhitespace - commentCharsNoWhitespace, numberOfRTokens: numberOfRTokens, numberOfRTokensNoComments: numberOfRTokensNoComments, numberOfNormalizedTokens: nodes, numberOfNormalizedTokensNoComments: nodesNoComments }, dataflow: { numberOfNodes: [...this.dataflow.graph.vertices(true)].length, numberOfEdges: numberOfEdges, numberOfCalls: numberOfCalls, numberOfFunctionDefinitions: numberOfDefinitions, sizeOfObject: (0, size_of_1.getSizeOfDfGraph)(this.dataflow.graph), storedVertexIndices: storedVertexIndices, storedEnvIndices: storedEnvIndices, overwrittenIndices: overwrittenIndices, }, // these are all properly initialized in finish() commonMeasurements: new Map(), retrieveTimePerToken: { raw: 0, normalized: 0 }, normalizeTimePerToken: { raw: 0, normalized: 0 }, dataflowTimePerToken: { raw: 0, normalized: 0 }, totalCommonTimePerToken: { raw: 0, normalized: 0 } }; } /** * Counts the number of stored indices in the dataflow graph created by the pointer analysis. */ countStoredVertexIndices() { return this.countStoredIndices(this.dataflow?.out.map(ref => ref) ?? []); } /** * Counts the number of stored indices in the dataflow graph created by the pointer analysis. */ countStoredEnvIndices() { return this.countStoredIndices(this.dataflow?.environment.current.memory.values() ?.flatMap(def => def) .map(def => def) ?? []); } /** * Counts the number of stored indices in the passed definitions. */ countStoredIndices(definitions) { let numberOfIndices = 0; for (const reference of definitions) { if (reference.indicesCollection) { numberOfIndices += this.countIndices(reference.indicesCollection); } } return numberOfIndices; } /** * Recursively counts the number of indices and sub-indices in the given collection. */ countIndices(collection) { let numberOfIndices = 0; for (const indices of collection ?? []) { for (const index of indices.indices) { numberOfIndices++; if ((0, vertex_1.isParentContainerIndex)(index)) { numberOfIndices += this.countIndices(index.subIndices); } } } return numberOfIndices; } /** * Slice for the given {@link SlicingCriteria}. * @see SingleSlicingCriterion * * @returns The per slice stats retrieved for this slicing criteria */ async slice(...slicingCriteria) { exports.benchmarkLogger.trace(`try to slice for criteria ${JSON.stringify(slicingCriteria)}`); this.guardActive(); (0, assert_1.guard)(!this.perSliceMeasurements.has(slicingCriteria), 'do not slice the same criteria combination twice'); const measurements = new stopwatch_1.Measurements(); const stats = { measurements: undefined, slicingCriteria: [], numberOfDataflowNodesSliced: 0, timesHitThreshold: 0, reconstructedCode: { code: '', linesWithAutoSelected: 0 } }; this.perSliceMeasurements.set(slicingCriteria, stats); this.executor.updateRequest({ criterion: slicingCriteria }); const totalStopwatch = measurements.start('total'); const slicedOutput = await this.measureSliceStep('slice', measurements, 'static slicing'); stats.slicingCriteria = [...slicedOutput.decodedCriteria]; stats.reconstructedCode = await this.measureSliceStep('reconstruct', measurements, 'reconstruct code'); totalStopwatch.stop(); exports.benchmarkLogger.debug(`Produced code for ${JSON.stringify(slicingCriteria)}: ${stats.reconstructedCode.code}`); const results = this.executor.getResults(false); if (exports.benchmarkLogger.settings.minLevel >= 3 /* LogLevel.Info */) { exports.benchmarkLogger.info(`mapped slicing criteria: ${slicedOutput.decodedCriteria.map(c => { const node = results.normalize.idMap.get(c.id); return `\n- id: ${c.id}, location: ${JSON.stringify(node?.location)}, lexeme: ${JSON.stringify(node?.lexeme)}`; }).join('')}`); } // if it is not in the dataflow graph it was kept to be safe and should not count to the included nodes stats.numberOfDataflowNodesSliced = [...slicedOutput.result].filter(id => results.dataflow.graph.hasVertex(id, false)).length; stats.timesHitThreshold = slicedOutput.timesHitThreshold; stats.measurements = measurements.get(); return { stats, slice: slicedOutput, code: stats.reconstructedCode }; } /** * Extract the control flow graph using {@link extractCFG} */ extractCFG() { exports.benchmarkLogger.trace('try to extract the control flow graph'); this.guardActive(); (0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined for control flow extraction'); (0, assert_1.guard)(this.dataflow !== undefined, 'dataflow should be defined for control flow extraction'); (0, assert_1.guard)(this.config !== undefined, 'config should be defined for control flow extraction'); const ast = this.normalizedAst; const dfg = this.dataflow.graph; const config = this.config; this.controlFlow = this.measureSimpleStep('extract control flow graph', () => (0, extract_cfg_1.extractCfg)(ast, config, dfg)); } /** * Infer the shape of data frames using abstract interpretation with {@link inferDataFrameShapes} * * @returns The statistics of the data frame shape inference */ inferDataFrameShapes() { exports.benchmarkLogger.trace('try to infer shapes for data frames'); (0, assert_1.guard)(this.stats !== undefined && !this.finished, 'need to call init before, and can not do after finish!'); (0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined for data frame shape inference'); (0, assert_1.guard)(this.dataflow !== undefined, 'dataflow should be defined for data frame shape inference'); (0, assert_1.guard)(this.controlFlow !== undefined, 'controlFlow should be defined for data frame shape inference'); (0, assert_1.guard)(this.config !== undefined, 'config should be defined for data frame shape inference'); const ast = this.normalizedAst; const dfg = this.dataflow.graph; const cfinfo = this.controlFlow; const config = this.config; const stats = { numberOfDataFrameFiles: 0, numberOfNonDataFrameFiles: 0, numberOfResultConstraints: 0, numberOfResultingValues: 0, numberOfResultingTop: 0, numberOfResultingBottom: 0, numberOfEmptyNodes: 0, numberOfOperationNodes: 0, numberOfValueNodes: 0, sizeOfInfo: 0, perNodeStats: new Map() }; const result = this.measureSimpleStep('infer data frame shapes', () => (0, shape_inference_1.inferDataFrameShapes)(cfinfo, dfg, ast, config)); stats.numberOfResultConstraints = result.size; for (const value of result.values()) { if ((0, domain_1.equalDataFrameDomain)(value, domain_1.DataFrameTop)) { stats.numberOfResultingTop++; } else if ((0, domain_1.equalDataFrameDomain)(value, domain_1.DataFrameBottom)) { stats.numberOfResultingBottom++; } else { stats.numberOfResultingValues++; } } (0, visitor_1.visitAst)(this.normalizedAst.ast, (node) => { if (node.info.dataFrame === undefined) { return; } stats.sizeOfInfo += (0, size_of_1.safeSizeOf)([node.info.dataFrame]); const expression = (0, absint_info_1.hasDataFrameExpressionInfo)(node) ? node.info.dataFrame : undefined; const value = node.info.dataFrame.domain?.get(node.info.id); // Only store per-node information for nodes representing expressions or nodes with abstract values if (expression === undefined && value === undefined) { stats.numberOfEmptyNodes++; return; } const nodeStats = { numberOfEntries: node.info.dataFrame?.domain?.size ?? 0 }; if (expression !== undefined) { nodeStats.mappedOperations = expression.operations.map(op => op.operation); stats.numberOfOperationNodes++; if (value !== undefined) { nodeStats.inferredColNames = value.colnames === domain_1.ColNamesTop ? 'top' : value.colnames.length; nodeStats.inferredColCount = this.getInferredSize(value.cols); nodeStats.inferredRowCount = this.getInferredSize(value.rows); nodeStats.approxRangeColCount = value.cols === domain_1.IntervalBottom ? 0 : value.cols[1] - value.cols[0]; nodeStats.approxRangeRowCount = value.rows === domain_1.IntervalBottom ? 0 : value.rows[1] - value.rows[0]; } } if (value !== undefined) { stats.numberOfValueNodes++; } stats.perNodeStats.set(node.info.id, nodeStats); }); if (stats.numberOfOperationNodes > 0) { stats.numberOfDataFrameFiles = 1; } else { stats.numberOfNonDataFrameFiles = 1; } this.stats.dataFrameShape = stats; return stats; } getInferredSize(value) { if ((0, domain_1.equalInterval)(value, domain_1.IntervalTop)) { return 'top'; } else if (value === domain_1.IntervalBottom) { return 'bottom'; } else if (!isFinite(value[1])) { return 'infinite'; } return Math.floor((value[0] + value[1]) / 2); } /** Bridging the gap between the new internal and the old names for the benchmarking */ async measureCommonStep(expectedStep, keyToMeasure) { const memoryInit = process.memoryUsage(); const { result } = await this.commonMeasurements.measureAsync(keyToMeasure, () => this.executor.nextStep(expectedStep)); const memoryEnd = process.memoryUsage(); this.deltas.set(keyToMeasure, { heap: memoryEnd.heapUsed - memoryInit.heapUsed, rss: memoryEnd.rss - memoryInit.rss, external: memoryEnd.external - memoryInit.external, buffs: memoryEnd.arrayBuffers - memoryInit.arrayBuffers }); return result; } measureSimpleStep(keyToMeasure, measurement) { const memoryInit = process.memoryUsage(); const result = this.commonMeasurements.measure(keyToMeasure, measurement); const memoryEnd = process.memoryUsage(); this.deltas.set(keyToMeasure, { heap: memoryEnd.heapUsed - memoryInit.heapUsed, rss: memoryEnd.rss - memoryInit.rss, external: memoryEnd.external - memoryInit.external, buffs: memoryEnd.arrayBuffers - memoryInit.arrayBuffers }); return result; } async measureSliceStep(expectedStep, measure, keyToMeasure) { const { result } = await measure.measureAsync(keyToMeasure, () => this.executor.nextStep(expectedStep)); return result; } guardActive() { (0, assert_1.guard)(this.stats !== undefined && !this.finished, 'need to call init before, and can not do after finish!'); } /** * Call {@link slice} for all slicing criteria that match the given filter. * See {@link collectAllSlicingCriteria} for details. * <p> * the `report` function will be called *before* each *individual* slice is performed. * * @returns The number of slices that were produced * * @see collectAllSlicingCriteria * @see SlicingCriteriaFilter */ async sliceForAll(filter, report = () => { }, options = {}) { const { sampleCount, maxSliceCount, sampleStrategy } = { sampleCount: -1, maxSliceCount: -1, sampleStrategy: 'random', ...options }; this.guardActive(); let count = 0; let allCriteria = [...(0, collect_all_1.collectAllSlicingCriteria)(this.normalizedAst.ast, filter)]; // Cancel slicing if the number of slices exceeds the limit if (maxSliceCount > 0 && allCriteria.length > maxSliceCount) { return -allCriteria.length; } if (sampleCount > 0) { if (sampleStrategy === 'equidistant') { allCriteria = (0, arrays_1.equidistantSampling)(allCriteria, sampleCount, 'ceil'); } else { const random = options.seed ? (0, seedrandom_1.default)(options.seed) : Math.random; allCriteria.sort(() => random() - 0.5); allCriteria.length = Math.min(allCriteria.length, sampleCount); } } for (const slicingCriteria of allCriteria) { report(count, allCriteria.length, allCriteria); await this.slice(...slicingCriteria); count++; } return count; } /** * Retrieves the final stats and closes the shell session. * Can be called multiple times to retrieve the stored stats, but will only close the session once (the first time). */ finish() { (0, assert_1.guard)(this.stats !== undefined, 'need to call init before finish'); if (!this.finished) { this.commonMeasurements.measure('close R session', () => this.parser.close()); this.totalStopwatch.stop(); this.finished = true; } this.stats.commonMeasurements = this.commonMeasurements.get(); const retrieveTime = Number(this.stats.commonMeasurements.get('retrieve AST from R code')); const normalizeTime = Number(this.stats.commonMeasurements.get('normalize R AST')); const dataflowTime = Number(this.stats.commonMeasurements.get('produce dataflow information')); const controlFlowTime = Number(this.stats.commonMeasurements.get('extract control flow graph')); const dataFrameShapeTime = Number(this.stats.commonMeasurements.get('infer data frame shapes')); this.stats.retrieveTimePerToken = { raw: retrieveTime / this.stats.input.numberOfRTokens, normalized: retrieveTime / this.stats.input.numberOfNormalizedTokens }; this.stats.normalizeTimePerToken = { raw: normalizeTime / this.stats.input.numberOfRTokens, normalized: normalizeTime / this.stats.input.numberOfNormalizedTokens }; this.stats.dataflowTimePerToken = { raw: dataflowTime / this.stats.input.numberOfRTokens, normalized: dataflowTime / this.stats.input.numberOfNormalizedTokens }; this.stats.totalCommonTimePerToken = { raw: (retrieveTime + normalizeTime + dataflowTime) / this.stats.input.numberOfRTokens, normalized: (retrieveTime + normalizeTime + dataflowTime) / this.stats.input.numberOfNormalizedTokens }; this.stats.controlFlowTimePerToken = !isNaN(controlFlowTime) ? { raw: controlFlowTime / this.stats.input.numberOfRTokens, normalized: controlFlowTime / this.stats.input.numberOfNormalizedTokens, } : undefined; this.stats.dataFrameShapeTimePerToken = !isNaN(dataFrameShapeTime) ? { raw: dataFrameShapeTime / this.stats.input.numberOfRTokens, normalized: dataFrameShapeTime / this.stats.input.numberOfNormalizedTokens, } : undefined; return { stats: this.stats, parse: typeof this.loadedXml === 'string' ? this.loadedXml : JSON.stringify(this.loadedXml), dataflow: this.dataflow, normalize: this.normalizedAst }; } /** * Only call in case of an error - if the session must be closed and the benchmark itself is to be considered failed/dead. */ ensureSessionClosed() { this.parser?.close(); } } exports.BenchmarkSlicer = BenchmarkSlicer; //# sourceMappingURL=slicer.js.map