@eagleoutice/flowr
Version:
Static Dataflow Analyzer and Program Slicer for the R Programming Language
481 lines • 24.1 kB
JavaScript
"use strict";
/**
* Provides a top-level slicer that can be used to slice code *and* retrieve stats.
* @module
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.BenchmarkSlicer = exports.benchmarkLogger = void 0;
const stopwatch_1 = require("./stopwatch");
const seedrandom_1 = __importDefault(require("seedrandom"));
const log_1 = require("../util/log");
const assert_1 = require("../util/assert");
const strings_1 = require("../util/text/strings");
const parse_1 = require("../slicing/criterion/parse");
const default_pipelines_1 = require("../core/steps/pipeline/default-pipelines");
const retriever_1 = require("../r-bridge/retriever");
const collect_all_1 = require("../slicing/criterion/collect-all");
const size_of_1 = require("./stats/size-of");
const shell_1 = require("../r-bridge/shell");
const tree_sitter_types_1 = require("../r-bridge/lang-4.x/tree-sitter/tree-sitter-types");
const tree_sitter_executor_1 = require("../r-bridge/lang-4.x/tree-sitter/tree-sitter-executor");
const vertex_1 = require("../dataflow/graph/vertex");
const arrays_1 = require("../util/collections/arrays");
const config_1 = require("../config");
const extract_cfg_1 = require("../control-flow/extract-cfg");
const shape_inference_1 = require("../abstract-interpretation/data-frame/shape-inference");
const lattice_1 = require("../abstract-interpretation/domains/lattice");
const set_range_domain_1 = require("../abstract-interpretation/domains/set-range-domain");
const fs_1 = __importDefault(require("fs"));
const flowr_analyzer_context_1 = require("../project/context/flowr-analyzer-context");
const r_project_1 = require("../r-bridge/lang-4.x/ast/model/nodes/r-project");
const r_comment_1 = require("../r-bridge/lang-4.x/ast/model/nodes/r-comment");
const call_graph_1 = require("../dataflow/graph/call-graph");
/**
* The logger to be used for benchmarking as a global object.
*/
exports.benchmarkLogger = log_1.log.getSubLogger({ name: 'benchmark' });
class BenchmarkSlicer {
/** Measures all data recorded *once* per slicer (complete setup up to the dataflow graph creation) */
commonMeasurements = new stopwatch_1.Measurements();
perSliceMeasurements = new Map();
deltas = new Map();
parserName;
context;
stats;
loadedXml;
dataflow;
normalizedAst;
controlFlow;
callGraph;
totalStopwatch;
finished = false;
// Yes, this is unclean, but we know that we assign the executor during the initialization and this saves us from having to check for nullability every time
executor = null;
parser = null;
constructor(parserName) {
this.totalStopwatch = this.commonMeasurements.start('total');
this.parserName = parserName;
}
/**
* Initialize the slicer on the given request.
* Can only be called once for each instance.
*/
async init(request, config, autoSelectIf, threshold) {
(0, assert_1.guard)(this.stats === undefined, 'cannot initialize the slicer twice');
// we know these are in sync so we just cast to one of them
this.parser = await this.commonMeasurements.measure('initialize R session', async () => {
if (this.parserName === 'r-shell') {
return new shell_1.RShell(config_1.FlowrConfig.getForEngine(config, 'r-shell'));
}
else {
await tree_sitter_executor_1.TreeSitterExecutor.initTreeSitter(config_1.FlowrConfig.getForEngine(config, 'tree-sitter'));
return new tree_sitter_executor_1.TreeSitterExecutor();
}
});
this.context = (0, flowr_analyzer_context_1.contextFromInput)({ ...request }, config);
this.executor = (0, default_pipelines_1.createSlicePipeline)(this.parser, {
context: this.context,
criterion: [],
autoSelectIf,
threshold,
});
this.loadedXml = (await this.measureCommonStep('parse', 'retrieve AST from R code')).files.map(p => p.parsed);
this.normalizedAst = await this.measureCommonStep('normalize', 'normalize R AST');
this.dataflow = await this.measureCommonStep('dataflow', 'produce dataflow information');
this.executor.switchToRequestStage();
await this.calculateStatsAfterInit(request);
}
async calculateStatsAfterInit(request) {
const loadedContent = request.request === 'text' ? request.content : fs_1.default.readFileSync(request.content, 'utf-8');
let numberOfRTokens;
let numberOfRTokensNoComments;
if (this.parser.name === 'r-shell') {
// retrieve number of R tokens - flowr_parsed should still contain the last parsed code
numberOfRTokens = await (0, retriever_1.retrieveNumberOfRTokensOfLastParse)(this.parser);
numberOfRTokensNoComments = await (0, retriever_1.retrieveNumberOfRTokensOfLastParse)(this.parser, true);
}
else {
const countChildren = function (node, ignoreComments = false) {
let ret = node.type === tree_sitter_types_1.TreeSitterType.Comment && ignoreComments ? 0 : 1;
for (const child of node.children) {
ret += countChildren(child, ignoreComments);
}
return ret;
};
const root = this.loadedXml.map(t => t.rootNode);
numberOfRTokens = root.map(r => countChildren(r)).reduce((a, b) => a + b, 0);
numberOfRTokensNoComments = root.map(r => countChildren(r, true)).reduce((a, b) => a + b, 0);
}
(0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined after initialization');
(0, assert_1.guard)(this.dataflow !== undefined, 'dataflow should be defined after initialization');
// collect dataflow graph size
const vertices = this.dataflow.graph.vertices(true);
let numberOfEdges = 0;
let numberOfCalls = 0;
let numberOfDefinitions = 0;
for (const [n, info] of vertices) {
const outgoingEdges = this.dataflow.graph.outgoingEdges(n);
numberOfEdges += outgoingEdges?.size ?? 0;
if (info.tag === vertex_1.VertexType.FunctionCall) {
numberOfCalls++;
}
else if (info.tag === vertex_1.VertexType.FunctionDefinition) {
numberOfDefinitions++;
}
}
let nodes = 0;
let nodesNoComments = 0;
let commentChars = 0;
let commentCharsNoWhitespace = 0;
r_project_1.RProject.visitAst(this.normalizedAst.ast, t => {
nodes++;
const comments = t.info.adToks?.filter(r_comment_1.RComment.is);
if (comments && comments.length > 0) {
const content = comments.map(c => c.lexeme ?? '').join('');
commentChars += content.length;
commentCharsNoWhitespace += (0, strings_1.withoutWhitespace)(content).length;
}
else {
nodesNoComments++;
}
return false;
});
const split = loadedContent.split('\n');
const nonWhitespace = (0, strings_1.withoutWhitespace)(loadedContent).length;
this.stats = {
perSliceMeasurements: this.perSliceMeasurements,
memory: this.deltas,
request,
input: {
numberOfLines: split.length,
numberOfNonEmptyLines: split.filter(l => l.trim().length > 0).length,
numberOfCharacters: loadedContent.length,
numberOfCharactersNoComments: loadedContent.length - commentChars,
numberOfNonWhitespaceCharacters: nonWhitespace,
numberOfNonWhitespaceCharactersNoComments: nonWhitespace - commentCharsNoWhitespace,
numberOfRTokens: numberOfRTokens,
numberOfRTokensNoComments: numberOfRTokensNoComments,
numberOfNormalizedTokens: nodes,
numberOfNormalizedTokensNoComments: nodesNoComments
},
dataflow: {
numberOfNodes: this.dataflow.graph.vertices(true).toArray().length,
numberOfEdges: numberOfEdges,
numberOfCalls: numberOfCalls,
numberOfFunctionDefinitions: numberOfDefinitions,
sizeOfObject: (0, size_of_1.getSizeOfDfGraph)(this.dataflow.graph),
},
// these are all properly initialized in finish()
commonMeasurements: new Map(),
retrieveTimePerToken: { raw: 0, normalized: 0 },
normalizeTimePerToken: { raw: 0, normalized: 0 },
dataflowTimePerToken: { raw: 0, normalized: 0 },
totalCommonTimePerToken: { raw: 0, normalized: 0 }
};
}
/**
* Slice for the given {@link SlicingCriteria}.
* @see SingleSlicingCriterion
* @returns The per slice stats retrieved for this slicing criteria
*/
async slice(...slicingCriteria) {
exports.benchmarkLogger.trace(`try to slice for criteria ${JSON.stringify(slicingCriteria)}`);
this.guardActive();
(0, assert_1.guard)(!this.perSliceMeasurements.has(slicingCriteria), 'do not slice the same criteria combination twice');
const measurements = new stopwatch_1.Measurements();
const stats = {
measurements: undefined,
slicingCriteria: [],
numberOfDataflowNodesSliced: 0,
timesHitThreshold: 0,
reconstructedCode: {
code: '',
linesWithAutoSelected: 0
}
};
this.perSliceMeasurements.set(slicingCriteria, stats);
this.executor.updateRequest({ criterion: slicingCriteria });
const totalStopwatch = measurements.start('total');
const slicedOutput = await this.measureSliceStep('slice', measurements, 'static slicing');
const decodedCriteria = parse_1.SlicingCriteria.decodeAll(slicingCriteria, this.normalizedAst.idMap);
stats.slicingCriteria = Array.from(decodedCriteria);
stats.reconstructedCode = await this.measureSliceStep('reconstruct', measurements, 'reconstruct code');
totalStopwatch.stop();
exports.benchmarkLogger.debug(`Produced code for ${JSON.stringify(slicingCriteria)}: ${stats.reconstructedCode.code}`);
const results = this.executor.getResults(false);
if (exports.benchmarkLogger.settings.minLevel >= 3 /* LogLevel.Info */) {
exports.benchmarkLogger.info(`mapped slicing criteria: ${slicedOutput.slicedFor.map(id => {
const node = results.normalize.idMap.get(id);
return `\n- id: ${id}, location: ${JSON.stringify(node?.location)}, lexeme: ${JSON.stringify(node?.lexeme)}`;
}).join('')}`);
}
// if it is not in the dataflow graph it was kept to be safe and should not count to the included nodes
stats.numberOfDataflowNodesSliced = Array.from(slicedOutput.result).filter(id => results.dataflow.graph.hasVertex(id, false)).length;
stats.timesHitThreshold = slicedOutput.timesHitThreshold;
stats.measurements = measurements.get();
return {
stats,
slice: slicedOutput,
code: stats.reconstructedCode
};
}
/**
* Extract the control flow graph using {@link extractCFG}
*/
extractCFG() {
exports.benchmarkLogger.trace('try to extract the control flow graph');
this.guardActive();
(0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined for control flow extraction');
const ast = this.normalizedAst;
this.controlFlow = this.measureSimpleStep('extract control flow graph', () => (0, extract_cfg_1.extractCfg)(ast, this.context, undefined, undefined, true));
}
extractCG() {
exports.benchmarkLogger.trace('try to extract the call graph');
this.guardActive();
const g = this.dataflow?.graph;
(0, assert_1.guard)(g !== undefined, 'dataflow should be defined for call graph extraction');
this.callGraph = this.measureSimpleStep('extract call graph', () => call_graph_1.CallGraph.compute(g));
}
/**
* Infer the shape of data frames using abstract interpretation with {@link inferDataFrameShapes}
* @returns The statistics of the data frame shape inference
*/
inferDataFrameShapes() {
exports.benchmarkLogger.trace('try to infer shapes for data frames');
(0, assert_1.guard)(this.stats !== undefined && !this.finished, 'need to call init before, and can not do after finish!');
(0, assert_1.guard)(this.normalizedAst !== undefined, 'normalizedAst should be defined for data frame shape inference');
(0, assert_1.guard)(this.dataflow !== undefined, 'dataflow should be defined for data frame shape inference');
(0, assert_1.guard)(this.controlFlow !== undefined, 'controlFlow should be defined for data frame shape inference');
(0, assert_1.guard)(this.context !== undefined, 'context should be defined for data frame shape inference');
const ast = this.normalizedAst;
const dfg = this.dataflow.graph;
const cfinfo = this.controlFlow;
const stats = {
numberOfDataFrameFiles: 0,
numberOfNonDataFrameFiles: 0,
numberOfResultConstraints: 0,
numberOfResultingValues: 0,
numberOfResultingBottom: 0,
numberOfResultingTop: 0,
numberOfEmptyNodes: 0,
numberOfOperationNodes: 0,
numberOfValueNodes: 0,
sizeOfInfo: 0,
perNodeStats: new Map()
};
const inference = new shape_inference_1.DataFrameShapeInferenceVisitor({ controlFlow: cfinfo, dfg, normalizedAst: ast, ctx: this.context });
this.measureSimpleStep('infer data frame shapes', () => inference.start());
const result = inference.getEndState();
stats.numberOfResultConstraints = result.value.size;
stats.sizeOfInfo = (0, size_of_1.safeSizeOf)([inference.getAbstractTrace()]);
for (const value of result.value.values()) {
if (value.isTop()) {
stats.numberOfResultingTop++;
}
else if (value.isBottom()) {
stats.numberOfResultingBottom++;
}
else {
stats.numberOfResultingValues++;
}
}
r_project_1.RProject.visitAst(this.normalizedAst.ast, node => {
const operations = inference.getAbstractOperations(node.info.id);
const value = inference.getAbstractValue(node.info.id);
// Only store per-node information for nodes representing expressions or nodes with abstract values
if (operations === undefined && value === undefined) {
stats.numberOfEmptyNodes++;
return;
}
const nodeStats = {
numberOfEntries: inference.getAbstractState(node.info.id)?.value.size ?? 0
};
if (operations !== undefined) {
nodeStats.mappedOperations = operations.map(op => op.operation);
stats.numberOfOperationNodes++;
if (value !== undefined) {
nodeStats.inferredColNames = this.getInferredNumber(value.colnames);
nodeStats.inferredColCount = this.getInferredNumber(value.cols);
nodeStats.inferredRowCount = this.getInferredNumber(value.rows);
nodeStats.approxRangeColNames = this.getInferredRange(value.colnames);
nodeStats.approxRangeColCount = this.getInferredRange(value.cols);
nodeStats.approxRangeRowCount = this.getInferredRange(value.rows);
}
}
if (value !== undefined) {
stats.numberOfValueNodes++;
}
stats.perNodeStats.set(node.info.id, nodeStats);
});
if (stats.numberOfOperationNodes > 0) {
stats.numberOfDataFrameFiles = 1;
}
else {
stats.numberOfNonDataFrameFiles = 1;
}
this.stats.dataFrameShape = stats;
return stats;
}
getInferredRange(value) {
if (value.isValue()) {
if (value instanceof set_range_domain_1.SetRangeDomain) {
return value.value.range === lattice_1.Top ? Infinity : value.value.range.size;
}
else {
return value.value[1] - value.value[0];
}
}
return 0;
}
getInferredNumber(value) {
if (value.isTop()) {
return 'top';
}
else if (value.isValue()) {
if (value instanceof set_range_domain_1.SetRangeDomain) {
if (value.value.range === lattice_1.Top) {
return 'infinite';
}
return Math.floor(value.value.min.size + (value.value.range.size / 2));
}
else {
if (!isFinite(value.value[1])) {
return 'infinite';
}
return Math.floor((value.value[0] + value.value[1]) / 2);
}
}
return 'bottom';
}
/** Bridging the gap between the new internal and the old names for the benchmarking */
async measureCommonStep(expectedStep, keyToMeasure) {
const memoryInit = process.memoryUsage();
const { result } = await this.commonMeasurements.measureAsync(keyToMeasure, () => this.executor.nextStep(expectedStep));
const memoryEnd = process.memoryUsage();
this.deltas.set(keyToMeasure, {
heap: memoryEnd.heapUsed - memoryInit.heapUsed,
rss: memoryEnd.rss - memoryInit.rss,
external: memoryEnd.external - memoryInit.external,
buffs: memoryEnd.arrayBuffers - memoryInit.arrayBuffers
});
return result;
}
measureSimpleStep(keyToMeasure, measurement) {
const memoryInit = process.memoryUsage();
const result = this.commonMeasurements.measure(keyToMeasure, measurement);
const memoryEnd = process.memoryUsage();
this.deltas.set(keyToMeasure, {
heap: memoryEnd.heapUsed - memoryInit.heapUsed,
rss: memoryEnd.rss - memoryInit.rss,
external: memoryEnd.external - memoryInit.external,
buffs: memoryEnd.arrayBuffers - memoryInit.arrayBuffers
});
return result;
}
async measureSliceStep(expectedStep, measure, keyToMeasure) {
const { result } = await measure.measureAsync(keyToMeasure, () => this.executor.nextStep(expectedStep));
return result;
}
guardActive() {
(0, assert_1.guard)(this.stats !== undefined && !this.finished, 'need to call init before, and can not do after finish!');
}
/**
* Call {@link slice} for all slicing criteria that match the given filter.
* See {@link collectAllSlicingCriteria} for details.
* <p>
* the `report` function will be called *before* each *individual* slice is performed.
* @returns The number of slices that were produced
* @see collectAllSlicingCriteria
* @see SlicingCriteriaFilter
*/
async sliceForAll(filter, report = () => { }, options = {}) {
const { sampleCount, maxSliceCount, sampleStrategy } = { sampleCount: -1, maxSliceCount: -1, sampleStrategy: 'random', ...options };
this.guardActive();
let count = 0;
let allCriteria = [...(0, collect_all_1.collectAllSlicingCriteria)(this.normalizedAst.ast, filter)];
// Cancel slicing if the number of slices exceeds the limit
if (maxSliceCount > 0 && allCriteria.length > maxSliceCount) {
return -allCriteria.length;
}
if (sampleCount > 0) {
if (sampleStrategy === 'equidistant') {
allCriteria = (0, arrays_1.equidistantSampling)(allCriteria, sampleCount, 'ceil');
}
else {
const random = options.seed ? (0, seedrandom_1.default)(options.seed) : Math.random;
allCriteria.sort(() => random() - 0.5);
allCriteria.length = Math.min(allCriteria.length, sampleCount);
}
}
for (const slicingCriteria of allCriteria) {
report(count, allCriteria.length, allCriteria);
await this.slice(...slicingCriteria);
count++;
}
return count;
}
/**
* Retrieves the final stats and closes the shell session.
* Can be called multiple times to retrieve the stored stats, but will only close the session once (the first time).
*/
finish() {
(0, assert_1.guard)(this.stats !== undefined, 'need to call init before finish');
if (!this.finished) {
this.commonMeasurements.measure('close R session', () => this.parser.close());
this.totalStopwatch.stop();
this.finished = true;
}
this.stats.commonMeasurements = this.commonMeasurements.get();
const retrieveTime = Number(this.stats.commonMeasurements.get('retrieve AST from R code'));
const normalizeTime = Number(this.stats.commonMeasurements.get('normalize R AST'));
const dataflowTime = Number(this.stats.commonMeasurements.get('produce dataflow information'));
const controlFlowTime = Number(this.stats.commonMeasurements.get('extract control flow graph'));
const callGraphTime = Number(this.stats.commonMeasurements.get('extract call graph'));
const dataFrameShapeTime = Number(this.stats.commonMeasurements.get('infer data frame shapes'));
this.stats.retrieveTimePerToken = {
raw: retrieveTime / this.stats.input.numberOfRTokens,
normalized: retrieveTime / this.stats.input.numberOfNormalizedTokens
};
this.stats.normalizeTimePerToken = {
raw: normalizeTime / this.stats.input.numberOfRTokens,
normalized: normalizeTime / this.stats.input.numberOfNormalizedTokens
};
this.stats.dataflowTimePerToken = {
raw: dataflowTime / this.stats.input.numberOfRTokens,
normalized: dataflowTime / this.stats.input.numberOfNormalizedTokens
};
this.stats.totalCommonTimePerToken = {
raw: (retrieveTime + normalizeTime + dataflowTime) / this.stats.input.numberOfRTokens,
normalized: (retrieveTime + normalizeTime + dataflowTime) / this.stats.input.numberOfNormalizedTokens
};
this.stats.controlFlowTimePerToken = Number.isNaN(controlFlowTime) ? undefined : {
raw: controlFlowTime / this.stats.input.numberOfRTokens,
normalized: controlFlowTime / this.stats.input.numberOfNormalizedTokens,
};
this.stats.callGraphTimePerToken = Number.isNaN(callGraphTime) ? undefined : {
raw: callGraphTime / this.stats.input.numberOfRTokens,
normalized: callGraphTime / this.stats.input.numberOfNormalizedTokens,
};
this.stats.dataFrameShapeTimePerToken = Number.isNaN(dataFrameShapeTime) ? undefined : {
raw: dataFrameShapeTime / this.stats.input.numberOfRTokens,
normalized: dataFrameShapeTime / this.stats.input.numberOfNormalizedTokens,
};
return {
stats: this.stats,
parse: typeof this.loadedXml === 'string' ? this.loadedXml : JSON.stringify(this.loadedXml),
dataflow: this.dataflow,
normalize: this.normalizedAst
};
}
/**
* Only call in case of an error - if the session must be closed and the benchmark itself is to be considered failed/dead.
*/
ensureSessionClosed() {
this.parser?.close();
}
}
exports.BenchmarkSlicer = BenchmarkSlicer;
//# sourceMappingURL=slicer.js.map