UNPKG

@eagleoutice/flowr

Version:

Static Dataflow Analyzer and Program Slicer for the R Programming Language

128 lines 6.5 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.histogramsFromClusters = histogramsFromClusters; exports.histogramFromNumbers = histogramFromNumbers; exports.histograms2table = histograms2table; const fs_1 = __importDefault(require("fs")); const bimap_1 = require("../../../util/bimap"); const defaultmap_1 = require("../../../util/defaultmap"); const assert_1 = require("../../../util/assert"); const summarizer_1 = require("../../../util/summarizer"); const arrays_1 = require("../../../util/arrays"); /** * Produces column-wise histogram-information based on a {@link ClusterReport}. * * Let's suppose you want histograms for the Assignments feature. * By default, for each clustered value, a histogram is produced (can be configured by `filter`). * * @param report - The report to collect histogram information from * @param binSize - Size of each bin (see {@link histogramFromNumbers} for details on why we do not specify the bin-count) * @param relateValuesToNumberOfLines - If true, each value (like `<-` appeared in file 'x' exactly `N` times) will be divided by the number of lines in the file 'x'. * @param filter - If given, only produce histograms for the given values */ function histogramsFromClusters(report, binSize, relateValuesToNumberOfLines, ...filter) { const contexts = [...report.valueInfoMap.entries()]; const filenameFromId = new bimap_1.BiMap(report.contextIdMap.entries()); // first, we collect the number of appearances for each value const valueCounts = new defaultmap_1.DefaultMap(() => []); for (const id of report.contextIdMap.values()) { // calculate the number of lines within the file given by the id const filename = filenameFromId.getKey(id); (0, assert_1.guard)(filename !== undefined, `filename for id ${id} is undefined`); const numberOfLines = relateValuesToNumberOfLines ? fs_1.default.readFileSync(filename, 'utf-8').split('\n').length : 1; for (const [value, counts] of contexts) { valueCounts.get(value).push(counts.get(id) / numberOfLines); } } return [...valueCounts.entries()].map(([name, counts]) => filter.length === 0 || filter.includes(name) ? histogramFromNumbers(name, binSize, counts) : undefined).filter(assert_1.isNotUndefined); } /** * Produces a histogram from a list of numbers. * Because we need to create several histograms of different datasets and want to compare them, we do not accept the * number of bins desired and calculate the bin-size from the data (via `Math.ceil((max - min + 1) / bins)`). * Instead, we require the bin-size to be given. * There *always* will be an extra bin for the minimum value. */ function histogramFromNumbers(name, binSize, values) { (0, assert_1.guard)(binSize > 0, `binSize must be greater than 0, but was ${binSize}`); (0, assert_1.guard)(values.length > 0, 'values must not be empty'); const summarized = (0, summarizer_1.summarizeMeasurement)(values); const numberOfBins = Math.ceil((summarized.max - summarized.min + 1) / binSize) + 1; const histogram = new Array(numberOfBins).fill(0); for (const v of values) { const bin = v === summarized.min ? 0 : Math.floor((v - summarized.min) / binSize) + 1; histogram[bin]++; } return { name: name, bins: histogram, binSize, ...summarized }; } /** * Takes an array of histograms created by {@link histogramFromNumbers} and produces a CSV table from it. * They must have the same bin-size for this function to work. * * The table has the following columns: * - `bin` - The corresponding bin number * - `from` - The exclusive lower bound of the bin * - `to` - The inclusive upper bound of the bin * - a column with the name of each histogram, containing its count of values in the corresponding bin * * @param histograms - The histogram to convert (assumed to have the same ranges and bins) * @param countAsDensity - If true, the count is divided by the total number of values (individually for each histogram, similar to pgfplots `hist/density` option) */ function histograms2table(histograms, countAsDensity = false) { (0, assert_1.guard)(histograms.length > 0, 'there must be at least one histogram to convert to a table'); const mostBins = guardForLargestBinSize(histograms); const header = ['bin', 'from', 'to', ...histograms.map(h => JSON.stringify(h.name))]; const sums = histograms.map(h => (0, arrays_1.arraySum)(h.bins)); const rows = []; for (let binIndex = 0; binIndex < mostBins; binIndex++) { const row = new Array(histograms.length + 3); row[0] = String(binIndex); if (binIndex === 0) { row[1] = histograms[0].min.toFixed(3); row[2] = histograms[0].min.toFixed(3); } else { row[1] = String((binIndex - 1) * histograms[0].binSize + histograms[0].min); row[2] = String((binIndex) * histograms[0].binSize + histograms[0].min); } // fill remaining columns writeRoResultsForHistograms(histograms, binIndex, row, countAsDensity, sums); rows.push(row); } return { header: header, rows: rows }; } function guardForLargestBinSize(histograms) { const first = histograms[0]; let mostBins = first.bins.length; for (let i = 1; i < histograms.length; i++) { (0, assert_1.guard)(histograms[i].binSize === first.binSize, `histograms must have the same bin-size, but ${histograms[i].name} has ${histograms[i].binSize} instead of ${first.binSize}`); if (histograms[i].bins.length > mostBins) { mostBins = histograms[i].bins.length; } } return mostBins; } function writeRoResultsForHistograms(histograms, binIndex, row, countAsDensity, sums) { for (let j = 0; j < histograms.length; j++) { const bins = histograms[j].bins; // does not have to be performant... if (binIndex >= bins.length) { row[j + 3] = '0'; /* in a histogram, 0 is the best default value for bins that are not present -- no value appeared in the corresponding bin */ } else { row[j + 3] = String(countAsDensity ? bins[binIndex] / sums[j] : bins[binIndex]); } } } //# sourceMappingURL=histogram.js.map