@eagleoutice/flowr
Version:
Static Dataflow Analyzer and Program Slicer for the R Programming Language
128 lines • 6.5 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.histogramsFromClusters = histogramsFromClusters;
exports.histogramFromNumbers = histogramFromNumbers;
exports.histograms2table = histograms2table;
const fs_1 = __importDefault(require("fs"));
const bimap_1 = require("../../../util/bimap");
const defaultmap_1 = require("../../../util/defaultmap");
const assert_1 = require("../../../util/assert");
const summarizer_1 = require("../../../util/summarizer");
const arrays_1 = require("../../../util/arrays");
/**
* Produces column-wise histogram-information based on a {@link ClusterReport}.
*
* Let's suppose you want histograms for the Assignments feature.
* By default, for each clustered value, a histogram is produced (can be configured by `filter`).
*
* @param report - The report to collect histogram information from
* @param binSize - Size of each bin (see {@link histogramFromNumbers} for details on why we do not specify the bin-count)
* @param relateValuesToNumberOfLines - If true, each value (like `<-` appeared in file 'x' exactly `N` times) will be divided by the number of lines in the file 'x'.
* @param filter - If given, only produce histograms for the given values
*/
function histogramsFromClusters(report, binSize, relateValuesToNumberOfLines, ...filter) {
const contexts = [...report.valueInfoMap.entries()];
const filenameFromId = new bimap_1.BiMap(report.contextIdMap.entries());
// first, we collect the number of appearances for each value
const valueCounts = new defaultmap_1.DefaultMap(() => []);
for (const id of report.contextIdMap.values()) {
// calculate the number of lines within the file given by the id
const filename = filenameFromId.getKey(id);
(0, assert_1.guard)(filename !== undefined, `filename for id ${id} is undefined`);
const numberOfLines = relateValuesToNumberOfLines ? fs_1.default.readFileSync(filename, 'utf-8').split('\n').length : 1;
for (const [value, counts] of contexts) {
valueCounts.get(value).push(counts.get(id) / numberOfLines);
}
}
return [...valueCounts.entries()].map(([name, counts]) => filter.length === 0 || filter.includes(name) ? histogramFromNumbers(name, binSize, counts) : undefined).filter(assert_1.isNotUndefined);
}
/**
* Produces a histogram from a list of numbers.
* Because we need to create several histograms of different datasets and want to compare them, we do not accept the
* number of bins desired and calculate the bin-size from the data (via `Math.ceil((max - min + 1) / bins)`).
* Instead, we require the bin-size to be given.
* There *always* will be an extra bin for the minimum value.
*/
function histogramFromNumbers(name, binSize, values) {
(0, assert_1.guard)(binSize > 0, `binSize must be greater than 0, but was ${binSize}`);
(0, assert_1.guard)(values.length > 0, 'values must not be empty');
const summarized = (0, summarizer_1.summarizeMeasurement)(values);
const numberOfBins = Math.ceil((summarized.max - summarized.min + 1) / binSize) + 1;
const histogram = new Array(numberOfBins).fill(0);
for (const v of values) {
const bin = v === summarized.min ? 0 : Math.floor((v - summarized.min) / binSize) + 1;
histogram[bin]++;
}
return {
name: name,
bins: histogram,
binSize,
...summarized
};
}
/**
* Takes an array of histograms created by {@link histogramFromNumbers} and produces a CSV table from it.
* They must have the same bin-size for this function to work.
*
* The table has the following columns:
* - `bin` - The corresponding bin number
* - `from` - The exclusive lower bound of the bin
* - `to` - The inclusive upper bound of the bin
* - a column with the name of each histogram, containing its count of values in the corresponding bin
*
* @param histograms - The histogram to convert (assumed to have the same ranges and bins)
* @param countAsDensity - If true, the count is divided by the total number of values (individually for each histogram, similar to pgfplots `hist/density` option)
*/
function histograms2table(histograms, countAsDensity = false) {
(0, assert_1.guard)(histograms.length > 0, 'there must be at least one histogram to convert to a table');
const mostBins = guardForLargestBinSize(histograms);
const header = ['bin', 'from', 'to', ...histograms.map(h => JSON.stringify(h.name))];
const sums = histograms.map(h => (0, arrays_1.arraySum)(h.bins));
const rows = [];
for (let binIndex = 0; binIndex < mostBins; binIndex++) {
const row = new Array(histograms.length + 3);
row[0] = String(binIndex);
if (binIndex === 0) {
row[1] = histograms[0].min.toFixed(3);
row[2] = histograms[0].min.toFixed(3);
}
else {
row[1] = String((binIndex - 1) * histograms[0].binSize + histograms[0].min);
row[2] = String((binIndex) * histograms[0].binSize + histograms[0].min);
}
// fill remaining columns
writeRoResultsForHistograms(histograms, binIndex, row, countAsDensity, sums);
rows.push(row);
}
return {
header: header,
rows: rows
};
}
function guardForLargestBinSize(histograms) {
const first = histograms[0];
let mostBins = first.bins.length;
for (let i = 1; i < histograms.length; i++) {
(0, assert_1.guard)(histograms[i].binSize === first.binSize, `histograms must have the same bin-size, but ${histograms[i].name} has ${histograms[i].binSize} instead of ${first.binSize}`);
if (histograms[i].bins.length > mostBins) {
mostBins = histograms[i].bins.length;
}
}
return mostBins;
}
function writeRoResultsForHistograms(histograms, binIndex, row, countAsDensity, sums) {
for (let j = 0; j < histograms.length; j++) {
const bins = histograms[j].bins;
// does not have to be performant...
if (binIndex >= bins.length) {
row[j + 3] = '0'; /* in a histogram, 0 is the best default value for bins that are not present -- no value appeared in the corresponding bin */
}
else {
row[j + 3] = String(countAsDensity ? bins[binIndex] / sums[j] : bins[binIndex]);
}
}
}
//# sourceMappingURL=histogram.js.map