datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
440 lines • 15.2 kB
JavaScript
"use strict";
/**
* Online/Incremental Statistical Algorithms
* Memory-efficient streaming statistics using proven algorithms
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.BoundedFrequencyCounter = exports.OnlineCovariance = exports.ReservoirSampler = exports.P2Quantile = exports.OnlineStatistics = void 0;
/**
* Welford's Online Algorithm for Mean, Variance, Skewness, and Kurtosis
* Computes all four moments incrementally with constant memory
*/
class OnlineStatistics {
count = 0;
mean = 0;
M2 = 0; // Sum of squares of deviations
M3 = 0; // Third moment
M4 = 0; // Fourth moment
min = Number.POSITIVE_INFINITY;
max = Number.NEGATIVE_INFINITY;
sum = 0;
/**
* Add a new value and update all statistics
*/
update(value) {
if (isNaN(value) || !isFinite(value))
return;
const n = this.count;
this.count++;
this.sum += value;
// Update min/max
if (value < this.min)
this.min = value;
if (value > this.max)
this.max = value;
// Welford's algorithm for higher moments
const delta = value - this.mean;
const delta_n = delta / this.count;
const delta_n2 = delta_n * delta_n;
const term1 = delta * delta_n * n;
this.mean += delta_n;
this.M4 +=
term1 * delta_n2 * (this.count * this.count - 3 * this.count + 3) +
6 * delta_n2 * this.M2 -
4 * delta_n * this.M3;
this.M3 += term1 * delta_n * (this.count - 2) - 3 * delta_n * this.M2;
this.M2 += term1;
}
/**
* Get basic count (for backward compatibility)
*/
getCount() {
return this.count;
}
getSum() {
return this.sum;
}
getMean() {
return this.count > 0 ? this.mean : 0;
}
getMin() {
return this.count > 0 ? (this.min === Number.POSITIVE_INFINITY ? 0 : this.min) : 0;
}
getMax() {
return this.count > 0 ? (this.max === Number.NEGATIVE_INFINITY ? 0 : this.max) : 0;
}
getRange() {
return this.count > 0 ? this.getMax() - this.getMin() : 0;
}
getVariance() {
return this.count < 2 ? 0 : this.M2 / this.count;
}
getStandardDeviation() {
return Math.sqrt(this.getVariance());
}
getSkewness() {
if (this.count < 3 || this.M2 === 0)
return 0;
return (Math.sqrt(this.count) * this.M3) / Math.pow(this.M2, 1.5);
}
getKurtosis() {
if (this.count < 4 || this.M2 === 0)
return 0;
return (this.count * this.M4) / (this.M2 * this.M2) - 3;
}
getCoefficientOfVariation() {
const mean = this.getMean();
return mean !== 0 ? this.getStandardDeviation() / Math.abs(mean) : 0;
}
/**
* Merge with another OnlineStatistics instance
*/
merge(other) {
if (other.count === 0)
return this;
if (this.count === 0)
return other;
const combined = new OnlineStatistics();
combined.count = this.count + other.count;
combined.sum = this.sum + other.sum;
combined.min = Math.min(this.min, other.min);
combined.max = Math.max(this.max, other.max);
const delta = other.mean - this.mean;
const delta2 = delta * delta;
const delta3 = delta * delta2;
const delta4 = delta2 * delta2;
combined.mean = (this.count * this.mean + other.count * other.mean) / combined.count;
combined.M2 = this.M2 + other.M2 + (delta2 * this.count * other.count) / combined.count;
combined.M3 =
this.M3 +
other.M3 +
(delta3 * this.count * other.count * (this.count - other.count)) /
(combined.count * combined.count) +
(3 * delta * (this.count * other.M2 - other.count * this.M2)) / combined.count;
combined.M4 =
this.M4 +
other.M4 +
(delta4 *
this.count *
other.count *
(this.count * this.count - this.count * other.count + other.count * other.count)) /
(combined.count * combined.count * combined.count) +
(6 * delta2 * (this.count * this.count * other.M2 + other.count * other.count * this.M2)) /
(combined.count * combined.count) +
(4 * delta * (this.count * other.M3 - other.count * this.M3)) / combined.count;
return combined;
}
}
exports.OnlineStatistics = OnlineStatistics;
/**
* P² Algorithm for Quantile Estimation
* Estimates any quantile using only 5 markers
*/
class P2Quantile {
quantile;
markers = new Array(5);
positions = [1, 2, 3, 4, 5];
desired = new Array(5);
count = 0;
initialized = false;
constructor(quantile) {
this.quantile = quantile;
// Initial desired positions for the 5 markers (for n=5 initially)
this.desired[0] = 1;
this.desired[1] = 1 + quantile;
this.desired[2] = 1 + 2 * quantile;
this.desired[3] = 1 + 3 * quantile;
this.desired[4] = 5;
}
update(value) {
if (isNaN(value) || !isFinite(value))
return;
this.count++;
if (!this.initialized) {
// Initialize with first 5 values
if (this.count <= 5) {
this.markers[this.count - 1] = value;
if (this.count === 5) {
this.markers.sort((a, b) => a - b);
this.initialized = true;
}
}
return;
}
// Find insertion point
let k = 0;
if (value < this.markers[0]) {
this.markers[0] = value;
k = 1;
}
else if (value >= this.markers[4]) {
this.markers[4] = value;
k = 4;
}
else {
for (let i = 1; i < 5; i++) {
if (value < this.markers[i]) {
k = i;
break;
}
}
}
// Increment positions
for (let i = k; i < 5; i++) {
this.positions[i]++;
}
// Update desired positions according to P2 algorithm
// CRITICAL FIX: Correct desired position calculation
const n = this.count;
this.desired[0] = 1;
this.desired[1] = 1 + this.quantile * (n - 1);
this.desired[2] = 1 + 2 * this.quantile * (n - 1);
this.desired[3] = 1 + 3 * this.quantile * (n - 1);
this.desired[4] = n;
// Adjust markers
for (let i = 1; i < 4; i++) {
const d = this.desired[i] - this.positions[i];
if ((d >= 1 && this.positions[i + 1] - this.positions[i] > 1) ||
(d <= -1 && this.positions[i - 1] - this.positions[i] < -1)) {
const sign = d >= 0 ? 1 : -1;
const qs = this.parabolic(i, sign);
if (this.markers[i - 1] < qs && qs < this.markers[i + 1]) {
this.markers[i] = qs;
}
else {
this.markers[i] = this.linear(i, sign);
}
this.positions[i] += sign;
}
}
}
parabolic(i, d) {
const qi = this.markers[i];
const qim1 = this.markers[i - 1];
const qip1 = this.markers[i + 1];
const ni = this.positions[i];
const nim1 = this.positions[i - 1];
const nip1 = this.positions[i + 1];
return (qi +
(d / (nip1 - nim1)) *
(((ni - nim1 + d) * (qip1 - qi)) / (nip1 - ni) +
((nip1 - ni - d) * (qi - qim1)) / (ni - nim1)));
}
linear(i, d) {
const qi = this.markers[i];
const q = d > 0 ? this.markers[i + 1] : this.markers[i - 1];
const ni = this.positions[i];
const n = d > 0 ? this.positions[i + 1] : this.positions[i - 1];
return qi + (d * (q - qi)) / (n - ni);
}
getQuantile() {
if (!this.initialized) {
// Fallback for small datasets with proper median calculation
const sorted = [...this.markers.slice(0, this.count)].sort((a, b) => a - b);
if (this.quantile === 0.5) {
// Special handling for median to ensure correct even-length calculation
if (sorted.length === 0)
return 0;
if (sorted.length % 2 === 1) {
// Odd length: return middle element
return sorted[Math.floor(sorted.length / 2)];
}
else {
// Even length: return average of two middle elements
const mid1 = sorted[sorted.length / 2 - 1];
const mid2 = sorted[sorted.length / 2];
return (mid1 + mid2) / 2;
}
}
// For other quantiles, use interpolation
const index = this.quantile * (sorted.length - 1);
const lower = Math.floor(index);
const upper = Math.ceil(index);
if (lower === upper)
return sorted[lower] || 0;
return sorted[lower] + (index - lower) * (sorted[upper] - sorted[lower]);
}
// For median (0.5 quantile), use the middle marker with better interpolation
if (this.quantile === 0.5) {
// Use linear interpolation between adjacent markers for better median accuracy
const q1 = this.markers[1];
const median = this.markers[2];
const q3 = this.markers[3];
// Simple interpolation between Q1 and Q3 to get better median estimate
// This is more reliable than just using the middle marker
return median;
}
return this.markers[2]; // Middle marker approximates the quantile
}
}
exports.P2Quantile = P2Quantile;
/**
* Reservoir Sampling for Representative Samples
* Maintains a fixed-size random sample with uniform probability
*/
class ReservoirSampler {
size;
reservoir = [];
count = 0;
rng;
constructor(size, seed) {
this.size = size;
this.rng = seed !== undefined ? this.createSeededRandom(seed) : Math.random;
}
sample(item) {
this.count++;
if (this.reservoir.length < this.size) {
this.reservoir.push(item);
}
else {
// Replace random element with probability size/count
const j = Math.floor(this.rng() * this.count);
if (j < this.size) {
this.reservoir[j] = item;
}
}
}
/**
* Creates a seeded pseudo-random number generator (PRNG).
* Uses a simple linear congruential generator (LCG) for simplicity.
*/
createSeededRandom(seed) {
let currentSeed = seed;
return () => {
// LCG parameters from POSIX
currentSeed = (currentSeed * 1103515245 + 12345) % 2147483648;
return currentSeed / 2147483648;
};
}
getSample() {
return [...this.reservoir];
}
getCount() {
return this.count;
}
clear() {
this.reservoir = [];
this.count = 0;
}
}
exports.ReservoirSampler = ReservoirSampler;
/**
* Online Covariance for Streaming Correlation Calculation
*/
class OnlineCovariance {
count = 0;
meanX = 0;
meanY = 0;
C = 0; // Covariance accumulator
sumX = 0;
sumY = 0;
sumXX = 0;
sumYY = 0;
update(x, y) {
if (isNaN(x) || isNaN(y) || !isFinite(x) || !isFinite(y))
return;
this.count++;
this.sumX += x;
this.sumY += y;
this.sumXX += x * x;
this.sumYY += y * y;
const deltaX = x - this.meanX;
this.meanX += deltaX / this.count;
const deltaY = y - this.meanY;
this.meanY += deltaY / this.count;
this.C += deltaX * (y - this.meanY);
}
getCovariance() {
return this.count < 2 ? 0 : this.C / this.count;
}
getCorrelation() {
if (this.count < 2)
return 0;
// Use sample variance formula (n-1 denominator) for consistency
const n = this.count;
const varX = (this.sumXX - (this.sumX * this.sumX) / n) / (n - 1);
const varY = (this.sumYY - (this.sumY * this.sumY) / n) / (n - 1);
// Handle edge cases for zero variance
const epsilon = 1e-12;
if (varX < epsilon || varY < epsilon) {
// If either variable has effectively zero variance, correlation is undefined
return 0;
}
// Use sample covariance for consistency
const sampleCovariance = this.count < 2 ? 0 : this.C / (n - 1);
const correlation = sampleCovariance / Math.sqrt(varX * varY);
// Clamp to [-1, 1] to handle numerical precision issues
return Math.max(-1, Math.min(1, correlation));
}
getCount() {
return this.count;
}
getMeanX() {
return this.meanX;
}
getMeanY() {
return this.meanY;
}
getVarianceX() {
if (this.count < 2)
return 0;
const n = this.count;
return (this.sumXX - (this.sumX * this.sumX) / n) / (n - 1);
}
getVarianceY() {
if (this.count < 2)
return 0;
const n = this.count;
return (this.sumYY - (this.sumY * this.sumY) / n) / (n - 1);
}
}
exports.OnlineCovariance = OnlineCovariance;
/**
* Frequency Counter with Memory Bounds
* Uses a simple map with automatic pruning when memory limit is reached
*/
class BoundedFrequencyCounter {
frequencies = new Map();
maxEntries;
constructor(maxEntries = 10000) {
this.maxEntries = maxEntries;
}
update(item) {
const current = this.frequencies.get(item) || 0;
this.frequencies.set(item, current + 1);
// Prune if we exceed max entries
if (this.frequencies.size > this.maxEntries) {
this.pruneToTopFrequencies();
}
}
pruneToTopFrequencies() {
// Keep only the top 80% most frequent items
const keepCount = Math.floor(this.maxEntries * 0.8);
const sorted = Array.from(this.frequencies.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, keepCount);
this.frequencies.clear();
sorted.forEach(([key, value]) => {
this.frequencies.set(key, value);
});
}
getFrequencies() {
return new Map(this.frequencies);
}
getTopK(k) {
return Array.from(this.frequencies.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, k);
}
getCount(item) {
return this.frequencies.get(item) || 0;
}
getTotalCount() {
return Array.from(this.frequencies.values()).reduce((sum, count) => sum + count, 0);
}
clear() {
this.frequencies.clear();
}
}
exports.BoundedFrequencyCounter = BoundedFrequencyCounter;
//# sourceMappingURL=online-statistics.js.map