UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

440 lines 15.2 kB
"use strict"; /** * Online/Incremental Statistical Algorithms * Memory-efficient streaming statistics using proven algorithms */ Object.defineProperty(exports, "__esModule", { value: true }); exports.BoundedFrequencyCounter = exports.OnlineCovariance = exports.ReservoirSampler = exports.P2Quantile = exports.OnlineStatistics = void 0; /** * Welford's Online Algorithm for Mean, Variance, Skewness, and Kurtosis * Computes all four moments incrementally with constant memory */ class OnlineStatistics { count = 0; mean = 0; M2 = 0; // Sum of squares of deviations M3 = 0; // Third moment M4 = 0; // Fourth moment min = Number.POSITIVE_INFINITY; max = Number.NEGATIVE_INFINITY; sum = 0; /** * Add a new value and update all statistics */ update(value) { if (isNaN(value) || !isFinite(value)) return; const n = this.count; this.count++; this.sum += value; // Update min/max if (value < this.min) this.min = value; if (value > this.max) this.max = value; // Welford's algorithm for higher moments const delta = value - this.mean; const delta_n = delta / this.count; const delta_n2 = delta_n * delta_n; const term1 = delta * delta_n * n; this.mean += delta_n; this.M4 += term1 * delta_n2 * (this.count * this.count - 3 * this.count + 3) + 6 * delta_n2 * this.M2 - 4 * delta_n * this.M3; this.M3 += term1 * delta_n * (this.count - 2) - 3 * delta_n * this.M2; this.M2 += term1; } /** * Get basic count (for backward compatibility) */ getCount() { return this.count; } getSum() { return this.sum; } getMean() { return this.count > 0 ? this.mean : 0; } getMin() { return this.count > 0 ? (this.min === Number.POSITIVE_INFINITY ? 0 : this.min) : 0; } getMax() { return this.count > 0 ? (this.max === Number.NEGATIVE_INFINITY ? 0 : this.max) : 0; } getRange() { return this.count > 0 ? this.getMax() - this.getMin() : 0; } getVariance() { return this.count < 2 ? 0 : this.M2 / this.count; } getStandardDeviation() { return Math.sqrt(this.getVariance()); } getSkewness() { if (this.count < 3 || this.M2 === 0) return 0; return (Math.sqrt(this.count) * this.M3) / Math.pow(this.M2, 1.5); } getKurtosis() { if (this.count < 4 || this.M2 === 0) return 0; return (this.count * this.M4) / (this.M2 * this.M2) - 3; } getCoefficientOfVariation() { const mean = this.getMean(); return mean !== 0 ? this.getStandardDeviation() / Math.abs(mean) : 0; } /** * Merge with another OnlineStatistics instance */ merge(other) { if (other.count === 0) return this; if (this.count === 0) return other; const combined = new OnlineStatistics(); combined.count = this.count + other.count; combined.sum = this.sum + other.sum; combined.min = Math.min(this.min, other.min); combined.max = Math.max(this.max, other.max); const delta = other.mean - this.mean; const delta2 = delta * delta; const delta3 = delta * delta2; const delta4 = delta2 * delta2; combined.mean = (this.count * this.mean + other.count * other.mean) / combined.count; combined.M2 = this.M2 + other.M2 + (delta2 * this.count * other.count) / combined.count; combined.M3 = this.M3 + other.M3 + (delta3 * this.count * other.count * (this.count - other.count)) / (combined.count * combined.count) + (3 * delta * (this.count * other.M2 - other.count * this.M2)) / combined.count; combined.M4 = this.M4 + other.M4 + (delta4 * this.count * other.count * (this.count * this.count - this.count * other.count + other.count * other.count)) / (combined.count * combined.count * combined.count) + (6 * delta2 * (this.count * this.count * other.M2 + other.count * other.count * this.M2)) / (combined.count * combined.count) + (4 * delta * (this.count * other.M3 - other.count * this.M3)) / combined.count; return combined; } } exports.OnlineStatistics = OnlineStatistics; /** * P² Algorithm for Quantile Estimation * Estimates any quantile using only 5 markers */ class P2Quantile { quantile; markers = new Array(5); positions = [1, 2, 3, 4, 5]; desired = new Array(5); count = 0; initialized = false; constructor(quantile) { this.quantile = quantile; // Initial desired positions for the 5 markers (for n=5 initially) this.desired[0] = 1; this.desired[1] = 1 + quantile; this.desired[2] = 1 + 2 * quantile; this.desired[3] = 1 + 3 * quantile; this.desired[4] = 5; } update(value) { if (isNaN(value) || !isFinite(value)) return; this.count++; if (!this.initialized) { // Initialize with first 5 values if (this.count <= 5) { this.markers[this.count - 1] = value; if (this.count === 5) { this.markers.sort((a, b) => a - b); this.initialized = true; } } return; } // Find insertion point let k = 0; if (value < this.markers[0]) { this.markers[0] = value; k = 1; } else if (value >= this.markers[4]) { this.markers[4] = value; k = 4; } else { for (let i = 1; i < 5; i++) { if (value < this.markers[i]) { k = i; break; } } } // Increment positions for (let i = k; i < 5; i++) { this.positions[i]++; } // Update desired positions according to P2 algorithm // CRITICAL FIX: Correct desired position calculation const n = this.count; this.desired[0] = 1; this.desired[1] = 1 + this.quantile * (n - 1); this.desired[2] = 1 + 2 * this.quantile * (n - 1); this.desired[3] = 1 + 3 * this.quantile * (n - 1); this.desired[4] = n; // Adjust markers for (let i = 1; i < 4; i++) { const d = this.desired[i] - this.positions[i]; if ((d >= 1 && this.positions[i + 1] - this.positions[i] > 1) || (d <= -1 && this.positions[i - 1] - this.positions[i] < -1)) { const sign = d >= 0 ? 1 : -1; const qs = this.parabolic(i, sign); if (this.markers[i - 1] < qs && qs < this.markers[i + 1]) { this.markers[i] = qs; } else { this.markers[i] = this.linear(i, sign); } this.positions[i] += sign; } } } parabolic(i, d) { const qi = this.markers[i]; const qim1 = this.markers[i - 1]; const qip1 = this.markers[i + 1]; const ni = this.positions[i]; const nim1 = this.positions[i - 1]; const nip1 = this.positions[i + 1]; return (qi + (d / (nip1 - nim1)) * (((ni - nim1 + d) * (qip1 - qi)) / (nip1 - ni) + ((nip1 - ni - d) * (qi - qim1)) / (ni - nim1))); } linear(i, d) { const qi = this.markers[i]; const q = d > 0 ? this.markers[i + 1] : this.markers[i - 1]; const ni = this.positions[i]; const n = d > 0 ? this.positions[i + 1] : this.positions[i - 1]; return qi + (d * (q - qi)) / (n - ni); } getQuantile() { if (!this.initialized) { // Fallback for small datasets with proper median calculation const sorted = [...this.markers.slice(0, this.count)].sort((a, b) => a - b); if (this.quantile === 0.5) { // Special handling for median to ensure correct even-length calculation if (sorted.length === 0) return 0; if (sorted.length % 2 === 1) { // Odd length: return middle element return sorted[Math.floor(sorted.length / 2)]; } else { // Even length: return average of two middle elements const mid1 = sorted[sorted.length / 2 - 1]; const mid2 = sorted[sorted.length / 2]; return (mid1 + mid2) / 2; } } // For other quantiles, use interpolation const index = this.quantile * (sorted.length - 1); const lower = Math.floor(index); const upper = Math.ceil(index); if (lower === upper) return sorted[lower] || 0; return sorted[lower] + (index - lower) * (sorted[upper] - sorted[lower]); } // For median (0.5 quantile), use the middle marker with better interpolation if (this.quantile === 0.5) { // Use linear interpolation between adjacent markers for better median accuracy const q1 = this.markers[1]; const median = this.markers[2]; const q3 = this.markers[3]; // Simple interpolation between Q1 and Q3 to get better median estimate // This is more reliable than just using the middle marker return median; } return this.markers[2]; // Middle marker approximates the quantile } } exports.P2Quantile = P2Quantile; /** * Reservoir Sampling for Representative Samples * Maintains a fixed-size random sample with uniform probability */ class ReservoirSampler { size; reservoir = []; count = 0; rng; constructor(size, seed) { this.size = size; this.rng = seed !== undefined ? this.createSeededRandom(seed) : Math.random; } sample(item) { this.count++; if (this.reservoir.length < this.size) { this.reservoir.push(item); } else { // Replace random element with probability size/count const j = Math.floor(this.rng() * this.count); if (j < this.size) { this.reservoir[j] = item; } } } /** * Creates a seeded pseudo-random number generator (PRNG). * Uses a simple linear congruential generator (LCG) for simplicity. */ createSeededRandom(seed) { let currentSeed = seed; return () => { // LCG parameters from POSIX currentSeed = (currentSeed * 1103515245 + 12345) % 2147483648; return currentSeed / 2147483648; }; } getSample() { return [...this.reservoir]; } getCount() { return this.count; } clear() { this.reservoir = []; this.count = 0; } } exports.ReservoirSampler = ReservoirSampler; /** * Online Covariance for Streaming Correlation Calculation */ class OnlineCovariance { count = 0; meanX = 0; meanY = 0; C = 0; // Covariance accumulator sumX = 0; sumY = 0; sumXX = 0; sumYY = 0; update(x, y) { if (isNaN(x) || isNaN(y) || !isFinite(x) || !isFinite(y)) return; this.count++; this.sumX += x; this.sumY += y; this.sumXX += x * x; this.sumYY += y * y; const deltaX = x - this.meanX; this.meanX += deltaX / this.count; const deltaY = y - this.meanY; this.meanY += deltaY / this.count; this.C += deltaX * (y - this.meanY); } getCovariance() { return this.count < 2 ? 0 : this.C / this.count; } getCorrelation() { if (this.count < 2) return 0; // Use sample variance formula (n-1 denominator) for consistency const n = this.count; const varX = (this.sumXX - (this.sumX * this.sumX) / n) / (n - 1); const varY = (this.sumYY - (this.sumY * this.sumY) / n) / (n - 1); // Handle edge cases for zero variance const epsilon = 1e-12; if (varX < epsilon || varY < epsilon) { // If either variable has effectively zero variance, correlation is undefined return 0; } // Use sample covariance for consistency const sampleCovariance = this.count < 2 ? 0 : this.C / (n - 1); const correlation = sampleCovariance / Math.sqrt(varX * varY); // Clamp to [-1, 1] to handle numerical precision issues return Math.max(-1, Math.min(1, correlation)); } getCount() { return this.count; } getMeanX() { return this.meanX; } getMeanY() { return this.meanY; } getVarianceX() { if (this.count < 2) return 0; const n = this.count; return (this.sumXX - (this.sumX * this.sumX) / n) / (n - 1); } getVarianceY() { if (this.count < 2) return 0; const n = this.count; return (this.sumYY - (this.sumY * this.sumY) / n) / (n - 1); } } exports.OnlineCovariance = OnlineCovariance; /** * Frequency Counter with Memory Bounds * Uses a simple map with automatic pruning when memory limit is reached */ class BoundedFrequencyCounter { frequencies = new Map(); maxEntries; constructor(maxEntries = 10000) { this.maxEntries = maxEntries; } update(item) { const current = this.frequencies.get(item) || 0; this.frequencies.set(item, current + 1); // Prune if we exceed max entries if (this.frequencies.size > this.maxEntries) { this.pruneToTopFrequencies(); } } pruneToTopFrequencies() { // Keep only the top 80% most frequent items const keepCount = Math.floor(this.maxEntries * 0.8); const sorted = Array.from(this.frequencies.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, keepCount); this.frequencies.clear(); sorted.forEach(([key, value]) => { this.frequencies.set(key, value); }); } getFrequencies() { return new Map(this.frequencies); } getTopK(k) { return Array.from(this.frequencies.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, k); } getCount(item) { return this.frequencies.get(item) || 0; } getTotalCount() { return Array.from(this.frequencies.values()).reduce((sum, count) => sum + count, 0); } clear() { this.frequencies.clear(); } } exports.BoundedFrequencyCounter = BoundedFrequencyCounter; //# sourceMappingURL=online-statistics.js.map