semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
413 lines • 15.9 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.StatisticalTests = void 0;
class StatisticalTests {
DEFAULT_BINS = 10;
PSI_THRESHOLDS = {
stable: 0.1,
minor_shift: 0.15,
major_shift: 0.25
};
/**
* Kolmogorov-Smirnov test for distribution comparison
* Tests whether two datasets come from the same distribution
*/
kolmogorovSmirnovTest(sample1, sample2, alpha = 0.05) {
if (sample1.length === 0 || sample2.length === 0) {
throw new Error('Both samples must contain at least one element');
}
// Sort samples
const sorted1 = [...sample1].sort((a, b) => a - b);
const sorted2 = [...sample2].sort((a, b) => a - b);
// Combine and get unique values
const combined = [...new Set([...sorted1, ...sorted2])].sort((a, b) => a - b);
let maxDifference = 0;
const n1 = sorted1.length;
const n2 = sorted2.length;
// Calculate empirical distribution functions and find max difference
for (let i = 0; i < combined.length; i++) {
const value = combined[i];
// Count values <= current value in each sample
const count1 = this.countLessEqual(sorted1, value);
const count2 = this.countLessEqual(sorted2, value);
// Calculate empirical CDFs
const cdf1 = count1 / n1;
const cdf2 = count2 / n2;
// Update maximum difference
const difference = Math.abs(cdf1 - cdf2);
if (difference > maxDifference) {
maxDifference = difference;
}
}
// Calculate critical value
const criticalValue = this.getKSCriticalValue(n1, n2, alpha);
// Approximate p-value calculation (simplified)
const pValue = this.calculateKSPValue(maxDifference, n1, n2);
return {
statistic: maxDifference,
p_value: pValue,
critical_value: criticalValue,
is_significant: maxDifference > criticalValue
};
}
/**
* Population Stability Index (PSI) calculation
* Measures distribution shift between expected and actual datasets
*/
populationStabilityIndex(expected, actual, bins = this.DEFAULT_BINS) {
if (expected.length === 0 || actual.length === 0) {
throw new Error('Both datasets must contain at least one element');
}
const psiResult = this.calculatePSIDetailed(expected, actual, bins);
return psiResult.psi_score;
}
/**
* Detailed PSI calculation with bin-level analysis
*/
calculatePSIDetailed(expected, actual, bins = this.DEFAULT_BINS) {
// Determine bin boundaries based on expected data quantiles
const sortedExpected = [...expected].sort((a, b) => a - b);
const binBoundaries = this.calculateBinBoundaries(sortedExpected, bins);
// Calculate bin counts for both datasets
const expectedBinCounts = this.binData(expected, binBoundaries);
const actualBinCounts = this.binData(actual, binBoundaries);
let psiScore = 0;
const binContributions = [];
for (let i = 0; i < bins; i++) {
// Calculate percentages (add small epsilon to avoid log(0))
const expectedPct = Math.max(expectedBinCounts[i] / expected.length, 0.0001);
const actualPct = Math.max(actualBinCounts[i] / actual.length, 0.0001);
// PSI contribution for this bin
const contribution = (actualPct - expectedPct) * Math.log(actualPct / expectedPct);
psiScore += contribution;
binContributions.push({
bin: i < bins - 1
? `${binBoundaries[i].toFixed(2)}-${binBoundaries[i + 1].toFixed(2)}`
: `${binBoundaries[i].toFixed(2)}+`,
expected_pct: expectedPct,
actual_pct: actualPct,
contribution: contribution
});
}
// Determine stability category
let stabilityCategory;
if (psiScore < this.PSI_THRESHOLDS.stable) {
stabilityCategory = 'stable';
}
else if (psiScore < this.PSI_THRESHOLDS.minor_shift) {
stabilityCategory = 'minor_shift';
}
else if (psiScore < this.PSI_THRESHOLDS.major_shift) {
stabilityCategory = 'major_shift';
}
else {
stabilityCategory = 'significant_shift';
}
return {
psi_score: psiScore,
stability_category: stabilityCategory,
bin_contributions: binContributions
};
}
/**
* Chi-square test for categorical distribution comparison
*/
chiSquareTest(observed, expected, alpha = 0.05) {
if (observed.length !== expected.length) {
throw new Error('Observed and expected arrays must have the same length');
}
if (observed.length === 0) {
throw new Error('Arrays must contain at least one element');
}
let chiSquareStatistic = 0;
for (let i = 0; i < observed.length; i++) {
if (expected[i] === 0) {
throw new Error('Expected frequencies must be greater than 0');
}
chiSquareStatistic += Math.pow(observed[i] - expected[i], 2) / expected[i];
}
const degreesOfFreedom = observed.length - 1;
const pValue = this.calculateChiSquarePValue(chiSquareStatistic, degreesOfFreedom);
return {
statistic: chiSquareStatistic,
p_value: pValue,
degrees_of_freedom: degreesOfFreedom,
is_significant: pValue < alpha
};
}
/**
* Anderson-Darling test for distribution comparison
* More sensitive to tail differences than KS test
*/
andersonDarlingTest(sample1, sample2) {
const n1 = sample1.length;
const n2 = sample2.length;
if (n1 === 0 || n2 === 0) {
throw new Error('Both samples must contain at least one element');
}
// Sort samples
const sorted1 = [...sample1].sort((a, b) => a - b);
const sorted2 = [...sample2].sort((a, b) => a - b);
// Combine samples
const combined = [...sorted1, ...sorted2].sort((a, b) => a - b);
const N = n1 + n2;
let adStatistic = 0;
for (let i = 0; i < N; i++) {
const rank = i + 1;
const value = combined[i];
// Find ranks in original samples
const rank1 = this.countLessEqual(sorted1, value);
const rank2 = this.countLessEqual(sorted2, value);
if (rank1 > 0 && rank1 < n1 && rank2 > 0 && rank2 < n2) {
const term1 = Math.log(rank1 / n1);
const term2 = Math.log(rank2 / n2);
adStatistic += (2 * rank - N - 1) * (term1 + term2);
}
}
adStatistic = -N - adStatistic / N;
adStatistic *= (n1 * n2) / (N * N);
// Critical value for 5% significance (simplified)
const criticalValue = 2.5;
return {
statistic: adStatistic,
is_significant: adStatistic > criticalValue
};
}
/**
* Wasserstein (Earth Mover's) distance between two distributions
*/
wassersteinDistance(sample1, sample2) {
if (sample1.length === 0 || sample2.length === 0) {
throw new Error('Both samples must contain at least one element');
}
const sorted1 = [...sample1].sort((a, b) => a - b);
const sorted2 = [...sample2].sort((a, b) => a - b);
const combined = [...new Set([...sorted1, ...sorted2])].sort((a, b) => a - b);
let distance = 0;
let cdf1 = 0;
let cdf2 = 0;
for (let i = 0; i < combined.length - 1; i++) {
const value = combined[i];
const nextValue = combined[i + 1];
// Update CDFs
cdf1 += this.countEqual(sorted1, value) / sorted1.length;
cdf2 += this.countEqual(sorted2, value) / sorted2.length;
// Add to distance
distance += Math.abs(cdf1 - cdf2) * (nextValue - value);
}
return distance;
}
/**
* Performance-optimized PSI for large datasets
*/
fastPSI(expected, actual, sampleSize = 10000) {
// Sample data if too large
const sampledExpected = this.sampleArray(expected, sampleSize);
const sampledActual = this.sampleArray(actual, sampleSize);
return this.populationStabilityIndex(sampledExpected, sampledActual);
}
// Helper methods
countLessEqual(sortedArray, value) {
let count = 0;
for (const item of sortedArray) {
if (item <= value)
count++;
else
break;
}
return count;
}
countEqual(sortedArray, value) {
return sortedArray.filter(x => x === value).length;
}
calculateBinBoundaries(sortedData, bins) {
const boundaries = [];
const n = sortedData.length;
for (let i = 0; i <= bins; i++) {
const index = Math.floor((i * (n - 1)) / bins);
boundaries.push(sortedData[index]);
}
// Ensure boundaries are unique and sorted
return [...new Set(boundaries)].sort((a, b) => a - b);
}
binData(data, boundaries) {
const bins = new Array(boundaries.length - 1).fill(0);
for (const value of data) {
let binIndex = boundaries.length - 2; // Default to last bin
for (let i = 0; i < boundaries.length - 1; i++) {
if (value <= boundaries[i + 1]) {
binIndex = i;
break;
}
}
bins[binIndex]++;
}
return bins;
}
getKSCriticalValue(n1, n2, alpha) {
// Simplified critical value calculation
const n = (n1 * n2) / (n1 + n2);
const criticalValues = {
0.01: 1.63,
0.05: 1.36,
0.10: 1.22
};
const c = criticalValues[alpha] || 1.36;
return c * Math.sqrt((n1 + n2) / (n1 * n2));
}
calculateKSPValue(statistic, n1, n2) {
// Simplified p-value calculation using asymptotic approximation
const n = (n1 * n2) / (n1 + n2);
const lambda = statistic * Math.sqrt(n);
// Approximation using series expansion
let pValue = 0;
for (let k = 1; k <= 100; k++) {
pValue += Math.pow(-1, k - 1) * Math.exp(-2 * k * k * lambda * lambda);
}
return Math.max(0, Math.min(1, 2 * pValue));
}
calculateChiSquarePValue(statistic, df) {
// Simplified chi-square p-value calculation
// This is a basic approximation - in production, use a proper statistical library
if (df === 1) {
const z = Math.sqrt(statistic);
return 2 * (1 - this.normalCDF(z));
}
// For higher df, use gamma function approximation
const x = statistic / 2;
const a = df / 2;
// Incomplete gamma function approximation
return 1 - this.incompleteGamma(a, x);
}
normalCDF(x) {
// Standard normal CDF approximation
return 0.5 * (1 + this.erf(x / Math.sqrt(2)));
}
erf(x) {
// Error function approximation
const a1 = 0.254829592;
const a2 = -0.284496736;
const a3 = 1.421413741;
const a4 = -1.453152027;
const a5 = 1.061405429;
const p = 0.3275911;
const sign = x >= 0 ? 1 : -1;
x = Math.abs(x);
const t = 1.0 / (1.0 + p * x);
const y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
return sign * y;
}
incompleteGamma(a, x) {
// Simplified incomplete gamma function
if (x === 0)
return 0;
if (a === 0)
return 1;
// Use series expansion for small x
let sum = 1;
let term = 1;
for (let n = 1; n <= 100; n++) {
term *= x / (a + n - 1);
sum += term;
if (Math.abs(term) < 1e-10)
break;
}
return Math.pow(x, a) * Math.exp(-x) * sum / this.gamma(a);
}
gamma(x) {
// Stirling's approximation for gamma function
if (x === 1)
return 1;
if (x === 0.5)
return Math.sqrt(Math.PI);
return Math.sqrt(2 * Math.PI / x) * Math.pow(x / Math.E, x);
}
sampleArray(array, sampleSize) {
if (array.length <= sampleSize)
return array;
const sampled = [];
const step = array.length / sampleSize;
for (let i = 0; i < sampleSize; i++) {
const index = Math.floor(i * step);
sampled.push(array[index]);
}
return sampled;
}
/**
* Batch processing for multiple distribution comparisons
*/
batchKSTest(referenceSamples, testSamples, alpha = 0.05) {
const results = [];
for (let i = 0; i < Math.min(referenceSamples.length, testSamples.length); i++) {
try {
const result = this.kolmogorovSmirnovTest(referenceSamples[i], testSamples[i], alpha);
results.push(result);
}
catch (error) {
// Handle individual test failures gracefully
results.push({
statistic: 0,
p_value: 1,
critical_value: 0,
is_significant: false
});
}
}
return results;
}
/**
* Comprehensive distribution comparison report
*/
compareDistributions(expected, actual, options = {}) {
const { alpha = 0.05, bins = this.DEFAULT_BINS, includePSI = true, includeWasserstein = false } = options;
const ksTest = this.kolmogorovSmirnovTest(expected, actual, alpha);
let psiAnalysis;
if (includePSI) {
psiAnalysis = this.calculatePSIDetailed(expected, actual, bins);
}
let wassersteinDistance;
if (includeWasserstein) {
wassersteinDistance = this.wassersteinDistance(expected, actual);
}
// Determine overall assessment
let driftDetected = ksTest.is_significant;
let severity = 'none';
let primaryIndicator = 'KS test';
if (psiAnalysis) {
switch (psiAnalysis.stability_category) {
case 'significant_shift':
driftDetected = true;
severity = 'high';
primaryIndicator = 'PSI';
break;
case 'major_shift':
driftDetected = true;
severity = severity === 'none' ? 'medium' : severity;
primaryIndicator = severity === 'medium' ? 'PSI' : primaryIndicator;
break;
case 'minor_shift':
if (severity === 'none') {
severity = 'low';
primaryIndicator = 'PSI';
}
break;
}
}
if (ksTest.is_significant && severity === 'none') {
severity = 'medium';
driftDetected = true;
}
return {
ks_test: ksTest,
psi_analysis: psiAnalysis,
wasserstein_distance: wassersteinDistance,
summary: {
drift_detected: driftDetected,
severity: severity,
primary_indicator: primaryIndicator
}
};
}
}
exports.StatisticalTests = StatisticalTests;
//# sourceMappingURL=statistical-tests.js.map