datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
522 lines • 24 kB
JavaScript
"use strict";
/**
* Streaming Bivariate Analysis Engine
* Processes pair relationships incrementally using online algorithms
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.StreamingBivariateAnalyzer = void 0;
const online_statistics_1 = require("./online-statistics");
const statistical_tests_1 = require("./statistical-tests");
const hypothesis_tests_1 = require("../statistical-tests/hypothesis-tests");
const types_1 = require("../eda/types");
/**
* Streaming Bivariate Analyzer
* Processes column pairs incrementally without storing all data
*/
class StreamingBivariateAnalyzer {
numericalPairs = new Map();
categoricalPairs = new Map();
numericalCategoricalPairs = new Map();
numericalCategoricalSamples = new Map(); // For Kruskal-Wallis
scatterSamples = new Map();
warnings = [];
maxPairs;
columnNameToIndex = new Map();
constructor(maxPairs = 50) {
this.maxPairs = maxPairs;
}
/**
* Initialize tracking for column pairs
*/
initializePairs(pairs) {
// Build column name to index mapping
this.columnNameToIndex.clear();
for (const pair of pairs) {
this.columnNameToIndex.set(pair.col1Name, pair.col1Index);
this.columnNameToIndex.set(pair.col2Name, pair.col2Index);
}
// Limit number of pairs to prevent memory explosion
const limitedPairs = pairs.slice(0, this.maxPairs);
if (pairs.length > this.maxPairs) {
this.warnings.push({
category: 'performance',
severity: 'medium',
message: `Too many column pairs (${pairs.length}). Analyzing first ${this.maxPairs} pairs.`,
impact: 'Some correlations not computed',
suggestion: 'Increase maxPairs limit or reduce number of columns',
});
}
for (const pair of limitedPairs) {
const pairKey = `${pair.col1Name}__${pair.col2Name}`;
if (this.isNumericalType(pair.col1Type) && this.isNumericalType(pair.col2Type)) {
// Numerical vs Numerical
this.numericalPairs.set(pairKey, new online_statistics_1.OnlineCovariance());
this.scatterSamples.set(pairKey, new online_statistics_1.ReservoirSampler(50, 42)); // Reduced from 1000, seeded for deterministic results
}
else if (this.isCategoricalType(pair.col1Type) && this.isCategoricalType(pair.col2Type)) {
// Categorical vs Categorical
this.categoricalPairs.set(pairKey, new online_statistics_1.BoundedFrequencyCounter(200)); // Reduced from 5000
}
else if ((this.isNumericalType(pair.col1Type) && this.isCategoricalType(pair.col2Type)) ||
(this.isCategoricalType(pair.col1Type) && this.isNumericalType(pair.col2Type))) {
// Numerical vs Categorical
this.numericalCategoricalPairs.set(pairKey, new Map());
this.numericalCategoricalSamples.set(pairKey, new Map());
}
}
}
/**
* Process a row of data for all initialized pairs
*/
processRow(row, columnTypes) {
// Process numerical vs numerical pairs
for (const [pairKey, covariance] of this.numericalPairs) {
const [col1Name, col2Name] = pairKey.split('__');
const col1Index = this.findColumnIndex(col1Name, row);
const col2Index = this.findColumnIndex(col2Name, row);
if (col1Index >= 0 && col2Index >= 0) {
const val1 = this.extractNumericValue(row[col1Index]);
const val2 = this.extractNumericValue(row[col2Index]);
if (val1 !== null && val2 !== null) {
covariance.update(val1, val2);
// Sample for scatter plot insights
const scatterSample = this.scatterSamples.get(pairKey);
if (scatterSample) {
scatterSample.sample([val1, val2]);
}
}
}
}
// Process categorical vs categorical pairs
for (const [pairKey, frequencyCounter] of this.categoricalPairs) {
const [col1Name, col2Name] = pairKey.split('__');
const col1Index = this.findColumnIndex(col1Name, row);
const col2Index = this.findColumnIndex(col2Name, row);
if (col1Index >= 0 && col2Index >= 0) {
const val1 = this.extractStringValue(row[col1Index]);
const val2 = this.extractStringValue(row[col2Index]);
if (val1 !== null && val2 !== null) {
frequencyCounter.update(`${val1}||${val2}`);
}
}
}
// Process numerical vs categorical pairs
for (const [pairKey, categoryGroups] of this.numericalCategoricalPairs) {
const [col1Name, col2Name] = pairKey.split('__');
const col1Index = this.findColumnIndex(col1Name, row);
const col2Index = this.findColumnIndex(col2Name, row);
if (col1Index >= 0 && col2Index >= 0) {
let numValue = null;
let catValue = null;
// Determine which is numerical and which is categorical
if (this.isNumericalType(columnTypes[col1Index])) {
numValue = this.extractNumericValue(row[col1Index]);
catValue = this.extractStringValue(row[col2Index]);
}
else {
numValue = this.extractNumericValue(row[col2Index]);
catValue = this.extractStringValue(row[col1Index]);
}
if (numValue !== null && catValue !== null) {
// Update statistics
if (!categoryGroups.has(catValue)) {
categoryGroups.set(catValue, new online_statistics_1.OnlineStatistics());
}
categoryGroups.get(catValue).update(numValue); // Only need to update with the numerical value
// Update samples for Kruskal-Wallis test
const categorySamples = this.numericalCategoricalSamples.get(pairKey);
if (!categorySamples.has(catValue)) {
categorySamples.set(catValue, new online_statistics_1.ReservoirSampler(30, 42)); // Smaller sample per group, seeded
}
categorySamples.get(catValue).sample(numValue);
}
}
}
}
/**
* Finalize analysis and return results
*/
finalize(headers) {
const numericalVsNumerical = this.finalizeNumericalAnalysis(headers);
const numericalVsCategorical = this.finalizeNumericalCategoricalAnalysis();
const categoricalVsCategorical = this.finalizeCategoricalAnalysis();
return {
numericalVsNumerical,
numericalVsCategorical,
categoricalVsCategorical,
};
}
finalizeNumericalAnalysis(_headers) {
const correlationPairs = [];
for (const [pairKey, covariance] of this.numericalPairs) {
const [col1Name, col2Name] = pairKey.split('__');
const correlation = covariance.getCorrelation();
const count = covariance.getCount();
if (count > 0) {
const significanceTest = statistical_tests_1.CorrelationSignificanceTest.test(correlation, count);
correlationPairs.push({
variable1: col1Name,
variable2: col2Name,
correlation: Number(correlation.toFixed(4)),
pearsonCorrelation: Number(correlation.toFixed(4)),
pValue: significanceTest.pValue,
strength: this.interpretCorrelationStrength(Math.abs(correlation)),
direction: correlation > 0 ? 'Positive' : correlation < 0 ? 'Negative' : 'None',
significance: significanceTest.interpretation,
sampleSize: count,
interpretation: `${this.interpretCorrelationStrength(Math.abs(correlation))} ${correlation > 0 ? 'positive' : correlation < 0 ? 'negative' : 'zero'} correlation (${significanceTest.interpretation})`,
});
}
}
// Sort by absolute correlation strength
correlationPairs.sort((a, b) => Math.abs(b.correlation) - Math.abs(a.correlation));
const scatterPlotInsights = this.generateScatterPlotInsights();
const strongCorrelations = correlationPairs.filter((pair) => Math.abs(pair.correlation) > 0.5);
return {
totalPairsAnalyzed: correlationPairs.length,
correlationPairs: correlationPairs.slice(0, 50), // Top 50 correlations
strongestPositiveCorrelation: correlationPairs.find((p) => p.correlation > 0) || null,
strongestNegativeCorrelation: correlationPairs.find((p) => p.correlation < 0) || null,
strongCorrelations,
scatterPlotInsights,
regressionInsights: [], // Would need additional computation
};
}
finalizeNumericalCategoricalAnalysis() {
const results = [];
for (const [pairKey, categoryGroups] of this.numericalCategoricalPairs) {
const [col1Name, col2Name] = pairKey.split('__');
const groupComparisons = [];
let totalCount = 0;
for (const [category, stats] of categoryGroups) {
const count = stats.getCount();
totalCount += count;
if (count > 0) {
// Extract statistics from the OnlineStatistics object
const mean = stats.getMean();
const stdDev = stats.getStandardDeviation();
// Use statistical approximations for quartiles (OnlineStatistics doesn't have getQuantile)
const q1 = mean - 0.675 * stdDev; // Approximation for Q1 in normal distribution
const median = mean; // Use mean as median approximation for streaming data
const q3 = mean + 0.675 * stdDev; // Approximation for Q3 in normal distribution
groupComparisons.push({
category,
count,
mean: Number(mean.toFixed(4)),
median: Number(median.toFixed(4)),
standardDeviation: Number(stdDev.toFixed(4)),
quartile1st: Number(q1.toFixed(4)),
quartile3rd: Number(q3.toFixed(4)),
});
}
}
if (groupComparisons.length > 0) {
results.push({
numericalVariable: col1Name.includes('numerical') ? col1Name : col2Name,
categoricalVariable: col1Name.includes('numerical') ? col2Name : col1Name,
groupComparisons,
statisticalTests: this.generateRealStatisticalTests(pairKey, groupComparisons), // Real ANOVA and Kruskal-Wallis tests
summary: this.generateGroupSummary(groupComparisons),
});
}
}
return results;
}
finalizeCategoricalAnalysis() {
const results = [];
for (const [pairKey, frequencyCounter] of this.categoricalPairs) {
const [col1Name, col2Name] = pairKey.split('__');
const frequencies = frequencyCounter.getFrequencies();
const contingencyTable = this.buildContingencyTable(frequencies);
const associationTests = this.generateAssociationTests(contingencyTable);
results.push({
variable1: col1Name,
variable2: col2Name,
contingencyTable,
associationTests,
insights: this.generateCategoricalInsights(contingencyTable, associationTests),
});
}
return results;
}
generateScatterPlotInsights() {
const insights = [];
for (const [pairKey, sampler] of this.scatterSamples) {
const [col1Name, col2Name] = pairKey.split('__');
const sample = sampler.getSample();
if (sample.length > 10) {
// Analyze scatter pattern
const xValues = sample.map((point) => point[0]);
const yValues = sample.map((point) => point[1]);
const xRange = Math.max(...xValues) - Math.min(...xValues);
const yRange = Math.max(...yValues) - Math.min(...yValues);
let pattern = 'Linear';
if (xRange === 0 || yRange === 0) {
pattern = 'Constant';
}
insights.push({
variable1: col1Name,
variable2: col2Name,
pattern,
outlierCount: 0, // Would need outlier detection
recommendedVisualization: 'Scatter plot with trend line',
insights: `${sample.length} point sample shows ${pattern.toLowerCase()} relationship`,
});
}
}
return insights;
}
buildContingencyTable(frequencies) {
const table = {};
const rowTotals = {};
const columnTotals = {};
for (const [combinedKey, count] of frequencies) {
const [row, col] = combinedKey.split('||');
if (!table[row])
table[row] = {};
table[row][col] = count;
rowTotals[row] = (rowTotals[row] || 0) + count;
columnTotals[col] = (columnTotals[col] || 0) + count;
}
return { table, rowTotals, columnTotals };
}
/**
* Generate real statistical tests using proper ANOVA F-test and Kruskal-Wallis test
*/
generateRealStatisticalTests(pairKey, groupComparisons) {
try {
// Convert group comparisons to GroupData format for statistical tests
const groupData = groupComparisons.map((group) => ({
name: group.category,
count: group.count,
mean: group.mean,
variance: Math.pow(group.standardDeviation, 2), // Convert std dev to variance
values: this.extractGroupValues(pairKey, group.category), // Get raw values if available
}));
// Handle edge cases
if (groupData.length < 2) {
return {
anova: {
fStatistic: 0,
pValue: 1,
interpretation: 'Insufficient groups for statistical comparison (need ≥2 groups)',
},
kruskalWallis: {
hStatistic: 0,
pValue: 1,
interpretation: 'Insufficient groups for statistical comparison (need ≥2 groups)',
},
};
}
// Perform ANOVA F-test
const anovaResult = (0, hypothesis_tests_1.anovaFTest)(groupData);
// Perform Kruskal-Wallis test
const kwResult = (0, hypothesis_tests_1.kruskalWallisTest)(groupData);
return {
anova: {
fStatistic: anovaResult.statistic,
pValue: anovaResult.pValue,
interpretation: this.formatAnovaInterpretation(anovaResult),
},
kruskalWallis: {
hStatistic: kwResult.statistic,
pValue: kwResult.pValue,
interpretation: this.formatKruskalWallisInterpretation(kwResult),
},
};
}
catch (error) {
// Fallback to safe default values if statistical tests fail
console.warn(`Statistical tests failed for ${pairKey}:`, error);
return {
anova: {
fStatistic: 0,
pValue: 1,
interpretation: 'Statistical test failed due to insufficient or invalid data',
},
kruskalWallis: {
hStatistic: 0,
pValue: 1,
interpretation: 'Statistical test failed due to insufficient or invalid data',
},
};
}
}
/**
* Extract raw values for a specific group if available from reservoir samples
*/
extractGroupValues(pairKey, categoryName) {
const categorySamples = this.numericalCategoricalSamples.get(pairKey);
if (!categorySamples) {
return undefined;
}
const groupSampler = categorySamples.get(categoryName);
if (!groupSampler) {
return undefined;
}
return groupSampler.getSample();
}
/**
* Format ANOVA result interpretation for compact display
*/
formatAnovaInterpretation(result) {
const significance = result.pValue < 0.001
? 'highly significant (p < 0.001)'
: result.pValue < 0.01
? 'very significant (p < 0.01)'
: result.pValue < 0.05
? 'significant (p < 0.05)'
: 'not significant (p ≥ 0.05)';
const conclusion = result.pValue < 0.05
? 'Group means differ significantly'
: 'No significant difference between group means';
return `F(${Array.isArray(result.degreesOfFreedom) ? result.degreesOfFreedom.join(',') : result.degreesOfFreedom}) = ${result.statistic.toFixed(3)}, p = ${result.pValue.toFixed(4)} (${significance}). ${conclusion}.`;
}
/**
* Format Kruskal-Wallis result interpretation for compact display
*/
formatKruskalWallisInterpretation(result) {
const significance = result.pValue < 0.001
? 'highly significant (p < 0.001)'
: result.pValue < 0.01
? 'very significant (p < 0.01)'
: result.pValue < 0.05
? 'significant (p < 0.05)'
: 'not significant (p ≥ 0.05)';
const conclusion = result.pValue < 0.05
? 'Group distributions differ significantly'
: 'No significant difference between group distributions';
return `H = ${result.statistic.toFixed(3)}, df = ${result.degreesOfFreedom}, p = ${result.pValue.toFixed(4)} (${significance}). ${conclusion}.`;
}
generateAssociationTests(contingencyTable) {
const { table } = contingencyTable;
// Convert contingency table to matrix format for chi-squared test
const rows = Object.keys(table);
const cols = rows.length > 0 ? Object.keys(table[rows[0]]) : [];
if (rows.length < 2 || cols.length < 2) {
return {
chiSquare: {
statistic: 0,
pValue: 1,
degreesOfFreedom: 0,
interpretation: 'Insufficient data for chi-squared test',
},
cramersV: {
statistic: 0,
interpretation: 'Cannot calculate association strength',
},
contingencyCoefficient: {
statistic: 0,
interpretation: 'Cannot calculate contingency coefficient',
},
};
}
const matrix = [];
for (const row of rows) {
const rowData = [];
for (const col of cols) {
rowData.push(table[row][col] || 0);
}
matrix.push(rowData);
}
const chiSquaredResult = statistical_tests_1.ChiSquaredTest.test(matrix);
// Calculate contingency coefficient
const contingencyCoeff = Math.sqrt(chiSquaredResult.statistic /
(chiSquaredResult.statistic + matrix.flat().reduce((sum, val) => sum + val, 0)));
const cramersVInterpretation = chiSquaredResult.cramersV > 0.5
? 'Strong association'
: chiSquaredResult.cramersV > 0.3
? 'Moderate association'
: chiSquaredResult.cramersV > 0.1
? 'Weak association'
: 'Very weak association';
const contingencyInterpretation = contingencyCoeff > 0.5
? 'Strong association'
: contingencyCoeff > 0.3
? 'Moderate association'
: contingencyCoeff > 0.1
? 'Weak association'
: 'Very weak association';
return {
chiSquare: {
statistic: chiSquaredResult.statistic,
pValue: chiSquaredResult.pValue,
degreesOfFreedom: chiSquaredResult.degreesOfFreedom,
interpretation: chiSquaredResult.interpretation,
},
cramersV: {
statistic: chiSquaredResult.cramersV,
interpretation: cramersVInterpretation,
},
contingencyCoefficient: {
statistic: Number(contingencyCoeff.toFixed(4)),
interpretation: contingencyInterpretation,
},
};
}
generateGroupSummary(comparisons) {
if (comparisons.length === 0)
return 'No groups to compare';
const sorted = [...comparisons].sort((a, b) => b.mean - a.mean);
const highest = sorted[0];
const lowest = sorted[sorted.length - 1];
return `${highest.category} has highest mean (${highest.mean.toFixed(2)}), ${lowest.category} has lowest (${lowest.mean.toFixed(2)})`;
}
generateCategoricalInsights(contingencyTable, associationTests) {
const { table } = contingencyTable;
// Find most common combination
let maxCount = 0;
let maxCombination = '';
for (const [row, cols] of Object.entries(table)) {
for (const [col, count] of Object.entries(cols)) {
if (count > maxCount) {
maxCount = count;
maxCombination = `${row} & ${col}`;
}
}
}
const strength = associationTests.cramersV.statistic > 0.5
? 'strong'
: associationTests.cramersV.statistic > 0.3
? 'moderate'
: 'weak';
return `Most common combination: ${maxCombination} (${maxCount} occurrences). Association strength: ${strength}.`;
}
interpretCorrelationStrength(absCorr) {
if (absCorr >= 0.8)
return 'Very Strong';
if (absCorr >= 0.6)
return 'Strong';
if (absCorr >= 0.4)
return 'Moderate';
if (absCorr >= 0.2)
return 'Weak';
return 'Very Weak';
}
isNumericalType(type) {
return type === types_1.EdaDataType.NUMERICAL_FLOAT || type === types_1.EdaDataType.NUMERICAL_INTEGER;
}
isCategoricalType(type) {
return type === types_1.EdaDataType.CATEGORICAL;
}
findColumnIndex(columnName, _row) {
// Use the column name to index mapping built during initialization
return this.columnNameToIndex.get(columnName) ?? -1;
}
extractNumericValue(value) {
if (value === null || value === undefined || value === '')
return null;
const num = typeof value === 'number' ? value : Number(value);
return isNaN(num) ? null : num;
}
extractStringValue(value) {
if (value === null || value === undefined || value === '')
return null;
return String(value);
}
getWarnings() {
return [...this.warnings];
}
}
exports.StreamingBivariateAnalyzer = StreamingBivariateAnalyzer;
//# sourceMappingURL=streaming-bivariate-analyzer.js.map