datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
1,128 lines • 46.6 kB
JavaScript
"use strict";
/**
* Streaming Univariate Analysis Engine
* Processes data incrementally using online algorithms
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.StreamingTextAnalyzer = exports.StreamingBooleanAnalyzer = exports.StreamingDateTimeAnalyzer = exports.StreamingCategoricalAnalyzer = exports.StreamingNumericalAnalyzer = void 0;
const online_statistics_1 = require("./online-statistics");
const statistical_tests_1 = require("./statistical-tests");
const types_1 = require("../eda/types");
/**
* Streaming Numerical Column Analyzer
*/
class StreamingNumericalAnalyzer {
columnName;
detectedType;
semanticType;
stats = new online_statistics_1.OnlineStatistics();
quantiles;
reservoir = new online_statistics_1.ReservoirSampler(100, 42); // Reduced from 1000, seeded for deterministic results
frequencies = new online_statistics_1.BoundedFrequencyCounter(100); // Reduced from 1000
warnings = [];
totalValues = 0;
validValues = 0;
nullValues = 0;
constructor(columnName, detectedType, semanticType = types_1.SemanticType.UNKNOWN) {
this.columnName = columnName;
this.detectedType = detectedType;
this.semanticType = semanticType;
// Initialize quantile estimators for common percentiles
this.quantiles = new Map([
[1, new online_statistics_1.P2Quantile(0.01)],
[5, new online_statistics_1.P2Quantile(0.05)],
[10, new online_statistics_1.P2Quantile(0.1)],
[25, new online_statistics_1.P2Quantile(0.25)],
[50, new online_statistics_1.P2Quantile(0.5)],
[75, new online_statistics_1.P2Quantile(0.75)],
[90, new online_statistics_1.P2Quantile(0.9)],
[95, new online_statistics_1.P2Quantile(0.95)],
[99, new online_statistics_1.P2Quantile(0.99)],
]);
}
processValue(value) {
this.totalValues++;
if (value === null || value === undefined || value === '') {
this.nullValues++;
return;
}
// Convert to number
const numValue = typeof value === 'number' ? value : Number(value);
if (isNaN(numValue)) {
this.nullValues++;
return;
}
this.validValues++;
// Update all streaming statistics
this.stats.update(numValue);
this.quantiles.forEach((quantile) => quantile.update(numValue));
this.reservoir.sample(numValue);
this.frequencies.update(numValue);
}
finalize() {
if (this.validValues === 0) {
this.warnings.push({
category: 'data',
severity: 'high',
message: `Column ${this.columnName} has no valid numeric values`,
impact: 'Statistical analysis not possible',
suggestion: 'Check data type detection or data quality',
});
}
const baseProfile = this.createBaseProfile();
const descriptiveStats = this.getDescriptiveStatistics();
const quantileStats = this.getQuantileStatistics();
const distributionAnalysis = this.getDistributionAnalysis();
const normalityTests = this.getNormalityTests();
const outlierAnalysis = this.getOutlierAnalysis();
const numericalPatterns = this.getNumericalPatterns();
return {
...baseProfile,
descriptiveStats,
quantileStats,
distributionAnalysis,
normalityTests,
outlierAnalysis,
numericalPatterns,
};
}
createBaseProfile() {
const uniqueValues = this.frequencies.getFrequencies().size;
return {
columnName: this.columnName,
detectedDataType: this.detectedType,
inferredSemanticType: this.semanticType,
dataQualityFlag: this.validValues / this.totalValues > 0.95
? 'Good'
: this.validValues / this.totalValues > 0.8
? 'Moderate'
: 'Poor',
totalValues: this.totalValues,
missingValues: this.nullValues,
missingPercentage: Number(((this.nullValues / this.totalValues) * 100).toFixed(2)),
uniqueValues,
uniquePercentage: this.calculateUniquePercentage(uniqueValues, this.validValues),
};
}
getDescriptiveStatistics() {
if (this.validValues === 0) {
return {
minimum: 0,
maximum: 0,
range: 0,
sum: 0,
mean: 0,
median: 0,
modes: [],
standardDeviation: 0,
variance: 0,
coefficientOfVariation: 0,
};
}
// Calculate modes from frequency data
const topFrequencies = this.frequencies.getTopK(5);
const maxFreq = topFrequencies.length > 0 ? topFrequencies[0][1] : 0;
const modes = topFrequencies
.filter(([, freq]) => freq === maxFreq)
.map(([value, frequency]) => ({
value,
frequency,
percentage: Number(((frequency / this.validValues) * 100).toFixed(2)),
}));
// Safe quantile access with fallback
const medianQuantile = this.quantiles.get(50);
const medianValue = medianQuantile ? medianQuantile.getQuantile() : this.stats.getMean();
return {
minimum: this.stats.getMin(),
maximum: this.stats.getMax(),
range: this.stats.getRange(),
sum: Number(this.stats.getSum().toFixed(6)),
mean: Number(this.stats.getMean().toFixed(6)),
median: Number(medianValue.toFixed(6)),
modes,
standardDeviation: Number(this.stats.getStandardDeviation().toFixed(6)),
variance: Number(this.stats.getVariance().toFixed(6)),
coefficientOfVariation: Number(this.stats.getCoefficientOfVariation().toFixed(4)),
};
}
getQuantileStatistics() {
if (this.validValues === 0) {
return {
percentile1st: 0,
percentile5th: 0,
percentile10th: 0,
quartile1st: 0,
quartile3rd: 0,
percentile90th: 0,
percentile95th: 0,
percentile99th: 0,
interquartileRange: 0,
medianAbsoluteDeviation: 0,
};
}
// Safe quantile access with fallbacks
const q1Quantile = this.quantiles.get(25);
const q3Quantile = this.quantiles.get(75);
const medianQuantile = this.quantiles.get(50);
const q1 = q1Quantile ? q1Quantile.getQuantile() : 0;
const q3 = q3Quantile ? q3Quantile.getQuantile() : 0;
const median = medianQuantile ? medianQuantile.getQuantile() : 0;
// Calculate MAD from reservoir sample
const sample = this.reservoir.getSample();
const absoluteDeviations = sample.map((val) => Math.abs(val - median)).sort((a, b) => a - b);
const mad = absoluteDeviations.length > 0
? absoluteDeviations[Math.floor(absoluteDeviations.length / 2)]
: 0;
// Safe quantile access for all percentiles
const getQuantileValue = (percentile) => {
const quantile = this.quantiles.get(percentile);
return quantile ? quantile.getQuantile() : 0;
};
return {
percentile1st: Number(getQuantileValue(1).toFixed(6)),
percentile5th: Number(getQuantileValue(5).toFixed(6)),
percentile10th: Number(getQuantileValue(10).toFixed(6)),
quartile1st: Number(q1.toFixed(6)),
quartile3rd: Number(q3.toFixed(6)),
percentile90th: Number(getQuantileValue(90).toFixed(6)),
percentile95th: Number(getQuantileValue(95).toFixed(6)),
percentile99th: Number(getQuantileValue(99).toFixed(6)),
interquartileRange: Number((q3 - q1).toFixed(6)),
medianAbsoluteDeviation: Number(mad.toFixed(6)),
};
}
getDistributionAnalysis() {
if (this.validValues < 3) {
return {
skewness: 0,
skewnessInterpretation: 'Insufficient data',
kurtosis: 0,
kurtosisInterpretation: 'Insufficient data',
histogramSummary: 'Too few values for distribution analysis',
};
}
const skewness = this.stats.getSkewness();
const kurtosis = this.stats.getKurtosis();
const skewnessInterpretation = Math.abs(skewness) < 0.5
? 'Approximately symmetric'
: skewness > 0.5
? 'Right-skewed (positive skew)'
: 'Left-skewed (negative skew)';
const kurtosisInterpretation = Math.abs(kurtosis) < 0.5
? 'Mesokurtic (normal-like tails)'
: kurtosis > 0.5
? 'Leptokurtic (heavy tails)'
: 'Platykurtic (light tails)';
const range = this.stats.getRange();
const bins = Math.min(10, Math.ceil(Math.sqrt(this.validValues)));
let histogramSummary = `Distribution spans ${bins} bins`;
if (range === 0) {
histogramSummary = 'All values are identical';
}
else if (bins <= 3) {
histogramSummary = 'Distribution is highly concentrated';
}
return {
skewness: Number(skewness.toFixed(4)),
skewnessInterpretation,
kurtosis: Number(kurtosis.toFixed(4)),
kurtosisInterpretation,
histogramSummary,
};
}
getNormalityTests() {
const n = this.validValues;
if (n < 3) {
const insufficientData = {
statistic: 0,
pValue: 1,
interpretation: 'Insufficient data for normality testing',
};
return {
shapiroWilk: insufficientData,
jarqueBera: insufficientData,
kolmogorovSmirnov: insufficientData,
};
}
// Get sample data for testing (from reservoir sampler)
const sampleData = this.reservoir.getSample();
if (sampleData.length < 3) {
const insufficientSample = {
statistic: 0,
pValue: 1,
interpretation: 'Insufficient sample data for normality testing',
};
return {
shapiroWilk: insufficientSample,
jarqueBera: insufficientSample,
kolmogorovSmirnov: insufficientSample,
};
}
// Use proper statistical tests from statistical-tests library
const shapiroResult = statistical_tests_1.ShapiroWilkTest.test(sampleData);
const jarqueBeraResult = statistical_tests_1.JarqueBeraTest.test(sampleData);
const ksResult = statistical_tests_1.KolmogorovSmirnovTest.test(sampleData);
return {
shapiroWilk: {
statistic: shapiroResult.statistic,
pValue: shapiroResult.pValue,
interpretation: shapiroResult.interpretation,
},
jarqueBera: {
statistic: jarqueBeraResult.statistic,
pValue: jarqueBeraResult.pValue,
interpretation: jarqueBeraResult.interpretation,
},
kolmogorovSmirnov: {
statistic: ksResult.statistic,
pValue: ksResult.pValue,
interpretation: ksResult.interpretation,
},
};
}
getOutlierAnalysis() {
if (this.validValues < 3) {
const emptyResult = {
lowerFence: 0,
upperFence: 0,
lowerOutliers: 0,
upperOutliers: 0,
lowerPercentage: 0,
upperPercentage: 0,
extremeOutliers: 0,
extremePercentage: 0,
};
return {
iqrMethod: emptyResult,
zScoreMethod: { threshold: 3, lowerOutliers: 0, upperOutliers: 0 },
modifiedZScoreMethod: { threshold: 3.5, outliers: 0 },
summary: {
totalOutliers: 0,
totalPercentage: 0,
minOutlierValue: 0,
maxOutlierValue: 0,
potentialImpact: 'No outliers detected',
},
};
}
// Safe quantile access with fallbacks
const q1Quantile = this.quantiles.get(25);
const q3Quantile = this.quantiles.get(75);
const q1 = q1Quantile ? q1Quantile.getQuantile() : 0;
const q3 = q3Quantile ? q3Quantile.getQuantile() : 0;
const iqr = q3 - q1;
const lowerFence = q1 - 1.5 * iqr;
const upperFence = q3 + 1.5 * iqr;
const extremeLowerFence = q1 - 3 * iqr;
const extremeUpperFence = q3 + 3 * iqr;
// Count outliers from reservoir sample
const sample = this.reservoir.getSample();
const lowerOutliers = sample.filter((val) => val < lowerFence && val >= extremeLowerFence).length;
const upperOutliers = sample.filter((val) => val > upperFence && val <= extremeUpperFence).length;
const extremeOutliers = sample.filter((val) => val < extremeLowerFence || val > extremeUpperFence).length;
// Z-score outliers
const mean = this.stats.getMean();
const stdDev = this.stats.getStandardDeviation();
const zScoreOutliers = stdDev > 0 ? sample.filter((val) => Math.abs((val - mean) / stdDev) > 3) : [];
// Modified Z-score (using MAD)
const medianQuantile = this.quantiles.get(50);
const median = medianQuantile ? medianQuantile.getQuantile() : this.stats.getMean();
const absoluteDeviations = sample.map((val) => Math.abs(val - median));
const mad = absoluteDeviations.sort((a, b) => a - b)[Math.floor(absoluteDeviations.length / 2)] || 0;
const modifiedZOutliers = mad > 0 ? sample.filter((val) => Math.abs((0.6745 * (val - median)) / mad) > 3.5) : [];
const allOutliers = new Set([
...sample.filter((val) => val < lowerFence || val > upperFence),
...zScoreOutliers,
...modifiedZOutliers,
]);
return {
iqrMethod: {
lowerFence: Number(lowerFence.toFixed(6)),
upperFence: Number(upperFence.toFixed(6)),
lowerOutliers,
upperOutliers,
lowerPercentage: Number(((lowerOutliers / sample.length) * 100).toFixed(2)),
upperPercentage: Number(((upperOutliers / sample.length) * 100).toFixed(2)),
extremeOutliers,
extremePercentage: Number(((extremeOutliers / sample.length) * 100).toFixed(2)),
},
zScoreMethod: {
threshold: 3,
lowerOutliers: zScoreOutliers.filter((val) => val < mean).length,
upperOutliers: zScoreOutliers.filter((val) => val > mean).length,
},
modifiedZScoreMethod: {
threshold: 3.5,
outliers: modifiedZOutliers.length,
},
summary: {
totalOutliers: allOutliers.size,
totalPercentage: Number(((allOutliers.size / sample.length) * 100).toFixed(2)),
minOutlierValue: allOutliers.size > 0 ? Math.min(...allOutliers) : 0,
maxOutlierValue: allOutliers.size > 0 ? Math.max(...allOutliers) : 0,
potentialImpact: allOutliers.size > sample.length * 0.05
? 'High outlier presence may affect analysis'
: 'Low outlier impact',
},
};
}
getNumericalPatterns() {
const sample = this.reservoir.getSample();
const zeroCount = sample.filter((val) => val === 0).length;
const negativeCount = sample.filter((val) => val < 0).length;
// Check for round numbers
const roundNumbers = sample.filter((val) => val % 5 === 0 || val % 10 === 0).length;
const roundPercentage = (roundNumbers / sample.length) * 100;
const roundNumbersNote = roundPercentage > 30
? 'High proportion of round numbers suggests potential data rounding'
: roundPercentage > 10
? 'Moderate rounding detected'
: 'No significant rounding detected';
// Log transformation potential
const positiveData = sample.filter((val) => val > 0);
const logTransformationPotential = positiveData.length === sample.length && sample.some((val) => val > 1000)
? 'Good candidate for log transformation due to wide range'
: 'Log transformation may not be beneficial';
return {
zeroValuePercentage: Number(((zeroCount / sample.length) * 100).toFixed(2)),
negativeValuePercentage: Number(((negativeCount / sample.length) * 100).toFixed(2)),
roundNumbersNote,
logTransformationPotential,
};
}
getWarnings() {
return [...this.warnings];
}
clearMemory() {
// Clear reservoir sample to free memory
this.reservoir = new online_statistics_1.ReservoirSampler(100, 42);
}
/**
* Calculate uniqueness percentage using consistent formula across all sections
* Fixes inter-section consistency bug
*/
calculateUniquePercentage(uniqueCount, validValueCount) {
if (validValueCount === 0)
return 0;
const percentage = (uniqueCount / validValueCount) * 100;
return Number(percentage.toFixed(2));
}
}
exports.StreamingNumericalAnalyzer = StreamingNumericalAnalyzer;
/**
* Streaming Categorical Column Analyzer
*/
class StreamingCategoricalAnalyzer {
columnName;
detectedType;
semanticType;
frequencies = new online_statistics_1.BoundedFrequencyCounter(500); // Reduced from 10000
warnings = [];
totalValues = 0;
validValues = 0;
nullValues = 0;
lengthStats = new online_statistics_1.OnlineStatistics();
constructor(columnName, detectedType, semanticType = types_1.SemanticType.UNKNOWN) {
this.columnName = columnName;
this.detectedType = detectedType;
this.semanticType = semanticType;
}
processValue(value) {
this.totalValues++;
if (value === null || value === undefined || value === '') {
this.nullValues++;
return;
}
const stringValue = String(value);
this.validValues++;
this.frequencies.update(stringValue);
this.lengthStats.update(stringValue.length);
}
finalize() {
const baseProfile = this.createBaseProfile();
const frequencies = this.getFrequencyDistribution();
const diversityMetrics = this.getDiversityMetrics(frequencies);
const labelAnalysis = this.getLabelAnalysis();
const recommendations = this.getRecommendations(frequencies, baseProfile.uniqueValues);
return {
...baseProfile,
uniqueCategories: this.frequencies.getFrequencies().size,
mostFrequentCategory: frequencies[0] || {
label: '',
count: 0,
percentage: 0,
cumulativePercentage: 0,
},
secondMostFrequentCategory: frequencies[1] || {
label: '',
count: 0,
percentage: 0,
cumulativePercentage: 0,
},
leastFrequentCategory: frequencies[frequencies.length - 1] || {
label: '',
count: 0,
percentage: 0,
cumulativePercentage: 0,
},
frequencyDistribution: frequencies.slice(0, 20),
diversityMetrics,
labelAnalysis,
recommendations,
};
}
createBaseProfile() {
const uniqueValues = this.frequencies.getFrequencies().size;
return {
columnName: this.columnName,
detectedDataType: this.detectedType,
inferredSemanticType: this.semanticType,
dataQualityFlag: this.validValues / this.totalValues > 0.95
? 'Good'
: this.validValues / this.totalValues > 0.8
? 'Moderate'
: 'Poor',
totalValues: this.totalValues,
missingValues: this.nullValues,
missingPercentage: Number(((this.nullValues / this.totalValues) * 100).toFixed(2)),
uniqueValues,
uniquePercentage: this.calculateUniquePercentage(uniqueValues, this.validValues),
};
}
getFrequencyDistribution() {
const freqMap = this.frequencies.getFrequencies();
const frequencies = Array.from(freqMap.entries())
.map(([label, count]) => ({
label,
count,
percentage: Number(((count / this.validValues) * 100).toFixed(2)),
cumulativePercentage: 0,
}))
.sort((a, b) => b.count - a.count);
// Calculate cumulative percentages
let cumulative = 0;
frequencies.forEach((freq) => {
cumulative += freq.percentage;
freq.cumulativePercentage = Number(cumulative.toFixed(2));
});
return frequencies;
}
getDiversityMetrics(frequencies) {
if (frequencies.length === 0) {
return {
shannonEntropy: 0,
maxEntropy: 0,
giniImpurity: 0,
balanceInterpretation: 'No categories',
majorCategoryDominance: 'No data',
};
}
// Shannon entropy
const shannonEntropy = frequencies.reduce((entropy, freq) => {
const probability = freq.count / this.validValues;
return entropy - probability * Math.log2(probability);
}, 0);
const maxEntropy = Math.log2(frequencies.length);
// Gini impurity
const giniImpurity = 1 -
frequencies.reduce((sum, freq) => {
const probability = freq.count / this.validValues;
return sum + Math.pow(probability, 2);
}, 0);
const normalizedEntropy = maxEntropy > 0 ? shannonEntropy / maxEntropy : 0;
const balanceInterpretation = normalizedEntropy > 0.9
? 'Highly balanced distribution'
: normalizedEntropy > 0.7
? 'Moderately balanced distribution'
: normalizedEntropy > 0.4
? 'Unbalanced distribution'
: 'Highly unbalanced distribution';
const topCategoryPercentage = frequencies[0]?.percentage || 0;
const majorCategoryDominance = topCategoryPercentage > 80
? 'Single category dominates'
: topCategoryPercentage > 60
? 'Major category present'
: topCategoryPercentage > 40
? 'Moderate concentration'
: 'Well distributed';
return {
shannonEntropy: Number(shannonEntropy.toFixed(4)),
maxEntropy: Number(maxEntropy.toFixed(4)),
giniImpurity: Number(giniImpurity.toFixed(4)),
balanceInterpretation,
majorCategoryDominance,
};
}
getLabelAnalysis() {
if (this.validValues === 0) {
return { minLabelLength: 0, maxLabelLength: 0, avgLabelLength: 0, emptyLabelsCount: 0 };
}
return {
minLabelLength: this.lengthStats.getMin(),
maxLabelLength: this.lengthStats.getMax(),
avgLabelLength: Number(this.lengthStats.getMean().toFixed(1)),
emptyLabelsCount: 0, // Empty strings already filtered out
};
}
getRecommendations(frequencies, uniqueCount) {
const recommendations = {};
if (uniqueCount > 100) {
recommendations.highCardinalityWarning = `High cardinality (${uniqueCount} categories) may require grouping or encoding strategies`;
}
const rareCategories = frequencies.filter((freq) => freq.percentage < 1).length;
if (rareCategories > uniqueCount * 0.5) {
recommendations.rareCategoriesNote = `${rareCategories} rare categories (<1% each) present - consider grouping into 'Other'`;
}
return recommendations;
}
getWarnings() {
return [...this.warnings];
}
/**
* Calculate uniqueness percentage using consistent formula across all sections
* Fixes inter-section consistency bug
*/
calculateUniquePercentage(uniqueCount, validValueCount) {
if (validValueCount === 0)
return 0;
const percentage = (uniqueCount / validValueCount) * 100;
return Number(percentage.toFixed(2));
}
}
exports.StreamingCategoricalAnalyzer = StreamingCategoricalAnalyzer;
/**
* Streaming DateTime Column Analyzer
*/
class StreamingDateTimeAnalyzer {
columnName;
detectedType;
semanticType;
warnings = [];
totalValues = 0;
validValues = 0;
nullValues = 0;
dateValues = [];
originalStringValues = []; // Store original format for proper granularity detection
maxDateSamples = 50; // Strict limit
dateValueFrequencies = new online_statistics_1.BoundedFrequencyCounter(200); // Track unique date values
yearCounts = new online_statistics_1.BoundedFrequencyCounter(50);
monthCounts = new online_statistics_1.BoundedFrequencyCounter(12);
dayOfWeekCounts = new online_statistics_1.BoundedFrequencyCounter(7);
hourCounts = new online_statistics_1.BoundedFrequencyCounter(24);
constructor(columnName, detectedType, semanticType = types_1.SemanticType.UNKNOWN) {
this.columnName = columnName;
this.detectedType = detectedType;
this.semanticType = semanticType;
}
processValue(value) {
this.totalValues++;
if (value === null || value === undefined || value === '') {
this.nullValues++;
return;
}
// Try to parse as date
const dateValue = new Date(String(value));
if (isNaN(dateValue.getTime())) {
this.nullValues++;
return;
}
this.validValues++;
// Track unique date values
this.dateValueFrequencies.update(String(value).trim());
// Store a sample of dates (strict limit to prevent memory growth)
if (this.dateValues.length < this.maxDateSamples) {
this.dateValues.push(dateValue);
this.originalStringValues.push(String(value).trim());
}
// Update frequency counters
this.yearCounts.update(dateValue.getFullYear());
this.monthCounts.update(dateValue.getMonth() + 1); // 1-based months
this.dayOfWeekCounts.update(dateValue.getDay()); // 0=Sunday
this.hourCounts.update(dateValue.getHours());
}
finalize() {
if (this.validValues === 0) {
this.warnings.push({
category: 'data',
severity: 'high',
message: `Column ${this.columnName} has no valid datetime values`,
impact: 'Temporal analysis not possible',
suggestion: 'Check data type detection or data quality',
});
}
const baseProfile = this.createBaseProfile();
// Calculate datetime-specific metrics
const sortedDates = this.dateValues.sort((a, b) => a.getTime() - b.getTime());
const minDateTime = sortedDates[0] || new Date();
const maxDateTime = sortedDates[sortedDates.length - 1] || new Date();
const timeSpan = this.calculateTimeSpan(minDateTime, maxDateTime);
const detectedGranularity = this.detectGranularity();
const mostCommonComponents = this.getMostCommonComponents();
return {
...baseProfile,
minDateTime,
maxDateTime,
timeSpan,
detectedGranularity,
implicitPrecision: this.detectPrecision(),
mostCommonYears: mostCommonComponents.years,
mostCommonMonths: mostCommonComponents.months,
mostCommonDaysOfWeek: mostCommonComponents.daysOfWeek,
mostCommonHours: mostCommonComponents.hours,
temporalPatterns: this.analyzeTemporalPatterns(),
gapAnalysis: this.analyzeGaps(),
validityNotes: this.generateValidityNotes(),
};
}
createBaseProfile() {
return {
columnName: this.columnName,
detectedDataType: this.detectedType,
inferredSemanticType: this.semanticType,
dataQualityFlag: this.validValues / this.totalValues > 0.95
? 'Good'
: this.validValues / this.totalValues > 0.8
? 'Moderate'
: 'Poor',
totalValues: this.totalValues,
missingValues: this.nullValues,
missingPercentage: Number(((this.nullValues / this.totalValues) * 100).toFixed(2)),
uniqueValues: this.dateValueFrequencies.getFrequencies().size,
uniquePercentage: this.calculateUniquePercentage(this.dateValueFrequencies.getFrequencies().size, this.validValues),
};
}
calculateTimeSpan(minDate, maxDate) {
const diffMs = maxDate.getTime() - minDate.getTime();
const diffDays = Math.floor(diffMs / (1000 * 60 * 60 * 24));
const diffYears = Math.floor(diffDays / 365);
const diffMonths = Math.floor((diffDays % 365) / 30);
const remainingDays = diffDays % 30;
if (diffYears > 0) {
return `${diffYears} years, ${diffMonths} months, ${remainingDays} days`;
}
else if (diffMonths > 0) {
return `${diffMonths} months, ${remainingDays} days`;
}
else {
return `${diffDays} days`;
}
}
detectGranularity() {
// Analyze granularity based on original string formats to avoid timezone conversion issues
if (this.originalStringValues.length === 0) {
return 'Day'; // Safe fallback
}
let hasSeconds = false;
let hasMinutes = false;
let hasHours = false;
let hasTimeComponent = false;
for (const dateStr of this.originalStringValues) {
// Check for explicit time components in the string format
// This avoids timezone conversion issues that occur with Date object methods
// Look for seconds: patterns like :SS or seconds decimal
if (/:\d{2}:\d{2}(\.\d+)?([+-]\d{2}:?\d{2}|Z)?$/i.test(dateStr) ||
/:\d{2}\.\d+([+-]\d{2}:?\d{2}|Z)?$/i.test(dateStr)) {
hasSeconds = true;
hasTimeComponent = true;
}
// Look for minutes: patterns like HH:MM
else if (/\d{1,2}:\d{2}([+-]\d{2}:?\d{2}|Z)?$/i.test(dateStr) ||
/T\d{1,2}:\d{2}([+-]\d{2}:?\d{2}|Z)?$/i.test(dateStr)) {
hasMinutes = true;
hasTimeComponent = true;
}
// Look for hours: patterns like HH or THH
else if (/T\d{1,2}([+-]\d{2}:?\d{2}|Z)?$/i.test(dateStr) ||
/\s+\d{1,2}([+-]\d{2}:?\d{2}|Z)?$/i.test(dateStr)) {
hasHours = true;
hasTimeComponent = true;
}
// Check for any time separator (T, space followed by time pattern)
else if (/[T\s]\d{1,2}(:\d{2})?(:\d{2})?/.test(dateStr)) {
hasTimeComponent = true;
if (/:\d{2}:\d{2}/.test(dateStr))
hasSeconds = true;
else if (/:\d{2}/.test(dateStr))
hasMinutes = true;
else
hasHours = true;
}
}
// Return the highest precision found
if (hasSeconds)
return 'Second';
if (hasMinutes)
return 'Minute';
if (hasHours)
return 'Hour';
// If no explicit time components found in strings, it's date-only data
return 'Day';
}
detectPrecision() {
return this.detectGranularity() + ' level precision detected';
}
getMostCommonComponents() {
const monthNames = [
'January',
'February',
'March',
'April',
'May',
'June',
'July',
'August',
'September',
'October',
'November',
'December',
];
const dayNames = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'];
return {
years: this.yearCounts.getTopK(3).map(([year]) => String(year)),
months: this.monthCounts.getTopK(3).map(([month]) => monthNames[month - 1]),
daysOfWeek: this.dayOfWeekCounts.getTopK(3).map(([day]) => dayNames[day]),
hours: this.hourCounts.getTopK(3).map(([hour]) => `${hour}:00`),
};
}
analyzeTemporalPatterns() {
if (this.dateValues.length < 10) {
return 'Insufficient data for temporal pattern analysis';
}
// Simple trend analysis
const sortedDates = this.dateValues.sort((a, b) => a.getTime() - b.getTime());
const intervals = [];
for (let i = 1; i < sortedDates.length; i++) {
intervals.push(sortedDates[i].getTime() - sortedDates[i - 1].getTime());
}
const avgInterval = intervals.reduce((sum, interval) => sum + interval, 0) / intervals.length;
const avgDays = avgInterval / (1000 * 60 * 60 * 24);
if (avgDays < 1) {
return 'High frequency data (sub-daily intervals)';
}
else if (avgDays < 7) {
return 'Daily to weekly patterns detected';
}
else if (avgDays < 32) {
return 'Weekly to monthly patterns detected';
}
else {
return 'Sparse temporal distribution (monthly+ intervals)';
}
}
analyzeGaps() {
if (this.dateValues.length < 2) {
return 'Insufficient data for gap analysis';
}
const sortedDates = this.dateValues.sort((a, b) => a.getTime() - b.getTime());
const gaps = [];
for (let i = 1; i < sortedDates.length; i++) {
gaps.push(sortedDates[i].getTime() - sortedDates[i - 1].getTime());
}
const maxGap = Math.max(...gaps);
const maxGapDays = maxGap / (1000 * 60 * 60 * 24);
return `Largest gap between consecutive records: ${Math.round(maxGapDays)} days`;
}
generateValidityNotes() {
const validityIssues = [];
// Check for future dates
const now = new Date();
const futureDates = this.dateValues.filter((d) => d > now).length;
if (futureDates > 0) {
validityIssues.push(`${futureDates} future dates detected`);
}
// Check for very old dates (before 1900)
const cutoffDate = new Date('1900-01-01');
const oldDates = this.dateValues.filter((d) => d < cutoffDate).length;
if (oldDates > 0) {
validityIssues.push(`${oldDates} dates before 1900 detected`);
}
return validityIssues.length > 0
? validityIssues.join('; ')
: 'No obvious validity issues detected';
}
getWarnings() {
return [...this.warnings];
}
/**
* Calculate uniqueness percentage using consistent formula across all sections
* Fixes inter-section consistency bug
*/
calculateUniquePercentage(uniqueCount, validValueCount) {
if (validValueCount === 0)
return 0;
const percentage = (uniqueCount / validValueCount) * 100;
return Number(percentage.toFixed(2));
}
}
exports.StreamingDateTimeAnalyzer = StreamingDateTimeAnalyzer;
/**
* Streaming Boolean Column Analyzer
*/
class StreamingBooleanAnalyzer {
columnName;
detectedType;
semanticType;
warnings = [];
totalValues = 0;
trueCount = 0;
falseCount = 0;
nullValues = 0;
constructor(columnName, detectedType, semanticType = types_1.SemanticType.STATUS) {
this.columnName = columnName;
this.detectedType = detectedType;
this.semanticType = semanticType;
}
processValue(value) {
this.totalValues++;
if (value === null || value === undefined || value === '') {
this.nullValues++;
return;
}
const strValue = String(value).toLowerCase().trim();
// Parse boolean-like values
if (['true', 'yes', 'y', '1', 'on', 'enabled', 'active'].includes(strValue)) {
this.trueCount++;
}
else if (['false', 'no', 'n', '0', 'off', 'disabled', 'inactive'].includes(strValue)) {
this.falseCount++;
}
else {
this.nullValues++;
}
}
finalize() {
const validValues = this.trueCount + this.falseCount;
if (validValues === 0) {
this.warnings.push({
category: 'data',
severity: 'high',
message: `Column ${this.columnName} has no valid boolean values`,
impact: 'Boolean analysis not possible',
suggestion: 'Check data type detection or data quality',
});
}
const baseProfile = {
columnName: this.columnName,
detectedDataType: this.detectedType,
inferredSemanticType: this.semanticType,
dataQualityFlag: validValues / this.totalValues > 0.95
? 'Good'
: validValues / this.totalValues > 0.8
? 'Moderate'
: 'Poor',
totalValues: this.totalValues,
missingValues: this.nullValues,
missingPercentage: Number(((this.nullValues / this.totalValues) * 100).toFixed(2)),
uniqueValues: validValues > 0 ? (this.trueCount > 0 && this.falseCount > 0 ? 2 : 1) : 0,
uniquePercentage: Number((((validValues > 0 ? (this.trueCount > 0 && this.falseCount > 0 ? 2 : 1) : 0) /
validValues) *
100).toFixed(2)),
};
const truePercentage = validValues > 0 ? Number(((this.trueCount / validValues) * 100).toFixed(2)) : 0;
const falsePercentage = validValues > 0 ? Number(((this.falseCount / validValues) * 100).toFixed(2)) : 0;
let interpretation = 'No valid boolean values';
if (validValues > 0) {
if (truePercentage > 75) {
interpretation = 'Predominantly True';
}
else if (falsePercentage > 75) {
interpretation = 'Predominantly False';
}
else {
interpretation = 'Balanced distribution';
}
}
return {
...baseProfile,
trueCount: this.trueCount,
falseCount: this.falseCount,
truePercentage,
falsePercentage,
interpretation,
};
}
getWarnings() {
return [...this.warnings];
}
/**
* Calculate uniqueness percentage using consistent formula across all sections
* Fixes inter-section consistency bug
*/
calculateUniquePercentage(uniqueCount, validValueCount) {
if (validValueCount === 0)
return 0;
const percentage = (uniqueCount / validValueCount) * 100;
return Number(percentage.toFixed(2));
}
}
exports.StreamingBooleanAnalyzer = StreamingBooleanAnalyzer;
/**
* Streaming Text Column Analyzer
*/
class StreamingTextAnalyzer {
columnName;
detectedType;
semanticType;
warnings = [];
totalValues = 0;
validValues = 0;
nullValues = 0;
charLengths = [];
wordCounts = [];
maxTextSamples = 100; // Strict limit
emptyStrings = 0;
numericTexts = 0;
urlCount = 0;
emailCount = 0;
wordFrequencies = new online_statistics_1.BoundedFrequencyCounter(50); // Reduced from 1000
valueFrequencies = new online_statistics_1.BoundedFrequencyCounter(100); // Track unique text values
constructor(columnName, detectedType, semanticType = types_1.SemanticType.UNKNOWN) {
this.columnName = columnName;
this.detectedType = detectedType;
this.semanticType = semanticType;
}
processValue(value) {
this.totalValues++;
if (value === null || value === undefined) {
this.nullValues++;
return;
}
const strValue = String(value);
if (strValue === '') {
this.emptyStrings++;
this.nullValues++;
return;
}
this.validValues++;
// Track unique text values
this.valueFrequencies.update(strValue);
// Analyze text characteristics
const charLength = strValue.length;
const wordCount = strValue.trim().split(/\s+/).length;
// Store samples for statistics (strict limit to prevent memory growth)
if (this.charLengths.length < this.maxTextSamples) {
this.charLengths.push(charLength);
this.wordCounts.push(wordCount);
}
// Pattern detection
if (/^\d+$/.test(strValue.trim())) {
this.numericTexts++;
}
if (/^https?:\/\//.test(strValue)) {
this.urlCount++;
}
if (/^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(strValue)) {
this.emailCount++;
}
// Word frequency analysis (for shorter texts)
if (charLength < 500) {
const words = strValue
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter((word) => word.length > 2); // Skip very short words
words.forEach((word) => this.wordFrequencies.update(word));
}
}
finalize() {
if (this.validValues === 0) {
this.warnings.push({
category: 'data',
severity: 'high',
message: `Column ${this.columnName} has no valid text values`,
impact: 'Text analysis not possible',
suggestion: 'Check data type detection or data quality',
});
}
const baseProfile = this.createBaseProfile();
const textStatistics = this.getTextStatistics();
const textPatterns = this.getTextPatterns();
const topFrequentWords = this.getTopFrequentWords();
return {
...baseProfile,
textStatistics,
textPatterns,
topFrequentWords,
};
}
createBaseProfile() {
return {
columnName: this.columnName,
detectedDataType: this.detectedType,
inferredSemanticType: this.semanticType,
dataQualityFlag: this.validValues / this.totalValues > 0.95
? 'Good'
: this.validValues / this.totalValues > 0.8
? 'Moderate'
: 'Poor',
totalValues: this.totalValues,
missingValues: this.nullValues,
missingPercentage: Number(((this.nullValues / this.totalValues) * 100).toFixed(2)),
uniqueValues: this.valueFrequencies.getFrequencies().size,
uniquePercentage: this.calculateUniquePercentage(this.valueFrequencies.getFrequencies().size, this.validValues),
};
}
getTextStatistics() {
if (this.charLengths.length === 0) {
return {
minCharLength: 0,
maxCharLength: 0,
avgCharLength: 0,
medianCharLength: 0,
stdCharLength: 0,
minWordCount: 0,
maxWordCount: 0,
avgWordCount: 0,
};
}
const sortedLengths = [...this.charLengths].sort((a, b) => a - b);
const avgCharLength = this.charLengths.reduce((sum, len) => sum + len, 0) / this.charLengths.length;
const avgWordCount = this.wordCounts.reduce((sum, count) => sum + count, 0) / this.wordCounts.length;
// Calculate standard deviation
const variance = this.charLengths.reduce((sum, len) => sum + Math.pow(len - avgCharLength, 2), 0) /
this.charLengths.length;
const stdCharLength = Math.sqrt(variance);
return {
minCharLength: Math.min(...this.charLengths),
maxCharLength: Math.max(...this.charLengths),
avgCharLength: Number(avgCharLength.toFixed(2)),
medianCharLength: sortedLengths[Math.floor(sortedLengths.length / 2)],
stdCharLength: Number(stdCharLength.toFixed(2)),
minWordCount: Math.min(...this.wordCounts),
maxWordCount: Math.max(...this.wordCounts),
avgWordCount: Number(avgWordCount.toFixed(2)),
};
}
getTextPatterns() {
const emptyStringPercentage = Number(((this.emptyStrings / this.totalValues) * 100).toFixed(2));
const numericTextPercentage = Number(((this.numericTexts / this.validValues) * 100).toFixed(2));
const urlPercentage = Number(((this.urlCount / this.validValues) * 100).toFixed(2));
const emailPercentage = Number(((this.emailCount / this.validValues) * 100).toFixed(2));
return {
emptyStringPercentage,
numericTextPercentage,
urlCount: this.urlCount,
emailCount: this.emailCount,
urlPercentage,
emailPercentage,
};
}
getTopFrequentWords() {
return this.wordFrequencies.getTopK(5).map(([word]) => word);
}
getWarnings() {
return [...this.warnings];
}
/**
* Calculate uniqueness percentage using consistent formula across all sections
* Fixes inter-section consistency bug
*/
calculateUniquePercentage(uniqueCount, validValueCount) {
if (validValueCount === 0)
return 0;
const percentage = (uniqueCount / validValueCount) * 100;
return Number(percentage.toFixed(2));
}
}
exports.StreamingTextAnalyzer = StreamingTextAnalyzer;
//# sourceMappingURL=streaming-univariate-analyzer.js.map