datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
1,125 lines • 66.5 kB
JavaScript
"use strict";
/**
* Section 2: Data Quality & Integrity Audit - Main Analyzer
* Orchestrates all quality dimensions and generates comprehensive report
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.Section2Analyzer = void 0;
const completeness_analyzer_1 = require("./completeness-analyzer");
const uniqueness_analyzer_1 = require("./uniqueness-analyzer");
const validity_analyzer_1 = require("./validity-analyzer");
const business_rule_engine_1 = require("./business-rule-engine");
const pattern_validation_engine_1 = require("./pattern-validation-engine");
const config_1 = require("../../core/config");
const version_1 = require("../../utils/version");
class Section2Analyzer {
data;
headers;
columnTypes;
rowCount;
columnCount;
config;
onProgress;
warnings = [];
startTime = 0;
section3Result;
// Performance optimization: Pre-computed column lookups
columnIndexMap = new Map();
entityColumnCache = new Set();
dateColumnCache = new Set();
numericColumnCache = new Set();
constructor(input) {
this.data = input.data;
this.headers = input.headers;
this.columnTypes = input.columnTypes;
this.rowCount = input.rowCount;
this.columnCount = input.columnCount;
this.config = this.mergeConfig(input.config);
this.onProgress = input.onProgress;
this.section3Result = input.section3Result;
// Pre-build column index maps for O(1) lookups
this.buildColumnIndexMaps();
}
/**
* Pre-build column index maps for performance optimization
*/
buildColumnIndexMaps() {
// Build column name to index mapping
this.headers.forEach((header, index) => {
this.columnIndexMap.set(header.toLowerCase(), index);
const lowerHeader = header.toLowerCase();
const columnType = this.columnTypes[index];
// Cache entity identifier columns
if (this.isEntityIdentifierColumn(lowerHeader)) {
this.entityColumnCache.add(index);
}
// Cache date columns
if (this.isDateColumn(lowerHeader)) {
this.dateColumnCache.add(index);
}
// Cache numeric columns
if (this.isNumericColumn(columnType)) {
this.numericColumnCache.add(index);
}
});
}
isEntityIdentifierColumn(headerLower) {
return (headerLower.includes('customer') ||
headerLower.includes('client') ||
headerLower.includes('person') ||
headerLower.includes('user') ||
headerLower.includes('product') ||
headerLower.includes('item') ||
headerLower.includes('sku') ||
headerLower.includes('part') ||
headerLower.includes('company') ||
headerLower.includes('organization') ||
headerLower.includes('vendor') ||
headerLower.includes('supplier') ||
headerLower.includes('location') ||
headerLower.includes('address') ||
headerLower.includes('city') ||
headerLower.includes('region') ||
(headerLower.includes('id') && !headerLower.includes('_id_')) ||
headerLower.includes('identifier') ||
headerLower === 'key');
}
isDateColumn(headerLower) {
return /(date|time|created|updated|timestamp|modified)/i.test(headerLower);
}
isNumericColumn(columnType) {
return columnType === 'number' || columnType === 'integer' || columnType === 'float';
}
async analyze() {
this.startTime = performance.now();
const performanceMetrics = {};
try {
this.reportProgress('completeness', 0, 'Starting data quality audit');
// 1. Completeness Analysis
const completenessStart = performance.now();
this.reportProgress('completeness', 10, 'Analyzing data completeness');
const completeness = await this.analyzeCompleteness();
performanceMetrics.completeness = performance.now() - completenessStart;
// 2. Uniqueness Analysis
const uniquenessStart = performance.now();
this.reportProgress('uniqueness', 25, 'Detecting duplicates and analyzing uniqueness');
const uniqueness = await this.analyzeUniqueness();
performanceMetrics.uniqueness = performance.now() - uniquenessStart;
// 3. Validity Analysis
const validityStart = performance.now();
this.reportProgress('validity', 50, 'Validating data types and formats');
const validity = await this.analyzeValidity();
performanceMetrics.validity = performance.now() - validityStart;
// 4. Enhanced Business Rule and Pattern Validation
const businessRuleStart = performance.now();
this.reportProgress('accuracy', 60, 'Validating business rules and cross-field consistency');
const { accuracy, consistency } = await this.analyzeBusinessRulesAndPatterns();
performanceMetrics.businessRules = performance.now() - businessRuleStart;
// Additional dimensions (enhanced with statistical insights)
const timeliness = this.createPlaceholderTimeliness();
const integrity = this.createEnhancedIntegrity();
const reasonableness = this.createPlaceholderReasonableness();
const precision = this.createPlaceholderPrecision();
const representational = this.createPlaceholderRepresentational();
// 5. Generate profiling insights
this.reportProgress('report-generation', 90, 'Generating profiling insights');
const profilingInsights = this.generateProfilingInsights();
// 6. Create cockpit with composite scoring
this.reportProgress('report-generation', 95, 'Creating quality cockpit');
const cockpit = this.createDataQualityCockpit({
completeness: completeness.score,
accuracy: accuracy.score,
consistency: consistency.score,
timeliness: timeliness.score,
uniqueness: uniqueness.score,
validity: validity.score,
integrity: integrity.score,
reasonableness: reasonableness.score,
precision: precision.score,
representational: representational.score,
});
// 7. Assemble final report
this.reportProgress('report-generation', 100, 'Finalizing quality audit report');
const qualityAudit = {
cockpit,
completeness,
accuracy,
consistency,
timeliness,
uniqueness,
validity,
integrity,
reasonableness,
precision,
representational,
profilingInsights,
generatedAt: new Date(),
version: (0, version_1.getDataPilotVersion)(),
};
const totalAnalysisTime = performance.now() - this.startTime;
performanceMetrics.total = totalAnalysisTime;
return {
qualityAudit,
warnings: this.warnings,
performanceMetrics: {
totalAnalysisTime,
phases: performanceMetrics,
},
};
}
catch (error) {
this.warnings.push({
category: 'computation',
severity: 'high',
message: `Quality analysis failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
impact: 'Quality audit incomplete',
});
throw error;
}
}
async analyzeCompleteness() {
const analyzer = new completeness_analyzer_1.CompletenessAnalyzer({
data: this.data,
headers: this.headers,
columnTypes: this.columnTypes,
rowCount: this.rowCount,
columnCount: this.columnCount,
});
return analyzer.analyze();
}
async analyzeUniqueness() {
const analyzer = new uniqueness_analyzer_1.UniquenessAnalyzer({
data: this.data,
headers: this.headers,
columnTypes: this.columnTypes,
rowCount: this.rowCount,
columnCount: this.columnCount,
});
return analyzer.analyze();
}
async analyzeValidity() {
const analyzer = new validity_analyzer_1.ValidityAnalyzer({
data: this.data,
headers: this.headers,
columnTypes: this.columnTypes,
rowCount: this.rowCount,
columnCount: this.columnCount,
businessRules: this.config.customBusinessRules,
customPatterns: this.config.externalReferences?.customPatterns,
customRanges: this.config.externalReferences?.customRanges,
});
return analyzer.analyze();
}
/**
* Enhanced business rule and pattern validation analysis
*/
async analyzeBusinessRulesAndPatterns() {
// Business Rule Validation
const businessRuleConfig = {
enabledRuleTypes: ['cross_field', 'intra_record', 'business_logic'],
enableBuiltInRules: true,
maxViolationsToTrack: 1000,
customRules: this.config.customBusinessRules,
};
const businessRuleEngine = new business_rule_engine_1.BusinessRuleEngine(this.data, this.headers, this.columnTypes, businessRuleConfig);
const businessRuleResults = businessRuleEngine.validateData();
const businessRuleSummary = businessRuleEngine.getViolationSummary();
// Pattern Validation
const patternConfig = {
enableBuiltInPatterns: true,
enableFormatStandardization: true,
maxViolationsPerPattern: 100,
customPatterns: this.config.customPatterns,
};
const patternEngine = new pattern_validation_engine_1.PatternValidationEngine(this.data, this.headers, patternConfig);
const patternResults = patternEngine.validatePatterns();
const patternSummary = patternEngine.getPatternSummary();
// Enhanced Accuracy Analysis
const accuracy = {
valueConformity: this.performExternalReferenceValidation(),
crossFieldValidation: businessRuleResults.crossFieldValidations,
outlierImpact: this.analyzeOutlierImpact(),
patternValidation: patternResults.patternValidations,
businessRuleSummary: {
totalRules: businessRuleSummary.totalRulesEvaluated,
totalViolations: businessRuleSummary.totalViolations,
criticalViolations: businessRuleResults.criticalViolations,
violationsBySeverity: businessRuleSummary.violationsBySeverity,
},
score: this.calculateAccuracyScore(businessRuleResults, patternResults),
};
// Enhanced Consistency Analysis with statistical insights
const statisticalInsights = this.extractStatisticalInsights();
const consistency = {
intraRecord: businessRuleResults.intraRecordConsistency,
interRecord: this.performEntityResolution(),
formatConsistency: patternResults.formatConsistency,
patternSummary: {
totalPatterns: patternSummary.totalPatternsEvaluated,
totalViolations: patternSummary.totalViolations,
violationsBySeverity: patternSummary.violationsBySeverity,
problematicColumns: patternSummary.mostProblematicColumns,
},
statisticalConsistency: {
correlationStability: statisticalInsights.multicollinearity?.severity === 'none' ? 'stable' : 'unstable',
normalityConsistency: statisticalInsights.normality?.violatedVariables === 0 ? 'consistent' : 'inconsistent',
outliersImpact: statisticalInsights.outliers?.percentage || 0 < 5 ? 'minimal' : 'significant',
},
score: this.calculateEnhancedConsistencyScore(businessRuleResults, patternResults, statisticalInsights),
};
// Add warnings for high violation counts
if (businessRuleResults.criticalViolations > 0) {
this.warnings.push({
category: 'business_rules',
severity: 'high',
message: `${businessRuleResults.criticalViolations} critical business rule violations detected`,
impact: 'Data may not meet business requirements',
});
}
if (patternSummary.violationsBySeverity.critical > 0) {
this.warnings.push({
category: 'pattern_validation',
severity: 'high',
message: `${patternSummary.violationsBySeverity.critical} critical pattern validation failures`,
impact: 'Data format issues may affect downstream processing',
});
}
return { accuracy, consistency };
}
calculateAccuracyScore(businessRuleResults, patternResults) {
const totalRows = this.rowCount;
const totalViolations = businessRuleResults.totalViolations + patternResults.totalViolations;
const criticalViolations = businessRuleResults.criticalViolations +
(patternResults.patternValidations?.filter((p) => p.severity === 'critical').length ||
0);
// Calculate base score (0-100)
let score = 100;
// Deduct for violations (more severe = higher deduction)
const violationRate = totalViolations / totalRows;
score -= violationRate * 50; // Up to 50 points for violation rate
// Extra deduction for critical violations
const criticalRate = criticalViolations / totalRows;
score -= criticalRate * 30; // Up to 30 additional points for critical violations
score = Math.max(0, Math.round(score));
let interpretation;
if (score >= 95)
interpretation = 'Excellent';
else if (score >= 85)
interpretation = 'Good';
else if (score >= 70)
interpretation = 'Fair';
else if (score >= 50)
interpretation = 'Needs Improvement';
else
interpretation = 'Poor';
return {
score,
interpretation,
details: `${totalViolations} total rule violations, ${criticalViolations} critical`,
};
}
calculateConsistencyScore(businessRuleResults, patternResults) {
return this.calculateEnhancedConsistencyScore(businessRuleResults, patternResults, {
hasStatisticalTests: false,
});
}
/**
* Enhanced consistency scoring with statistical insights
*/
calculateEnhancedConsistencyScore(businessRuleResults, patternResults, statisticalInsights) {
const totalFormatIssues = patternResults.formatConsistency?.length || 0;
const intraRecordIssues = businessRuleResults.intraRecordConsistency?.length || 0;
// Base score calculation
let score = 100;
const details = [];
// Deduct for format inconsistencies
if (totalFormatIssues > 0) {
const formatPenalty = totalFormatIssues * 10;
score -= formatPenalty;
details.push(`${totalFormatIssues} format inconsistencies (-${formatPenalty})`);
}
// Deduct for intra-record consistency issues
if (intraRecordIssues > 0) {
const recordPenalty = intraRecordIssues * 15;
score -= recordPenalty;
details.push(`${intraRecordIssues} intra-record issues (-${recordPenalty})`);
}
// Factor in statistical consistency
if (statisticalInsights.hasStatisticalTests) {
// Multicollinearity affects consistency
if (statisticalInsights.multicollinearity) {
const severity = statisticalInsights.multicollinearity.severity;
if (severity === 'severe') {
score -= 15;
details.push('Severe multicollinearity affecting variable consistency (-15)');
}
else if (severity === 'moderate') {
score -= 8;
details.push('Moderate multicollinearity detected (-8)');
}
else {
score += 3; // Bonus for good correlation structure
details.push('Stable correlation structure (+3)');
}
}
// Normality consistency
if (statisticalInsights.normality) {
const violatedVars = statisticalInsights.normality.violatedVariables;
if (violatedVars > 0) {
const normalityPenalty = Math.min(8, violatedVars * 3);
score -= normalityPenalty;
details.push(`${violatedVars} variables violate normality (-${normalityPenalty})`);
}
else {
score += 2;
details.push('Variables satisfy normality assumptions (+2)');
}
}
// Outlier impact on consistency
if (statisticalInsights.outliers) {
const outlierPercentage = statisticalInsights.outliers.percentage;
if (outlierPercentage > 10) {
score -= 12;
details.push(`High outlier rate (${outlierPercentage.toFixed(1)}%) affects consistency (-12)`);
}
else if (outlierPercentage > 5) {
score -= 6;
details.push(`Moderate outlier rate (${outlierPercentage.toFixed(1)}%) (-6)`);
}
else {
score += 2;
details.push(`Low outlier rate (${outlierPercentage.toFixed(1)}%) supports consistency (+2)`);
}
}
}
score = Math.max(0, Math.min(100, Math.round(score)));
let interpretation;
if (score >= 95)
interpretation = 'Excellent';
else if (score >= 85)
interpretation = 'Good';
else if (score >= 70)
interpretation = 'Fair';
else if (score >= 50)
interpretation = 'Needs Improvement';
else
interpretation = 'Poor';
const detailsText = details.length > 0 ? details.join('; ') : 'No consistency issues detected';
return {
score,
interpretation,
details: detailsText,
};
}
// Previous placeholder methods removed - now using enhanced implementations
createPlaceholderTimeliness() {
// Enhanced timeliness analysis
const dateColumns = this.findDateColumns();
const dataFreshness = this.analyzeDataFreshness(dateColumns);
const updateFrequency = this.analyzeUpdateFrequency(dateColumns);
let score = 75; // Default neutral score
let interpretation = 'Fair';
let details = 'Timeliness analysis based on available date/timestamp columns';
if (dateColumns.length === 0) {
score = 85;
interpretation = 'Good';
details = 'Timeliness not applicable - dataset contains static reference data without temporal elements';
}
else if (dataFreshness.latestTimestamp) {
const daysSinceUpdate = dataFreshness.daysSinceLatest || 0;
if (daysSinceUpdate <= 1) {
score = 95;
interpretation = 'Excellent';
details = 'Data is very recent (updated within 24 hours)';
}
else if (daysSinceUpdate <= 7) {
score = 85;
interpretation = 'Good';
details = 'Data is relatively fresh (updated within a week)';
}
else if (daysSinceUpdate <= 30) {
score = 70;
interpretation = 'Fair';
details = 'Data is moderately fresh (updated within a month)';
}
else if (daysSinceUpdate <= 365) {
score = 60;
interpretation = 'Needs Improvement';
details = 'Data may be stale (updated within a year)';
}
else {
score = 40;
interpretation = 'Poor';
details = 'Data appears to be very stale (over a year old)';
}
}
return {
dataFreshness,
updateFrequency,
score: {
score,
interpretation,
details,
},
};
}
/**
* Enhanced integrity analysis with statistical test insights
*/
createEnhancedIntegrity() {
const statisticalInsights = this.extractStatisticalInsights();
let baseScore = 85;
const issues = [];
const strengths = [];
// Factor in statistical test results
if (statisticalInsights.hasStatisticalTests) {
// Multicollinearity affects data integrity
if (statisticalInsights.multicollinearity) {
const severity = statisticalInsights.multicollinearity.severity;
if (severity === 'severe') {
baseScore -= 20;
issues.push('Severe multicollinearity detected - data relationships may be unstable');
}
else if (severity === 'moderate') {
baseScore -= 10;
issues.push('Moderate multicollinearity detected - some variables highly correlated');
}
else {
strengths.push('No significant multicollinearity detected');
}
}
// Outlier analysis affects integrity
if (statisticalInsights.outliers) {
const outlierPercentage = statisticalInsights.outliers.percentage;
if (outlierPercentage > 10) {
baseScore -= 15;
issues.push(`High outlier rate (${outlierPercentage.toFixed(1)}%) may indicate data quality issues`);
}
else if (outlierPercentage > 5) {
baseScore -= 8;
issues.push(`Moderate outlier rate (${outlierPercentage.toFixed(1)}%) detected`);
}
else {
strengths.push(`Low outlier rate (${outlierPercentage.toFixed(1)}%) indicates good data integrity`);
}
}
// Normality assumption violations
if (statisticalInsights.normality) {
const violatedVars = statisticalInsights.normality.violatedVariables;
if (violatedVars > 0) {
const penalty = Math.min(10, violatedVars * 2);
baseScore -= penalty;
issues.push(`${violatedVars} variables violate normality assumptions`);
}
else {
strengths.push('Variables satisfy normality assumptions');
}
}
// Clustering structure indicates natural data groupings (positive for integrity)
if (statisticalInsights.clustering?.hasNaturalClusters) {
baseScore += 5;
strengths.push(`Natural data clustering structure detected (${statisticalInsights.clustering.optimalClusters} clusters)`);
}
}
baseScore = Math.max(0, Math.min(100, Math.round(baseScore)));
let interpretation;
if (baseScore >= 95)
interpretation = 'Excellent';
else if (baseScore >= 85)
interpretation = 'Good';
else if (baseScore >= 70)
interpretation = 'Fair';
else if (baseScore >= 50)
interpretation = 'Needs Improvement';
else
interpretation = 'Poor';
const details = issues.length > 0
? `Statistical analysis reveals: ${issues.join('; ')}`
: strengths.length > 0
? `Statistical analysis confirms: ${strengths.join('; ')}`
: 'Enhanced integrity analysis with statistical validation';
return {
orphanedRecords: [],
cardinalityViolations: [],
statisticalValidation: {
multicollinearityCheck: statisticalInsights.multicollinearity,
outlierAnalysis: statisticalInsights.outliers,
normalityAssessment: statisticalInsights.normality,
clusteringStructure: statisticalInsights.clustering,
},
score: {
score: baseScore,
interpretation,
details,
},
};
}
/**
* Extract statistical insights from Section 3 results for quality scoring
*/
extractStatisticalInsights() {
if (!this.section3Result?.edaAnalysis?.multivariateAnalysis) {
return { hasStatisticalTests: false };
}
const multivariateAnalysis = this.section3Result.edaAnalysis.multivariateAnalysis;
const insights = { hasStatisticalTests: true };
try {
// Extract multicollinearity insights from correlation analysis
const correlationPairs = this.section3Result.edaAnalysis.bivariateAnalysis?.numericalVsNumerical?.correlationPairs ||
[];
if (correlationPairs.length > 0) {
const highCorrelations = correlationPairs.filter((pair) => Math.abs(pair.correlation) > 0.8);
const veryHighCorrelations = correlationPairs.filter((pair) => Math.abs(pair.correlation) > 0.95);
let severity = 'none';
const affectedVariables = [];
if (veryHighCorrelations.length > 0) {
severity = 'severe';
veryHighCorrelations.forEach((pair) => {
if (!affectedVariables.includes(pair.variable1))
affectedVariables.push(pair.variable1);
if (!affectedVariables.includes(pair.variable2))
affectedVariables.push(pair.variable2);
});
}
else if (highCorrelations.length > 0) {
severity = 'moderate';
highCorrelations.forEach((pair) => {
if (!affectedVariables.includes(pair.variable1))
affectedVariables.push(pair.variable1);
if (!affectedVariables.includes(pair.variable2))
affectedVariables.push(pair.variable2);
});
}
insights.multicollinearity = {
severity,
affectedVariables,
maxVIF: Math.max(...correlationPairs.map((p) => 1 / (1 - p.correlation * p.correlation))),
};
}
// Extract outlier insights
const outlierAnalysis = multivariateAnalysis.outlierDetection;
if (outlierAnalysis?.isApplicable) {
insights.outliers = {
count: outlierAnalysis.totalOutliers,
percentage: outlierAnalysis.outlierPercentage,
method: outlierAnalysis.method,
};
}
// Extract normality insights
const normalityTests = multivariateAnalysis.normalityTests;
if (normalityTests) {
const violatedCount = normalityTests.overallAssessment.violations.length;
insights.normality = {
violatedVariables: violatedCount,
totalTested: 1, // Simplified - multivariate normality
};
}
// Extract clustering insights
const clusteringAnalysis = multivariateAnalysis.clusteringAnalysis;
if (clusteringAnalysis?.isApplicable) {
insights.clustering = {
hasNaturalClusters: clusteringAnalysis.finalClustering.validation.silhouetteScore > 0.3,
optimalClusters: clusteringAnalysis.optimalClusters,
qualityScore: clusteringAnalysis.finalClustering.validation.silhouetteScore,
};
}
}
catch (error) {
console.warn('Error extracting statistical insights for quality scoring:', error);
}
return insights;
}
createPlaceholderReasonableness() {
return {
statisticalPlausibility: [],
semanticPlausibility: [],
contextualAnomalies: [],
score: {
score: 80,
interpretation: 'Good',
details: 'Reasonableness analysis not yet implemented',
},
};
}
createPlaceholderPrecision() {
// Enhanced precision analysis
const numericPrecision = this.analyzeNumericPrecision();
const temporalGranularity = this.analyzeTemporalGranularity();
const categoricalSpecificity = this.analyzeCategoricalSpecificity();
// Calculate score based on precision consistency
let score = 85; // Default good score
let interpretation = 'Good';
let details = 'Precision analysis based on numeric scale, temporal granularity, and categorical specificity';
// Deduct points for precision inconsistencies
const precisionIssues = numericPrecision.filter((p) => p.inconsistentPrecision).length +
temporalGranularity.filter((t) => t.mixedGranularity).length +
categoricalSpecificity.filter((c) => c.lowSpecificity).length;
if (precisionIssues > 0) {
score -= Math.min(precisionIssues * 5, 30); // 5 points per issue, max 30 point penalty
if (score >= 90) {
interpretation = 'Excellent';
details = 'High precision and consistency across data types';
}
else if (score >= 75) {
interpretation = 'Good';
details = 'Generally good precision with minor consistency issues';
}
else if (score >= 60) {
interpretation = 'Fair';
details = 'Some precision inconsistencies detected that may affect analysis quality';
}
else if (score >= 40) {
interpretation = 'Needs Improvement';
details = 'Significant precision issues that should be addressed';
}
else {
interpretation = 'Poor';
details = 'Major precision problems affecting data reliability';
}
}
return {
numericPrecision,
temporalGranularity,
categoricalSpecificity,
score: {
score: Math.max(0, score),
interpretation,
details,
},
};
}
createPlaceholderRepresentational() {
return {
unitStandardization: [],
codeStandardization: [],
textFormatting: [],
score: {
score: 80,
interpretation: 'Good',
details: 'Representational analysis not yet implemented',
},
};
}
generateProfilingInsights() {
return {
valueLengthAnalysis: [],
characterSetAnalysis: [],
specialCharacterAnalysis: [],
};
}
createDataQualityCockpit(scores) {
// Use configurable weights for quality scoring
const configManager = (0, config_1.getConfig)();
const qualityConfig = configManager.getQualityConfig();
const weights = qualityConfig.qualityWeights;
let compositeScore = 0;
for (const [dimension, score] of Object.entries(scores)) {
const weight = weights[dimension] || 0;
compositeScore += score.score * weight;
}
const compositeQualityScore = {
score: Math.round(compositeScore * 100) / 100,
interpretation: this.interpretScore(compositeScore),
details: `Weighted average of ${Object.keys(scores).length} quality dimensions`,
};
// Identify strengths and weaknesses
const strengths = this.identifyStrengths(scores);
const weaknesses = this.identifyWeaknesses(scores);
// Estimate technical debt
const technicalDebt = this.estimateTechnicalDebt(scores, weaknesses);
return {
compositeScore: compositeQualityScore,
dimensionScores: scores,
topStrengths: strengths,
topWeaknesses: weaknesses,
technicalDebt,
};
}
interpretScore(score) {
const configManager = (0, config_1.getConfig)();
const qualityConfig = configManager.getQualityConfig();
const thresholds = qualityConfig.qualityThresholds;
if (score >= thresholds.excellent)
return 'Excellent';
if (score >= thresholds.good)
return 'Good';
if (score >= thresholds.fair)
return 'Fair';
if (score >= thresholds.needsImprovement)
return 'Needs Improvement';
return 'Poor';
}
identifyStrengths(dimensionScores) {
const strengths = [];
for (const [dimension, score] of Object.entries(dimensionScores)) {
if (score.score >= 90) {
strengths.push({
description: `Excellent ${dimension} with ${score.score}% score`,
category: dimension,
impact: score.score >= 95 ? 'high' : 'medium',
});
}
}
return strengths.slice(0, 3); // Top 3 strengths
}
identifyWeaknesses(dimensionScores) {
const weaknesses = [];
const sortedDimensions = Object.entries(dimensionScores).sort(([, a], [, b]) => a.score - b.score);
for (const [dimension, score] of sortedDimensions.slice(0, 3)) {
let severity;
let priority;
if (score.score < 50) {
severity = 'critical';
priority = 10;
}
else if (score.score < 70) {
severity = 'high';
priority = 8;
}
else if (score.score < 85) {
severity = 'medium';
priority = 6;
}
else {
severity = 'low';
priority = 4;
}
weaknesses.push({
description: `${dimension} quality needs attention (${score.score}% score)`,
category: dimension,
severity,
priority,
estimatedEffort: this.estimateEffortForDimension(dimension, score.score),
});
}
return weaknesses;
}
estimateEffortForDimension(_dimension, score) {
const effort = 100 - score; // Inverse relationship
if (effort > 50)
return '8-16 hours';
if (effort > 30)
return '4-8 hours';
if (effort > 15)
return '2-4 hours';
return '1-2 hours';
}
estimateTechnicalDebt(dimensionScores, weaknesses) {
const totalEffortHours = weaknesses.reduce((sum, weakness) => {
const effort = weakness.estimatedEffort || '2-4 hours';
const hours = this.parseEffortHours(effort);
return sum + hours;
}, 0);
let complexityLevel;
if (totalEffortHours > 20)
complexityLevel = 'High';
else if (totalEffortHours > 8)
complexityLevel = 'Medium';
else
complexityLevel = 'Low';
const automatedFixableIssues = this.countAutomatedFixableIssues(dimensionScores);
return {
timeEstimate: `${Math.round(totalEffortHours)} hours estimated cleanup`,
complexityLevel,
primaryDebtContributors: weaknesses.map((w) => w.description),
automatedCleaningPotential: {
fixableIssues: automatedFixableIssues,
examples: [
'Trimming leading/trailing spaces',
'Standardizing text casing',
'Date format normalization',
],
},
};
}
parseEffortHours(effort) {
const match = effort.match(/(\d+)-(\d+)/);
if (match) {
return (parseInt(match[1]) + parseInt(match[2])) / 2;
}
return 4; // Default
}
countAutomatedFixableIssues(dimensionScores) {
// Simplified heuristic for counting automatically fixable issues
let count = 0;
// Validity issues are often automatically fixable
if (dimensionScores.validity && dimensionScores.validity.score < 90) {
count += Math.round((90 - dimensionScores.validity.score) / 10);
}
// Consistency issues are often automatically fixable
if (dimensionScores.consistency && dimensionScores.consistency.score < 90) {
count += Math.round((90 - dimensionScores.consistency.score) / 15);
}
return count;
}
reportProgress(phase, progress, operation) {
if (this.onProgress) {
const elapsed = performance.now() - this.startTime;
this.onProgress({
phase,
progress,
currentOperation: operation,
timeElapsed: elapsed,
estimatedTimeRemaining: progress > 0 ? (elapsed / progress) * (100 - progress) : undefined,
});
}
}
mergeConfig(userConfig) {
const defaultConfig = {
enabledDimensions: ['completeness', 'accuracy', 'consistency', 'uniqueness', 'validity'],
strictMode: false,
maxOutlierDetection: 1000,
semanticDuplicateThreshold: 0.8,
customBusinessRules: [],
externalReferences: {},
};
return { ...defaultConfig, ...userConfig };
}
/**
* External Reference Validation (TODO item 1)
* Validates data against external reference lists and standards
*/
performExternalReferenceValidation() {
const conformityChecks = [];
const externalRefs = this.config.externalReferences || {};
// Country code validation
if (externalRefs.countryCodesList) {
const countryColumns = this.headers
.map((header, index) => ({ header, index }))
.filter(({ header }) => /country|nation|ctry|cntry/i.test(header) ||
header.toLowerCase().includes('country_code') ||
header.toLowerCase() === 'cc');
countryColumns.forEach(({ header, index }) => {
const violations = this.validateAgainstReferenceList(index, externalRefs.countryCodesList, 'Country Code Standard');
if (violations.violationsFound > 0) {
conformityChecks.push({
columnName: header,
standard: 'ISO 3166 Country Codes',
violationsFound: violations.violationsFound,
examples: violations.examples,
description: `${violations.violationsFound} invalid country codes found`,
});
}
});
}
// Currency code validation
if (externalRefs.currencyCodesList) {
const currencyColumns = this.headers
.map((header, index) => ({ header, index }))
.filter(({ header }) => /currency|curr|money/i.test(header) ||
header.toLowerCase().includes('currency_code') ||
header.toLowerCase() === 'ccy');
currencyColumns.forEach(({ header, index }) => {
const violations = this.validateAgainstReferenceList(index, externalRefs.currencyCodesList, 'Currency Code Standard');
if (violations.violationsFound > 0) {
conformityChecks.push({
columnName: header,
standard: 'ISO 4217 Currency Codes',
violationsFound: violations.violationsFound,
examples: violations.examples,
description: `${violations.violationsFound} invalid currency codes found`,
});
}
});
}
// Product master list validation
if (externalRefs.productMasterList) {
const productColumns = this.headers
.map((header, index) => ({ header, index }))
.filter(({ header }) => /product|item|sku|part/i.test(header));
productColumns.forEach(({ header, index }) => {
const violations = this.validateAgainstReferenceList(index, externalRefs.productMasterList, 'Product Master List');
if (violations.violationsFound > 0) {
conformityChecks.push({
columnName: header,
standard: 'Product Master Reference',
violationsFound: violations.violationsFound,
examples: violations.examples,
description: `${violations.violationsFound} products not found in master list`,
});
}
});
}
return conformityChecks;
}
/**
* Validates column values against a reference list (Optimized O(n))
*/
validateAgainstReferenceList(columnIndex, referenceList, standardName) {
const configManager = (0, config_1.getConfig)();
const qualityConfig = configManager.getQualityConfig();
const referenceSet = new Set(referenceList.map((ref) => ref.toLowerCase().trim()));
const violations = [];
let violationsFound = 0;
// Use configurable sample size for performance
const sampleSize = Math.min(qualityConfig.externalValidation.maxSampleSize, this.data.length);
const maxExamples = qualityConfig.externalValidation.maxExampleViolations;
// Single pass through the data
for (let rowIndex = 0; rowIndex < sampleSize; rowIndex++) {
const value = this.data[rowIndex][columnIndex];
if (value && typeof value === 'string') {
const normalizedValue = value.toLowerCase().trim();
if (normalizedValue && !referenceSet.has(normalizedValue)) {
violationsFound++;
if (violations.length < maxExamples) {
violations.push(value);
}
}
}
}
return {
violationsFound,
examples: violations,
};
}
/**
* Outlier Impact Analysis (TODO item 2)
* Links with Section 3 outlier analysis results to assess impact on accuracy
*/
analyzeOutlierImpact() {
if (!this.section3Result?.edaAnalysis?.multivariateAnalysis?.outlierDetection) {
return {
percentageErrornousOutliers: 0,
description: 'Outlier analysis not available - Section 3 results required',
};
}
const outlierAnalysis = this.section3Result.edaAnalysis.multivariateAnalysis.outlierDetection;
if (!outlierAnalysis.isApplicable) {
return {
percentageErrornousOutliers: 0,
description: 'Outlier detection not applicable for this dataset',
};
}
// Calculate erroneous outlier percentage based on business context
const totalOutliers = outlierAnalysis.totalOutliers;
const outlierPercentage = outlierAnalysis.outlierPercentage;
// Heuristic: assume outliers in certain contexts are more likely to be errors
let errorLikelihood = 0.3; // Base 30% likelihood of outliers being errors
// Increase likelihood for specific column types
const numericColumns = this.headers
.map((header, index) => ({
header: header.toLowerCase(),
index,
type: this.columnTypes[index],
}))
.filter(({ type }) => type === 'number' || type === 'integer' || type === 'float');
// Check for error-prone contexts
let contextualErrorLikelihood = 0;
numericColumns.forEach(({ header }) => {
if (header.includes('age') || header.includes('score') || header.includes('rating')) {
contextualErrorLikelihood += 0.2; // 20% increase for age/score fields
}
if (header.includes('price') || header.includes('amount') || header.includes('cost')) {
contextualErrorLikelihood += 0.15; // 15% increase for financial fields
}
if (header.includes('quantity') || header.includes('count')) {
contextualErrorLikelihood += 0.1; // 10% increase for quantity fields
}
});
errorLikelihood = Math.min(0.8, errorLikelihood + contextualErrorLikelihood / numericColumns.length);
const percentageErrornousOutliers = outlierPercentage * errorLikelihood;
let description = `Estimated ${percentageErrornousOutliers.toFixed(2)}% of data may contain outlier-related errors`;
if (percentageErrornousOutliers > 10) {
description += ' - High outlier error rate suggests data quality issues';
}
else if (percentageErrornousOutliers > 5) {
description += ' - Moderate outlier error rate detected';
}
else {
description += ' - Low outlier error rate indicates good data quality';
}
return {
percentageErrornousOutliers,
description,
outlierDetails: {
totalOutliers,
outlierPercentage,
method: outlierAnalysis.method,
errorLikelihood: Math.round(errorLikelihood * 100) / 100,
},
};
}
/**
* Entity Resolution (Optimized O(n*m) where m is avg entities per type)
* Identifies and resolves duplicate entities across records
*/
performEntityResolution() {
const entityResolutionResults = [];
// Use cached entity columns for O(1) lookup
const entityColumns = this.getEntityColumnsFromCache();
if (entityColumns.length === 0) {
return [
{
entityType: 'Generic Records',
inconsistentEntities: 0,
examples: [],
analysis: 'No clear entity identifier columns found for resolution analysis',
},
];
}
// Limit entity columns to prevent excessive computation
const maxEntityColumns = 3; // Configurable limit
const limitedEntityColumns = entityColumns.slice(0, maxEntityColumns);
// Perform entity resolution for each identified entity type
limitedEntityColumns.forEach((entityCol) => {
const resolution = this.resolveEntitiesForColumnOptimized(entityCol);
if (resolution.inconsistentEntities > 0) {
entityResolutionResults.push(resolution);
}
});
return entityResolutionResults;
}
/**
* Get entity columns from pre-computed cache (O(1))
*/
getEntityColumnsFromCache() {
const entityColumns = [];
// Use pre-computed cache instead of scanning all headers
this.entityColumnCache.forEach((index) => {
const header = this.headers[index];
const lowerHeader = header.toLowerCase();
let entityType = 'Generic Entity';
// Determine entity type (optimized with early returns)
if (lowerHeader.includes('customer') ||
lowerHeader.includes('client') ||
lowerHeader.includes('person') ||
lowerHeader.includes('user')) {
entityType = 'Customer/Person';
}
else if (lowerHeader.includes('product') ||
lowerHeader.includes('item') ||
lowerHeader.includes('sku') ||
lowerHeader.includes('part')) {
entityType = 'Product';
}
else if (lowerHeader.includes('company') ||
lowerHeader.includes('organization') ||
lowerHeader.includes('vendor') ||
lowerHeader.includes('supplier')) {
entityType = 'Organization';
}
else if (lowerHeader.includes('location') ||
lowerHeader.includes('address') ||
lowerHeader.includes('city') ||
lowerHeader.includes('region')) {
entityType = 'Location';
}
entityColumns.push({
name: header,
index,
entityType,
});
});
return entityColumns;
}
/**
* Identifies columns that likely represent entity identifiers (Legacy method for compatibility)
*/
identifyEntityColumns() {
return this.getEntityColumnsFromCache();
}
/**
* Performs entity resolution for a specific column (Optimized O(n))
*/
resolveEntitiesForColumnOptimized(entityCol) {
const configMana