semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
465 lines • 20.9 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.DriftDetector = void 0;
const statistical_tests_1 = require("./statistical-tests");
const pattern_drift_1 = require("./pattern-drift");
const alert_generator_1 = require("./alert-generator");
class DriftDetector {
config;
statisticalTests;
patternDetector;
alertGenerator;
constructor(config) {
this.config = {
ks_test_threshold: 0.05,
psi_threshold: 0.1,
pattern_similarity_threshold: 0.8,
uniqueness_threshold: 0.05,
scale_change_threshold: 5.0,
confidence_degradation_threshold: 0.1,
sample_size_limit: 100000,
enable_performance_mode: true,
...config
};
this.statisticalTests = new statistical_tests_1.StatisticalTests();
this.patternDetector = new pattern_drift_1.PatternDriftDetector();
this.alertGenerator = new alert_generator_1.AlertGenerator();
}
async detectDrift(historicalAnchor, currentColumn, currentFingerprint) {
const startTime = Date.now();
const historicalFingerprint = JSON.parse(historicalAnchor.fingerprint);
// Optimize data for performance if needed
const optimizedData = this.config.enable_performance_mode
? this.optimizeForPerformance(currentColumn)
: currentColumn;
// Detect different types of drift
const driftTypes = [];
const details = {};
const alerts = [];
// 1. Distribution Drift Detection
const distributionDrift = await this.detectDistributionDrift(historicalFingerprint, currentFingerprint, optimizedData);
if (distributionDrift) {
driftTypes.push(distributionDrift);
details.distribution_drift = this.getDistributionDetails(distributionDrift);
}
// 2. Format/Pattern Drift Detection
const formatDrift = await this.detectFormatDrift(historicalFingerprint, currentFingerprint);
if (formatDrift) {
driftTypes.push(formatDrift);
details.format_drift = this.getFormatDetails(formatDrift);
}
// 3. Unit/Scale Drift Detection
const unitDrift = this.detectUnitDrift(historicalFingerprint, currentFingerprint);
if (unitDrift) {
driftTypes.push(unitDrift);
details.unit_drift = this.getUnitDetails(unitDrift);
}
// 4. Joinability Drift Detection
const joinabilityDrift = this.detectJoinabilityDrift(historicalFingerprint, currentFingerprint);
if (joinabilityDrift) {
driftTypes.push(joinabilityDrift);
details.joinability_drift = this.getJoinabilityDetails(joinabilityDrift);
}
// 5. Confidence Drift Detection
const confidenceDrift = this.detectConfidenceDrift(historicalAnchor, driftTypes, details);
if (confidenceDrift) {
driftTypes.push(confidenceDrift);
details.confidence_drift = this.getConfidenceDetails(confidenceDrift);
}
// Calculate overall severity and confidence
const severity = this.calculateOverallSeverity(driftTypes);
const confidenceScore = this.calculateConfidenceScore(driftTypes, details);
// Generate alerts
for (const drift of driftTypes) {
const alert = await this.alertGenerator.generateAlert(drift, historicalAnchor, currentColumn, details);
alerts.push(alert);
}
// Generate recommendations
const recommendedActions = this.generateRecommendations(driftTypes, severity);
const detectionTime = Date.now() - startTime;
return {
anchor_id: historicalAnchor.anchor_id,
column_name: currentColumn.name,
drift_detected: driftTypes.length > 0,
drift_types: driftTypes,
severity: severity,
confidence_score: confidenceScore,
details: details,
alerts: alerts,
recommended_actions: recommendedActions,
performance_metrics: {
detection_time_ms: detectionTime,
samples_processed: optimizedData.values.length
}
};
}
optimizeForPerformance(column) {
if (column.values.length <= this.config.sample_size_limit) {
return column;
}
// Stratified sampling to maintain distribution characteristics
const sampleSize = this.config.sample_size_limit;
const step = Math.floor(column.values.length / sampleSize);
const sampledValues = [];
for (let i = 0; i < column.values.length; i += step) {
sampledValues.push(column.values[i]);
if (sampledValues.length >= sampleSize)
break;
}
return {
...column,
values: sampledValues
};
}
async detectDistributionDrift(historical, current, currentColumn) {
if (this.isNumericType(historical.dtype) && this.isNumericType(current.dtype)) {
// Extract historical numeric data from sample_values
const historicalValues = historical.sample_values
.map(v => parseFloat(v))
.filter(v => !isNaN(v));
let currentValues = currentColumn.values
.map(v => parseFloat(v))
.filter(v => !isNaN(v));
if (historicalValues.length === 0 || currentValues.length === 0) {
return null;
}
// If historical sample is very small, use robust mean-shift heuristic
if (historicalValues.length > 0 && historicalValues.length < 50) {
const mean = (arr) => arr.reduce((a, b) => a + b, 0) / arr.length;
const hMean = mean(historicalValues);
const cMean = mean(currentValues);
const rel = Math.abs(cMean - hMean) / Math.max(1, Math.abs(hMean));
if (rel > 0.2) {
return {
type: 'distribution',
severity: rel > 1 ? 'high' : 'medium',
metric_value: rel,
threshold: 0.2,
description: `Distribution mean shift ~${(rel * 100).toFixed(1)}% (small baseline sample)`
};
}
return null;
}
// Kolmogorov-Smirnov test
const ksResult = this.statisticalTests.kolmogorovSmirnovTest(historicalValues, currentValues);
// Population Stability Index
const psiScore = this.statisticalTests.populationStabilityIndex(historicalValues, currentValues);
const largeSample = (historicalValues.length + currentValues.length) >= 12000;
const ksTriggered = ksResult.p_value < this.config.ks_test_threshold;
const psiTriggered = psiScore > this.config.psi_threshold;
// For large samples, KS can flag tiny shifts; require PSI confirmation
const driftTriggered = largeSample ? psiTriggered : (ksTriggered || psiTriggered);
if (driftTriggered) {
let severity = 'low';
if (psiScore > 0.25 || ksResult.p_value < 0.001) {
severity = 'critical';
}
else if (psiScore > 0.15 || ksResult.p_value < 0.01) {
severity = 'high';
}
else if (psiScore > 0.1 || ksResult.p_value < 0.05) {
severity = 'medium';
}
return {
type: 'distribution',
severity: severity,
metric_value: Math.max(psiScore, 1 - ksResult.p_value),
threshold: Math.min(this.config.psi_threshold, this.config.ks_test_threshold),
description: `Distribution shift detected: PSI=${psiScore.toFixed(4)}, KS p-value=${ksResult.p_value.toFixed(4)}`,
meta: {
ks_statistic: ksResult.statistic,
ks_p_value: ksResult.p_value,
ks_critical_value: ksResult.critical_value,
psi_score: psiScore
}
};
}
}
return null;
}
async detectFormatDrift(historical, current) {
// Skip format drift for numeric types to avoid false positives on numeric string patterns
if (this.isNumericType(historical.dtype) && this.isNumericType(current.dtype)) {
return null;
}
// Run full analysis to attach actionable details
const analysis = await this.patternDetector.analyzePatternDrift(historical, current);
if (analysis.similarity_score >= this.config.pattern_similarity_threshold) {
return null;
}
let severity = 'low';
switch (analysis.format_stability) {
case 'format_shift':
severity = 'critical';
break;
case 'major_change':
severity = 'high';
break;
case 'minor_change':
severity = 'medium';
break;
case 'stable':
severity = 'low';
break;
}
const semanticPatternLoss = analysis.lost_patterns.filter(p => p.semantic_type).length;
if (semanticPatternLoss > 0) {
severity = severity === 'low' ? 'medium' : severity === 'medium' ? 'high' : severity;
}
return {
type: 'format',
severity,
metric_value: 1 - analysis.similarity_score,
threshold: 1 - this.config.pattern_similarity_threshold,
description: `Format drift detected: ${(100 * (1 - analysis.similarity_score)).toFixed(1)}% pattern change`,
meta: { analysis }
};
}
detectUnitDrift(historical, current) {
if (!this.isNumericType(historical.dtype) || !this.isNumericType(current.dtype)) {
return null;
}
const historicalMin = typeof historical.min === 'number' ? historical.min : parseFloat(historical.min || '0');
const historicalMax = typeof historical.max === 'number' ? historical.max : parseFloat(historical.max || '0');
const currentMin = typeof current.min === 'number' ? current.min : parseFloat(current.min || '0');
const currentMax = typeof current.max === 'number' ? current.max : parseFloat(current.max || '0');
const historicalRange = historicalMax - historicalMin;
const currentRange = currentMax - currentMin;
if (historicalRange === 0 || currentRange === 0) {
return null;
}
const scaleFactor = currentRange / historicalRange;
const avgMagnitudeChange = Math.abs(Math.log10(scaleFactor));
if (scaleFactor > this.config.scale_change_threshold ||
scaleFactor < 1 / this.config.scale_change_threshold) {
let severity = 'low';
if (scaleFactor > 100 || scaleFactor < 0.01) {
severity = 'critical';
}
else if (scaleFactor > 50 || scaleFactor < 0.02) {
severity = 'high';
}
else if (scaleFactor > 10 || scaleFactor < 0.1) {
severity = 'medium';
}
return {
type: 'unit',
severity: severity,
metric_value: scaleFactor,
threshold: this.config.scale_change_threshold,
description: `Unit/scale change detected: ${scaleFactor.toFixed(2)}x magnitude change`,
meta: {
scale_factor: scaleFactor,
old_range: [historicalMin, historicalMax],
new_range: [currentMin, currentMax]
}
};
}
return null;
}
detectJoinabilityDrift(historical, current) {
const uniquenessChange = Math.abs(historical.unique_ratio - current.unique_ratio);
if (uniquenessChange > this.config.uniqueness_threshold) {
let severity = 'low';
if (uniquenessChange > 0.5) {
severity = 'critical';
}
else if (uniquenessChange > 0.25) {
severity = 'high';
}
else if (uniquenessChange > 0.1) {
severity = 'medium';
}
return {
type: 'joinability',
severity: severity,
metric_value: uniquenessChange,
threshold: this.config.uniqueness_threshold,
description: `Joinability degradation: uniqueness changed by ${(uniquenessChange * 100).toFixed(1)}%`,
meta: {
old_unique_ratio: historical.unique_ratio,
new_unique_ratio: current.unique_ratio,
duplicate_increase: (1 - current.unique_ratio) - (1 - historical.unique_ratio),
key_integrity_score: 1 - uniquenessChange
}
};
}
return null;
}
detectConfidenceDrift(historical, driftTypes, details) {
if (historical.confidence === undefined)
return null;
const weights = {
distribution: 0.2,
format: 0.3,
unit: 0.3,
joinability: 0.2,
confidence: 0
};
const severityPenalty = {
low: 0.05,
medium: 0.10,
high: 0.20,
critical: 0.35
};
const degradation = driftTypes
.filter(d => d.type !== 'confidence')
.reduce((sum, d) => sum + (weights[d.type] || 0) * severityPenalty[d.severity], 0);
const oldConf = historical.confidence;
const newConf = Math.max(0, Math.min(1, oldConf * (1 - degradation)));
const confidenceChange = Math.max(0, oldConf - newConf);
if (confidenceChange > this.config.confidence_degradation_threshold) {
let severity = 'low';
if (confidenceChange > 0.4)
severity = 'critical';
else if (confidenceChange > 0.25)
severity = 'high';
else if (confidenceChange > 0.15)
severity = 'medium';
return {
type: 'confidence',
severity: severity,
metric_value: confidenceChange,
threshold: this.config.confidence_degradation_threshold,
description: `Confidence degradation: ${(confidenceChange * 100).toFixed(1)}% decrease in mapping certainty`,
meta: { old_confidence: oldConf, new_confidence: newConf, inferred_degradation: degradation }
};
}
return null;
}
calculateOverallSeverity(driftTypes) {
if (driftTypes.length === 0)
return 'low';
const severityScores = driftTypes.map(drift => {
switch (drift.severity) {
case 'critical': return 4;
case 'high': return 3;
case 'medium': return 2;
case 'low': return 1;
default: return 0;
}
});
const maxSeverity = severityScores.length > 0 ? Math.max(...severityScores) : 0;
const avgSeverity = severityScores.length > 0 ? severityScores.reduce((a, b) => a + b, 0) / severityScores.length : 0;
if (maxSeverity >= 4 || avgSeverity >= 3.5)
return 'critical';
if (maxSeverity >= 3 || avgSeverity >= 2.5)
return 'high';
if (maxSeverity >= 2 || avgSeverity >= 1.5)
return 'medium';
return 'low';
}
calculateConfidenceScore(driftTypes, details) {
if (driftTypes.length === 0)
return 1.0;
const baseConfidence = 0.5;
const evidenceWeight = Math.min(driftTypes.length / 3, 1.0);
const severityPenalty = driftTypes.reduce((penalty, drift) => {
switch (drift.severity) {
case 'critical': return penalty + 0.3;
case 'high': return penalty + 0.2;
case 'medium': return penalty + 0.1;
case 'low': return penalty + 0.05;
default: return penalty;
}
}, 0);
return Math.max(0.1, Math.min(1.0, baseConfidence + evidenceWeight - severityPenalty));
}
generateRecommendations(driftTypes, severity) {
const recommendations = [];
if (severity === 'critical') {
recommendations.push('Immediate investigation required - critical drift detected');
recommendations.push('Consider halting automated processes until drift is resolved');
}
for (const drift of driftTypes) {
switch (drift.type) {
case 'distribution':
recommendations.push('Review data ingestion pipeline for upstream changes');
recommendations.push('Validate data quality and preprocessing steps');
break;
case 'format':
recommendations.push('Check for schema changes or data format updates');
recommendations.push('Update regex patterns and validation rules');
break;
case 'unit':
recommendations.push('Verify unit consistency across data sources');
recommendations.push('Implement unit conversion or normalization');
break;
case 'joinability':
recommendations.push('Check for duplicate key generation or data quality issues');
recommendations.push('Review primary key constraints and uniqueness rules');
break;
case 'confidence':
recommendations.push('Re-evaluate semantic mapping rules');
recommendations.push('Consider additional training data or pattern updates');
break;
}
}
if (recommendations.length === 0) {
recommendations.push('Continue monitoring - no immediate action required');
}
return [...new Set(recommendations)]; // Remove duplicates
}
isNumericType(dtype) {
return ['int64', 'float64', 'number'].includes(dtype.toLowerCase());
}
// Helper methods for extracting details
getDistributionDetails(drift) {
return {
ks_statistic: drift.meta?.ks_statistic ?? 0,
ks_p_value: drift.meta?.ks_p_value ?? 0,
psi_score: drift.meta?.psi_score ?? drift.metric_value,
distribution_change: drift.description
};
}
getFormatDetails(drift) {
const analysis = drift.meta?.analysis;
return {
pattern_similarity: 1 - drift.metric_value,
new_patterns: analysis?.new_patterns?.map((p) => p.pattern) ?? [],
lost_patterns: analysis?.lost_patterns?.map((p) => p.pattern) ?? [],
sample_changes: analysis?.sample_analysis?.character_set_changes ?? []
};
}
getUnitDetails(drift) {
return {
scale_factor: drift.meta?.scale_factor ?? drift.metric_value,
magnitude_change: drift.description,
value_range_shift: { old_range: drift.meta?.old_range ?? [0, 0], new_range: drift.meta?.new_range ?? [0, 0] }
};
}
getJoinabilityDetails(drift) {
return {
uniqueness_change: drift.metric_value,
duplicate_increase: drift.meta?.duplicate_increase ?? 0,
key_integrity_score: drift.meta?.key_integrity_score ?? (1 - drift.metric_value)
};
}
getConfidenceDetails(drift) {
return {
confidence_change: drift.metric_value,
mapping_uncertainty_increase: drift.metric_value,
semantic_alignment_degradation: drift.metric_value * 0.5
};
}
// Batch processing for multiple columns
async detectDriftBatch(historicalAnchors, currentColumns, currentFingerprints) {
const results = [];
for (let i = 0; i < Math.min(historicalAnchors.length, currentColumns.length); i++) {
const result = await this.detectDrift(historicalAnchors[i], currentColumns[i], currentFingerprints[i]);
results.push(result);
}
return results;
}
// Performance monitoring
getPerformanceMetrics() {
// This would track actual performance metrics in a real implementation
return {
average_detection_time: 0,
total_detections: 0,
cache_hit_rate: 0
};
}
}
exports.DriftDetector = DriftDetector;
//# sourceMappingURL=drift-detector.js.map