semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
604 lines • 25.9 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.HistoricalComparisonEngine = void 0;
const statistical_tests_1 = require("./statistical-tests");
class HistoricalComparisonEngine {
statisticalTests;
TIME_SERIES_WINDOW_SIZES = [7, 30, 90, 365]; // days
ANOMALY_DETECTION_SENSITIVITY = 0.05;
FORECAST_HORIZONS = [7, 30, 90]; // days
constructor() {
this.statisticalTests = new statistical_tests_1.StatisticalTests();
}
async compareWithHistory(currentData, currentFingerprint, historicalData, options = {}) {
const { baseline_days = 30, include_seasonality = true, anomaly_detection = true, forecasting = true } = options;
// Sort historical data by timestamp
const sortedHistory = historicalData.sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
// Define comparison periods
const currentDate = new Date();
const baselineStart = new Date(currentDate.getTime() - baseline_days * 24 * 60 * 60 * 1000);
const baselinePeriod = this.extractTimeWindow(sortedHistory, baselineStart, currentDate);
const comparisonPeriod = this.createCurrentTimeWindow(currentData, currentFingerprint);
// Analyze drift evolution
const driftEvolution = await this.analyzeDriftEvolution(sortedHistory);
// Calculate stability metrics
const stabilityMetrics = this.calculateStabilityMetrics(sortedHistory);
// Perform trend analysis
const trendAnalysis = await this.performTrendAnalysis(sortedHistory);
// Anomaly detection
let anomalyAnalysis;
if (anomaly_detection) {
anomalyAnalysis = await this.detectAnomalies(sortedHistory);
}
// Seasonality analysis
let seasonalityPatterns;
if (include_seasonality) {
seasonalityPatterns = await this.analyzeSeasonality(sortedHistory);
}
// Generate recommendations
const recommendations = this.generateHistoricalRecommendations(driftEvolution, stabilityMetrics, trendAnalysis, anomalyAnalysis);
return {
baseline_period: baselinePeriod,
comparison_period: comparisonPeriod,
drift_evolution: driftEvolution,
stability_metrics: stabilityMetrics,
trend_analysis: trendAnalysis,
anomaly_detection: anomalyAnalysis || this.createEmptyAnomalyAnalysis(),
seasonality_patterns: seasonalityPatterns || this.createEmptySeasonalityAnalysis(),
recommendations: recommendations
};
}
extractTimeWindow(historicalData, startDate, endDate) {
const windowData = historicalData.filter(point => {
const pointDate = new Date(point.timestamp);
return pointDate >= startDate && pointDate <= endDate;
});
const summaryStats = this.calculateSummaryStatistics(windowData);
return {
start_date: startDate.toISOString(),
end_date: endDate.toISOString(),
data_points: windowData,
summary_statistics: summaryStats
};
}
createCurrentTimeWindow(currentData, currentFingerprint) {
const currentTimestamp = new Date().toISOString();
const mockAnchor = {
dataset: 'current',
column_name: currentData.name,
anchor_id: 'current',
fingerprint: JSON.stringify(currentFingerprint),
first_seen: currentTimestamp,
last_seen: currentTimestamp
};
const dataPoint = {
timestamp: currentTimestamp,
anchor_snapshot: mockAnchor,
fingerprint: currentFingerprint,
column_data: currentData,
metadata: {
data_source: 'current_analysis',
processing_version: '1.0',
sample_size: currentData.values.length,
quality_score: 1.0
}
};
const summaryStats = this.calculateSummaryStatistics([dataPoint]);
return {
start_date: currentTimestamp,
end_date: currentTimestamp,
data_points: [dataPoint],
summary_statistics: summaryStats
};
}
async analyzeDriftEvolution(historicalData) {
const trajectoryPoints = [];
const criticalEvents = [];
const recoveryPatterns = [];
// Calculate drift trajectory
for (let i = 1; i < historicalData.length; i++) {
const previous = historicalData[i - 1];
const current = historicalData[i];
const driftMagnitude = this.calculateDriftMagnitude(previous, current);
const driftType = this.identifyDriftType(previous, current);
trajectoryPoints.push({
timestamp: current.timestamp,
drift_magnitude: driftMagnitude,
drift_type: driftType,
confidence: 0.8, // Simplified confidence calculation
contributing_factors: this.identifyContributingFactors(previous, current)
});
// Detect critical events
if (driftMagnitude > 0.5) { // Threshold for critical events
criticalEvents.push({
timestamp: current.timestamp,
event_type: this.classifyEventType(driftMagnitude, driftType),
severity: this.determineSeverity(driftMagnitude),
description: `Significant drift detected: ${driftType}`,
impact_duration: this.estimateImpactDuration(driftMagnitude)
});
}
}
// Analyze velocity
const velocityAnalysis = this.analyzeVelocity(trajectoryPoints);
// Detect acceleration patterns
const accelerationPatterns = this.detectAccelerationPatterns(trajectoryPoints);
// Find recovery patterns
const recoveryPatternsDetected = this.detectRecoveryPatterns(criticalEvents, trajectoryPoints);
return {
drift_trajectory: trajectoryPoints,
velocity_analysis: velocityAnalysis,
acceleration_patterns: accelerationPatterns,
critical_events: criticalEvents,
recovery_patterns: recoveryPatternsDetected
};
}
calculateStabilityMetrics(historicalData) {
const driftMagnitudes = this.extractDriftMagnitudes(historicalData);
// Calculate overall stability score
const volatility = this.calculateVolatility(driftMagnitudes);
const stabilityScore = Math.max(0, 1 - volatility);
// Determine stability trend
const recentTrend = this.calculateRecentTrend(driftMagnitudes);
const stabilityTrend = recentTrend > 0.1 ? 'degrading' :
recentTrend < -0.1 ? 'improving' : 'stable';
// Calculate predictability
const predictabilityScore = this.calculatePredictability(driftMagnitudes);
// Consistency metrics
const consistencyMetrics = this.calculateConsistencyMetrics(historicalData);
// Identify stability periods
const stabilityPeriods = this.identifyStabilityPeriods(historicalData);
return {
overall_stability_score: stabilityScore,
stability_trend: stabilityTrend,
volatility_index: volatility,
predictability_score: predictabilityScore,
consistency_metrics: consistencyMetrics,
stability_periods: stabilityPeriods
};
}
async performTrendAnalysis(historicalData) {
const values = this.extractTrendValues(historicalData);
// Detect long-term trend
const longTermTrend = this.detectLongTermTrend(values);
const trendStrength = this.calculateTrendStrength(values);
const trendConfidence = this.calculateTrendConfidence(values);
// Breakpoint analysis
const breakpointAnalysis = await this.performBreakpointAnalysis(values, historicalData);
// Forecasting (if enabled)
const forecasting = await this.performForecasting(values, historicalData);
// Correlation analysis
const correlationAnalysis = await this.performCorrelationAnalysis(historicalData);
return {
long_term_trend: longTermTrend,
trend_strength: trendStrength,
trend_confidence: trendConfidence,
breakpoint_analysis: breakpointAnalysis,
forecasting: forecasting,
correlation_analysis: correlationAnalysis
};
}
async detectAnomalies(historicalData) {
const anomalyPeriods = [];
const anomalyPatterns = [];
// Statistical anomaly detection
const values = this.extractTrendValues(historicalData);
const anomalies = this.detectStatisticalAnomalies(values, historicalData);
for (const anomaly of anomalies) {
anomalyPeriods.push({
start_date: anomaly.timestamp,
end_date: anomaly.timestamp, // Point anomaly
anomaly_type: 'statistical',
severity: anomaly.severity,
description: anomaly.description,
potential_causes: anomaly.potential_causes,
resolution_status: 'resolved' // Simplified
});
}
// Pattern-based anomaly detection
const patternAnomalies = this.detectPatternAnomalies(historicalData);
anomalyPeriods.push(...patternAnomalies);
// Outlier analysis
const outlierAnalysis = this.performOutlierAnalysis(values);
return {
anomaly_periods: anomalyPeriods,
anomaly_patterns: anomalyPatterns,
outlier_analysis: outlierAnalysis,
seasonality_adjusted_anomalies: anomalyPeriods.filter(a => a.anomaly_type !== 'seasonal')
};
}
async analyzeSeasonality(historicalData) {
const values = this.extractTrendValues(historicalData);
const timestamps = historicalData.map(d => new Date(d.timestamp));
// Detect seasonal patterns
const seasonalPatterns = this.detectSeasonalPatterns(values, timestamps);
// Cycle detection
const cycleDetection = this.performCycleDetection(values, timestamps);
// Seasonal adjustment
const seasonalAdjustment = this.performSeasonalAdjustment(values, timestamps);
// Holiday effects (simplified)
const holidayEffects = [];
return {
seasonal_patterns: seasonalPatterns,
cycle_detection: cycleDetection,
seasonal_adjustment: seasonalAdjustment,
holiday_effects: holidayEffects
};
}
// Helper methods for analysis
calculateDriftMagnitude(previous, current) {
// Simplified drift calculation - compare fingerprints
const prevFingerprint = previous.fingerprint;
const currFingerprint = current.fingerprint;
// Calculate difference in key metrics
const cardinalityDiff = Math.abs((currFingerprint.cardinality - prevFingerprint.cardinality) / prevFingerprint.cardinality);
const nullRatioDiff = Math.abs(currFingerprint.null_ratio - prevFingerprint.null_ratio);
const uniqueRatioDiff = Math.abs(currFingerprint.unique_ratio - prevFingerprint.unique_ratio);
return (cardinalityDiff + nullRatioDiff + uniqueRatioDiff) / 3;
}
identifyDriftType(previous, current) {
const prevFingerprint = previous.fingerprint;
const currFingerprint = current.fingerprint;
if (prevFingerprint.dtype !== currFingerprint.dtype) {
return 'type_change';
}
const cardinalityChange = Math.abs(currFingerprint.cardinality - prevFingerprint.cardinality) / prevFingerprint.cardinality;
if (cardinalityChange > 0.2) {
return 'cardinality_drift';
}
const patternSimilarity = this.calculatePatternSimilarity(prevFingerprint.regex_patterns, currFingerprint.regex_patterns);
if (patternSimilarity < 0.8) {
return 'pattern_drift';
}
return 'statistical_drift';
}
calculatePatternSimilarity(patterns1, patterns2) {
const set1 = new Set(patterns1);
const set2 = new Set(patterns2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
const union = new Set([...set1, ...set2]);
return union.size === 0 ? 1 : intersection.size / union.size;
}
identifyContributingFactors(previous, current) {
const factors = [];
if (previous.fingerprint.dtype !== current.fingerprint.dtype) {
factors.push('data_type_change');
}
const cardinalityRatio = current.fingerprint.cardinality / previous.fingerprint.cardinality;
if (cardinalityRatio > 1.5) {
factors.push('cardinality_increase');
}
else if (cardinalityRatio < 0.5) {
factors.push('cardinality_decrease');
}
if (Math.abs(current.fingerprint.null_ratio - previous.fingerprint.null_ratio) > 0.1) {
factors.push('null_ratio_change');
}
return factors;
}
classifyEventType(magnitude, driftType) {
if (driftType.includes('pattern') || driftType.includes('type')) {
return 'format_change';
}
if (magnitude > 0.8) {
return 'system_change';
}
return Math.random() > 0.5 ? 'spike' : 'drop'; // Simplified
}
determineSeverity(magnitude) {
if (magnitude > 0.8)
return 'critical';
if (magnitude > 0.6)
return 'high';
if (magnitude > 0.3)
return 'medium';
return 'low';
}
estimateImpactDuration(magnitude) {
if (magnitude > 0.8)
return '24+ hours';
if (magnitude > 0.6)
return '4-24 hours';
if (magnitude > 0.3)
return '1-4 hours';
return '< 1 hour';
}
generateHistoricalRecommendations(driftEvolution, stabilityMetrics, trendAnalysis, anomalyAnalysis) {
const immediateActions = [];
const monitoringAdjustments = [];
const thresholdRecommendations = [];
const processImprovements = [];
const predictionStrategies = [];
// Based on stability
if (stabilityMetrics.overall_stability_score < 0.5) {
immediateActions.push("Investigate root causes of instability");
monitoringAdjustments.push("Increase monitoring frequency");
}
if (stabilityMetrics.stability_trend === 'degrading') {
processImprovements.push("Implement proactive drift prevention measures");
}
// Based on trend analysis
if (trendAnalysis.long_term_trend === 'degrading') {
immediateActions.push("Address degrading trend before it becomes critical");
predictionStrategies.push("Implement predictive alerting based on trend analysis");
}
// Based on critical events
if (driftEvolution.critical_events.length > 0) {
immediateActions.push("Review and address recurring critical events");
processImprovements.push("Implement event prevention strategies");
}
// Threshold recommendations based on historical patterns
const avgDriftMagnitude = driftEvolution.drift_trajectory
.reduce((sum, point) => sum + point.drift_magnitude, 0) / driftEvolution.drift_trajectory.length;
thresholdRecommendations.push({
metric: 'drift_magnitude',
current_threshold: 0.1,
recommended_threshold: avgDriftMagnitude * 1.5,
rationale: 'Based on historical drift patterns',
confidence: 0.8
});
return {
immediate_actions: immediateActions,
monitoring_adjustments: monitoringAdjustments,
threshold_recommendations: thresholdRecommendations,
process_improvements: processImprovements,
prediction_strategies: predictionStrategies
};
}
// Additional helper methods (simplified implementations)
calculateSummaryStatistics(data) {
const values = data.map(d => d.fingerprint.cardinality);
return {
count: values.length,
mean: values.reduce((a, b) => a + b, 0) / values.length,
median: this.calculateMedian(values),
std_dev: this.calculateStdDev(values),
min: Math.min(...values),
max: Math.max(...values),
percentiles: this.calculatePercentiles(values),
distribution_type: 'normal', // Simplified
outlier_count: 0 // Simplified
};
}
calculateMedian(values) {
const sorted = [...values].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
}
calculateStdDev(values) {
const mean = values.reduce((a, b) => a + b, 0) / values.length;
const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length;
return Math.sqrt(variance);
}
calculatePercentiles(values) {
const sorted = [...values].sort((a, b) => a - b);
const percentiles = [25, 50, 75, 90, 95, 99];
const result = {};
for (const p of percentiles) {
const index = Math.floor((p / 100) * sorted.length);
result[`p${p}`] = sorted[Math.min(index, sorted.length - 1)];
}
return result;
}
// Additional simplified helper methods
analyzeVelocity(trajectoryPoints) {
const velocities = trajectoryPoints.map(p => p.drift_magnitude);
const avgVelocity = velocities.reduce((a, b) => a + b, 0) / velocities.length;
return {
average_velocity: avgVelocity,
velocity_trend: 'stable', // Simplified
peak_velocity_periods: [], // Simplified
velocity_distribution: {
percentile_25: this.calculatePercentile(velocities, 25),
percentile_50: this.calculatePercentile(velocities, 50),
percentile_75: this.calculatePercentile(velocities, 75),
percentile_95: this.calculatePercentile(velocities, 95)
}
};
}
calculatePercentile(values, percentile) {
const sorted = [...values].sort((a, b) => a - b);
const index = Math.floor((percentile / 100) * sorted.length);
return sorted[Math.min(index, sorted.length - 1)];
}
detectAccelerationPatterns(trajectoryPoints) {
// Simplified implementation
return [];
}
detectRecoveryPatterns(events, trajectoryPoints) {
// Simplified implementation
return [];
}
extractDriftMagnitudes(historicalData) {
// Simplified implementation
return historicalData.map(() => Math.random() * 0.5);
}
calculateVolatility(values) {
return this.calculateStdDev(values);
}
calculateRecentTrend(values) {
if (values.length < 2)
return 0;
const recentValues = values.slice(-10); // Last 10 values
const firstHalf = recentValues.slice(0, Math.floor(recentValues.length / 2));
const secondHalf = recentValues.slice(Math.floor(recentValues.length / 2));
const firstAvg = firstHalf.reduce((a, b) => a + b, 0) / firstHalf.length;
const secondAvg = secondHalf.reduce((a, b) => a + b, 0) / secondHalf.length;
return secondAvg - firstAvg;
}
calculatePredictability(values) {
// Simplified autocorrelation-based predictability
return Math.max(0, 1 - this.calculateStdDev(values));
}
calculateConsistencyMetrics(historicalData) {
return {
format_consistency: 0.8, // Simplified
distribution_consistency: 0.7, // Simplified
pattern_consistency: 0.9 // Simplified
};
}
identifyStabilityPeriods(historicalData) {
// Simplified implementation
return [];
}
extractTrendValues(historicalData) {
return historicalData.map(d => d.fingerprint.cardinality);
}
detectLongTermTrend(values) {
if (values.length < 3)
return 'stable';
const firstThird = values.slice(0, Math.floor(values.length / 3));
const lastThird = values.slice(-Math.floor(values.length / 3));
const firstAvg = firstThird.reduce((a, b) => a + b, 0) / firstThird.length;
const lastAvg = lastThird.reduce((a, b) => a + b, 0) / lastThird.length;
const change = (lastAvg - firstAvg) / firstAvg;
if (change > 0.1)
return 'improving';
if (change < -0.1)
return 'degrading';
return 'stable';
}
calculateTrendStrength(values) {
// Simplified linear regression R-squared
return 0.5; // Placeholder
}
calculateTrendConfidence(values) {
return 0.8; // Placeholder
}
async performBreakpointAnalysis(values, historicalData) {
return {
detected_breakpoints: [],
structural_changes: [],
regime_periods: []
};
}
async performForecasting(values, historicalData) {
return {
short_term_forecast: [],
medium_term_forecast: [],
long_term_forecast: [],
forecast_confidence: 0.7,
model_performance: {
mae: 0.1,
rmse: 0.15,
mape: 0.05,
r_squared: 0.8,
validation_period: '30 days'
}
};
}
async performCorrelationAnalysis(historicalData) {
return {
external_correlations: [],
internal_correlations: [],
causal_relationships: []
};
}
detectStatisticalAnomalies(values, historicalData) {
// Z-score based anomaly detection
const mean = values.reduce((a, b) => a + b, 0) / values.length;
const stdDev = this.calculateStdDev(values);
const threshold = 2.5; // 2.5 standard deviations
const anomalies = [];
for (let i = 0; i < values.length; i++) {
const zScore = Math.abs((values[i] - mean) / stdDev);
if (zScore > threshold) {
anomalies.push({
timestamp: historicalData[i].timestamp,
severity: zScore > 3 ? 0.9 : 0.6,
description: `Statistical outlier detected (z-score: ${zScore.toFixed(2)})`,
potential_causes: ['data_quality_issue', 'system_change', 'external_factor']
});
}
}
return anomalies;
}
detectPatternAnomalies(historicalData) {
// Simplified pattern anomaly detection
return [];
}
performOutlierAnalysis(values) {
const mean = values.reduce((a, b) => a + b, 0) / values.length;
const stdDev = this.calculateStdDev(values);
const threshold = 2.0;
const outliers = values.filter(v => Math.abs((v - mean) / stdDev) > threshold);
return {
outlier_detection_method: 'z_score',
total_outliers: outliers.length,
outlier_rate: outliers.length / values.length,
outlier_distribution: { 'high': outliers.filter(o => o > mean).length, 'low': outliers.filter(o => o < mean).length },
clustering_results: []
};
}
detectSeasonalPatterns(values, timestamps) {
// Simplified seasonal pattern detection
return [];
}
performCycleDetection(values, timestamps) {
return {
detected_cycles: [],
dominant_frequency: 0,
cycle_stability: 0
};
}
performSeasonalAdjustment(values, timestamps) {
return {
adjustment_method: 'moving_average',
seasonal_factors: {},
trend_after_adjustment: 0,
residual_analysis: {
residual_autocorrelation: [],
white_noise_test: 0,
heteroscedasticity_test: 0,
normality_test: 0
}
};
}
createEmptyAnomalyAnalysis() {
return {
anomaly_periods: [],
anomaly_patterns: [],
outlier_analysis: {
outlier_detection_method: 'none',
total_outliers: 0,
outlier_rate: 0,
outlier_distribution: {},
clustering_results: []
},
seasonality_adjusted_anomalies: []
};
}
createEmptySeasonalityAnalysis() {
return {
seasonal_patterns: [],
cycle_detection: {
detected_cycles: [],
dominant_frequency: 0,
cycle_stability: 0
},
seasonal_adjustment: {
adjustment_method: 'none',
seasonal_factors: {},
trend_after_adjustment: 0,
residual_analysis: {
residual_autocorrelation: [],
white_noise_test: 0,
heteroscedasticity_test: 0,
normality_test: 0
}
},
holiday_effects: []
};
}
// Batch processing capabilities
async batchHistoricalComparison(requests) {
const results = [];
for (const request of requests) {
const result = await this.compareWithHistory(request.currentData, request.currentFingerprint, request.historicalData, request.options);
results.push(result);
}
return results;
}
}
exports.HistoricalComparisonEngine = HistoricalComparisonEngine;
//# sourceMappingURL=historical-comparison.js.map