UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

379 lines 16.6 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.PerformanceOptimizedDriftDetector = void 0; const drift_detector_1 = require("./drift-detector"); /** * Performance-optimized drift detection for large datasets * Targets <1s detection time for 1M+ rows */ class PerformanceOptimizedDriftDetector { detector; CHUNK_SIZE = 100000; // Process in 100k row chunks SAMPLE_SIZE = 50000; // Use sampling for very large datasets PARALLEL_WORKERS = 4; // Simulated parallel processing constructor(config) { this.detector = new drift_detector_1.DriftDetector({ enable_performance_mode: true, sample_size_limit: this.SAMPLE_SIZE, ...config }); } /** * High-performance drift detection optimized for 1M+ rows * Target: <1s detection time for 1M rows */ async detectDriftFast(historicalAnchor, currentColumn, currentFingerprint) { const startTime = Date.now(); // Step 1: Early exit checks (< 1ms) const quickCheck = this.performQuickCheck(historicalAnchor, currentFingerprint); if (!quickCheck.needsDetailedAnalysis) { return this.createQuickResult(historicalAnchor, currentColumn, quickCheck); } // Step 2: Intelligent sampling for large datasets const optimizedColumn = this.intelligentSampling(currentColumn); // Step 3: Parallel processing simulation for different drift types const driftPromises = [ this.fastDistributionCheck(historicalAnchor, optimizedColumn), this.fastPatternCheck(historicalAnchor, currentFingerprint), this.fastScaleCheck(historicalAnchor, currentFingerprint), this.fastJoinabilityCheck(historicalAnchor, currentFingerprint) ]; const driftResults = await Promise.all(driftPromises); // Step 4: Aggregate results const detectionTime = Date.now() - startTime; const result = await this.detector.detectDrift(historicalAnchor, optimizedColumn, currentFingerprint); // Enhance with performance metrics result.performance_metrics = { detection_time_ms: detectionTime, samples_processed: optimizedColumn.values.length, optimization_applied: currentColumn.values.length > this.SAMPLE_SIZE, compression_ratio: optimizedColumn.values.length / currentColumn.values.length }; return result; } /** * Batch processing with automatic load balancing */ async detectDriftBatchOptimized(anchors, columns, fingerprints) { const startTime = Date.now(); const results = []; // Process in optimized chunks const chunkSize = Math.ceil(anchors.length / this.PARALLEL_WORKERS); for (let i = 0; i < anchors.length; i += chunkSize) { const chunkAnchors = anchors.slice(i, i + chunkSize); const chunkColumns = columns.slice(i, i + chunkSize); const chunkFingerprints = fingerprints.slice(i, i + chunkSize); // Process chunk const chunkPromises = chunkAnchors.map((anchor, index) => this.detectDriftFast(anchor, chunkColumns[index], chunkFingerprints[index])); const chunkResults = await Promise.all(chunkPromises); results.push(...chunkResults); } const totalTime = Date.now() - startTime; console.log(`Batch processing completed: ${results.length} columns in ${totalTime}ms`); return results; } performQuickCheck(historicalAnchor, currentFingerprint) { const historical = JSON.parse(historicalAnchor.fingerprint); const quickDrifts = []; // Type change detection (critical) if (historical.dtype !== currentFingerprint.dtype) { quickDrifts.push('critical_type_change'); return { needsDetailedAnalysis: true, quickDrifts }; } // Cardinality dramatic change const cardinalityRatio = currentFingerprint.cardinality / historical.cardinality; if (cardinalityRatio > 10 || cardinalityRatio < 0.1) { quickDrifts.push('critical_cardinality_change'); } // Null ratio dramatic change if (Math.abs(currentFingerprint.null_ratio - historical.null_ratio) > 0.5) { quickDrifts.push('critical_null_ratio_change'); } // If no critical changes detected, we need detailed analysis return { needsDetailedAnalysis: quickDrifts.length === 0 || quickDrifts.some(d => !d.startsWith('critical')), quickDrifts }; } createQuickResult(historicalAnchor, currentColumn, quickCheck) { const driftTypes = quickCheck.quickDrifts.map(drift => ({ type: 'format', severity: 'critical', metric_value: 1.0, threshold: 0.1, description: `Quick detection: ${drift}` })); return { anchor_id: historicalAnchor.anchor_id, column_name: currentColumn.name, drift_detected: driftTypes.length > 0, drift_types: driftTypes, severity: 'critical', confidence_score: 0.95, details: {}, alerts: [], recommended_actions: ['Immediate investigation required for critical changes'], performance_metrics: { detection_time_ms: 1, samples_processed: 0 } }; } intelligentSampling(column) { if (column.values.length <= this.SAMPLE_SIZE) { return column; } // Stratified sampling to preserve distribution characteristics const sampleIndices = this.generateStratifiedSample(column.values, this.SAMPLE_SIZE); const sampledValues = sampleIndices.map(i => column.values[i]); return { ...column, values: sampledValues }; } generateStratifiedSample(values, sampleSize) { const indices = []; const step = values.length / sampleSize; // Systematic sampling with random start const randomStart = Math.floor(Math.random() * step); for (let i = 0; i < sampleSize; i++) { const index = Math.floor(randomStart + i * step); if (index < values.length) { indices.push(index); } } // Add some random samples to improve representation const randomSampleSize = Math.min(1000, Math.floor(sampleSize * 0.1)); for (let i = 0; i < randomSampleSize; i++) { const randomIndex = Math.floor(Math.random() * values.length); if (!indices.includes(randomIndex)) { indices.push(randomIndex); } } return indices.slice(0, sampleSize); } async fastDistributionCheck(historicalAnchor, currentColumn) { // Fast numerical check if (!this.isNumericColumn(currentColumn)) { return null; } const historical = JSON.parse(historicalAnchor.fingerprint); const numericValues = currentColumn.values.map(v => parseFloat(v)).filter(v => !isNaN(v)); if (numericValues.length === 0) return null; // Fast statistical comparison using moments const currentMean = numericValues.reduce((a, b) => a + b, 0) / numericValues.length; const currentStd = Math.sqrt(numericValues.reduce((sum, val) => sum + Math.pow(val - currentMean, 2), 0) / numericValues.length); // Extract historical statistics (simplified) const historicalMean = historical.sample_values .map(v => parseFloat(v)) .filter(v => !isNaN(v)) .reduce((a, b) => a + b, 0) / historical.sample_values.length; const meanDiff = Math.abs(currentMean - historicalMean) / historicalMean; if (meanDiff > 0.2) { // 20% change threshold return { type: 'distribution', severity: meanDiff > 0.5 ? 'critical' : 'high', metric_value: meanDiff, threshold: 0.2, description: `Fast distribution check: ${(meanDiff * 100).toFixed(1)}% mean change` }; } return null; } async fastPatternCheck(historicalAnchor, currentFingerprint) { const historical = JSON.parse(historicalAnchor.fingerprint); // Quick pattern similarity check const historicalPatterns = new Set(historical.regex_patterns); const currentPatterns = new Set(currentFingerprint.regex_patterns); const intersection = new Set([...historicalPatterns].filter(x => currentPatterns.has(x))); const union = new Set([...historicalPatterns, ...currentPatterns]); const similarity = union.size === 0 ? 1 : intersection.size / union.size; if (similarity < 0.7) { return { type: 'format', severity: similarity < 0.3 ? 'critical' : 'high', metric_value: 1 - similarity, threshold: 0.3, description: `Fast pattern check: ${((1 - similarity) * 100).toFixed(1)}% pattern change` }; } return null; } async fastScaleCheck(historicalAnchor, currentFingerprint) { const historical = JSON.parse(historicalAnchor.fingerprint); if (!this.isNumericType(historical.dtype) || !this.isNumericType(currentFingerprint.dtype)) { return null; } const historicalRange = this.parseNumeric(historical.max) - this.parseNumeric(historical.min); const currentRange = this.parseNumeric(currentFingerprint.max) - this.parseNumeric(currentFingerprint.min); if (historicalRange === 0 || currentRange === 0) return null; const scaleFactor = currentRange / historicalRange; if (scaleFactor > 5 || scaleFactor < 0.2) { return { type: 'unit', severity: scaleFactor > 10 || scaleFactor < 0.1 ? 'critical' : 'high', metric_value: scaleFactor, threshold: 5, description: `Fast scale check: ${scaleFactor.toFixed(2)}x scale change` }; } return null; } async fastJoinabilityCheck(historicalAnchor, currentFingerprint) { const historical = JSON.parse(historicalAnchor.fingerprint); const uniquenessChange = Math.abs(historical.unique_ratio - currentFingerprint.unique_ratio); if (uniquenessChange > 0.3) { return { type: 'joinability', severity: uniquenessChange > 0.5 ? 'critical' : 'medium', metric_value: uniquenessChange, threshold: 0.3, description: `Fast joinability check: ${(uniquenessChange * 100).toFixed(1)}% uniqueness change` }; } return null; } isNumericColumn(column) { return ['int64', 'float64', 'number'].includes(column.data_type); } isNumericType(dtype) { return ['int64', 'float64', 'number'].includes(dtype.toLowerCase()); } parseNumeric(value) { if (typeof value === 'number') return value; if (typeof value === 'string') return parseFloat(value); return 0; } /** * Memory-efficient streaming drift detection for very large datasets */ async detectDriftStreaming(historicalAnchor, dataStream, options = {}) { const { maxSamples = 1000000, earlyExit = true, progressCallback } = options; const samples = []; let processedCount = 0; const samplingRate = maxSamples / 1000000; // Adaptive sampling for await (const chunk of dataStream) { if (Math.random() < samplingRate || samples.length < 10000) { samples.push(chunk); } processedCount++; if (progressCallback && processedCount % 100000 === 0) { progressCallback(processedCount); } if (samples.length >= maxSamples) { break; } // Early exit for critical drift if (earlyExit && samples.length > 1000 && samples.length % 1000 === 0) { const quickResult = await this.checkForCriticalDrift(historicalAnchor, samples); if (quickResult) { return quickResult; } } } // Create column data from samples const columnData = { name: 'streaming_column', values: samples, data_type: this.inferDataType(samples) }; // Generate fingerprint const fingerprint = this.generateStreamingFingerprint(samples); return this.detectDriftFast(historicalAnchor, columnData, fingerprint); } async checkForCriticalDrift(historicalAnchor, samples) { // Quick critical drift check on sample const fingerprint = this.generateStreamingFingerprint(samples); const quickCheck = this.performQuickCheck(historicalAnchor, fingerprint); if (quickCheck.quickDrifts.some(d => d.startsWith('critical'))) { const columnData = { name: 'streaming_sample', values: samples, data_type: this.inferDataType(samples) }; return this.createQuickResult(historicalAnchor, columnData, quickCheck); } return null; } inferDataType(samples) { if (samples.length === 0) return 'unknown'; const sample = samples[0]; if (typeof sample === 'number') return 'float64'; if (typeof sample === 'boolean') return 'boolean'; if (sample instanceof Date) return 'datetime'; if (typeof sample === 'string') { // Try to parse as number if (!isNaN(parseFloat(sample))) return 'float64'; } return 'string'; } generateStreamingFingerprint(samples) { const uniqueValues = new Set(samples); const nullCount = samples.filter(v => v === null || v === undefined || v === '').length; return { dtype: this.inferDataType(samples), cardinality: uniqueValues.size, regex_patterns: [], // Simplified for streaming null_ratio: nullCount / samples.length, unique_ratio: uniqueValues.size / samples.length, sample_values: Array.from(uniqueValues).slice(0, 20).map(v => String(v)) }; } /** * Get performance benchmarks */ async benchmarkPerformance(dataSizes = [1000, 10000, 100000, 1000000]) { const results = {}; for (const size of dataSizes) { const benchmarks = []; const runs = size > 100000 ? 3 : 5; for (let i = 0; i < runs; i++) { const testData = this.generateTestData(size); const startTime = Date.now(); await this.detectDriftFast(testData.anchor, testData.column, testData.fingerprint); const endTime = Date.now(); benchmarks.push(endTime - startTime); } const avgTime = benchmarks.reduce((a, b) => a + b, 0) / benchmarks.length; const throughput = size / avgTime * 1000; // rows per second results[size] = { avgTime, throughput }; } return results; } generateTestData(size) { const values = Array.from({ length: size }, (_, i) => Math.random() * 1000); const column = { name: 'test_column', values: values, data_type: 'float64' }; const fingerprint = { dtype: 'float64', cardinality: new Set(values).size, regex_patterns: [], null_ratio: 0, unique_ratio: new Set(values).size / values.length, sample_values: values.slice(0, 20).map(v => String(v)) }; const anchor = { dataset: 'test', column_name: 'test_column', anchor_id: 'test_anchor', fingerprint: JSON.stringify(fingerprint), first_seen: new Date().toISOString(), last_seen: new Date().toISOString() }; return { anchor, column, fingerprint }; } } exports.PerformanceOptimizedDriftDetector = PerformanceOptimizedDriftDetector; //# sourceMappingURL=performance-optimizer.js.map