semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
379 lines • 16.6 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.PerformanceOptimizedDriftDetector = void 0;
const drift_detector_1 = require("./drift-detector");
/**
* Performance-optimized drift detection for large datasets
* Targets <1s detection time for 1M+ rows
*/
class PerformanceOptimizedDriftDetector {
detector;
CHUNK_SIZE = 100000; // Process in 100k row chunks
SAMPLE_SIZE = 50000; // Use sampling for very large datasets
PARALLEL_WORKERS = 4; // Simulated parallel processing
constructor(config) {
this.detector = new drift_detector_1.DriftDetector({
enable_performance_mode: true,
sample_size_limit: this.SAMPLE_SIZE,
...config
});
}
/**
* High-performance drift detection optimized for 1M+ rows
* Target: <1s detection time for 1M rows
*/
async detectDriftFast(historicalAnchor, currentColumn, currentFingerprint) {
const startTime = Date.now();
// Step 1: Early exit checks (< 1ms)
const quickCheck = this.performQuickCheck(historicalAnchor, currentFingerprint);
if (!quickCheck.needsDetailedAnalysis) {
return this.createQuickResult(historicalAnchor, currentColumn, quickCheck);
}
// Step 2: Intelligent sampling for large datasets
const optimizedColumn = this.intelligentSampling(currentColumn);
// Step 3: Parallel processing simulation for different drift types
const driftPromises = [
this.fastDistributionCheck(historicalAnchor, optimizedColumn),
this.fastPatternCheck(historicalAnchor, currentFingerprint),
this.fastScaleCheck(historicalAnchor, currentFingerprint),
this.fastJoinabilityCheck(historicalAnchor, currentFingerprint)
];
const driftResults = await Promise.all(driftPromises);
// Step 4: Aggregate results
const detectionTime = Date.now() - startTime;
const result = await this.detector.detectDrift(historicalAnchor, optimizedColumn, currentFingerprint);
// Enhance with performance metrics
result.performance_metrics = {
detection_time_ms: detectionTime,
samples_processed: optimizedColumn.values.length,
optimization_applied: currentColumn.values.length > this.SAMPLE_SIZE,
compression_ratio: optimizedColumn.values.length / currentColumn.values.length
};
return result;
}
/**
* Batch processing with automatic load balancing
*/
async detectDriftBatchOptimized(anchors, columns, fingerprints) {
const startTime = Date.now();
const results = [];
// Process in optimized chunks
const chunkSize = Math.ceil(anchors.length / this.PARALLEL_WORKERS);
for (let i = 0; i < anchors.length; i += chunkSize) {
const chunkAnchors = anchors.slice(i, i + chunkSize);
const chunkColumns = columns.slice(i, i + chunkSize);
const chunkFingerprints = fingerprints.slice(i, i + chunkSize);
// Process chunk
const chunkPromises = chunkAnchors.map((anchor, index) => this.detectDriftFast(anchor, chunkColumns[index], chunkFingerprints[index]));
const chunkResults = await Promise.all(chunkPromises);
results.push(...chunkResults);
}
const totalTime = Date.now() - startTime;
console.log(`Batch processing completed: ${results.length} columns in ${totalTime}ms`);
return results;
}
performQuickCheck(historicalAnchor, currentFingerprint) {
const historical = JSON.parse(historicalAnchor.fingerprint);
const quickDrifts = [];
// Type change detection (critical)
if (historical.dtype !== currentFingerprint.dtype) {
quickDrifts.push('critical_type_change');
return { needsDetailedAnalysis: true, quickDrifts };
}
// Cardinality dramatic change
const cardinalityRatio = currentFingerprint.cardinality / historical.cardinality;
if (cardinalityRatio > 10 || cardinalityRatio < 0.1) {
quickDrifts.push('critical_cardinality_change');
}
// Null ratio dramatic change
if (Math.abs(currentFingerprint.null_ratio - historical.null_ratio) > 0.5) {
quickDrifts.push('critical_null_ratio_change');
}
// If no critical changes detected, we need detailed analysis
return {
needsDetailedAnalysis: quickDrifts.length === 0 || quickDrifts.some(d => !d.startsWith('critical')),
quickDrifts
};
}
createQuickResult(historicalAnchor, currentColumn, quickCheck) {
const driftTypes = quickCheck.quickDrifts.map(drift => ({
type: 'format',
severity: 'critical',
metric_value: 1.0,
threshold: 0.1,
description: `Quick detection: ${drift}`
}));
return {
anchor_id: historicalAnchor.anchor_id,
column_name: currentColumn.name,
drift_detected: driftTypes.length > 0,
drift_types: driftTypes,
severity: 'critical',
confidence_score: 0.95,
details: {},
alerts: [],
recommended_actions: ['Immediate investigation required for critical changes'],
performance_metrics: {
detection_time_ms: 1,
samples_processed: 0
}
};
}
intelligentSampling(column) {
if (column.values.length <= this.SAMPLE_SIZE) {
return column;
}
// Stratified sampling to preserve distribution characteristics
const sampleIndices = this.generateStratifiedSample(column.values, this.SAMPLE_SIZE);
const sampledValues = sampleIndices.map(i => column.values[i]);
return {
...column,
values: sampledValues
};
}
generateStratifiedSample(values, sampleSize) {
const indices = [];
const step = values.length / sampleSize;
// Systematic sampling with random start
const randomStart = Math.floor(Math.random() * step);
for (let i = 0; i < sampleSize; i++) {
const index = Math.floor(randomStart + i * step);
if (index < values.length) {
indices.push(index);
}
}
// Add some random samples to improve representation
const randomSampleSize = Math.min(1000, Math.floor(sampleSize * 0.1));
for (let i = 0; i < randomSampleSize; i++) {
const randomIndex = Math.floor(Math.random() * values.length);
if (!indices.includes(randomIndex)) {
indices.push(randomIndex);
}
}
return indices.slice(0, sampleSize);
}
async fastDistributionCheck(historicalAnchor, currentColumn) {
// Fast numerical check
if (!this.isNumericColumn(currentColumn)) {
return null;
}
const historical = JSON.parse(historicalAnchor.fingerprint);
const numericValues = currentColumn.values.map(v => parseFloat(v)).filter(v => !isNaN(v));
if (numericValues.length === 0)
return null;
// Fast statistical comparison using moments
const currentMean = numericValues.reduce((a, b) => a + b, 0) / numericValues.length;
const currentStd = Math.sqrt(numericValues.reduce((sum, val) => sum + Math.pow(val - currentMean, 2), 0) / numericValues.length);
// Extract historical statistics (simplified)
const historicalMean = historical.sample_values
.map(v => parseFloat(v))
.filter(v => !isNaN(v))
.reduce((a, b) => a + b, 0) / historical.sample_values.length;
const meanDiff = Math.abs(currentMean - historicalMean) / historicalMean;
if (meanDiff > 0.2) { // 20% change threshold
return {
type: 'distribution',
severity: meanDiff > 0.5 ? 'critical' : 'high',
metric_value: meanDiff,
threshold: 0.2,
description: `Fast distribution check: ${(meanDiff * 100).toFixed(1)}% mean change`
};
}
return null;
}
async fastPatternCheck(historicalAnchor, currentFingerprint) {
const historical = JSON.parse(historicalAnchor.fingerprint);
// Quick pattern similarity check
const historicalPatterns = new Set(historical.regex_patterns);
const currentPatterns = new Set(currentFingerprint.regex_patterns);
const intersection = new Set([...historicalPatterns].filter(x => currentPatterns.has(x)));
const union = new Set([...historicalPatterns, ...currentPatterns]);
const similarity = union.size === 0 ? 1 : intersection.size / union.size;
if (similarity < 0.7) {
return {
type: 'format',
severity: similarity < 0.3 ? 'critical' : 'high',
metric_value: 1 - similarity,
threshold: 0.3,
description: `Fast pattern check: ${((1 - similarity) * 100).toFixed(1)}% pattern change`
};
}
return null;
}
async fastScaleCheck(historicalAnchor, currentFingerprint) {
const historical = JSON.parse(historicalAnchor.fingerprint);
if (!this.isNumericType(historical.dtype) || !this.isNumericType(currentFingerprint.dtype)) {
return null;
}
const historicalRange = this.parseNumeric(historical.max) - this.parseNumeric(historical.min);
const currentRange = this.parseNumeric(currentFingerprint.max) - this.parseNumeric(currentFingerprint.min);
if (historicalRange === 0 || currentRange === 0)
return null;
const scaleFactor = currentRange / historicalRange;
if (scaleFactor > 5 || scaleFactor < 0.2) {
return {
type: 'unit',
severity: scaleFactor > 10 || scaleFactor < 0.1 ? 'critical' : 'high',
metric_value: scaleFactor,
threshold: 5,
description: `Fast scale check: ${scaleFactor.toFixed(2)}x scale change`
};
}
return null;
}
async fastJoinabilityCheck(historicalAnchor, currentFingerprint) {
const historical = JSON.parse(historicalAnchor.fingerprint);
const uniquenessChange = Math.abs(historical.unique_ratio - currentFingerprint.unique_ratio);
if (uniquenessChange > 0.3) {
return {
type: 'joinability',
severity: uniquenessChange > 0.5 ? 'critical' : 'medium',
metric_value: uniquenessChange,
threshold: 0.3,
description: `Fast joinability check: ${(uniquenessChange * 100).toFixed(1)}% uniqueness change`
};
}
return null;
}
isNumericColumn(column) {
return ['int64', 'float64', 'number'].includes(column.data_type);
}
isNumericType(dtype) {
return ['int64', 'float64', 'number'].includes(dtype.toLowerCase());
}
parseNumeric(value) {
if (typeof value === 'number')
return value;
if (typeof value === 'string')
return parseFloat(value);
return 0;
}
/**
* Memory-efficient streaming drift detection for very large datasets
*/
async detectDriftStreaming(historicalAnchor, dataStream, options = {}) {
const { maxSamples = 1000000, earlyExit = true, progressCallback } = options;
const samples = [];
let processedCount = 0;
const samplingRate = maxSamples / 1000000; // Adaptive sampling
for await (const chunk of dataStream) {
if (Math.random() < samplingRate || samples.length < 10000) {
samples.push(chunk);
}
processedCount++;
if (progressCallback && processedCount % 100000 === 0) {
progressCallback(processedCount);
}
if (samples.length >= maxSamples) {
break;
}
// Early exit for critical drift
if (earlyExit && samples.length > 1000 && samples.length % 1000 === 0) {
const quickResult = await this.checkForCriticalDrift(historicalAnchor, samples);
if (quickResult) {
return quickResult;
}
}
}
// Create column data from samples
const columnData = {
name: 'streaming_column',
values: samples,
data_type: this.inferDataType(samples)
};
// Generate fingerprint
const fingerprint = this.generateStreamingFingerprint(samples);
return this.detectDriftFast(historicalAnchor, columnData, fingerprint);
}
async checkForCriticalDrift(historicalAnchor, samples) {
// Quick critical drift check on sample
const fingerprint = this.generateStreamingFingerprint(samples);
const quickCheck = this.performQuickCheck(historicalAnchor, fingerprint);
if (quickCheck.quickDrifts.some(d => d.startsWith('critical'))) {
const columnData = {
name: 'streaming_sample',
values: samples,
data_type: this.inferDataType(samples)
};
return this.createQuickResult(historicalAnchor, columnData, quickCheck);
}
return null;
}
inferDataType(samples) {
if (samples.length === 0)
return 'unknown';
const sample = samples[0];
if (typeof sample === 'number')
return 'float64';
if (typeof sample === 'boolean')
return 'boolean';
if (sample instanceof Date)
return 'datetime';
if (typeof sample === 'string') {
// Try to parse as number
if (!isNaN(parseFloat(sample)))
return 'float64';
}
return 'string';
}
generateStreamingFingerprint(samples) {
const uniqueValues = new Set(samples);
const nullCount = samples.filter(v => v === null || v === undefined || v === '').length;
return {
dtype: this.inferDataType(samples),
cardinality: uniqueValues.size,
regex_patterns: [], // Simplified for streaming
null_ratio: nullCount / samples.length,
unique_ratio: uniqueValues.size / samples.length,
sample_values: Array.from(uniqueValues).slice(0, 20).map(v => String(v))
};
}
/**
* Get performance benchmarks
*/
async benchmarkPerformance(dataSizes = [1000, 10000, 100000, 1000000]) {
const results = {};
for (const size of dataSizes) {
const benchmarks = [];
const runs = size > 100000 ? 3 : 5;
for (let i = 0; i < runs; i++) {
const testData = this.generateTestData(size);
const startTime = Date.now();
await this.detectDriftFast(testData.anchor, testData.column, testData.fingerprint);
const endTime = Date.now();
benchmarks.push(endTime - startTime);
}
const avgTime = benchmarks.reduce((a, b) => a + b, 0) / benchmarks.length;
const throughput = size / avgTime * 1000; // rows per second
results[size] = { avgTime, throughput };
}
return results;
}
generateTestData(size) {
const values = Array.from({ length: size }, (_, i) => Math.random() * 1000);
const column = {
name: 'test_column',
values: values,
data_type: 'float64'
};
const fingerprint = {
dtype: 'float64',
cardinality: new Set(values).size,
regex_patterns: [],
null_ratio: 0,
unique_ratio: new Set(values).size / values.length,
sample_values: values.slice(0, 20).map(v => String(v))
};
const anchor = {
dataset: 'test',
column_name: 'test_column',
anchor_id: 'test_anchor',
fingerprint: JSON.stringify(fingerprint),
first_seen: new Date().toISOString(),
last_seen: new Date().toISOString()
};
return { anchor, column, fingerprint };
}
}
exports.PerformanceOptimizedDriftDetector = PerformanceOptimizedDriftDetector;
//# sourceMappingURL=performance-optimizer.js.map