UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

github.com/kneelinghorse/semantic-ds-toolkit

411 lines • 18.3 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.JoinCostModel = exports.SemanticJoinPlanner = void 0; class SemanticJoinPlanner { costModel; constructor() { this.costModel = new JoinCostModel(); } planOptimalJoin(leftDf, rightDf, leftContext, rightContext, options) { // Gather statistics const leftStats = this.analyzeDataFrameStatistics(leftDf); const rightStats = this.analyzeDataFrameStatistics(rightDf); // Resolve join columns const leftColumns = this.resolveJoinColumns(options.leftOn, leftDf.columns); const rightColumns = this.resolveJoinColumns(options.rightOn, rightDf.columns); // Estimate join selectivity const selectivity = this.estimateJoinSelectivity(leftDf, rightDf, leftColumns, rightColumns, leftContext, rightContext, options); // Plan normalization const normalizationPlan = this.planNormalization(leftDf, rightDf, leftColumns, rightColumns, leftContext, rightContext, options); // Choose optimal strategy const strategy = this.selectJoinStrategy(leftStats, rightStats, selectivity, options); // Plan indexing const indexingStrategy = this.planIndexing(leftStats, rightStats, strategy); // Plan batching and parallelism const batchingStrategy = this.planBatching(leftStats, rightStats, options); // Plan caching const cacheStrategy = this.planCaching(leftStats, rightStats, normalizationPlan); // Calculate total cost const estimatedCost = this.costModel.calculateTotalCost(leftStats, rightStats, strategy, selectivity, normalizationPlan, batchingStrategy); // Generate optimizations const optimizations = this.generateOptimizations(leftStats, rightStats, selectivity, normalizationPlan); return { strategy, estimatedCost, estimatedRows: selectivity.expectedMatches, optimizations, indexingStrategy, batchingStrategy, cacheStrategy, normalizationPlan }; } analyzeDataFrameStatistics(df) { const rowCount = df.shape[0]; const columnCount = df.shape[1]; const uniquenessRatios = {}; const nullRatios = {}; const dataTypes = {}; let totalSize = 0; for (const column of df.columns) { const values = df.getColumn(column); const nonNullValues = values.filter(v => v != null && v !== ''); // Calculate uniqueness ratio const uniqueCount = new Set(nonNullValues).size; uniquenessRatios[column] = nonNullValues.length > 0 ? uniqueCount / nonNullValues.length : 0; // Calculate null ratio nullRatios[column] = (values.length - nonNullValues.length) / values.length; // Infer data type dataTypes[column] = df.dtypes[column] || this.inferDataType(values); // Estimate column size const avgValueSize = this.estimateAverageValueSize(values.slice(0, 100)); totalSize += avgValueSize * rowCount; } return { rowCount, columnCount, avgRowSize: totalSize / rowCount, uniquenessRatios, nullRatios, dataTypes, estimatedSize: totalSize }; } estimateJoinSelectivity(leftDf, rightDf, leftColumns, rightColumns, leftContext, rightContext, options) { let totalSelectivity = 1.0; let totalConfidence = 1.0; const reasoning = []; for (let i = 0; i < leftColumns.length; i++) { const leftCol = leftColumns[i]; const rightCol = rightColumns[i]; const leftValues = leftDf.getColumn(leftCol); const rightValues = rightDf.getColumn(rightCol); const leftUnique = new Set(leftValues).size; const rightUnique = new Set(rightValues).size; // Calculate basic selectivity let columnSelectivity = Math.min(leftUnique, rightUnique) / Math.max(leftValues.length, rightValues.length); // Adjust for semantic context const leftCtx = leftContext[leftCol]; const rightCtx = rightContext[rightCol]; if (leftCtx && rightCtx) { if (leftCtx.semantic_type === rightCtx.semantic_type) { columnSelectivity *= 1.2; // Boost for same semantic type totalConfidence *= Math.max(leftCtx.confidence, rightCtx.confidence); reasoning.push(`Semantic match: ${leftCtx.semantic_type}`); } else if (this.areTypesCompatible(leftCtx.semantic_type, rightCtx.semantic_type)) { columnSelectivity *= 0.8; // Slight reduction for compatible types totalConfidence *= 0.8; reasoning.push(`Compatible types: ${leftCtx.semantic_type} ↔ ${rightCtx.semantic_type}`); } else { columnSelectivity *= 0.3; // Major reduction for incompatible types totalConfidence *= 0.5; reasoning.push(`Incompatible types: ${leftCtx.semantic_type} ↔ ${rightCtx.semantic_type}`); } } // Adjust for fuzzy matching if (options.enableFuzzyMatching) { columnSelectivity *= 1.5; // Fuzzy matching increases matches reasoning.push('Fuzzy matching enabled'); } // Adjust for data quality const leftNullRatio = leftValues.filter(v => v == null).length / leftValues.length; const rightNullRatio = rightValues.filter(v => v == null).length / rightValues.length; const avgNullRatio = (leftNullRatio + rightNullRatio) / 2; columnSelectivity *= (1 - avgNullRatio); // Reduce for nulls if (avgNullRatio > 0.1) { reasoning.push(`High null ratio: ${(avgNullRatio * 100).toFixed(1)}%`); } totalSelectivity *= columnSelectivity; } // Cap selectivity to reasonable bounds totalSelectivity = Math.max(0.001, Math.min(1.0, totalSelectivity)); const expectedMatches = Math.ceil(leftDf.shape[0] * rightDf.shape[0] * totalSelectivity); return { expectedMatches, selectivity: totalSelectivity, confidence: totalConfidence, reasoning }; } planNormalization(leftDf, rightDf, leftColumns, rightColumns, leftContext, rightContext, options) { const leftPlans = []; const rightPlans = []; let totalCost = 0; for (let i = 0; i < leftColumns.length; i++) { const leftCol = leftColumns[i]; const rightCol = rightColumns[i]; // Plan left column normalization const leftPlan = this.planColumnNormalization(leftCol, leftDf, leftContext[leftCol], 'left'); leftPlans.push(leftPlan); totalCost += leftPlan.costEstimate; // Plan right column normalization const rightPlan = this.planColumnNormalization(rightCol, rightDf, rightContext[rightCol], 'right'); rightPlans.push(rightPlan); totalCost += rightPlan.costEstimate; } // Decide whether to precompute normalization const precomputeNormalization = this.shouldPrecomputeNormalization(leftDf.shape[0] + rightDf.shape[0], totalCost, options); return { leftColumns: leftPlans, rightColumns: rightPlans, precomputeNormalization, estimatedNormalizationCost: totalCost }; } planColumnNormalization(column, df, context, side) { const values = df.getColumn(column); const uniqueValues = new Set(values); let normalizer = 'default'; let confidence = 0.5; if (context) { const normalizerMapping = { 'email_address': 'email', 'phone_number': 'phone', 'display_name': 'name', 'monetary_value': 'numeric', 'temporal': 'date', 'categorical_attribute': 'categorical', 'categorical_code': 'categorical' }; normalizer = normalizerMapping[context.semantic_type] || 'default'; confidence = context.confidence; } const selectivityEstimate = uniqueValues.size / values.length; const cardinalityEstimate = uniqueValues.size; // Estimate normalization cost based on complexity const normalizerCosts = { 'default': 1, 'email': 2, 'phone': 3, 'name': 4, 'address': 5, 'numeric': 2, 'date': 3, 'categorical': 1 }; const costEstimate = (normalizerCosts[normalizer] || 1) * values.length; return { column, normalizer, confidence, selectivityEstimate, cardinalityEstimate, costEstimate }; } selectJoinStrategy(leftStats, rightStats, selectivity, options) { const leftRows = leftStats.rowCount; const rightRows = rightStats.rowCount; const expectedMatches = selectivity.expectedMatches; // Very small datasets - use nested loop if (leftRows < 1000 && rightRows < 1000) { return 'nested_loop'; } // One side is very small - broadcast join if (leftRows < 10000 || rightRows < 10000) { return 'broadcast_join'; } // High selectivity - sort merge might be efficient if (selectivity.selectivity > 0.1 && expectedMatches > 100000) { return 'sort_merge'; } // Default to hash join for most cases return 'hash_join'; } planIndexing(leftStats, rightStats, strategy) { if (strategy === 'nested_loop') { return 'none'; } if (strategy === 'broadcast_join') { return leftStats.rowCount < rightStats.rowCount ? 'build_left' : 'build_right'; } if (strategy === 'hash_join') { // Build index on smaller side return leftStats.rowCount < rightStats.rowCount ? 'build_left' : 'build_right'; } if (strategy === 'sort_merge') { // Dual indexing beneficial for sort-merge return 'dual_index'; } return 'none'; } planBatching(leftStats, rightStats, options) { const totalRows = leftStats.rowCount + rightStats.rowCount; // Enable batching for large datasets if (totalRows > 100000) { const batchSize = options.batchSize || Math.min(50000, Math.max(10000, totalRows / 10)); const parallelism = Math.min(4, Math.ceil(totalRows / batchSize)); return { enabled: true, batchSize, parallelism }; } return { enabled: false, batchSize: totalRows, parallelism: 1 }; } planCaching(leftStats, rightStats, normalizationPlan) { const totalRows = leftStats.rowCount + rightStats.rowCount; const normalizationCost = normalizationPlan.estimatedNormalizationCost; // Enable value caching if normalization is expensive const enableValueCache = normalizationCost > totalRows * 2; // Enable index caching for large datasets const enableIndexCache = totalRows > 50000; // Size cache based on memory constraints const cacheSize = Math.min(100000, Math.max(10000, totalRows / 5)); return { enableValueCache, enableIndexCache, cacheSize }; } generateOptimizations(leftStats, rightStats, selectivity, normalizationPlan) { const optimizations = []; // Suggest column ordering if (leftStats.columnCount > 5 || rightStats.columnCount > 5) { optimizations.push('Consider column pruning to reduce memory usage'); } // Suggest filtering if (selectivity.selectivity < 0.01) { optimizations.push('Low selectivity detected - consider pre-filtering data'); } // Suggest normalization strategy if (normalizationPlan.estimatedNormalizationCost > (leftStats.rowCount + rightStats.rowCount) * 5) { optimizations.push('High normalization cost - consider simpler normalizers'); } // Suggest indexing const totalRows = leftStats.rowCount + rightStats.rowCount; if (totalRows > 1000000) { optimizations.push('Large dataset - consider pre-building persistent indices'); } // Memory optimization const estimatedMemoryUsage = (leftStats.estimatedSize + rightStats.estimatedSize) / (1024 * 1024); // MB if (estimatedMemoryUsage > 1000) { optimizations.push('High memory usage - consider streaming or disk-based processing'); } return optimizations; } shouldPrecomputeNormalization(totalRows, normalizationCost, options) { // Precompute if normalization is expensive relative to dataset size const costPerRow = normalizationCost / totalRows; // Precompute for expensive normalizers or when caching is enabled return costPerRow > 3 || (options.cacheNormalizedValues !== false); } inferDataType(values) { if (values.length === 0) return 'unknown'; const sample = values.slice(0, 100); let numericCount = 0; let dateCount = 0; for (const value of sample) { if (value != null) { if (typeof value === 'number' || (!isNaN(parseFloat(String(value))) && isFinite(parseFloat(String(value))))) { numericCount++; } if (!isNaN(new Date(String(value)).getTime()) && String(value).length > 6) { dateCount++; } } } if (numericCount / sample.length > 0.8) return 'numeric'; if (dateCount / sample.length > 0.7) return 'date'; return 'string'; } estimateAverageValueSize(values) { if (values.length === 0) return 0; const sample = values.slice(0, Math.min(100, values.length)); let totalSize = 0; for (const value of sample) { if (value != null) { if (typeof value === 'string') { totalSize += value.length * 2; // Unicode characters } else if (typeof value === 'number') { totalSize += 8; // 64-bit number } else { totalSize += String(value).length * 2; } } } return sample.length > 0 ? totalSize / sample.length : 0; } areTypesCompatible(type1, type2) { const compatibilityGroups = [ ['identifier', 'high_cardinality_attribute'], ['monetary_value', 'numeric_value'], ['display_name', 'generic_attribute'], ['categorical_attribute', 'categorical_code'], ['temporal', 'datetime', 'timestamp'] ]; return compatibilityGroups.some(group => group.includes(type1) && group.includes(type2)); } resolveJoinColumns(columns, availableColumns) { if (!columns) { throw new Error('Join columns must be specified'); } const cols = Array.isArray(columns) ? columns : [columns]; for (const col of cols) { if (!availableColumns.includes(col)) { throw new Error(`Column '${col}' not found. Available: ${availableColumns.join(', ')}`); } } return cols; } } exports.SemanticJoinPlanner = SemanticJoinPlanner; class JoinCostModel { // Cost constants (adjust based on profiling) NESTED_LOOP_COST = 1.0; HASH_BUILD_COST = 0.5; HASH_PROBE_COST = 0.3; SORT_COST = 0.8; MERGE_COST = 0.4; NORMALIZATION_BASE_COST = 2.0; calculateTotalCost(leftStats, rightStats, strategy, selectivity, normalizationPlan, batchingStrategy) { const leftRows = leftStats.rowCount; const rightRows = rightStats.rowCount; let joinCost = 0; switch (strategy) { case 'nested_loop': joinCost = leftRows * rightRows * this.NESTED_LOOP_COST; break; case 'hash_join': const buildCost = Math.min(leftRows, rightRows) * this.HASH_BUILD_COST; const probeCost = Math.max(leftRows, rightRows) * this.HASH_PROBE_COST; joinCost = buildCost + probeCost; break; case 'sort_merge': const sortCostLeft = leftRows * Math.log2(leftRows) * this.SORT_COST; const sortCostRight = rightRows * Math.log2(rightRows) * this.SORT_COST; const mergeCost = (leftRows + rightRows) * this.MERGE_COST; joinCost = sortCostLeft + sortCostRight + mergeCost; break; case 'broadcast_join': const broadcastCost = Math.min(leftRows, rightRows) * 0.1; // Broadcasting overhead const hashJoinCost = Math.max(leftRows, rightRows) * this.HASH_PROBE_COST; joinCost = broadcastCost + hashJoinCost; break; default: joinCost = leftRows * rightRows * this.NESTED_LOOP_COST; } // Add normalization cost const normalizationCost = normalizationPlan.estimatedNormalizationCost * this.NORMALIZATION_BASE_COST; // Apply batching discount let batchingMultiplier = 1.0; if (batchingStrategy.enabled && batchingStrategy.parallelism > 1) { batchingMultiplier = 0.8; // 20% efficiency gain from parallelism } return (joinCost + normalizationCost) * batchingMultiplier; } } exports.JoinCostModel = JoinCostModel; //# sourceMappingURL=join-planner.js.map