semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
317 lines • 14.9 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.SmartAnchorReconciler = void 0;
const anchors_1 = require("./anchors");
class SmartAnchorReconciler {
anchorSystem;
strategies = new Map();
constructor() {
this.anchorSystem = new anchors_1.StableColumnAnchorSystem();
this.initializeStrategies();
}
initializeStrategies() {
this.strategies.set('conservative', {
name: 'conservative',
confidence_threshold: 0.9,
drift_tolerance: 0.1,
semantic_weight: 0.4,
structural_weight: 0.6,
enable_fuzzy_matching: false
});
this.strategies.set('balanced', {
name: 'balanced',
confidence_threshold: 0.8,
drift_tolerance: 0.2,
semantic_weight: 0.3,
structural_weight: 0.7,
enable_fuzzy_matching: true
});
this.strategies.set('aggressive', {
name: 'aggressive',
confidence_threshold: 0.7,
drift_tolerance: 0.3,
semantic_weight: 0.2,
structural_weight: 0.8,
enable_fuzzy_matching: true
});
this.strategies.set('semantic_first', {
name: 'semantic_first',
confidence_threshold: 0.75,
drift_tolerance: 0.25,
semantic_weight: 0.6,
structural_weight: 0.4,
enable_fuzzy_matching: true
});
}
reconcileAnchorsAdvanced(datasetName, newColumns, existingAnchors, strategyName = 'balanced', customOptions) {
const startTime = Date.now();
const strategy = this.strategies.get(strategyName) || this.strategies.get('balanced');
const reconciliationOptions = {
confidence_threshold: strategy.confidence_threshold,
allow_multiple_matches: false,
create_new_anchors: true,
drift_tolerance: strategy.drift_tolerance,
...customOptions
};
const baseResult = this.anchorSystem.reconcileAnchors(datasetName, newColumns, existingAnchors, reconciliationOptions);
const enhancedMatches = this.enhanceMatches(newColumns, existingAnchors, baseResult, strategy);
const confidenceMetrics = this.calculateConfidenceMetrics(enhancedMatches, newColumns.length);
const driftAnalysis = this.analyzeDrift(enhancedMatches, existingAnchors, strategy);
const issues = this.identifyPotentialIssues(enhancedMatches, confidenceMetrics, driftAnalysis);
const recommendations = this.generateRecommendations(confidenceMetrics, driftAnalysis, issues);
const reconciliationTime = Date.now() - startTime;
return {
matched_anchors: enhancedMatches.matched_anchors,
unmatched_columns: enhancedMatches.unmatched_columns,
new_anchors: enhancedMatches.new_anchors,
confidence_metrics: confidenceMetrics,
strategy_used: strategyName,
reconciliation_time_ms: reconciliationTime,
potential_issues: issues,
recommendations: recommendations
};
}
enhanceMatches(newColumns, existingAnchors, baseResult, strategy) {
if (!strategy.enable_fuzzy_matching) {
return baseResult;
}
const unmatchedColumns = [...baseResult.unmatched_columns];
const enhancedMatches = [...baseResult.matched_anchors];
const remainingAnchors = existingAnchors.filter(anchor => !enhancedMatches.some(match => match.anchor_id === anchor.anchor_id));
for (const columnName of unmatchedColumns) {
const column = newColumns.find(col => col.name === columnName);
if (!column)
continue;
const fuzzyMatch = this.findFuzzyMatch(column, remainingAnchors, strategy);
if (fuzzyMatch) {
enhancedMatches.push(fuzzyMatch);
const index = unmatchedColumns.indexOf(columnName);
if (index > -1)
unmatchedColumns.splice(index, 1);
}
}
return {
matched_anchors: enhancedMatches,
unmatched_columns: unmatchedColumns,
new_anchors: baseResult.new_anchors
};
}
findFuzzyMatch(column, anchors, strategy) {
let bestMatch = null;
for (const anchor of anchors) {
const fingerprint = this.anchorSystem.generateFingerprint(column);
const score = this.anchorSystem.calculateMatchScore(fingerprint, anchor, column.name, strategy.drift_tolerance);
const adjustedConfidence = this.calculateAdjustedConfidence(score, strategy);
if (adjustedConfidence >= strategy.confidence_threshold * 0.8) {
if (!bestMatch || adjustedConfidence > bestMatch.score.confidence) {
bestMatch = { anchor, score: { ...score, confidence: adjustedConfidence } };
}
}
}
if (bestMatch) {
return {
anchor_id: bestMatch.anchor.anchor_id,
column_name: column.name,
confidence: bestMatch.score.confidence,
match_reason: [...this.getMatchReasons(bestMatch.score), 'fuzzy_match']
};
}
return null;
}
calculateAdjustedConfidence(score, strategy) {
const semanticScore = (score.component_scores.regex_match +
score.component_scores.name_similarity) / 2;
const structuralScore = (score.component_scores.dtype_match +
score.component_scores.cardinality_similarity +
score.component_scores.statistical_similarity) / 3;
return (semanticScore * strategy.semantic_weight +
structuralScore * strategy.structural_weight);
}
getMatchReasons(score) {
const reasons = [];
if (score.component_scores.dtype_match === 1.0) {
reasons.push('data_type_match');
}
if (score.component_scores.cardinality_similarity > 0.8) {
reasons.push('cardinality_similar');
}
if (score.component_scores.regex_match > 0.5) {
reasons.push('pattern_match');
}
if (score.component_scores.statistical_similarity > 0.8) {
reasons.push('statistical_similarity');
}
if (score.component_scores.name_similarity > 0.7) {
reasons.push('name_similarity');
}
return reasons;
}
calculateConfidenceMetrics(result, totalColumns) {
const matches = result.matched_anchors;
const strongMatches = matches.filter(m => m.confidence >= 0.9).length;
const weakMatches = matches.filter(m => m.confidence >= 0.7 && m.confidence < 0.9).length;
const noMatches = result.unmatched_columns.length;
const newColumns = result.new_anchors.length;
const overallConfidence = matches.length > 0
? matches.reduce((sum, m) => sum + m.confidence, 0) / matches.length
: 0;
const semanticMatches = matches.filter(m => m.match_reason.includes('pattern_match') || m.match_reason.includes('name_similarity'));
const semanticConfidence = semanticMatches.length > 0
? semanticMatches.reduce((sum, m) => sum + m.confidence, 0) / semanticMatches.length
: 0;
const structuralMatches = matches.filter(m => m.match_reason.includes('data_type_match') || m.match_reason.includes('statistical_similarity'));
const structuralConfidence = structuralMatches.length > 0
? structuralMatches.reduce((sum, m) => sum + m.confidence, 0) / structuralMatches.length
: 0;
const statisticalMatches = matches.filter(m => m.match_reason.includes('statistical_similarity'));
const statisticalConfidence = statisticalMatches.length > 0
? statisticalMatches.reduce((sum, m) => sum + m.confidence, 0) / statisticalMatches.length
: 0;
const nameMatches = matches.filter(m => m.match_reason.includes('name_similarity'));
const nameConfidence = nameMatches.length > 0
? nameMatches.reduce((sum, m) => sum + m.confidence, 0) / nameMatches.length
: 0;
return {
overall_confidence: overallConfidence,
semantic_confidence: semanticConfidence,
structural_confidence: structuralConfidence,
statistical_confidence: statisticalConfidence,
name_confidence: nameConfidence,
breakdown: {
strong_matches: strongMatches,
weak_matches: weakMatches,
no_matches: noMatches,
new_columns: newColumns
}
};
}
analyzeDrift(result, existingAnchors, strategy) {
const drifts = [];
for (const match of result.matched_anchors) {
const anchor = existingAnchors.find(a => a.anchor_id === match.anchor_id);
if (!anchor)
continue;
if (match.confidence < strategy.confidence_threshold * 1.1 && match.confidence >= strategy.confidence_threshold) {
let driftType = 'statistical';
let severity = 'low';
if (match.confidence < strategy.confidence_threshold * 1.05) {
severity = 'medium';
}
if (!match.match_reason.includes('pattern_match') && match.match_reason.includes('statistical_similarity')) {
driftType = 'semantic';
severity = 'high';
}
drifts.push({
anchor_id: match.anchor_id,
column_name: match.column_name,
drift_type: driftType,
severity: severity,
details: {
confidence: match.confidence,
match_reasons: match.match_reason,
threshold: strategy.confidence_threshold
},
suggested_action: severity === 'high' ? 'investigate' : 'accept'
});
}
}
return drifts;
}
identifyPotentialIssues(result, metrics, drifts) {
const issues = [];
if (metrics.overall_confidence < 0.7) {
issues.push('Low overall confidence in reconciliation results');
}
if (metrics.breakdown.weak_matches > metrics.breakdown.strong_matches) {
issues.push('More weak matches than strong matches detected');
}
if (metrics.breakdown.no_matches > result.matched_anchors.length) {
issues.push('High number of unmatched columns');
}
const highSeverityDrifts = drifts.filter(d => d.severity === 'high').length;
if (highSeverityDrifts > 0) {
issues.push(`${highSeverityDrifts} high-severity anchor drift(s) detected`);
}
if (metrics.semantic_confidence < 0.6 && metrics.structural_confidence > 0.8) {
issues.push('Structural matches found but semantic alignment is weak');
}
return issues;
}
generateRecommendations(metrics, drifts, issues) {
const recommendations = [];
if (metrics.overall_confidence < 0.7) {
recommendations.push('Consider using a more conservative reconciliation strategy');
recommendations.push('Review column naming conventions and data types');
}
if (metrics.breakdown.no_matches > 0) {
recommendations.push('Verify that unmatched columns are genuinely new');
recommendations.push('Consider manual semantic annotation for unmatched columns');
}
const semanticDrifts = drifts.filter(d => d.drift_type === 'semantic');
if (semanticDrifts.length > 0) {
recommendations.push('Review semantic patterns for drifted anchors');
recommendations.push('Consider updating anchor fingerprints to reflect schema evolution');
}
if (metrics.semantic_confidence < 0.6) {
recommendations.push('Enhance semantic pattern detection rules');
recommendations.push('Consider domain-specific semantic vocabularies');
}
if (issues.includes('High number of unmatched columns')) {
recommendations.push('Enable auto-inference for new semantic patterns');
recommendations.push('Review data ingestion pipeline for schema changes');
}
return recommendations;
}
getReconciliationStrategies() {
return Array.from(this.strategies.keys());
}
addCustomStrategy(name, strategy) {
this.strategies.set(name, strategy);
}
getStrategy(name) {
return this.strategies.get(name);
}
analyzeAnchorEvolution(anchorHistory, windowSize = 5) {
if (anchorHistory.length < 2) {
return {
stability_score: 1.0,
trending_patterns: [],
evolution_summary: { message: 'Insufficient history for analysis' }
};
}
let stabilityScore = 1.0;
const trendingPatterns = [];
const anchorChanges = {};
for (let i = 1; i < Math.min(anchorHistory.length, windowSize + 1); i++) {
const prev = new Set(anchorHistory[i - 1].map(a => a.anchor_id));
const curr = new Set(anchorHistory[i].map(a => a.anchor_id));
const added = new Set([...curr].filter(id => !prev.has(id)));
const removed = new Set([...prev].filter(id => !curr.has(id)));
const changeRate = (added.size + removed.size) / Math.max(prev.size, curr.size);
stabilityScore *= (1 - changeRate);
if (added.size > 0) {
anchorChanges['additions'] = (anchorChanges['additions'] || 0) + added.size;
}
if (removed.size > 0) {
anchorChanges['removals'] = (anchorChanges['removals'] || 0) + removed.size;
}
}
if (anchorChanges['additions'] > anchorChanges['removals']) {
trendingPatterns.push('schema_expansion');
}
else if (anchorChanges['removals'] > anchorChanges['additions']) {
trendingPatterns.push('schema_consolidation');
}
return {
stability_score: Math.max(0, stabilityScore),
trending_patterns: trendingPatterns,
evolution_summary: {
total_changes: Object.values(anchorChanges).reduce((a, b) => a + b, 0),
change_breakdown: anchorChanges,
analysis_window: Math.min(anchorHistory.length - 1, windowSize)
}
};
}
}
exports.SmartAnchorReconciler = SmartAnchorReconciler;
//# sourceMappingURL=reconciler.js.map