semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
489 lines • 22.1 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.JoinConfidenceCalculator = void 0;
class JoinConfidenceCalculator {
calibrationData = new Map();
weightingScheme;
constructor(weightingScheme) {
this.weightingScheme = {
semanticTypeMatch: 0.25,
statisticalSimilarity: 0.20,
valuePatternMatch: 0.15,
cardinalityAlignment: 0.10,
domainCompatibility: 0.10,
dataQualityScore: 0.10,
normalizationEffectiveness: 0.10,
...weightingScheme
};
}
calculateMatchConfidence(leftContext, rightContext, leftValues, rightValues, leftStats, rightStats, matchEvidence, cidCompatibility) {
const components = this.calculateConfidenceComponents(leftContext, rightContext, leftValues, rightValues, leftStats, rightStats, matchEvidence, cidCompatibility);
const factors = this.generateConfidenceFactors(leftContext, rightContext, leftStats, rightStats, matchEvidence, components);
const overall = this.aggregateOverallConfidence(components, factors);
const reliability = this.calculateReliability(components, factors, matchEvidence);
const confidenceLevel = this.classifyConfidenceLevel(overall);
const { explanation, warnings, recommendations } = this.generateExplanations(components, factors, overall, matchEvidence);
return {
overall,
components,
factors,
confidence_level: confidenceLevel,
reliability,
explanation,
warnings,
recommendations
};
}
calculateConfidenceComponents(leftContext, rightContext, leftValues, rightValues, leftStats, rightStats, matchEvidence, cidCompatibility) {
return {
semanticTypeMatch: this.calculateSemanticTypeMatch(leftContext, rightContext),
statisticalSimilarity: this.calculateStatisticalSimilarity(leftStats, rightStats),
valuePatternMatch: this.calculateValuePatternMatch(matchEvidence),
cardinalityAlignment: this.calculateCardinalityAlignment(leftStats, rightStats),
domainCompatibility: this.calculateDomainCompatibility(leftContext, rightContext, cidCompatibility),
dataQualityScore: this.calculateDataQualityScore(leftStats, rightStats),
normalizationEffectiveness: this.calculateNormalizationEffectiveness(matchEvidence)
};
}
calculateSemanticTypeMatch(leftContext, rightContext) {
if (!leftContext || !rightContext) {
return 0.3; // Neutral score when no semantic context
}
const leftType = leftContext.semantic_type;
const rightType = rightContext.semantic_type;
// Exact semantic type match
if (leftType === rightType) {
const avgConfidence = (leftContext.confidence + rightContext.confidence) / 2;
return Math.min(0.95, avgConfidence * 1.1); // Boost for exact match
}
// Check for compatible semantic types
const compatibility = this.getSemanticTypeCompatibility(leftType, rightType);
if (compatibility > 0) {
const avgConfidence = (leftContext.confidence + rightContext.confidence) / 2;
return avgConfidence * compatibility * 0.8;
}
// Check for domain-related types
const domainSimilarity = this.calculateDomainSimilarity(leftContext.domain_specific_tags, rightContext.domain_specific_tags);
if (domainSimilarity > 0.3) {
return domainSimilarity * 0.6;
}
// Incompatible semantic types
return Math.max(0.1, (leftContext.confidence + rightContext.confidence) / 4);
}
calculateStatisticalSimilarity(leftStats, rightStats) {
let similarity = 0;
let componentCount = 0;
// Data type similarity
if (leftStats.dataType === rightStats.dataType) {
similarity += 0.3;
}
else if (this.areDataTypesCompatible(leftStats.dataType, rightStats.dataType)) {
similarity += 0.15;
}
componentCount++;
// Cardinality similarity
const cardinalityRatio = Math.min(leftStats.uniqueCount, rightStats.uniqueCount) /
Math.max(leftStats.uniqueCount, rightStats.uniqueCount);
similarity += cardinalityRatio * 0.25;
componentCount++;
// Null percentage similarity
const nullDiff = Math.abs(leftStats.nullPercentage - rightStats.nullPercentage) / 100;
similarity += (1 - nullDiff) * 0.15;
componentCount++;
// Length similarity (for strings)
if (leftStats.avgLength && rightStats.avgLength) {
const lengthRatio = Math.min(leftStats.avgLength, rightStats.avgLength) /
Math.max(leftStats.avgLength, rightStats.avgLength);
similarity += lengthRatio * 0.2;
componentCount++;
}
// Numeric statistics similarity
if (leftStats.numericStats && rightStats.numericStats) {
const rangeOverlap = this.calculateNumericRangeOverlap(leftStats.numericStats, rightStats.numericStats);
similarity += rangeOverlap * 0.1;
componentCount++;
}
return componentCount > 0 ? similarity : 0;
}
calculateValuePatternMatch(matchEvidence) {
if (matchEvidence.length === 0)
return 0;
let totalSimilarity = 0;
let exactMatches = 0;
let normalizedMatches = 0;
let fuzzyMatches = 0;
for (const evidence of matchEvidence) {
totalSimilarity += evidence.similarity;
switch (evidence.matchType) {
case 'exact':
exactMatches++;
break;
case 'normalized':
normalizedMatches++;
break;
case 'fuzzy':
fuzzyMatches++;
break;
}
}
const avgSimilarity = totalSimilarity / matchEvidence.length;
// Boost exact matches, moderate normalized matches, lower fuzzy matches
const matchTypeScore = (exactMatches * 1.0 + normalizedMatches * 0.8 + fuzzyMatches * 0.6) / matchEvidence.length;
return avgSimilarity * matchTypeScore;
}
calculateCardinalityAlignment(leftStats, rightStats) {
const leftCardinality = leftStats.uniqueCount;
const rightCardinality = rightStats.uniqueCount;
if (leftCardinality === 0 || rightCardinality === 0) {
return 0;
}
// Calculate alignment based on cardinality ratio
const ratio = Math.min(leftCardinality, rightCardinality) / Math.max(leftCardinality, rightCardinality);
// Penalize extreme mismatches
if (ratio < 0.1) {
return ratio * 0.5;
}
// Reward good alignment
if (ratio > 0.8) {
return Math.min(0.95, ratio * 1.1);
}
return ratio;
}
calculateDomainCompatibility(leftContext, rightContext, cidCompatibility) {
let domainScore = 0.5; // Default neutral score
// Use CID compatibility if available
if (cidCompatibility) {
const leftCids = new Set(cidCompatibility.left.map(c => c.concept.cid));
const rightCids = new Set(cidCompatibility.right.map(c => c.concept.cid));
const intersection = new Set([...leftCids].filter(x => rightCids.has(x)));
if (intersection.size > 0) {
domainScore = Math.max(domainScore, 0.8);
}
else if (leftCids.size > 0 && rightCids.size > 0) {
// Check for related concepts
const relatedness = this.calculateCIDRelatedness(cidCompatibility.left, cidCompatibility.right);
domainScore = Math.max(domainScore, relatedness);
}
}
// Use semantic context domain tags
if (leftContext && rightContext) {
const domainSimilarity = this.calculateDomainSimilarity(leftContext.domain_specific_tags, rightContext.domain_specific_tags);
domainScore = Math.max(domainScore, domainSimilarity);
}
return domainScore;
}
calculateDataQualityScore(leftStats, rightStats) {
let qualityScore = 1.0;
// Penalize high null rates
const avgNullRate = (leftStats.nullPercentage + rightStats.nullPercentage) / 2;
qualityScore *= Math.max(0.2, 1 - avgNullRate / 100);
// Reward reasonable uniqueness
const leftUniqueness = leftStats.uniquePercentage;
const rightUniqueness = rightStats.uniquePercentage;
// Penalize very low uniqueness (likely not useful for joining)
if (leftUniqueness < 5 || rightUniqueness < 5) {
qualityScore *= 0.5;
}
// Penalize extremely high uniqueness (might indicate identifiers that won't match)
if (leftUniqueness > 99 && rightUniqueness > 99) {
qualityScore *= 0.7;
}
return Math.max(0.1, qualityScore);
}
calculateNormalizationEffectiveness(matchEvidence) {
if (matchEvidence.length === 0)
return 0.5;
let improvementCount = 0;
let totalImprovement = 0;
for (const evidence of matchEvidence) {
if (evidence.matchType === 'normalized' || evidence.matchType === 'fuzzy') {
// Compare raw vs normalized similarity
const rawSimilarity = this.calculateStringSimilarity(String(evidence.leftValue || ''), String(evidence.rightValue || ''));
const normalizedSimilarity = evidence.similarity;
if (normalizedSimilarity > rawSimilarity) {
improvementCount++;
totalImprovement += (normalizedSimilarity - rawSimilarity);
}
}
}
if (improvementCount === 0) {
// If no normalization was beneficial, but we have exact matches, that's still good
const exactMatchCount = matchEvidence.filter(e => e.matchType === 'exact').length;
return exactMatchCount > 0 ? 0.8 : 0.5;
}
const avgImprovement = totalImprovement / improvementCount;
const improvementRate = improvementCount / matchEvidence.length;
return Math.min(0.95, avgImprovement * improvementRate + 0.3);
}
generateConfidenceFactors(leftContext, rightContext, leftStats, rightStats, matchEvidence, components) {
const factors = [];
// Semantic type factor
if (leftContext && rightContext) {
if (leftContext.semantic_type === rightContext.semantic_type) {
factors.push({
factor: 'semantic_type_exact_match',
impact: 0.3,
weight: 0.9,
evidence: `Both columns have semantic type: ${leftContext.semantic_type}`,
confidence: Math.min(leftContext.confidence, rightContext.confidence)
});
}
else {
factors.push({
factor: 'semantic_type_mismatch',
impact: -0.2,
weight: 0.7,
evidence: `Left: ${leftContext.semantic_type}, Right: ${rightContext.semantic_type}`,
confidence: 0.8
});
}
}
// Data quality factor
const avgNullRate = (leftStats.nullPercentage + rightStats.nullPercentage) / 2;
if (avgNullRate > 20) {
factors.push({
factor: 'high_null_rate',
impact: -0.3,
weight: 0.8,
evidence: `Average null rate: ${avgNullRate.toFixed(1)}%`,
confidence: 0.9
});
}
// Match quality factor
if (matchEvidence.length > 0) {
const exactMatches = matchEvidence.filter(e => e.matchType === 'exact').length;
const exactMatchRate = exactMatches / matchEvidence.length;
if (exactMatchRate > 0.8) {
factors.push({
factor: 'high_exact_match_rate',
impact: 0.25,
weight: 0.9,
evidence: `${(exactMatchRate * 100).toFixed(1)}% exact matches`,
confidence: 0.95
});
}
else if (exactMatchRate < 0.2) {
factors.push({
factor: 'low_exact_match_rate',
impact: -0.15,
weight: 0.7,
evidence: `Only ${(exactMatchRate * 100).toFixed(1)}% exact matches`,
confidence: 0.85
});
}
}
// Cardinality factor
const cardinalityRatio = Math.min(leftStats.uniqueCount, rightStats.uniqueCount) /
Math.max(leftStats.uniqueCount, rightStats.uniqueCount);
if (cardinalityRatio < 0.1) {
factors.push({
factor: 'extreme_cardinality_mismatch',
impact: -0.4,
weight: 0.8,
evidence: `Cardinality ratio: ${cardinalityRatio.toFixed(3)}`,
confidence: 0.9
});
}
return factors;
}
aggregateOverallConfidence(components, factors) {
// Weighted sum of components
let baseConfidence = 0;
for (const [component, score] of Object.entries(components)) {
const weight = this.weightingScheme[component] || 0;
baseConfidence += score * weight;
}
// Apply factors
let adjustedConfidence = baseConfidence;
for (const factor of factors) {
const adjustment = factor.impact * factor.weight * factor.confidence;
adjustedConfidence += adjustment;
}
// Ensure bounds
return Math.max(0, Math.min(1, adjustedConfidence));
}
calculateReliability(components, factors, matchEvidence) {
let reliability = 0.5;
// Evidence quantity boosts reliability
if (matchEvidence.length > 10) {
reliability += 0.2;
}
else if (matchEvidence.length > 5) {
reliability += 0.1;
}
// Consistent components boost reliability
const componentValues = Object.values(components);
const componentStdDev = this.calculateStandardDeviation(componentValues);
reliability += Math.max(0, 0.3 - componentStdDev);
// High-confidence factors boost reliability
const highConfidenceFactors = factors.filter(f => f.confidence > 0.8).length;
reliability += highConfidenceFactors * 0.05;
return Math.max(0, Math.min(1, reliability));
}
classifyConfidenceLevel(confidence) {
if (confidence >= 0.9)
return 'very_high';
if (confidence >= 0.75)
return 'high';
if (confidence >= 0.5)
return 'medium';
if (confidence >= 0.25)
return 'low';
return 'very_low';
}
generateExplanations(components, factors, overall, matchEvidence) {
const explanation = `Overall confidence: ${(overall * 100).toFixed(1)}% based on ` +
`semantic alignment (${(components.semanticTypeMatch * 100).toFixed(1)}%), ` +
`statistical similarity (${(components.statisticalSimilarity * 100).toFixed(1)}%), ` +
`and ${matchEvidence.length} sample matches.`;
const warnings = [];
const recommendations = [];
// Generate warnings
if (components.semanticTypeMatch < 0.3) {
warnings.push('Semantic types may be incompatible - verify column meanings match');
}
if (components.dataQualityScore < 0.5) {
warnings.push('Data quality issues detected - high null rates or poor uniqueness');
}
if (overall < 0.3) {
warnings.push('Very low confidence - manual verification strongly recommended');
}
// Generate recommendations
if (components.normalizationEffectiveness < 0.6) {
recommendations.push('Consider using different normalization strategies');
}
if (components.cardinalityAlignment < 0.3) {
recommendations.push('Large cardinality differences - consider pre-filtering or sampling');
}
if (matchEvidence.length < 5) {
recommendations.push('Limited sample evidence - test with larger sample for better assessment');
}
return { explanation, warnings, recommendations };
}
// Helper methods
getSemanticTypeCompatibility(type1, type2) {
const compatibilityMatrix = {
'identifier': { 'high_cardinality_attribute': 0.7 },
'email_address': { 'contact_method': 0.8, 'user_identifier': 0.6 },
'phone_number': { 'contact_method': 0.8 },
'monetary_value': { 'numeric_value': 0.9, 'quantitative_measure': 0.7 },
'temporal': { 'datetime': 0.95, 'timestamp': 0.9, 'event_timestamp': 0.8 },
'display_name': { 'generic_attribute': 0.6, 'categorical_attribute': 0.4 },
'categorical_attribute': { 'categorical_code': 0.8 }
};
return compatibilityMatrix[type1]?.[type2] ||
compatibilityMatrix[type2]?.[type1] || 0;
}
calculateDomainSimilarity(tags1, tags2) {
if (!tags1.length || !tags2.length)
return 0;
const set1 = new Set(tags1);
const set2 = new Set(tags2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
const union = new Set([...set1, ...set2]);
return intersection.size / union.size;
}
areDataTypesCompatible(type1, type2) {
const compatibleGroups = [
['numeric', 'mixed'],
['string', 'mixed'],
['date', 'string']
];
return compatibleGroups.some(group => group.includes(type1) && group.includes(type2));
}
calculateNumericRangeOverlap(stats1, stats2) {
const min1 = stats1.min, max1 = stats1.max;
const min2 = stats2.min, max2 = stats2.max;
const overlapStart = Math.max(min1, min2);
const overlapEnd = Math.min(max1, max2);
if (overlapStart >= overlapEnd)
return 0;
const overlapSize = overlapEnd - overlapStart;
const totalRange = Math.max(max1, max2) - Math.min(min1, min2);
return totalRange > 0 ? overlapSize / totalRange : 0;
}
calculateCIDRelatedness(left, right) {
// Check for parent-child relationships or shared parents
let maxRelatedness = 0;
for (const leftCid of left) {
for (const rightCid of right) {
if (leftCid.concept.parent_cid === rightCid.concept.cid ||
rightCid.concept.parent_cid === leftCid.concept.cid) {
maxRelatedness = Math.max(maxRelatedness, 0.7);
}
else if (leftCid.concept.parent_cid === rightCid.concept.parent_cid &&
leftCid.concept.parent_cid) {
maxRelatedness = Math.max(maxRelatedness, 0.5);
}
}
}
return maxRelatedness;
}
calculateStringSimilarity(str1, str2) {
const len1 = str1.length;
const len2 = str2.length;
const maxLen = Math.max(len1, len2);
if (maxLen === 0)
return 1;
const editDistance = this.levenshteinDistance(str1, str2);
return (maxLen - editDistance) / maxLen;
}
levenshteinDistance(str1, str2) {
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
for (let i = 0; i <= str1.length; i++)
matrix[0][i] = i;
for (let j = 0; j <= str2.length; j++)
matrix[j][0] = j;
for (let j = 1; j <= str2.length; j++) {
for (let i = 1; i <= str1.length; i++) {
const substitutionCost = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[j][i] = Math.min(matrix[j][i - 1] + 1, matrix[j - 1][i] + 1, matrix[j - 1][i - 1] + substitutionCost);
}
}
return matrix[str2.length][str1.length];
}
calculateStandardDeviation(values) {
if (values.length === 0)
return 0;
const mean = values.reduce((a, b) => a + b, 0) / values.length;
const variance = values.reduce((acc, val) => acc + Math.pow(val - mean, 2), 0) / values.length;
return Math.sqrt(variance);
}
// Public API for calibration
updateCalibration(joinType, actualOutcome, predictedConfidence) {
if (!this.calibrationData.has(joinType)) {
this.calibrationData.set(joinType, {
truePositives: 0,
falsePositives: 0,
trueNegatives: 0,
falseNegatives: 0,
precisionByConfidenceLevel: {},
recallByConfidenceLevel: {}
});
}
const data = this.calibrationData.get(joinType);
if (actualOutcome && predictedConfidence > 0.5) {
data.truePositives++;
}
else if (actualOutcome && predictedConfidence <= 0.5) {
data.falseNegatives++;
}
else if (!actualOutcome && predictedConfidence > 0.5) {
data.falsePositives++;
}
else {
data.trueNegatives++;
}
}
getCalibratedConfidence(joinType, rawConfidence) {
const data = this.calibrationData.get(joinType);
if (!data)
return rawConfidence;
// Simple calibration - can be made more sophisticated
const precision = data.truePositives / (data.truePositives + data.falsePositives);
const calibrationFactor = precision || 1;
return Math.min(1, rawConfidence * calibrationFactor);
}
}
exports.JoinConfidenceCalculator = JoinConfidenceCalculator;
//# sourceMappingURL=join-confidence.js.map