semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
430 lines • 16.7 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.SuggestionEngine = void 0;
class SuggestionEngine {
anchorStore;
pendingSuggestions = new Map();
constructor(anchorStore) {
this.anchorStore = anchorStore;
}
async generateSuggestions(analysis) {
const startTime = Date.now();
const newMappings = await this.generateSemanticMappings(analysis);
const driftDetections = await this.detectDrift(analysis);
const healthMetrics = await this.calculateHealthMetrics(analysis, newMappings);
const acceptAllId = this.generateAcceptAllId(newMappings);
// Store suggestions for quick accept
for (const mapping of newMappings) {
this.pendingSuggestions.set(mapping.id, mapping);
}
return {
newMappings,
driftDetections,
healthMetrics,
acceptAllId,
processingTimeMs: Date.now() - startTime
};
}
async generateSemanticMappings(analysis) {
const mappings = [];
// Process schema changes for new columns
for (const schemaChange of analysis.schemaChanges) {
if (schemaChange.type === 'column_added') {
const mapping = await this.suggestMappingForColumn(schemaChange.table, schemaChange.column, schemaChange.after);
if (mapping) {
mappings.push(mapping);
}
}
}
// Process data file changes
for (const fileChange of analysis.dataFileChanges) {
if (fileChange.status === 'added' && fileChange.patch) {
const fileMappings = await this.suggestMappingsForDataFile(fileChange);
mappings.push(...fileMappings);
}
}
return mappings;
}
async suggestMappingForColumn(table, column, metadata) {
const columnInfo = {
name: column,
table: table,
type: metadata?.type || 'unknown',
constraints: metadata?.constraints || []
};
// Find similar anchors
const similarAnchors = await this.findSimilarAnchors(columnInfo);
if (similarAnchors.length > 0) {
const bestMatch = similarAnchors[0];
const semanticType = await this.inferSemanticType(columnInfo, bestMatch.anchor);
const mapping = {
id: this.generateSuggestionId(),
column: column,
dataset: table,
semantic_type: semanticType,
confidence: bestMatch.confidence,
anchor_id: bestMatch.anchor.anchor_id,
evidence: this.generateEvidence(columnInfo, bestMatch),
quick_accept_url: this.generateQuickAcceptUrl(this.generateSuggestionId())
};
return mapping;
}
// Fallback to pattern-based inference
const inferredType = this.inferSemanticTypeFromPattern(columnInfo);
if (inferredType.confidence > 0.5) {
return {
id: this.generateSuggestionId(),
column: column,
dataset: table,
semantic_type: inferredType.type,
confidence: inferredType.confidence,
evidence: inferredType.evidence,
quick_accept_url: this.generateQuickAcceptUrl(this.generateSuggestionId())
};
}
return null;
}
async findSimilarAnchors(columnInfo) {
const allAnchors = await this.anchorStore.getAllAnchors();
const scores = [];
for (const anchor of allAnchors) {
const confidence = this.calculateSimilarityScore(columnInfo, anchor);
if (confidence > 0.3) {
scores.push({ anchor, confidence });
}
}
return scores.sort((a, b) => b.confidence - a.confidence);
}
calculateSimilarityScore(columnInfo, anchor) {
let score = 0;
let factors = 0;
// Name similarity (Levenshtein distance)
const nameSimilarity = this.calculateNameSimilarity(columnInfo.name, anchor.column_name);
score += nameSimilarity * 0.4;
factors += 0.4;
// Type similarity
const fingerprint = JSON.parse(anchor.fingerprint);
if (fingerprint.dtype && columnInfo.type) {
const typeSimilarity = this.calculateTypeSimilarity(columnInfo.type, fingerprint.dtype);
score += typeSimilarity * 0.3;
factors += 0.3;
}
// Pattern similarity (if available)
if (fingerprint.regex_patterns && fingerprint.regex_patterns.length > 0) {
const patternScore = this.calculatePatternSimilarity(columnInfo, fingerprint.regex_patterns);
score += patternScore * 0.3;
factors += 0.3;
}
return factors > 0 ? score / factors : 0;
}
calculateNameSimilarity(name1, name2) {
const longer = name1.length > name2.length ? name1 : name2;
const shorter = name1.length > name2.length ? name2 : name1;
if (longer.length === 0)
return 1.0;
const distance = this.levenshteinDistance(longer, shorter);
return (longer.length - distance) / longer.length;
}
levenshteinDistance(str1, str2) {
const matrix = [];
for (let i = 0; i <= str2.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= str1.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= str2.length; i++) {
for (let j = 1; j <= str1.length; j++) {
if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
}
else {
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1);
}
}
}
return matrix[str2.length][str1.length];
}
calculateTypeSimilarity(type1, type2) {
const normalized1 = this.normalizeType(type1);
const normalized2 = this.normalizeType(type2);
if (normalized1 === normalized2)
return 1.0;
// Similar types
const similarities = {
'integer': ['int', 'bigint', 'smallint', 'int64'],
'string': ['text', 'varchar', 'char', 'str'],
'float': ['double', 'decimal', 'numeric', 'float64'],
'date': ['datetime', 'timestamp', 'time']
};
for (const [baseType, variants] of Object.entries(similarities)) {
if ((normalized1 === baseType && variants.includes(normalized2)) ||
(normalized2 === baseType && variants.includes(normalized1)) ||
(variants.includes(normalized1) && variants.includes(normalized2))) {
return 0.8;
}
}
return 0;
}
normalizeType(type) {
return type.toLowerCase().replace(/[^\w]/g, '');
}
calculatePatternSimilarity(columnInfo, patterns) {
// This would ideally use sample data, but for now we'll use column name patterns
const columnName = columnInfo.name.toLowerCase();
for (const pattern of patterns) {
try {
const regex = new RegExp(pattern, 'i');
if (regex.test(columnName)) {
return 0.7;
}
}
catch {
// Invalid regex, skip
}
}
return 0;
}
async inferSemanticType(columnInfo, anchor) {
// If anchor has a mapped CID, use it
if (anchor.mapped_cid) {
return anchor.mapped_cid;
}
// Infer from column name and type
return this.inferSemanticTypeFromPattern(columnInfo).type;
}
inferSemanticTypeFromPattern(columnInfo) {
const name = columnInfo.name.toLowerCase();
const type = columnInfo.type?.toLowerCase() || '';
const patterns = [
{
pattern: /^(user_?)?id$/,
type: 'identity.user_id',
confidence: 0.9,
evidence: ['Column name matches user ID pattern']
},
{
pattern: /^(customer|cust)_?id$/,
type: 'identity.customer_id',
confidence: 0.9,
evidence: ['Column name matches customer ID pattern']
},
{
pattern: /^email$/,
type: 'identity.email',
confidence: 0.95,
evidence: ['Column name is "email"']
},
{
pattern: /^(created_at|timestamp|date_created)$/,
type: 'temporal.created_at',
confidence: 0.85,
evidence: ['Column name indicates creation timestamp']
},
{
pattern: /^(amount|price|cost|total)$/,
type: 'money.amount',
confidence: 0.8,
evidence: ['Column name indicates monetary value']
},
{
pattern: /^(name|title|label)$/,
type: 'text.name',
confidence: 0.7,
evidence: ['Column name indicates text label']
},
{
pattern: /^(phone|telephone)$/,
type: 'contact.phone',
confidence: 0.85,
evidence: ['Column name indicates phone number']
},
{
pattern: /^(address|addr)$/,
type: 'location.address',
confidence: 0.8,
evidence: ['Column name indicates address']
}
];
for (const patternInfo of patterns) {
if (patternInfo.pattern.test(name)) {
return {
type: patternInfo.type,
confidence: patternInfo.confidence,
evidence: patternInfo.evidence
};
}
}
// Type-based inference
if (type.includes('int') || type.includes('bigint')) {
return {
type: 'numeric.integer',
confidence: 0.6,
evidence: ['Column type is integer']
};
}
if (type.includes('varchar') || type.includes('text')) {
return {
type: 'text.string',
confidence: 0.5,
evidence: ['Column type is text']
};
}
return {
type: 'unknown',
confidence: 0,
evidence: ['No pattern matched']
};
}
async suggestMappingsForDataFile(fileChange) {
// This would analyze CSV headers or JSON structure
// For now, return empty array as this requires file content analysis
return [];
}
async detectDrift(analysis) {
const drifts = [];
for (const schemaChange of analysis.schemaChanges) {
if (schemaChange.type === 'type_changed') {
drifts.push({
column: schemaChange.column,
dataset: schemaChange.table,
description: `Type changed from ${schemaChange.before?.type || 'unknown'} to ${schemaChange.after?.type || 'unknown'}`,
severity: 'medium',
suggested_action: 'Validate existing semantic mappings',
file: `${schemaChange.table}.yml`
});
}
if (schemaChange.type === 'column_removed') {
drifts.push({
column: schemaChange.column,
dataset: schemaChange.table,
description: `Column removed from schema`,
severity: 'high',
suggested_action: 'Remove semantic mapping',
file: `${schemaChange.table}.yml`
});
}
}
return drifts;
}
async calculateHealthMetrics(analysis, newMappings) {
// Count columns from schema changes
const totalColumns = analysis.schemaChanges.filter(c => c.type === 'column_added').length;
const mappedColumns = newMappings.length;
const coverage = totalColumns > 0 ? mappedColumns / totalColumns : 1;
const driftRisk = analysis.riskLevel;
// Quality score based on confidence levels
const avgConfidence = newMappings.length > 0
? newMappings.reduce((sum, m) => sum + m.confidence, 0) / newMappings.length
: 1;
const qualityScore = avgConfidence * 100;
return {
coverage,
driftRisk,
qualityScore,
mappedColumns,
totalColumns
};
}
generateEvidence(columnInfo, match) {
const evidence = [];
if (match.confidence > 0.8) {
evidence.push(`High similarity to existing anchor "${match.anchor.anchor_id}"`);
}
if (columnInfo.name === match.anchor.column_name) {
evidence.push('Exact column name match');
}
else {
evidence.push(`Similar column name to "${match.anchor.column_name}"`);
}
return evidence;
}
generateSuggestionId() {
return `sugg_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
generateAcceptAllId(mappings) {
const ids = mappings.map(m => m.id).sort().join(',');
return `accept_all_${Buffer.from(ids).toString('base64').substr(0, 10)}`;
}
generateQuickAcceptUrl(suggestionId) {
return `https://github.com/owner/repo/actions/runs/accept?suggestion=${suggestionId}`;
}
async acceptSuggestion(suggestionId) {
const suggestion = this.pendingSuggestions.get(suggestionId);
if (!suggestion) {
return false;
}
try {
// Create new anchor from accepted suggestion
const newAnchor = {
dataset: suggestion.dataset,
column_name: suggestion.column,
anchor_id: suggestion.anchor_id || `sca_${Date.now()}`,
fingerprint: JSON.stringify({
dtype: 'inferred',
cardinality: 0,
regex_patterns: [],
null_ratio: 0,
unique_ratio: 0,
sample_values: []
}),
first_seen: new Date().toISOString(),
last_seen: new Date().toISOString(),
mapped_cid: suggestion.semantic_type,
confidence: suggestion.confidence
};
await this.anchorStore.saveAnchor(newAnchor);
this.pendingSuggestions.delete(suggestionId);
return true;
}
catch (error) {
console.error('Error accepting suggestion:', error);
return false;
}
}
async acceptSuggestionByData(suggestion) {
try {
const newAnchor = {
dataset: suggestion.dataset,
column_name: suggestion.column,
anchor_id: suggestion.anchor_id || `sca_${Date.now()}`,
fingerprint: JSON.stringify({
dtype: 'inferred',
cardinality: 0,
regex_patterns: [],
null_ratio: 0,
unique_ratio: 0,
sample_values: []
}),
first_seen: new Date().toISOString(),
last_seen: new Date().toISOString(),
mapped_cid: suggestion.semantic_type,
confidence: suggestion.confidence
};
await this.anchorStore.saveAnchor(newAnchor);
return true;
}
catch (error) {
console.error('Error accepting suggestion by data:', error);
return false;
}
}
async acceptAllSuggestions(acceptAllId) {
let accepted = 0;
let failed = 0;
// Extract suggestion IDs from accept all ID
const suggestions = Array.from(this.pendingSuggestions.values());
for (const suggestion of suggestions) {
const success = await this.acceptSuggestion(suggestion.id);
if (success) {
accepted++;
}
else {
failed++;
}
}
return { accepted, failed };
}
}
exports.SuggestionEngine = SuggestionEngine;
//# sourceMappingURL=suggestion-engine.js.map