semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
613 lines • 26 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.StableColumnAnchorSystem = void 0;
const crypto = __importStar(require("crypto"));
class StableColumnAnchorSystem {
config;
compiledPatterns = [];
constructor(config = {}) {
this.config = {
sample_size: 200,
regex_patterns: [
'(^|_)(id|pk|key)$',
'(^|_)(cust|customer|user|person)(_id)?$',
'(^|_)(email|mail)$',
'(^|_)(phone|tel|mobile)$',
'(^|_)(amount|price|cost|value)$',
'(^|_)(date|time|timestamp)$',
'(^|_)(name|title|label)$',
'(^|_)(code|cd|abbr)$'
],
min_cardinality_threshold: 10,
max_unique_ratio: 0.99,
...config
};
this.compiledPatterns = this.compilePatterns(this.config.regex_patterns);
}
generateAnchorId() {
return `sca_${crypto.randomBytes(8).toString('hex')}`;
}
computeColumnStatistics(column) {
const totalRows = column.values.length;
let nullCount = 0;
const uniqueValues = new Set();
let minValue;
let maxValue;
const isNumeric = column.data_type === 'int64' || column.data_type === 'float64';
for (let i = 0; i < column.values.length; i++) {
const raw = column.values[i];
if (raw === null || raw === undefined || raw === '') {
nullCount++;
continue;
}
const asString = String(raw);
if (isNumeric) {
const num = Number(raw);
if (!Number.isNaN(num))
uniqueValues.add(num);
else
uniqueValues.add(asString);
}
else {
uniqueValues.add(asString);
}
if (isNumeric) {
const num = Number(raw);
if (!Number.isNaN(num)) {
if (minValue === undefined || (typeof minValue === 'number' && num < minValue)) {
minValue = num;
}
if (maxValue === undefined || (typeof maxValue === 'number' && num > maxValue)) {
maxValue = num;
}
}
}
else {
if (minValue === undefined || String(minValue) > asString) {
minValue = asString;
}
if (maxValue === undefined || String(maxValue) < asString) {
maxValue = asString;
}
}
}
const sampleSize = Math.min(this.config.sample_size, uniqueValues.size);
const sampleValues = [];
if (sampleSize > 0) {
let count = 0;
for (const v of uniqueValues) {
sampleValues.push(String(v));
count++;
if (count >= sampleSize)
break;
}
}
return {
total_rows: totalRows,
null_count: nullCount,
unique_count: uniqueValues.size,
min_value: minValue,
max_value: maxValue,
data_type: column.data_type,
sample_values: sampleValues
};
}
detectRegexPatterns(columnName, sampleValues) {
const matchedPatterns = [];
for (const compiled of this.compiledPatterns) {
const { raw, re, tokens } = compiled;
// Strong match on column name via regex
if (re.test(columnName)) {
matchedPatterns.push(raw);
continue;
}
// Relaxed token match on column name (e.g., "cust_pk" should indicate customer)
if (this.tokenMatchWithTokens(columnName, tokens)) {
matchedPatterns.push(raw);
continue;
}
// Evaluate value-based matches using regex with counters to avoid array allocations
let regexMatches = 0;
if (sampleValues.length > 0) {
for (let i = 0; i < sampleValues.length; i++) {
if (re.test(sampleValues[i]))
regexMatches++;
}
}
let matchRatio = sampleValues.length > 0 ? regexMatches / sampleValues.length : 0;
// If regex is too strict, fall back to relaxed token-based value matching
if (matchRatio === 0 && sampleValues.length > 0) {
let tokenMatches = 0;
for (let i = 0; i < sampleValues.length; i++) {
if (this.tokenMatchWithTokens(sampleValues[i], tokens))
tokenMatches++;
}
matchRatio = tokenMatches / sampleValues.length;
}
if (matchRatio > 0.8) {
matchedPatterns.push(raw);
}
}
return matchedPatterns;
}
compilePatterns(patterns) {
return patterns.map(raw => ({ raw, re: new RegExp(raw, 'i'), tokens: this.extractTokens(raw) }));
}
extractTokens(pattern) {
const groups = Array.from(pattern.matchAll(/\(([^)]+)\)/g));
const tokens = [];
for (const g of groups) {
const parts = g[1].split('|');
for (const p of parts) {
const t = p.toLowerCase();
const normalized = t.replace(/[^a-z]/g, '');
if (normalized.length >= 3)
tokens.push(normalized);
}
}
return Array.from(new Set(tokens));
}
tokenMatchWithTokens(text, tokens) {
if (tokens.length === 0)
return false;
const lower = String(text).toLowerCase();
return tokens.some(tok => lower.includes(tok));
}
splitCamelCase(input) {
return input.replace(/([a-z])([A-Z])/g, '$1 $2').split(/[^a-zA-Z0-9]+/);
}
tokenizeName(name) {
const rawTokens = this.splitCamelCase(name).filter(Boolean).map(t => t.toLowerCase());
const tokens = new Set();
for (const t of rawTokens) {
tokens.add(t);
if (t.length >= 3)
tokens.add(t.slice(0, 3));
}
return Array.from(tokens);
}
jaccardSimilarityFromTokens(a, b) {
if (a.length === 0 && b.length === 0)
return 1.0;
if (a.length === 0 || b.length === 0)
return 0.0;
const setA = new Set(a);
const setB = new Set(b);
let intersection = 0;
let union = setA.size;
for (const tok of setB) {
if (setA.has(tok))
intersection++;
else
union++;
}
return union === 0 ? 0 : intersection / union;
}
inferDataType(values) {
let considered = 0;
let intCount = 0;
let floatCount = 0;
let boolCount = 0;
let dateCount = 0;
for (let i = 0; i < values.length && considered < 100; i++) {
const value = values[i];
if (value === null || value === undefined || value === '')
continue;
considered++;
const originalValue = value;
const strValue = String(value).trim();
if (typeof originalValue === 'boolean' || strValue === 'true' || strValue === 'false') {
boolCount++;
}
else if (typeof originalValue === 'number') {
if (Number.isInteger(originalValue)) {
intCount++;
}
else {
floatCount++;
}
}
else if (/^-?\d+$/.test(strValue)) {
intCount++;
}
else if (/^-?\d*\.\d+$/.test(strValue)) {
floatCount++;
}
else if (/^\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}(?::\d{2}(?:\.\d{1,6})?)?(?:Z|[+-]\d{2}:?\d{2})?)?/.test(strValue) ||
/^\d{1,2}\/\d{1,2}\/\d{4}(?:[ T]\d{2}:\d{2}(?::\d{2})?)?/.test(strValue)) {
dateCount++;
}
}
if (considered === 0)
return 'unknown';
if (floatCount / considered >= 0.8)
return 'float64';
if (intCount / considered >= 0.8)
return 'int64';
if (boolCount / considered > 0.8)
return 'boolean';
if (dateCount / considered > 0.8)
return 'datetime';
return 'string';
}
generateFingerprint(column) {
const stats = this.computeColumnStatistics(column);
const inferredType = this.inferDataType(column.values);
const patterns = this.detectRegexPatterns(column.name, stats.sample_values);
return {
min: stats.min_value,
max: stats.max_value,
dtype: column.data_type && column.data_type !== 'unknown' ? column.data_type : inferredType,
cardinality: stats.unique_count,
regex_patterns: patterns,
null_ratio: stats.null_count / stats.total_rows,
unique_ratio: stats.unique_count / stats.total_rows,
sample_values: stats.sample_values
};
}
fingerprintToString(fingerprint) {
const parts = [
`min=${fingerprint.min || 'null'}`,
`max=${fingerprint.max || 'null'}`,
`dtype=${fingerprint.dtype}`,
`card=${fingerprint.cardinality}`,
`null_ratio=${fingerprint.null_ratio.toFixed(3)}`,
`unique_ratio=${fingerprint.unique_ratio.toFixed(3)}`
];
if (fingerprint.regex_patterns.length > 0) {
parts.push(`patterns=${fingerprint.regex_patterns.join(',')}`);
}
// Use ';' as a safe delimiter to avoid conflicts with '|' inside regex patterns
return parts.join(';');
}
createAnchor(dataset, column, mappedCid, confidence) {
const fingerprint = this.generateFingerprint(column);
const timestamp = new Date().toISOString().split('T')[0];
return {
dataset,
column_name: column.name,
anchor_id: this.generateAnchorId(),
fingerprint: this.fingerprintToString(fingerprint),
first_seen: timestamp,
last_seen: timestamp,
mapped_cid: mappedCid,
confidence: confidence
};
}
calculateMatchScore(columnFingerprint, anchor, columnName, driftTolerance = 0.2) {
const anchorFingerprint = this.parseFingerprintString(anchor.fingerprint);
const dtypeMatch = columnFingerprint.dtype === anchorFingerprint.dtype ? 1.0 : 0.0;
const cardinalitySimilarity = this.calculateCardinalitySimilarity(columnFingerprint.cardinality, anchorFingerprint.cardinality);
const regexMatch = this.calculateRegexSimilarity(columnFingerprint.regex_patterns, anchorFingerprint.regex_patterns);
const statisticalSimilarity = this.calculateStatisticalSimilarity(columnFingerprint, anchorFingerprint, driftTolerance);
const nameSimilarity = this.calculateNameSimilarity(columnName, anchor.column_name);
const componentScores = {
dtype_match: dtypeMatch,
cardinality_similarity: cardinalitySimilarity,
regex_match: regexMatch,
statistical_similarity: statisticalSimilarity,
name_similarity: nameSimilarity
};
const weights = {
dtype_match: 0.3,
cardinality_similarity: 0.25,
regex_match: 0.2,
statistical_similarity: 0.15,
name_similarity: 0.1
};
// Penalize weak generic overlaps when names are dissimilar
let adjustedCardinality = componentScores.cardinality_similarity;
let adjustedStats = componentScores.statistical_similarity;
if (componentScores.regex_match <= 0.25 && componentScores.name_similarity < 0.4) {
adjustedCardinality *= 0.2;
adjustedStats *= 0.5;
}
let totalScore = componentScores.dtype_match * weights.dtype_match +
adjustedCardinality * weights.cardinality_similarity +
componentScores.regex_match * weights.regex_match +
adjustedStats * weights.statistical_similarity +
componentScores.name_similarity * weights.name_similarity;
// Require semantic overlap (patterns) or strong name similarity; otherwise disallow
const hasAnyPatterns = (columnFingerprint.regex_patterns?.length || 0) > 0 && (anchorFingerprint.regex_patterns?.length || 0) > 0;
if (hasAnyPatterns && componentScores.regex_match === 0 && componentScores.name_similarity < 0.6) {
totalScore = 0;
}
// Strong guarded boosts for unambiguous matches
if (componentScores.dtype_match === 1.0 &&
componentScores.statistical_similarity >= 0.9 &&
componentScores.regex_match >= 0.95 &&
componentScores.name_similarity >= 0.95) {
totalScore = 1.0;
}
// Guardrail: if the only overlapping pattern is generic ID and names are dissimilar, cap the score
const GENERIC_ID = '(^|_)(id|pk|key)$';
const colSet = new Set(columnFingerprint.regex_patterns || []);
const ancSet = new Set(anchorFingerprint.regex_patterns || []);
let overlapGenericOnly = false;
if (colSet.size > 0 && ancSet.size > 0) {
let interCount = 0;
for (const p of colSet) {
if (ancSet.has(p))
interCount++;
}
overlapGenericOnly = interCount === 1 && (colSet.has(GENERIC_ID) && ancSet.has(GENERIC_ID)) && (colSet.size === 1 || ancSet.size === 1);
}
if (overlapGenericOnly && componentScores.name_similarity < 0.6) {
totalScore = Math.min(totalScore, 0.6);
}
// If pattern overlap is only generic ID and there is no meaningful token overlap, strongly cap
if (colSet.size > 0 && ancSet.size > 0 && overlapGenericOnly) {
const tokens1 = this.tokenizeName(columnName);
const tokens2 = this.tokenizeName(anchor.column_name);
const genericTokens = new Set(['id', 'pk', 'key']);
let meaningfulOverlap = false;
const setTok2 = new Set(tokens2);
for (const t of tokens1) {
if (genericTokens.has(t))
continue;
if (setTok2.has(t)) {
meaningfulOverlap = true;
break;
}
}
if (!meaningfulOverlap) {
totalScore = Math.min(totalScore, 0.49);
}
}
const confidence = Math.min(totalScore, 1.0);
return {
anchor_id: anchor.anchor_id,
total_score: totalScore,
component_scores: componentScores,
confidence
};
}
parseFingerprintString(fingerprintStr) {
// Support new ';' delimiter and fallback to legacy '|' if needed
const parts = fingerprintStr.split(fingerprintStr.includes(';') ? ';' : '|');
const fingerprint = {
regex_patterns: []
};
for (const part of parts) {
const [key, value] = part.split('=', 2);
switch (key) {
case 'min':
fingerprint.min = value === 'null' ? undefined : (isNaN(Number(value)) ? value : Number(value));
break;
case 'max':
fingerprint.max = value === 'null' ? undefined : (isNaN(Number(value)) ? value : Number(value));
break;
case 'dtype':
fingerprint.dtype = value;
break;
case 'card':
fingerprint.cardinality = parseInt(value);
break;
case 'null_ratio':
fingerprint.null_ratio = parseFloat(value);
break;
case 'unique_ratio':
fingerprint.unique_ratio = parseFloat(value);
break;
case 'patterns':
fingerprint.regex_patterns = value ? value.split(',') : [];
break;
}
}
return fingerprint;
}
calculateCardinalitySimilarity(card1, card2) {
if (card1 === 0 && card2 === 0)
return 1.0;
if (card1 === 0 || card2 === 0)
return 0.0;
const ratio = Math.min(card1, card2) / Math.max(card1, card2);
return ratio;
}
calculateRegexSimilarity(patterns1, patterns2) {
if (patterns1.length === 0 && patterns2.length === 0)
return 1.0;
if (patterns1.length === 0 || patterns2.length === 0)
return 0.0;
const set1 = new Set(patterns1);
const set2 = new Set(patterns2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
const union = new Set([...set1, ...set2]);
if (intersection.size === 0)
return 0.0;
const GENERIC_ID = '(^|_)(id|pk|key)$';
if (intersection.size === 1 && intersection.has(GENERIC_ID)) {
return 0.0;
}
const jaccard = intersection.size / union.size;
return jaccard;
}
calculateStatisticalSimilarity(fp1, fp2, driftTolerance = 0) {
let similarity = 0;
let components = 0;
if (fp1.null_ratio !== undefined && fp2.null_ratio !== undefined) {
const diff = Math.abs(fp1.null_ratio - fp2.null_ratio);
const nullRatioSim = diff <= driftTolerance
? 1
: Math.max(0, 1 - (diff - driftTolerance) / (1 - driftTolerance));
similarity += nullRatioSim;
components++;
}
if (fp1.unique_ratio !== undefined && fp2.unique_ratio !== undefined) {
const diff = Math.abs(fp1.unique_ratio - fp2.unique_ratio);
const uniqueRatioSim = diff <= driftTolerance
? 1
: Math.max(0, 1 - (diff - driftTolerance) / (1 - driftTolerance));
similarity += uniqueRatioSim;
components++;
}
return components > 0 ? similarity / components : 0;
}
calculateNameSimilarity(name1, name2) {
const norm1 = name1.toLowerCase().replace(/[^a-z0-9]/g, '');
const norm2 = name2.toLowerCase().replace(/[^a-z0-9]/g, '');
if (norm1 === norm2)
return 1.0;
if (norm1.length === 0 || norm2.length === 0)
return 0.0;
// Token-based similarity only, filtering out generic tokens
const genericTokens = new Set(['id', 'pk', 'key']);
const tokens1 = this.tokenizeName(name1).filter(t => !genericTokens.has(t));
const tokens2 = this.tokenizeName(name2).filter(t => !genericTokens.has(t));
return this.jaccardSimilarityFromTokens(tokens1, tokens2);
}
reconcileAnchors(dataset, columns, existingAnchors, options = {
confidence_threshold: 0.8,
allow_multiple_matches: false,
create_new_anchors: true,
drift_tolerance: 0.2
}) {
const matchedAnchors = [];
const unmatched_columns = [];
const newAnchors = [];
// Precompute fingerprints for all columns (reduces repeated work)
const columnFingerprints = columns.map(col => ({
column: col,
fp: this.generateFingerprint(col)
}));
const candidates = [];
for (let ci = 0; ci < columnFingerprints.length; ci++) {
const { column, fp } = columnFingerprints[ci];
for (let ai = 0; ai < existingAnchors.length; ai++) {
const anchor = existingAnchors[ai];
const score = this.calculateMatchScore(fp, anchor, column.name, options.drift_tolerance);
// Hard semantic guard: if both sides have patterns but overlap is none or generic-only, skip
const colPatterns = new Set(fp.regex_patterns || []);
const ancPatterns = new Set(this.parseFingerprintString(anchor.fingerprint).regex_patterns || []);
let overlapCount = 0;
for (const p of colPatterns)
if (ancPatterns.has(p))
overlapCount++;
const GENERIC_ID = '(^|_)(id|pk|key)$';
const overlapGenericOnly = overlapCount === 1 && colPatterns.has(GENERIC_ID) && ancPatterns.has(GENERIC_ID);
const hasAnyPatterns = colPatterns.size > 0 && ancPatterns.size > 0;
if (hasAnyPatterns && (overlapCount === 0 || overlapGenericOnly)) {
continue;
}
const hasSemanticSignal = score.component_scores.regex_match > 0 || score.component_scores.name_similarity >= 0.8;
if (score.confidence >= options.confidence_threshold && hasSemanticSignal) {
candidates.push({ columnIndex: ci, anchorIndex: ai, score });
}
}
}
// Sort candidates by strong semantic signals first to reduce cross-assignments
candidates.sort((a, b) => {
const ar = a.score.component_scores.regex_match - b.score.component_scores.regex_match;
if (ar !== 0)
return ar > 0 ? -1 : 1;
const ad = a.score.component_scores.dtype_match - b.score.component_scores.dtype_match;
if (ad !== 0)
return ad > 0 ? -1 : 1;
const as = a.score.component_scores.statistical_similarity - b.score.component_scores.statistical_similarity;
if (as !== 0)
return as > 0 ? -1 : 1;
const an = a.score.component_scores.name_similarity - b.score.component_scores.name_similarity;
if (an !== 0)
return an > 0 ? -1 : 1;
const at = a.score.confidence - b.score.confidence;
return at > 0 ? -1 : at < 0 ? 1 : 0;
});
const usedColumns = new Set();
const usedAnchors = new Set();
for (const cand of candidates) {
if (usedColumns.has(cand.columnIndex))
continue;
if (usedAnchors.has(cand.anchorIndex) && !options.allow_multiple_matches)
continue;
const { column } = columnFingerprints[cand.columnIndex];
const anchor = existingAnchors[cand.anchorIndex];
const matchReasons = this.getMatchReasons(cand.score);
matchedAnchors.push({
anchor_id: anchor.anchor_id,
column_name: column.name,
confidence: cand.score.confidence,
match_reason: matchReasons
});
usedColumns.add(cand.columnIndex);
usedAnchors.add(cand.anchorIndex);
}
// Process remaining columns
for (let ci = 0; ci < columnFingerprints.length; ci++) {
if (usedColumns.has(ci))
continue;
const { column } = columnFingerprints[ci];
if (options.create_new_anchors) {
const newAnchor = this.createAnchor(dataset, column);
newAnchors.push(newAnchor);
}
else {
unmatched_columns.push(column.name);
}
}
return {
matched_anchors: matchedAnchors,
unmatched_columns,
new_anchors: newAnchors
};
}
getMatchReasons(matchScore) {
const reasons = [];
if (matchScore.component_scores.dtype_match === 1.0) {
reasons.push('data_type_match');
}
if (matchScore.component_scores.cardinality_similarity > 0.8) {
reasons.push('cardinality_similar');
}
if (matchScore.component_scores.regex_match > 0.5) {
reasons.push('pattern_match');
}
if (matchScore.component_scores.statistical_similarity > 0.8) {
reasons.push('statistical_similarity');
}
if (matchScore.component_scores.name_similarity > 0.7) {
reasons.push('name_similarity');
}
return reasons;
}
updateAnchorLastSeen(anchor) {
return {
...anchor,
last_seen: new Date().toISOString().split('T')[0]
};
}
}
exports.StableColumnAnchorSystem = StableColumnAnchorSystem;
//# sourceMappingURL=anchors.js.map