semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
772 lines • 35.1 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.SemanticJoinOperator = void 0;
const shadow_semantics_1 = require("../core/shadow-semantics");
const statistical_analyzer_1 = require("../inference/statistical-analyzer");
const dataframe_adapters_1 = require("../core/dataframe-adapters");
const dataframe_join_adapters_1 = require("./dataframe-join-adapters");
const join_planner_1 = require("./join-planner");
const join_confidence_1 = require("./join-confidence");
// xxHash64 implementation for high-performance fingerprinting
class XXHash64 {
static PRIME1 = BigInt('11400714785074694791');
static PRIME2 = BigInt('14029467366897019727');
static PRIME3 = BigInt('1609587929392839161');
static PRIME4 = BigInt('9650029242287828579');
static PRIME5 = BigInt('2870177450012600261');
static hash(input) {
const data = typeof input === 'string' ? Buffer.from(input, 'utf8') : input;
let hash = BigInt(0);
// Simple hash implementation (production should use full xxHash64)
for (let i = 0; i < data.length; i++) {
hash = ((hash << BigInt(5)) - hash) + BigInt(data[i]);
hash = hash & BigInt('0xFFFFFFFFFFFFFFFF'); // 64-bit mask
}
return hash.toString(16);
}
}
class SemanticJoinOperator {
semanticsLayer;
cidRegistry;
statisticalAnalyzer;
adapterRegistry;
joinPlanner;
confidenceCalculator;
normalizers;
cache = new Map();
cacheStats = { hits: 0, misses: 0 };
cacheEnabled = true;
constructor(cidRegistry, semanticsLayer, statisticalAnalyzer) {
this.cidRegistry = cidRegistry;
this.semanticsLayer = semanticsLayer || new shadow_semantics_1.ShadowSemanticsLayer();
this.statisticalAnalyzer = statisticalAnalyzer || new statistical_analyzer_1.StatisticalAnalyzer();
this.adapterRegistry = new dataframe_adapters_1.DataFrameAdapterRegistry();
this.joinPlanner = new join_planner_1.SemanticJoinPlanner();
this.confidenceCalculator = new join_confidence_1.JoinConfidenceCalculator();
this.normalizers = this.initializeNormalizers();
}
async semanticJoin(left, right, options = {}) {
const startTime = performance.now();
// Find specialized join adapters for the input types
const leftJoinAdapter = (0, dataframe_join_adapters_1.getJoinAdapter)(left);
const rightJoinAdapter = (0, dataframe_join_adapters_1.getJoinAdapter)(right);
// Set defaults (apply type-specific optimizations if available)
const baseOptions = {
how: 'inner',
confidenceThreshold: 0.7,
enableFuzzyMatching: true,
fuzzyThreshold: 0.8,
cacheNormalizedValues: true,
batchSize: 10000,
autoSelectNormalizers: true,
preserveOriginalColumns: true
};
let opts = {
...baseOptions,
...options
};
// Apply type-specific optimizations
if (leftJoinAdapter) {
opts = leftJoinAdapter.optimizeForType(opts);
}
if (rightJoinAdapter && rightJoinAdapter !== leftJoinAdapter) {
opts = rightJoinAdapter.optimizeForType(opts);
}
// Adapt dataframes
let leftDf, rightDf;
if (leftJoinAdapter) {
leftDf = leftJoinAdapter.toDataFrameLike(left);
}
else {
leftDf = this.adapterRegistry.adapt(left);
}
if (rightJoinAdapter) {
rightDf = rightJoinAdapter.toDataFrameLike(right);
}
else {
rightDf = this.adapterRegistry.adapt(right);
}
if (!leftDf || !rightDf) {
throw new Error('Unable to adapt input data frames');
}
// Attach semantic context
const leftSemantic = this.semanticsLayer.attachSemanticsShadow(leftDf, {
dataset_name: 'left_join_input'
});
const rightSemantic = this.semanticsLayer.attachSemanticsShadow(rightDf, {
dataset_name: 'right_join_input'
});
// Determine join columns
const leftColumns = this.resolveJoinColumns(opts.leftOn, leftDf.columns);
const rightColumns = this.resolveJoinColumns(opts.rightOn, rightDf.columns);
if (leftColumns.length !== rightColumns.length) {
throw new Error('Number of left and right join columns must match');
}
const leftContextMap = this.buildContextMap(leftSemantic.dataframe_id, leftColumns);
const rightContextMap = this.buildContextMap(rightSemantic.dataframe_id, rightColumns);
const joinPlan = this.joinPlanner.planOptimalJoin(leftDf, rightDf, leftContextMap, rightContextMap, opts);
if (options.batchSize == null && joinPlan.batchingStrategy.enabled) {
opts.batchSize = joinPlan.batchingStrategy.batchSize;
}
if (options.cacheNormalizedValues == null) {
opts.cacheNormalizedValues = joinPlan.cacheStrategy.enableValueCache;
}
const normalizerPlanIndexLeft = new Map(joinPlan.normalizationPlan.leftColumns.map(plan => [plan.column, plan]));
const normalizerPlanIndexRight = new Map(joinPlan.normalizationPlan.rightColumns.map(plan => [plan.column, plan]));
const normalizerPairs = [];
for (let i = 0; i < leftColumns.length; i++) {
const leftCol = leftColumns[i];
const rightCol = rightColumns[i];
const plannedLeft = normalizerPlanIndexLeft.get(leftCol);
const plannedRight = normalizerPlanIndexRight.get(rightCol);
let normalizerName = plannedLeft?.normalizer ||
plannedRight?.normalizer || 'default';
let confidence = Math.max(plannedLeft?.confidence ?? 0.5, plannedRight?.confidence ?? 0.5);
let normalizer = this.normalizers[normalizerName] || this.normalizers.default;
const leftContext = leftContextMap[leftCol] || null;
const rightContext = rightContextMap[rightCol] || null;
const leftValues = leftDf.getColumn(leftCol);
const rightValues = rightDf.getColumn(rightCol);
const leftStats = this.statisticalAnalyzer.analyzeColumn(leftValues);
const rightStats = this.statisticalAnalyzer.analyzeColumn(rightValues);
const cidMatchesLeft = this.cidRegistry.lookupByLabel(leftCol);
const cidMatchesRight = this.cidRegistry.lookupByLabel(rightCol);
const cidSelection = this.selectNormalizerFromCID(cidMatchesLeft, cidMatchesRight);
if (cidSelection && cidSelection.confidence > confidence) {
normalizer = this.normalizers[cidSelection.name];
normalizerName = cidSelection.name;
confidence = cidSelection.confidence;
}
if (opts.autoSelectNormalizers) {
const selection = this.selectOptimalNormalizer(leftContext, rightContext, leftValues, rightValues);
if (selection.confidence >= confidence) {
const inferredName = this.identifyNormalizer(selection.normalizer);
normalizerName = inferredName ?? 'custom';
normalizer = selection.normalizer;
confidence = selection.confidence;
}
}
normalizerPairs.push({
left: leftCol,
right: rightCol,
normalizer,
normalizerName,
confidence,
leftValues,
rightValues,
leftStats,
rightStats,
leftContext,
rightContext,
cidMatches: { left: cidMatchesLeft, right: cidMatchesRight }
});
}
this.cacheEnabled = opts.cacheNormalizedValues !== false;
const normalizationStart = performance.now();
const rawMatches = await this.performSemanticMatch(leftDf, rightDf, normalizerPairs, opts);
const matches = this.calculateConfidenceScores(rawMatches, normalizerPairs, leftDf, rightDf, opts.confidenceThreshold);
const normalizationEnd = performance.now();
// Execute the actual join
const joinStart = performance.now();
let joinedData;
// Use specialized join adapter if available for the result format
const resultAdapter = leftJoinAdapter || rightJoinAdapter;
if (resultAdapter) {
const intermediateResult = this.executeJoin(left, right, leftDf, rightDf, matches, opts);
const tempResult = {
data: intermediateResult,
matches,
performance: { totalTime: 0, normalizationTime: 0, matchingTime: 0, joinTime: 0, cacheHits: 0, totalOperations: 0 },
statistics: { inputRowsLeft: 0, inputRowsRight: 0, outputRows: 0, matchedRows: 0, confidence: { average: 0, median: 0, distribution: {} } }
};
joinedData = resultAdapter.fromJoinResult(tempResult, left, right, opts);
}
else {
joinedData = this.executeJoin(left, right, leftDf, rightDf, matches, opts);
}
const joinEnd = performance.now();
const endTime = performance.now();
// Calculate statistics
const confidenceValues = matches.map(m => m.confidence);
const avgConfidence = confidenceValues.reduce((a, b) => a + b, 0) / confidenceValues.length || 0;
const sortedConfidence = [...confidenceValues].sort((a, b) => a - b);
const medianConfidence = sortedConfidence.length > 0
? sortedConfidence[Math.floor(sortedConfidence.length / 2)]
: 0;
const confidenceDistribution = {
'high (0.9-1.0)': confidenceValues.filter(c => c >= 0.9).length,
'medium (0.7-0.9)': confidenceValues.filter(c => c >= 0.7 && c < 0.9).length,
'low (0.5-0.7)': confidenceValues.filter(c => c >= 0.5 && c < 0.7).length,
'very_low (<0.5)': confidenceValues.filter(c => c < 0.5).length
};
return {
data: joinedData,
matches,
performance: {
totalTime: endTime - startTime,
normalizationTime: normalizationEnd - normalizationStart,
matchingTime: 0, // included in normalization
joinTime: joinEnd - joinStart,
cacheHits: this.cacheStats.hits,
totalOperations: matches.length
},
statistics: {
inputRowsLeft: leftDf.shape[0],
inputRowsRight: rightDf.shape[0],
outputRows: Array.isArray(joinedData) ? joinedData.length : 0,
matchedRows: matches.length,
confidence: {
average: avgConfidence,
median: medianConfidence,
distribution: confidenceDistribution
}
}
};
}
async performSemanticMatch(leftDf, rightDf, normalizerPairs, opts) {
const matches = [];
// Create normalized indices using raw normalized keys so fuzzy matching operates on real tokens
const leftIndices = new Map();
// Build left indices
const leftRows = leftDf.shape[0];
for (let i = 0; i < leftRows; i++) {
const { compositeKey, normalizedValues } = this.createCompositeKey(leftDf, i, normalizerPairs.map(p => ({ column: p.left, normalizer: p.normalizer })));
if (!leftIndices.has(compositeKey)) {
leftIndices.set(compositeKey, { indices: [], metadata: { normalizedValues } });
}
leftIndices.get(compositeKey).indices.push(i);
}
// Build right indices and find matches
const rightRows = rightDf.shape[0];
const batchSize = Math.max(1, opts.batchSize || rightRows);
for (let batchStart = 0; batchStart < rightRows; batchStart += batchSize) {
const batchEnd = Math.min(batchStart + batchSize, rightRows);
for (let j = batchStart; j < batchEnd; j++) {
const { compositeKey, normalizedValues } = this.createCompositeKey(rightDf, j, normalizerPairs.map(p => ({ column: p.right, normalizer: p.normalizer })));
// Exact match
const exactMatches = leftIndices.get(compositeKey);
if (exactMatches) {
for (const leftIndex of exactMatches.indices) {
matches.push({
leftIndex,
rightIndex: j,
confidence: Math.min(...normalizerPairs.map(p => p.confidence)),
matchType: 'exact',
normalizerUsed: this.describeNormalizers(normalizerPairs),
metadata: {
compositeKey,
normalizedLeft: exactMatches.metadata.normalizedValues,
normalizedRight: normalizedValues
}
});
}
}
else if (opts.enableFuzzyMatching) {
// Fuzzy matching
const fuzzyMatches = this.findFuzzyMatches({ key: compositeKey, normalizedValues }, leftIndices, opts.fuzzyThreshold);
for (const fuzzyMatch of fuzzyMatches) {
matches.push({
leftIndex: fuzzyMatch.index,
rightIndex: j,
confidence: fuzzyMatch.similarity * Math.min(...normalizerPairs.map(p => p.confidence)),
matchType: 'fuzzy',
normalizerUsed: this.describeNormalizers(normalizerPairs),
metadata: {
compositeKey,
fuzzyKey: fuzzyMatch.key,
similarity: fuzzyMatch.similarity,
normalizedLeft: fuzzyMatch.normalizedValues,
normalizedRight: normalizedValues
}
});
}
}
}
}
// Filter by confidence threshold
return matches.filter(m => m.confidence >= opts.confidenceThreshold);
}
calculateConfidenceScores(matches, normalizerPairs, leftDf, rightDf, threshold) {
if (matches.length === 0 || normalizerPairs.length === 0) {
return matches;
}
const aggregatedCid = {
left: normalizerPairs.flatMap(pair => pair.cidMatches.left),
right: normalizerPairs.flatMap(pair => pair.cidMatches.right)
};
const representativeLeftContext = this.resolveDominantContext(normalizerPairs, 'left');
const representativeRightContext = this.resolveDominantContext(normalizerPairs, 'right');
const representativeLeftStats = this.resolveRepresentativeStats(normalizerPairs, 'left');
const representativeRightStats = this.resolveRepresentativeStats(normalizerPairs, 'right');
const representativeLeftValues = normalizerPairs[0].leftValues;
const representativeRightValues = normalizerPairs[0].rightValues;
const enhancedMatches = matches.map(match => {
const normalizedLeftValues = Array.isArray((match.metadata || {}).normalizedLeft)
? match.metadata.normalizedLeft
: [];
const normalizedRightValues = Array.isArray((match.metadata || {}).normalizedRight)
? match.metadata.normalizedRight
: [];
const evidence = normalizerPairs.map((pair, idx) => {
const leftValue = pair.leftValues[match.leftIndex];
const rightValue = pair.rightValues[match.rightIndex];
const normalizedLeft = normalizedLeftValues[idx] || String(leftValue ?? '');
const normalizedRight = normalizedRightValues[idx] || String(rightValue ?? '');
const similarity = match.matchType === 'exact'
? 1
: match.metadata?.similarity ??
this.calculateStringSimilarity(normalizedLeft, normalizedRight);
return {
leftValue,
rightValue,
normalizedLeft,
normalizedRight,
similarity,
matchType: match.matchType,
semanticAlignment: this.computeSemanticAlignment(pair.leftContext, pair.rightContext),
contextualRelevance: this.computeContextualRelevance(pair.leftContext, pair.rightContext)
};
});
const confidenceScore = this.confidenceCalculator.calculateMatchConfidence(representativeLeftContext, representativeRightContext, representativeLeftValues, representativeRightValues, representativeLeftStats, representativeRightStats, evidence, aggregatedCid);
return {
...match,
confidence: confidenceScore.overall,
metadata: {
...match.metadata,
confidenceScore,
matchEvidence: evidence
}
};
});
return enhancedMatches.filter(m => m.confidence >= threshold);
}
buildContextMap(dataframeId, columns) {
const contextMap = {};
for (const column of columns) {
contextMap[column] = this.semanticsLayer.getSemanticContext(dataframeId, column);
}
return contextMap;
}
identifyNormalizer(normalizer) {
for (const [name, fn] of Object.entries(this.normalizers)) {
if (fn === normalizer) {
return name;
}
}
return null;
}
selectNormalizerFromCID(leftMatches, rightMatches) {
const leftCandidates = leftMatches
.map(match => ({
name: this.inferNormalizerFromConcept(match),
confidence: match.confidence
}))
.filter((candidate) => candidate.name != null);
const rightCandidates = rightMatches
.map(match => ({
name: this.inferNormalizerFromConcept(match),
confidence: match.confidence
}))
.filter((candidate) => candidate.name != null);
if (leftCandidates.length === 0 && rightCandidates.length === 0) {
return null;
}
for (const leftCandidate of leftCandidates) {
const rightCandidate = rightCandidates.find(candidate => candidate.name === leftCandidate.name);
if (rightCandidate) {
return {
name: leftCandidate.name,
confidence: Math.min(0.95, (leftCandidate.confidence + rightCandidate.confidence) / 2 + 0.2)
};
}
}
if (leftCandidates.length > 0) {
return {
name: leftCandidates[0].name,
confidence: Math.min(0.85, leftCandidates[0].confidence + 0.2)
};
}
if (rightCandidates.length > 0) {
return {
name: rightCandidates[0].name,
confidence: Math.min(0.85, rightCandidates[0].confidence + 0.2)
};
}
return null;
}
inferNormalizerFromConcept(match) {
const concept = match.concept;
const cid = concept.cid.toLowerCase();
const labelString = concept.labels.map(label => label.toLowerCase()).join(' ');
if (cid.includes('email') || labelString.includes('email')) {
return 'email';
}
if (cid.includes('phone') || labelString.includes('phone')) {
return 'phone';
}
if (cid.includes('name') || labelString.includes('name')) {
return 'name';
}
if (cid.includes('address') || labelString.includes('address')) {
return 'address';
}
if (concept.facets?.temporal) {
return 'date';
}
if (concept.facets?.numerical || concept.facets?.identifier) {
return 'numeric';
}
if (concept.facets?.categorical) {
return 'categorical';
}
return null;
}
resolveDominantContext(normalizerPairs, side) {
let bestContext = null;
let bestConfidence = -1;
for (const pair of normalizerPairs) {
const context = side === 'left' ? pair.leftContext : pair.rightContext;
if (context && context.confidence > bestConfidence) {
bestContext = context;
bestConfidence = context.confidence;
}
}
return bestContext;
}
resolveRepresentativeStats(normalizerPairs, side) {
let bestStats = side === 'left' ? normalizerPairs[0].leftStats : normalizerPairs[0].rightStats;
let bestConfidence = side === 'left'
? normalizerPairs[0].leftContext?.confidence ?? 0
: normalizerPairs[0].rightContext?.confidence ?? 0;
for (const pair of normalizerPairs) {
const context = side === 'left' ? pair.leftContext : pair.rightContext;
if (context && context.confidence > bestConfidence) {
bestConfidence = context.confidence;
bestStats = side === 'left' ? pair.leftStats : pair.rightStats;
}
}
return bestStats;
}
computeSemanticAlignment(leftContext, rightContext) {
if (leftContext && rightContext) {
if (leftContext.semantic_type === rightContext.semantic_type) {
return 1;
}
if (this.areTypesCompatible(leftContext.semantic_type, rightContext.semantic_type)) {
return 0.75;
}
return 0.3;
}
return 0.5;
}
computeContextualRelevance(leftContext, rightContext) {
const leftConfidence = leftContext?.confidence ?? 0.5;
const rightConfidence = rightContext?.confidence ?? 0.5;
return Math.min(1, (leftConfidence + rightConfidence) / 2);
}
createCompositeKey(df, rowIndex, columnNormalizers) {
const keyParts = [];
for (const { column, normalizer } of columnNormalizers) {
const values = df.getColumn(column);
const value = values[rowIndex];
const normalized = this.normalizeWithCache(value, normalizer);
keyParts.push(normalized);
}
const normalizedKey = keyParts.join('|');
return {
compositeKey: XXHash64.hash(normalizedKey),
normalizedValues: keyParts
};
}
normalizeWithCache(value, normalizer) {
if (value == null)
return '';
if (!this.cacheEnabled) {
return normalizer(value);
}
const key = `${String(value)}_${normalizer.name || 'unknown'}`;
if (this.cache.has(key)) {
this.cacheStats.hits++;
return this.cache.get(key);
}
this.cacheStats.misses++;
const normalized = normalizer(value);
// Cache management - keep cache size reasonable
if (this.cache.size > 100000) {
// Remove oldest entries (simple LRU simulation)
const oldestKeys = Array.from(this.cache.keys()).slice(0, 10000);
oldestKeys.forEach(key => this.cache.delete(key));
}
this.cache.set(key, normalized);
return normalized;
}
findFuzzyMatches(target, index, threshold) {
const results = [];
for (const [key, entry] of index.entries()) {
const similarity = this.calculateStringSimilarity(target.normalizedValues.join('|'), entry.metadata.normalizedValues.join('|'));
if (similarity >= threshold) {
for (const idx of entry.indices) {
results.push({ key, index: idx, similarity, normalizedValues: entry.metadata.normalizedValues });
}
}
}
return results.sort((a, b) => b.similarity - a.similarity);
}
describeNormalizers(normalizerPairs) {
return normalizerPairs
.map(pair => `${pair.left}:${pair.right}:${pair.normalizerName}`)
.join(',');
}
calculateStringSimilarity(str1, str2) {
const len1 = str1.length;
const len2 = str2.length;
const maxLen = Math.max(len1, len2);
if (maxLen === 0)
return 1;
const editDistance = this.levenshteinDistance(str1, str2);
return (maxLen - editDistance) / maxLen;
}
levenshteinDistance(str1, str2) {
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
for (let i = 0; i <= str1.length; i++)
matrix[0][i] = i;
for (let j = 0; j <= str2.length; j++)
matrix[j][0] = j;
for (let j = 1; j <= str2.length; j++) {
for (let i = 1; i <= str1.length; i++) {
const substitutionCost = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[j][i] = Math.min(matrix[j][i - 1] + 1, matrix[j - 1][i] + 1, matrix[j - 1][i - 1] + substitutionCost);
}
}
return matrix[str2.length][str1.length];
}
executeJoin(originalLeft, originalRight, leftDf, rightDf, matches, opts) {
// For now, return array of objects - can be extended for different output formats
const result = [];
// Create mapping of matched indices
const leftMatched = new Set(matches.map(m => m.leftIndex));
const rightMatched = new Set(matches.map(m => m.rightIndex));
const leftColumnSet = new Set(leftDf.columns);
const mapLeftColumn = (col) => opts.preserveOriginalColumns ? `left_${col}` : col;
const mapRightColumn = (col) => {
if (opts.preserveOriginalColumns) {
return `right_${col}`;
}
return leftColumnSet.has(col) ? `${col}_right` : col;
};
// Add matched rows
for (const match of matches) {
const leftRow = {};
const rightRow = {};
// Get left row data
for (const col of leftDf.columns) {
const values = leftDf.getColumn(col);
leftRow[mapLeftColumn(col)] = values[match.leftIndex];
}
// Get right row data
for (const col of rightDf.columns) {
const values = rightDf.getColumn(col);
rightRow[mapRightColumn(col)] = values[match.rightIndex];
}
result.push({
...leftRow,
...rightRow,
_semantic_join_meta: {
confidence: match.confidence,
matchType: match.matchType,
normalizerUsed: match.normalizerUsed
}
});
}
// Handle unmatched rows based on join type
if (opts.how === 'left' || opts.how === 'outer') {
for (let i = 0; i < leftDf.shape[0]; i++) {
if (!leftMatched.has(i)) {
const leftRow = {};
for (const col of leftDf.columns) {
const values = leftDf.getColumn(col);
leftRow[mapLeftColumn(col)] = values[i];
}
// Add null right columns
for (const col of rightDf.columns) {
leftRow[mapRightColumn(col)] = null;
}
leftRow._semantic_join_meta = {
confidence: 0,
matchType: 'no_match',
normalizerUsed: null
};
result.push(leftRow);
}
}
}
if (opts.how === 'right' || opts.how === 'outer') {
for (let j = 0; j < rightDf.shape[0]; j++) {
if (!rightMatched.has(j)) {
const rightRow = {};
// Add null left columns
for (const col of leftDf.columns) {
rightRow[mapLeftColumn(col)] = null;
}
for (const col of rightDf.columns) {
const values = rightDf.getColumn(col);
rightRow[mapRightColumn(col)] = values[j];
}
rightRow._semantic_join_meta = {
confidence: 0,
matchType: 'no_match',
normalizerUsed: null
};
result.push(rightRow);
}
}
}
return result;
}
selectOptimalNormalizer(leftContext, rightContext, leftValues, rightValues) {
// Default fallback
let bestNormalizer = this.normalizers.default;
let bestConfidence = 0.5;
// Use semantic types to select normalizer
if (leftContext && rightContext) {
const leftType = leftContext.semantic_type;
const rightType = rightContext.semantic_type;
if (leftType === rightType) {
bestConfidence = 0.9;
switch (leftType) {
case 'email_address':
bestNormalizer = this.normalizers.email;
break;
case 'phone_number':
bestNormalizer = this.normalizers.phone;
break;
case 'display_name':
bestNormalizer = this.normalizers.name;
break;
case 'monetary_value':
case 'high_cardinality_attribute':
bestNormalizer = this.normalizers.numeric;
break;
case 'temporal':
bestNormalizer = this.normalizers.date;
break;
case 'categorical_attribute':
case 'categorical_code':
bestNormalizer = this.normalizers.categorical;
break;
}
}
else if (this.areTypesCompatible(leftType, rightType)) {
bestConfidence = 0.7;
bestNormalizer = this.normalizers.default;
}
}
// Statistical analysis as fallback
if (bestConfidence < 0.7) {
const leftStats = this.statisticalAnalyzer.analyzeColumn(leftValues.slice(0, 100));
const rightStats = this.statisticalAnalyzer.analyzeColumn(rightValues.slice(0, 100));
if (leftStats.dataType === rightStats.dataType) {
bestConfidence = 0.6;
switch (leftStats.dataType) {
case 'numeric':
bestNormalizer = this.normalizers.numeric;
break;
case 'date':
bestNormalizer = this.normalizers.date;
break;
case 'string':
if (leftStats.uniquePercentage < 20 && rightStats.uniquePercentage < 20) {
bestNormalizer = this.normalizers.categorical;
}
else {
bestNormalizer = this.normalizers.default;
}
break;
}
}
}
return { normalizer: bestNormalizer, confidence: bestConfidence };
}
areTypesCompatible(type1, type2) {
const compatibilityGroups = [
['identifier', 'high_cardinality_attribute'],
['monetary_value', 'numeric_value'],
['display_name', 'generic_attribute'],
['categorical_attribute', 'categorical_code']
];
return compatibilityGroups.some(group => group.includes(type1) && group.includes(type2));
}
resolveJoinColumns(columns, availableColumns) {
if (!columns) {
throw new Error('Join columns must be specified');
}
const cols = Array.isArray(columns) ? columns : [columns];
// Validate columns exist
for (const col of cols) {
if (!availableColumns.includes(col)) {
throw new Error(`Column '${col}' not found in dataframe. Available: ${availableColumns.join(', ')}`);
}
}
return cols;
}
initializeNormalizers() {
return {
email: (value) => {
const str = String(value || '').toLowerCase().trim();
return str.replace(/\s+/g, '');
},
phone: (value) => {
const str = String(value || '');
return str.replace(/[^\d]/g, '');
},
name: (value) => {
const str = String(value || '').toLowerCase().trim();
return str.replace(/[^\w\s]/g, '').replace(/\s+/g, ' ');
},
address: (value) => {
const str = String(value || '').toLowerCase().trim();
return str
.replace(/[^\w\s\d]/g, ' ')
.replace(/\b(st|street|ave|avenue|rd|road|blvd|boulevard|dr|drive|ln|lane|ct|court)\b/g, '')
.replace(/\s+/g, ' ');
},
numeric: (value) => {
const num = parseFloat(String(value || '0'));
return isNaN(num) ? '0' : num.toFixed(2);
},
date: (value) => {
try {
const date = new Date(value);
return isNaN(date.getTime()) ? '' : date.toISOString().split('T')[0];
}
catch {
return '';
}
},
categorical: (value) => {
return String(value || '').toLowerCase().trim();
},
default: (value) => {
return String(value || '').toLowerCase().trim().replace(/\s+/g, ' ');
}
};
}
// Public API for cache management
clearCache() {
this.cache.clear();
this.cacheStats = { hits: 0, misses: 0 };
}
getCacheStats() {
const total = this.cacheStats.hits + this.cacheStats.misses;
return {
...this.cacheStats,
hitRate: total > 0 ? this.cacheStats.hits / total : 0
};
}
// Public API for adding custom normalizers
addNormalizer(name, normalizer) {
this.normalizers[name] = normalizer;
}
}
exports.SemanticJoinOperator = SemanticJoinOperator;
//# sourceMappingURL=semantic-join.js.map