datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
470 lines • 20.3 kB
JavaScript
;
/**
* Intelligent Column Matching Engine
* Phase 1: Foundation Architecture - Advanced column similarity detection
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.IntelligentColumnMatcher = void 0;
const types_1 = require("./types");
class IntelligentColumnMatcher {
semanticCache = new Map();
commonPatterns;
domainKnowledge;
constructor() {
this.initializePatterns();
this.initializeDomainKnowledge();
}
/**
* Calculate semantic similarity between two column names
* Uses multiple strategies: exact matching, pattern matching, domain knowledge
*/
semanticSimilarity(col1, col2) {
const cacheKey = `${col1.toLowerCase()}:${col2.toLowerCase()}`;
if (this.semanticCache.has(cacheKey)) {
return this.semanticCache.get(cacheKey);
}
let similarity = 0;
// 1. Exact match (highest score)
if (col1.toLowerCase() === col2.toLowerCase()) {
similarity = 1.0;
}
// 2. Cleaned name match (remove prefixes/suffixes)
else {
const clean1 = this.cleanColumnName(col1);
const clean2 = this.cleanColumnName(col2);
if (clean1 === clean2) {
similarity = 0.95;
}
// 3. Domain knowledge matching
else {
similarity = Math.max(this.domainBasedSimilarity(clean1, clean2), this.patternBasedSimilarity(col1, col2), this.structuralSimilarity(col1, col2), this.fuzzyStringSimilarity(clean1, clean2));
}
}
this.semanticCache.set(cacheKey, similarity);
return similarity;
}
/**
* Analyze distribution similarity between column data
*/
distributionSimilarity(data1, data2) {
const stats1 = this.calculateDetailedStats(data1);
const stats2 = this.calculateDetailedStats(data2);
const statistical = this.compareStatisticalProperties(stats1, stats2);
const structural = this.compareStructuralProperties(stats1, stats2);
const domain = this.compareDomainProperties(stats1, stats2);
const semantic = this.compareSemanticProperties(stats1, stats2);
const overall = (statistical + structural + domain + semantic) / 4;
return {
overall,
statistical,
structural,
domain,
semantic
};
}
/**
* Detect cardinality relationship between two columns
*/
detectCardinality(table1, table2, col1, col2) {
const schema1 = table1.schema.find(s => s.name === col1);
const schema2 = table2.schema.find(s => s.name === col2);
if (!schema1 || !schema2) {
return types_1.CardinalityType.MANY_TO_MANY;
}
// Analyze uniqueness patterns
const unique1 = schema1.unique || schema1.distinctCount === table1.rowCount;
const unique2 = schema2.unique || schema2.distinctCount === table2.rowCount;
if (unique1 && unique2) {
return types_1.CardinalityType.ONE_TO_ONE;
}
else if (unique1 && !unique2) {
return types_1.CardinalityType.ONE_TO_MANY;
}
else if (!unique1 && unique2) {
return types_1.CardinalityType.MANY_TO_ONE;
}
else {
return types_1.CardinalityType.MANY_TO_MANY;
}
}
/**
* Infer foreign key relationships using statistical signatures
*/
inferForeignKeys(tables) {
const candidates = [];
for (let i = 0; i < tables.length; i++) {
for (let j = 0; j < tables.length; j++) {
if (i === j)
continue;
const fkCandidates = this.detectForeignKeysBetweenTables(tables[i], tables[j]);
candidates.push(...fkCandidates);
}
}
return candidates
.filter(c => c.confidence >= 0.6)
.sort((a, b) => b.confidence - a.confidence);
}
/**
* Suggest optimal join strategy based on column characteristics
*/
suggestJoinStrategy(col1, col2, semanticSim, distributionSim) {
// Exact match strategy
if (semanticSim >= 0.95 && distributionSim.overall >= 0.9) {
return types_1.JoinStrategy.EXACT_MATCH;
}
// Semantic match for similar naming
if (semanticSim >= 0.8) {
return types_1.JoinStrategy.SEMANTIC_MATCH;
}
// Pattern match for formatted data
if (this.hasCommonPatterns(col1, col2)) {
return types_1.JoinStrategy.PATTERN_MATCH;
}
// Range overlap for numeric/date data
if (this.isNumericOrDate(col1.type) && this.isNumericOrDate(col2.type)) {
return types_1.JoinStrategy.RANGE_OVERLAP;
}
// Fuzzy match for string data with variations
if (col1.type === types_1.DataType.STRING && col2.type === types_1.DataType.STRING) {
return types_1.JoinStrategy.FUZZY_MATCH;
}
return types_1.JoinStrategy.STATISTICAL_MATCH;
}
// Private helper methods
initializePatterns() {
this.commonPatterns = new Map([
['id', ['identifier', 'key', 'pk', 'primary_key', 'id_number']],
['name', ['title', 'label', 'full_name', 'display_name', 'description']],
['email', ['mail', 'email_address', 'e_mail', 'contact_email']],
['phone', ['telephone', 'mobile', 'contact_number', 'phone_number']],
['date', ['time', 'timestamp', 'created_at', 'updated_at', 'date_time']],
['address', ['location', 'street', 'addr', 'postal_address']],
['code', ['abbreviation', 'short_code', 'symbol', 'reference']],
['amount', ['price', 'cost', 'value', 'total', 'sum']],
['count', ['quantity', 'number', 'num', 'total_count']],
['status', ['state', 'condition', 'flag', 'is_active']]
]);
}
initializeDomainKnowledge() {
this.domainKnowledge = new Map([
// Customer domain
['customer', ['client', 'user', 'account', 'member', 'subscriber']],
['order', ['purchase', 'transaction', 'sale', 'booking', 'reservation']],
['product', ['item', 'article', 'good', 'service', 'offering']],
// Financial domain
['payment', ['transaction', 'charge', 'billing', 'invoice']],
['account', ['wallet', 'balance', 'ledger', 'financial_account']],
// Geographic domain
['country', ['nation', 'territory', 'state', 'region']],
['city', ['town', 'municipality', 'locality', 'urban_area']],
// Temporal domain
['start', ['begin', 'commence', 'initiate', 'open']],
['end', ['finish', 'complete', 'close', 'terminate']],
// Business domain
['employee', ['staff', 'worker', 'personnel', 'team_member']],
['department', ['division', 'unit', 'section', 'group']],
['manager', ['supervisor', 'lead', 'director', 'head']],
// Technical domain
['server', ['host', 'machine', 'node', 'instance']],
['database', ['db', 'datastore', 'repository', 'storage']],
['application', ['app', 'system', 'platform', 'service']]
]);
}
cleanColumnName(name) {
return name.toLowerCase()
// Remove table prefixes
.replace(/^(tbl_|table_|tb_|t_)/, '')
// Remove common suffixes
.replace(/_(id|key|fk|pk|number|num|code|cd)$/, '')
// Remove common prefixes
.replace(/^(is_|has_|can_|should_)/, '')
// Normalize separators
.replace(/[_\s\-\.]+/g, '_')
// Remove trailing/leading underscores
.replace(/^_+|_+$/g, '')
.trim();
}
domainBasedSimilarity(name1, name2) {
for (const [domain, synonyms] of this.domainKnowledge) {
const inDomain1 = name1.includes(domain) || synonyms.some(s => name1.includes(s));
const inDomain2 = name2.includes(domain) || synonyms.some(s => name2.includes(s));
if (inDomain1 && inDomain2) {
return 0.85;
}
}
return 0;
}
patternBasedSimilarity(name1, name2) {
for (const [pattern, variations] of this.commonPatterns) {
const matches1 = name1.toLowerCase().includes(pattern) ||
variations.some(v => name1.toLowerCase().includes(v));
const matches2 = name2.toLowerCase().includes(pattern) ||
variations.some(v => name2.toLowerCase().includes(v));
if (matches1 && matches2) {
return 0.8;
}
}
return 0;
}
structuralSimilarity(name1, name2) {
// Compare structural patterns
const pattern1 = this.extractStructuralPattern(name1);
const pattern2 = this.extractStructuralPattern(name2);
if (pattern1 === pattern2) {
return 0.7;
}
// Check for similar prefixes/suffixes
const commonPrefix = this.longestCommonPrefix(name1.toLowerCase(), name2.toLowerCase());
const commonSuffix = this.longestCommonSuffix(name1.toLowerCase(), name2.toLowerCase());
const totalLength = Math.max(name1.length, name2.length);
const similarity = (commonPrefix.length + commonSuffix.length) / totalLength;
return Math.min(similarity * 0.6, 0.6); // Cap at 0.6 for structural similarity
}
fuzzyStringSimilarity(str1, str2) {
const distance = this.levenshteinDistance(str1, str2);
const maxLength = Math.max(str1.length, str2.length);
if (maxLength === 0)
return 1;
const similarity = 1 - (distance / maxLength);
// Reduced threshold from 0.3 to 0.15 to be more lenient
return Math.max(similarity - 0.15, 0);
}
calculateDetailedStats(data) {
const nonNullData = data.filter(d => d != null);
const stringData = nonNullData.filter(d => typeof d === 'string');
const numericData = nonNullData.filter(d => typeof d === 'number' || !isNaN(Number(d)));
return {
total: data.length,
nonNull: nonNullData.length,
unique: new Set(nonNullData).size,
// String statistics
avgLength: stringData.length > 0 ?
stringData.reduce((sum, s) => sum + String(s).length, 0) / stringData.length : 0,
maxLength: stringData.length > 0 ?
Math.max(...stringData.map(s => String(s).length)) : 0,
minLength: stringData.length > 0 ?
Math.min(...stringData.map(s => String(s).length)) : 0,
// Numeric statistics
min: numericData.length > 0 ? Math.min(...numericData.map(Number)) : null,
max: numericData.length > 0 ? Math.max(...numericData.map(Number)) : null,
mean: numericData.length > 0 ?
numericData.reduce((sum, n) => sum + Number(n), 0) / numericData.length : null,
// Patterns
patterns: this.extractDataPatterns(nonNullData),
types: this.analyzeDataTypes(nonNullData),
// Distribution
distribution: this.calculateDistribution(nonNullData)
};
}
compareStatisticalProperties(stats1, stats2) {
let score = 0;
let factors = 0;
// Compare ranges for numeric data
if (stats1.min !== null && stats2.min !== null) {
const range1 = stats1.max - stats1.min;
const range2 = stats2.max - stats2.min;
const overlap = Math.max(0, Math.min(stats1.max, stats2.max) - Math.max(stats1.min, stats2.min));
if (range1 > 0 && range2 > 0) {
score += overlap / Math.max(range1, range2);
factors++;
}
}
// Compare string lengths
if (stats1.avgLength > 0 && stats2.avgLength > 0) {
const lengthSim = 1 - Math.abs(stats1.avgLength - stats2.avgLength) / Math.max(stats1.avgLength, stats2.avgLength);
score += lengthSim;
factors++;
}
// Compare uniqueness ratios
const uniqueRatio1 = stats1.unique / stats1.nonNull;
const uniqueRatio2 = stats2.unique / stats2.nonNull;
const uniqueSim = 1 - Math.abs(uniqueRatio1 - uniqueRatio2);
score += uniqueSim;
factors++;
return factors > 0 ? score / factors : 0;
}
compareStructuralProperties(stats1, stats2) {
let score = 0;
let factors = 0;
// Compare data type distributions
const typeOverlap = this.calculateTypeOverlap(stats1.types, stats2.types);
score += typeOverlap;
factors++;
// Compare pattern similarities
const patternOverlap = this.calculatePatternOverlap(stats1.patterns, stats2.patterns);
score += patternOverlap;
factors++;
return factors > 0 ? score / factors : 0;
}
compareDomainProperties(stats1, stats2) {
// Compare domain-specific patterns (emails, phones, dates, etc.)
const domainScore = this.calculateDomainSpecificSimilarity(stats1.patterns, stats2.patterns);
return domainScore;
}
compareSemanticProperties(stats1, stats2) {
// This would integrate with NLP libraries for semantic analysis
// For now, return a basic score based on pattern matching
return this.calculatePatternOverlap(stats1.patterns, stats2.patterns) * 0.5;
}
detectForeignKeysBetweenTables(table1, table2) {
const candidates = [];
for (const col1 of table1.schema) {
for (const col2 of table2.schema) {
const semantic = this.semanticSimilarity(col1.name, col2.name);
if (semantic >= 0.6) {
// Additional checks for FK likelihood
const isForeignKeyLikely = this.assessForeignKeyLikelihood(col1, col2, table1, table2);
if (isForeignKeyLikely) {
candidates.push({
table: table1.tableName,
column: col1.name,
referencedTable: table2.tableName,
referencedColumn: col2.name,
confidence: semantic * isForeignKeyLikely,
matchingRows: 0, // Would be calculated from actual data
totalRows: table1.rowCount,
violations: 0
});
}
}
}
}
return candidates;
}
assessForeignKeyLikelihood(col1, col2, table1, table2) {
let likelihood = 0.5; // Base likelihood
// FK columns are often non-unique in the referencing table
if (!col1.unique && col2.unique) {
likelihood += 0.3;
}
// FK columns typically have high cardinality but not 100% unique
const cardinality1 = col1.distinctCount / table1.rowCount;
if (cardinality1 > 0.1 && cardinality1 < 0.9) {
likelihood += 0.2;
}
// Same data types
if (col1.type === col2.type) {
likelihood += 0.2;
}
// Naming patterns suggesting relationships
if (col1.name.toLowerCase().includes(table2.tableName.toLowerCase()) ||
col1.name.toLowerCase().includes('id')) {
likelihood += 0.3;
}
return Math.min(likelihood, 1.0);
}
hasCommonPatterns(col1, col2) {
const patterns1 = new Set(col1.patterns);
const patterns2 = new Set(col2.patterns);
const intersection = new Set([...patterns1].filter(p => patterns2.has(p)));
return intersection.size > 0;
}
isNumericOrDate(type) {
return [types_1.DataType.INTEGER, types_1.DataType.FLOAT, types_1.DataType.DATE, types_1.DataType.DATETIME].includes(type);
}
extractStructuralPattern(name) {
return name.replace(/[a-zA-Z]/g, 'X').replace(/[0-9]/g, 'N').replace(/[^XN]/g, '_');
}
longestCommonPrefix(str1, str2) {
let i = 0;
while (i < Math.min(str1.length, str2.length) && str1[i] === str2[i]) {
i++;
}
return str1.substring(0, i);
}
longestCommonSuffix(str1, str2) {
let i = 0;
while (i < Math.min(str1.length, str2.length) &&
str1[str1.length - 1 - i] === str2[str2.length - 1 - i]) {
i++;
}
return str1.substring(str1.length - i);
}
levenshteinDistance(str1, str2) {
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
for (let i = 0; i <= str1.length; i++)
matrix[0][i] = i;
for (let j = 0; j <= str2.length; j++)
matrix[j][0] = j;
for (let j = 1; j <= str2.length; j++) {
for (let i = 1; i <= str1.length; i++) {
const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[j][i] = Math.min(matrix[j][i - 1] + 1, matrix[j - 1][i] + 1, matrix[j - 1][i - 1] + indicator);
}
}
return matrix[str2.length][str1.length];
}
extractDataPatterns(data) {
const patterns = [];
const sample = data.slice(0, 100);
// Email pattern
if (sample.some(d => /\S+@\S+\.\S+/.test(String(d)))) {
patterns.push('email');
}
// Phone pattern
if (sample.some(d => /^\+?[\d\s\-\(\)]+$/.test(String(d)))) {
patterns.push('phone');
}
// UUID pattern
if (sample.some(d => /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(String(d)))) {
patterns.push('uuid');
}
// URL pattern
if (sample.some(d => /^https?:\/\//.test(String(d)))) {
patterns.push('url');
}
// Date patterns
if (sample.some(d => /^\d{4}-\d{2}-\d{2}$/.test(String(d)))) {
patterns.push('date_iso');
}
if (sample.some(d => /^\d{2}\/\d{2}\/\d{4}$/.test(String(d)))) {
patterns.push('date_us');
}
return patterns;
}
analyzeDataTypes(data) {
const types = {};
for (const item of data) {
const type = typeof item;
types[type] = (types[type] || 0) + 1;
}
return types;
}
calculateDistribution(data) {
const freq = {};
for (const item of data) {
const key = String(item);
freq[key] = (freq[key] || 0) + 1;
}
return {
uniqueValues: Object.keys(freq).length,
mostFrequent: Object.entries(freq).sort(([, a], [, b]) => b - a)[0],
distribution: freq
};
}
calculateTypeOverlap(types1, types2) {
const keys1 = new Set(Object.keys(types1));
const keys2 = new Set(Object.keys(types2));
const intersection = new Set([...keys1].filter(k => keys2.has(k)));
const union = new Set([...keys1, ...keys2]);
return intersection.size / union.size;
}
calculatePatternOverlap(patterns1, patterns2) {
const set1 = new Set(patterns1);
const set2 = new Set(patterns2);
const intersection = new Set([...set1].filter(p => set2.has(p)));
const union = new Set([...set1, ...set2]);
return union.size > 0 ? intersection.size / union.size : 0;
}
calculateDomainSpecificSimilarity(patterns1, patterns2) {
const domainPatterns = ['email', 'phone', 'uuid', 'url', 'date_iso', 'date_us'];
const domain1 = patterns1.filter(p => domainPatterns.includes(p));
const domain2 = patterns2.filter(p => domainPatterns.includes(p));
if (domain1.length === 0 && domain2.length === 0)
return 0;
const commonDomain = domain1.filter(p => domain2.includes(p));
return commonDomain.length / Math.max(domain1.length, domain2.length);
}
}
exports.IntelligentColumnMatcher = IntelligentColumnMatcher;
//# sourceMappingURL=column-matcher.js.map