semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
413 lines • 19.1 kB
JavaScript
"use strict";
/**
* Semantic Join Examples and Usage Patterns
*
* This file demonstrates how to use the semantic join system for various
* real-world data integration scenarios.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.customerDataIntegrationExample = customerDataIntegrationExample;
exports.productCatalogSyncExample = productCatalogSyncExample;
exports.employeeDeduplicationExample = employeeDeduplicationExample;
exports.highPerformanceJoinExample = highPerformanceJoinExample;
exports.multiColumnSemanticJoinExample = multiColumnSemanticJoinExample;
exports.runAllExamples = runAllExamples;
const cid_registry_1 = require("../registry/cid-registry");
const index_1 = require("./index");
/**
* Example 1: Customer Data Integration
* Scenario: Joining CRM customer data with transaction records
*/
async function customerDataIntegrationExample() {
// Setup CID registry with customer domain concepts
const cidRegistry = new cid_registry_1.CIDRegistry();
cidRegistry.registerPack({
pack: 'customer-domain',
version: '1.0.0',
description: 'Customer management concepts',
concepts: [
{
cid: 'customer.identifier',
labels: ['customer_id', 'cust_id', 'client_id', 'account_id'],
description: 'Customer identifier',
facets: { identifier: true },
examples: ['CUST-12345', 'C001', 'ACC-98765']
},
{
cid: 'customer.email',
labels: ['email', 'email_address', 'customer_email', 'contact_email'],
description: 'Customer email address',
facets: { pii: true },
examples: ['customer@example.com']
},
{
cid: 'customer.name',
labels: ['name', 'customer_name', 'full_name', 'client_name'],
description: 'Customer full name',
facets: { pii: true },
examples: ['John Smith']
}
]
});
// Create optimized join operator
const joinOperator = index_1.SemanticJoinFactory.createOptimized(cidRegistry, {
enableHighAccuracy: true
});
// Sample CRM data
const crmData = {
customer_id: ['CUST-001', 'CUST-002', 'CUST-003', 'CUST-004'],
email: ['alice@company.com', 'bob@startup.io', 'charlie@corp.net', 'diana@firm.org'],
full_name: ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince'],
registration_date: ['2023-01-15', '2023-02-20', '2023-03-10', '2023-04-05'],
customer_segment: ['Premium', 'Standard', 'Premium', 'Enterprise']
};
// Sample transaction data (possibly from different system)
const transactionData = {
cust_id: ['CUST-001', 'CUST-002', 'CUST-005', 'CUST-001', 'CUST-003'],
purchase_amount: [150.00, 299.99, 89.50, 75.25, 450.00],
purchase_date: ['2023-01-20', '2023-02-25', '2023-04-10', '2023-01-25', '2023-03-15'],
product_category: ['Electronics', 'Books', 'Clothing', 'Electronics', 'Software']
};
// Get optimized join options for customer matching
const joinOptions = index_1.SemanticJoinFactory.getDefaultOptions('customer_matching');
// Perform semantic join
const result = await joinOperator.semanticJoin(crmData, transactionData, {
...joinOptions,
leftOn: 'customer_id',
rightOn: 'cust_id',
how: 'left' // Keep all customers, even without transactions
});
// Record metrics
index_1.SemanticJoinMetrics.recordJoinPerformance('customer-integration-001', result);
console.log('=== Customer Data Integration Results ===');
console.log(`Input: ${result.statistics.inputRowsLeft} customers, ${result.statistics.inputRowsRight} transactions`);
console.log(`Output: ${result.statistics.outputRows} enriched customer records`);
console.log(`Matches: ${result.statistics.matchedRows} customers with transactions`);
console.log(`Average confidence: ${(result.statistics.confidence.average * 100).toFixed(1)}%`);
console.log(`Execution time: ${result.performance.totalTime.toFixed(2)}ms`);
// Validate against performance targets
if (result.performance.totalTime <= index_1.PERFORMANCE_TARGETS.MAX_TIME_100K_ROWS * (result.statistics.inputRowsLeft + result.statistics.inputRowsRight) / 100000) {
console.log('✓ Performance target met');
}
else {
console.log('⚠ Performance target missed');
}
}
/**
* Example 2: Product Catalog Synchronization
* Scenario: Joining product master data with inventory levels from warehouse system
*/
async function productCatalogSyncExample() {
const cidRegistry = new cid_registry_1.CIDRegistry();
cidRegistry.registerPack({
pack: 'product-domain',
version: '1.0.0',
description: 'Product catalog concepts',
concepts: [
{
cid: 'product.sku',
labels: ['sku', 'product_code', 'item_code', 'part_number'],
description: 'Product SKU identifier',
facets: { identifier: true },
examples: ['SKU-12345', 'PROD-001', 'ITM-98765']
},
{
cid: 'product.name',
labels: ['product_name', 'item_name', 'title', 'description'],
description: 'Product name or title',
facets: { categorical: true },
examples: ['Wireless Headphones']
}
]
});
const joinOperator = new index_1.SemanticJoinOperator(cidRegistry);
// Product master data
const catalogData = {
sku: ['SKU-001', 'SKU-002', 'SKU-003', 'SKU-004'],
product_name: ['Wireless Headphones', 'Smart Watch', 'Bluetooth Speaker', 'USB Charger'],
category: ['Electronics', 'Wearables', 'Audio', 'Accessories'],
price: [99.99, 299.99, 79.99, 19.99],
brand: ['TechCorp', 'SmartTech', 'AudioPlus', 'TechCorp']
};
// Warehouse inventory data
const inventoryData = {
product_code: ['SKU-001', 'SKU-002', 'SKU-005', 'SKU-003'],
warehouse_location: ['WH-North', 'WH-South', 'WH-East', 'WH-North'],
stock_quantity: [150, 75, 200, 89],
last_restocked: ['2023-03-01', '2023-03-15', '2023-03-20', '2023-02-28'],
reorder_point: [50, 25, 100, 30]
};
const result = await joinOperator.semanticJoin(catalogData, inventoryData, {
leftOn: 'sku',
rightOn: 'product_code',
how: 'outer', // Show all products and all inventory items
confidenceThreshold: 0.95, // High confidence for exact product matching
enableFuzzyMatching: false // Exact matches only for product codes
});
console.log('\n=== Product Catalog Synchronization Results ===');
console.log(`Catalog products: ${result.statistics.inputRowsLeft}`);
console.log(`Inventory items: ${result.statistics.inputRowsRight}`);
console.log(`Synchronized records: ${result.statistics.outputRows}`);
console.log(`Exact matches: ${result.statistics.matchedRows}`);
console.log(`Join accuracy: ${(result.statistics.confidence.average * 100).toFixed(1)}%`);
// Check which products have no inventory
const noInventoryCount = result.statistics.inputRowsLeft -
result.matches.filter(m => m.matchType === 'exact').length;
if (noInventoryCount > 0) {
console.log(`⚠ ${noInventoryCount} products missing inventory data`);
}
}
/**
* Example 3: Employee Data Deduplication
* Scenario: Finding duplicate employee records across HR and payroll systems
*/
async function employeeDeduplicationExample() {
const cidRegistry = new cid_registry_1.CIDRegistry();
cidRegistry.registerPack({
pack: 'employee-domain',
version: '1.0.0',
description: 'Employee management concepts',
concepts: [
{
cid: 'employee.email',
labels: ['email', 'work_email', 'corporate_email'],
description: 'Employee work email',
facets: { pii: true, identifier: true },
examples: ['john.doe@company.com']
},
{
cid: 'employee.name',
labels: ['name', 'full_name', 'employee_name'],
description: 'Employee full name',
facets: { pii: true },
examples: ['John Doe']
}
]
});
const joinOperator = new index_1.SemanticJoinOperator(cidRegistry);
// HR system data
const hrSystemData = {
emp_id: ['EMP001', 'EMP002', 'EMP003', 'EMP004'],
employee_name: ['John A. Doe', 'Jane M. Smith', 'Robert Johnson', 'Sarah Wilson'],
email: ['john.doe@company.com', 'j.smith@company.com', 'bob.johnson@company.com', 'sarah.w@company.com'],
department: ['Engineering', 'Marketing', 'Sales', 'Engineering'],
hire_date: ['2020-01-15', '2019-06-20', '2021-03-10', '2022-07-01']
};
// Payroll system data (potentially with variations in names/emails)
const payrollSystemData = {
employee_id: ['PAY001', 'PAY002', 'PAY003', 'PAY004', 'PAY005'],
full_name: ['John Doe', 'Jane Smith', 'R. Johnson', 'Sara Wilson', 'Mike Brown'],
work_email: ['john.doe@company.com', 'jane.smith@company.com', 'robert.j@company.com', 'sarah.wilson@company.com', 'mike.brown@company.com'],
salary: [95000, 75000, 68000, 82000, 71000]
};
// Use fuzzy matching for name variations and email differences
const result = await joinOperator.semanticJoin(hrSystemData, payrollSystemData, {
leftOn: ['employee_name', 'email'],
rightOn: ['full_name', 'work_email'],
how: 'outer',
confidenceThreshold: 0.6,
enableFuzzyMatching: true,
fuzzyThreshold: 0.7,
autoSelectNormalizers: true
});
console.log('\n=== Employee Deduplication Results ===');
console.log(`HR records: ${result.statistics.inputRowsLeft}`);
console.log(`Payroll records: ${result.statistics.inputRowsRight}`);
console.log(`Total deduplicated view: ${result.statistics.outputRows}`);
console.log(`Confident matches: ${result.matches.filter(m => m.confidence > 0.8).length}`);
console.log(`Fuzzy matches: ${result.matches.filter(m => m.matchType === 'fuzzy').length}`);
// Analyze match confidence distribution
console.log('\nConfidence Distribution:');
for (const [level, count] of Object.entries(result.statistics.confidence.distribution)) {
console.log(` ${level}: ${count} matches`);
}
}
/**
* Example 4: High-Performance Large Dataset Join
* Scenario: Processing 100K+ records efficiently
*/
async function highPerformanceJoinExample() {
const cidRegistry = new cid_registry_1.CIDRegistry();
cidRegistry.registerPack({
pack: 'performance-test',
version: '1.0.0',
description: 'Performance testing concepts',
concepts: [
{
cid: 'user.id',
labels: ['user_id', 'id', 'uid'],
description: 'User identifier',
facets: { identifier: true },
examples: ['USER-12345']
}
]
});
// Create high-performance optimized operator
const joinOperator = index_1.SemanticJoinFactory.createOptimized(cidRegistry, {
enableHighPerformance: true,
enableLargeDatasets: true
});
console.log('\n=== High-Performance Large Dataset Join ===');
console.log('Generating large test datasets...');
// Generate large datasets
const leftSize = 100000;
const rightSize = 80000;
const overlapSize = 60000;
const leftData = {
user_id: Array.from({ length: leftSize }, (_, i) => `USER-${(i + 1).toString().padStart(6, '0')}`),
email: Array.from({ length: leftSize }, (_, i) => `user${i + 1}@domain${Math.floor(i / 1000)}.com`),
registration_date: Array.from({ length: leftSize }, (_, i) => {
const date = new Date(2020, 0, 1);
date.setDate(date.getDate() + (i % 1000));
return date.toISOString().split('T')[0];
})
};
const rightData = {
uid: Array.from({ length: rightSize }, (_, i) => {
// Create overlap by reusing some user IDs
if (i < overlapSize) {
return `USER-${(i + 1).toString().padStart(6, '0')}`;
}
return `USER-${(leftSize + i + 1).toString().padStart(6, '0')}`;
}),
last_login: Array.from({ length: rightSize }, (_, i) => {
const date = new Date(2023, 0, 1);
date.setDate(date.getDate() + (i % 365));
return date.toISOString().split('T')[0];
}),
activity_score: Array.from({ length: rightSize }, () => Math.floor(Math.random() * 1000))
};
console.log(`Left dataset: ${leftSize.toLocaleString()} records`);
console.log(`Right dataset: ${rightSize.toLocaleString()} records`);
console.log(`Expected overlap: ${overlapSize.toLocaleString()} records`);
// Analyze and get suggested configuration
const analysis = await index_1.SemanticJoinFactory.analyzeAndSuggestJoinConfig(leftData, rightData, ['user_id'], ['uid'], cidRegistry);
console.log('\nJoin Analysis:');
console.log(`Configuration confidence: ${(analysis.confidence * 100).toFixed(1)}%`);
console.log('Reasoning:', analysis.reasoning);
if (analysis.warnings.length > 0) {
console.log('Warnings:', analysis.warnings);
}
const startTime = performance.now();
// Perform the join with suggested optimizations
const result = await joinOperator.semanticJoin(leftData, rightData, {
...analysis.suggestedOptions,
leftOn: 'user_id',
rightOn: 'uid',
how: 'inner',
confidenceThreshold: 0.9
});
const endTime = performance.now();
console.log('\nPerformance Results:');
console.log(`Total execution time: ${(endTime - startTime).toFixed(2)}ms`);
console.log(`Join processing time: ${result.performance.totalTime.toFixed(2)}ms`);
console.log(`Matched records: ${result.statistics.matchedRows.toLocaleString()}`);
console.log(`Throughput: ${((leftSize + rightSize) / (result.performance.totalTime / 1000)).toFixed(0)} records/second`);
const cacheStats = joinOperator.getCacheStats();
console.log(`Cache hit rate: ${(cacheStats.hitRate * 100).toFixed(1)}%`);
// Validate performance targets
const targetTime = index_1.PERFORMANCE_TARGETS.MAX_TIME_100K_ROWS * (leftSize + rightSize) / 100000;
if (result.performance.totalTime <= targetTime) {
console.log(`✓ Performance target met (${targetTime.toFixed(0)}ms target)`);
}
else {
console.log(`⚠ Performance target missed (${targetTime.toFixed(0)}ms target)`);
}
index_1.SemanticJoinMetrics.recordJoinPerformance('large-dataset-001', result);
}
/**
* Example 5: Multi-Column Semantic Join
* Scenario: Joining on multiple related columns with different semantic types
*/
async function multiColumnSemanticJoinExample() {
const cidRegistry = new cid_registry_1.CIDRegistry();
cidRegistry.registerPack({
pack: 'multi-column-domain',
version: '1.0.0',
description: 'Multi-column join concepts',
concepts: [
{
cid: 'person.first_name',
labels: ['first_name', 'fname', 'given_name'],
description: 'First name',
facets: { pii: true },
examples: ['John']
},
{
cid: 'person.last_name',
labels: ['last_name', 'lname', 'surname', 'family_name'],
description: 'Last name',
facets: { pii: true },
examples: ['Smith']
},
{
cid: 'temporal.birth_year',
labels: ['birth_year', 'year_born', 'birth_date'],
description: 'Birth year',
facets: { temporal: true },
examples: ['1985']
}
]
});
const joinOperator = new index_1.SemanticJoinOperator(cidRegistry);
// Dataset 1: Survey responses
const surveyData = {
first_name: ['John', 'Jane', 'Bob', 'Alice', 'Charlie'],
last_name: ['Smith', 'Doe', 'Johnson', 'Brown', 'Wilson'],
birth_year: [1985, 1990, 1978, 1982, 1975],
survey_score: [85, 92, 78, 88, 95]
};
// Dataset 2: Customer records (with slight variations)
const customerData = {
fname: ['John', 'Jane', 'Robert', 'Alice', 'Charles'],
lname: ['Smith', 'Doe', 'Johnson', 'Brown', 'Wilson'],
year_born: [1985, 1990, 1978, 1982, 1975],
customer_segment: ['Premium', 'Standard', 'Premium', 'Premium', 'Enterprise']
};
const result = await joinOperator.semanticJoin(surveyData, customerData, {
leftOn: ['first_name', 'last_name', 'birth_year'],
rightOn: ['fname', 'lname', 'year_born'],
how: 'inner',
confidenceThreshold: 0.8,
enableFuzzyMatching: true,
autoSelectNormalizers: true
});
console.log('\n=== Multi-Column Semantic Join Results ===');
console.log(`Survey responses: ${result.statistics.inputRowsLeft}`);
console.log(`Customer records: ${result.statistics.inputRowsRight}`);
console.log(`Matched records: ${result.statistics.matchedRows}`);
console.log(`Match confidence: ${(result.statistics.confidence.average * 100).toFixed(1)}%`);
// Show details of matches
result.matches.forEach((match, index) => {
console.log(`Match ${index + 1}: Confidence ${(match.confidence * 100).toFixed(1)}%, Type: ${match.matchType}`);
});
}
/**
* Run all examples
*/
async function runAllExamples() {
console.log('🚀 Starting Semantic Join Examples\n');
try {
await customerDataIntegrationExample();
await productCatalogSyncExample();
await employeeDeduplicationExample();
await multiColumnSemanticJoinExample();
await highPerformanceJoinExample();
// Show overall performance report
const performanceReport = index_1.SemanticJoinMetrics.getPerformanceReport();
console.log('\n📊 Overall Performance Report');
console.log(`Total joins executed: ${performanceReport.totalJoins}`);
console.log(`Average throughput: ${performanceReport.averageThroughput.toFixed(0)} records/second`);
console.log(`Average confidence: ${(performanceReport.averageConfidence * 100).toFixed(1)}%`);
console.log(`Average execution time: ${performanceReport.performanceBreakdown.totalTime.toFixed(2)}ms`);
console.log('\n✅ All examples completed successfully!');
}
catch (error) {
console.error('❌ Example execution failed:', error);
}
}
// Export for direct execution
if (require.main === module) {
runAllExamples();
}
//# sourceMappingURL=semantic-join-examples.js.map