UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

450 lines 22.6 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const globals_1 = require("@jest/globals"); const semantic_join_1 = require("./semantic-join"); const cid_registry_1 = require("../registry/cid-registry"); const shadow_semantics_1 = require("../core/shadow-semantics"); const statistical_analyzer_1 = require("../inference/statistical-analyzer"); (0, globals_1.describe)('SemanticJoinOperator', () => { let joinOperator; let cidRegistry; let semanticsLayer; let statisticalAnalyzer; (0, globals_1.beforeEach)(() => { cidRegistry = new cid_registry_1.CIDRegistry(); semanticsLayer = new shadow_semantics_1.ShadowSemanticsLayer(); statisticalAnalyzer = new statistical_analyzer_1.StatisticalAnalyzer(); joinOperator = new semantic_join_1.SemanticJoinOperator(cidRegistry, semanticsLayer, statisticalAnalyzer); // Register basic CID concepts for testing cidRegistry.registerPack({ pack: 'test-pack', version: '1.0.0', description: 'Test concepts', concepts: [ { cid: 'person.email', labels: ['email', 'email_address', 'user_email'], description: 'Email address', facets: { pii: true }, examples: ['user@example.com'] }, { cid: 'person.phone', labels: ['phone', 'phone_number', 'mobile'], description: 'Phone number', facets: { pii: true }, examples: ['+1-555-123-4567'] }, { cid: 'person.name', labels: ['name', 'full_name', 'customer_name'], description: 'Person name', facets: { pii: true }, examples: ['John Doe'] }, { cid: 'identifier.customer_id', labels: ['customer_id', 'cust_id', 'user_id'], description: 'Customer identifier', facets: { identifier: true }, examples: ['CUST-12345'] } ] }); }); (0, globals_1.describe)('Basic Join Functionality', () => { (0, globals_1.it)('should perform exact match join on simple data', async () => { const leftData = { id: [1, 2, 3, 4], email: ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com'] }; const rightData = { user_id: [2, 3, 4, 5], name: ['Bob Smith', 'Charlie Brown', 'David Wilson', 'Eve Davis'] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner', confidenceThreshold: 0.5 }); (0, globals_1.expect)(result.data).toBeDefined(); (0, globals_1.expect)(result.statistics.outputRows).toBe(3); // Should match IDs 2, 3, 4 (0, globals_1.expect)(result.statistics.matchedRows).toBe(3); (0, globals_1.expect)(result.matches).toHaveLength(3); (0, globals_1.expect)(result.performance.totalTime).toBeGreaterThan(0); }); (0, globals_1.it)('should handle left join with unmatched rows', async () => { const leftData = { id: [1, 2, 3], name: ['Alice', 'Bob', 'Charlie'] }; const rightData = { user_id: [2, 4], score: [85, 92] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'left', confidenceThreshold: 0.5 }); (0, globals_1.expect)(result.statistics.outputRows).toBe(3); // All left rows preserved (0, globals_1.expect)(result.statistics.matchedRows).toBe(1); // Only ID 2 matches }); (0, globals_1.it)('should handle outer join with all unmatched rows', async () => { const leftData = { id: [1, 2], name: ['Alice', 'Bob'] }; const rightData = { user_id: [3, 4], score: [85, 92] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'outer', confidenceThreshold: 0.5 }); (0, globals_1.expect)(result.statistics.outputRows).toBe(4); // 2 left + 2 right (0, globals_1.expect)(result.statistics.matchedRows).toBe(0); // No matches }); }); (0, globals_1.describe)('Semantic Type Matching', () => { (0, globals_1.it)('should perform semantic join on email addresses with normalization', async () => { const leftData = { customer_email: ['ALICE@EXAMPLE.COM', 'Bob@Example.Com', 'charlie@example.com'], order_id: ['ORD-001', 'ORD-002', 'ORD-003'] }; const rightData = { user_email: ['alice@example.com', 'bob@example.com', 'eve@example.com'], user_name: ['Alice Smith', 'Bob Jones', 'Eve Davis'] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'customer_email', rightOn: 'user_email', how: 'inner', confidenceThreshold: 0.5, autoSelectNormalizers: true }); (0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // Alice and Bob should match after normalization (0, globals_1.expect)(result.matches.some(m => m.matchType === 'normalized')).toBe(true); (0, globals_1.expect)(result.statistics.confidence.average).toBeGreaterThan(0.5); }); (0, globals_1.it)('should handle phone number normalization', async () => { const leftData = { phone: ['+1-555-123-4567', '555.234.5678', '(555) 345-6789'], customer_id: ['C001', 'C002', 'C003'] }; const rightData = { mobile: ['15551234567', '15552345678', '15556789012'], region: ['West', 'East', 'South'] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'phone', rightOn: 'mobile', how: 'inner', confidenceThreshold: 0.5, autoSelectNormalizers: true }); (0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // First two should match after phone normalization (0, globals_1.expect)(result.matches.some(m => m.matchType === 'normalized')).toBe(true); }); (0, globals_1.it)('should handle name matching with fuzzy logic', async () => { const leftData = { customer_name: ['John Doe', 'Jane Smith', 'Robert Johnson'], account_id: ['A001', 'A002', 'A003'] }; const rightData = { full_name: ['John D.', 'Jane Smyth', 'Bob Johnson'], credit_score: [750, 680, 720] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'customer_name', rightOn: 'full_name', how: 'inner', confidenceThreshold: 0.6, enableFuzzyMatching: true, fuzzyThreshold: 0.7, autoSelectNormalizers: true }); (0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThan(0); (0, globals_1.expect)(result.matches.some(m => m.matchType === 'fuzzy')).toBe(true); }); }); (0, globals_1.describe)('Multi-Column Joins', () => { (0, globals_1.it)('should perform multi-column semantic join', async () => { const leftData = { first_name: ['John', 'Jane', 'Bob'], last_name: ['Doe', 'Smith', 'Johnson'], birth_year: [1985, 1990, 1975] }; const rightData = { fname: ['John', 'Jane', 'Robert'], lname: ['Doe', 'Smith', 'Johnson'], year_born: [1985, 1990, 1975] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: ['first_name', 'last_name'], rightOn: ['fname', 'lname'], how: 'inner', confidenceThreshold: 0.7, autoSelectNormalizers: true }); (0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // John Doe and Jane Smith should match (0, globals_1.expect)(result.statistics.confidence.average).toBeGreaterThan(0.7); }); (0, globals_1.it)('should handle mixed data types in multi-column join', async () => { const leftData = { customer_id: ['C001', 'C002', 'C003'], order_date: ['2023-01-15', '2023-02-20', '2023-03-10'], amount: [100.50, 250.00, 75.25] }; const rightData = { cust_id: ['C001', 'C002', 'C004'], purchase_date: ['2023-01-15', '2023-02-20', '2023-04-05'], total: [100.5, 250, 125.75] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: ['customer_id', 'order_date'], rightOn: ['cust_id', 'purchase_date'], how: 'inner', confidenceThreshold: 0.8 }); (0, globals_1.expect)(result.statistics.matchedRows).toBe(2); }); }); (0, globals_1.describe)('Performance and Optimization', () => { (0, globals_1.it)('should handle large datasets efficiently', async () => { // Generate larger test datasets const leftSize = 10000; const rightSize = 8000; const overlapSize = 5000; const leftData = { id: Array.from({ length: leftSize }, (_, i) => i + 1), email: Array.from({ length: leftSize }, (_, i) => `user${i + 1}@example.com`) }; const rightData = { user_id: Array.from({ length: rightSize }, (_, i) => i + 1000), // Some overlap name: Array.from({ length: rightSize }, (_, i) => `User ${i + 1000}`) }; // Ensure some overlap for (let i = 0; i < overlapSize; i++) { rightData.user_id[i] = leftData.id[i]; } const startTime = performance.now(); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner', confidenceThreshold: 0.5, batchSize: 5000 }); const endTime = performance.now(); (0, globals_1.expect)(result.statistics.matchedRows).toBe(overlapSize); (0, globals_1.expect)(endTime - startTime).toBeLessThan(5000); // Should complete within 5 seconds (0, globals_1.expect)(result.performance.totalTime).toBeGreaterThan(0); (0, globals_1.expect)(result.performance.cacheHits).toBeGreaterThanOrEqual(0); }); (0, globals_1.it)('should cache normalized values effectively', async () => { const leftData = { email: ['user1@EXAMPLE.COM', 'USER2@example.com', 'User3@Example.Com', 'user1@EXAMPLE.COM'], id: [1, 2, 3, 1] }; const rightData = { user_email: ['user1@example.com', 'user2@example.com', 'user4@example.com'], score: [85, 92, 78] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'email', rightOn: 'user_email', how: 'inner', cacheNormalizedValues: true }); const cacheStats = joinOperator.getCacheStats(); (0, globals_1.expect)(cacheStats.hits).toBeGreaterThan(0); // Should have cache hits for repeated values (0, globals_1.expect)(cacheStats.hitRate).toBeGreaterThan(0); }); }); (0, globals_1.describe)('Confidence Scoring', () => { (0, globals_1.it)('should provide detailed confidence metrics', async () => { const leftData = { customer_email: ['alice@example.com', 'bob@company.com', 'charlie@test.org'], customer_id: ['C001', 'C002', 'C003'] }; const rightData = { user_email: ['alice@example.com', 'robert@company.com', 'charles@test.org'], user_score: [95, 87, 76] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'customer_email', rightOn: 'user_email', how: 'inner', confidenceThreshold: 0.5, enableFuzzyMatching: true }); (0, globals_1.expect)(result.statistics.confidence).toBeDefined(); (0, globals_1.expect)(result.statistics.confidence.average).toBeGreaterThan(0); (0, globals_1.expect)(result.statistics.confidence.median).toBeGreaterThan(0); (0, globals_1.expect)(result.statistics.confidence.distribution).toBeDefined(); // Check that we have different confidence levels const distribution = result.statistics.confidence.distribution; const totalDistribution = Object.values(distribution).reduce((a, b) => a + b, 0); (0, globals_1.expect)(totalDistribution).toBe(result.statistics.matchedRows); }); (0, globals_1.it)('should handle low confidence matches appropriately', async () => { const leftData = { name: ['John Smith', 'Jane Doe', 'Bob Johnson'], id: [1, 2, 3] }; const rightData = { full_name: ['Johnny Smithers', 'Janet Doe-Wilson', 'Robert Johns'], score: [85, 92, 78] }; const highThresholdResult = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'name', rightOn: 'full_name', how: 'inner', confidenceThreshold: 0.9, // Very high threshold enableFuzzyMatching: true }); const lowThresholdResult = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'name', rightOn: 'full_name', how: 'inner', confidenceThreshold: 0.3, // Low threshold enableFuzzyMatching: true }); (0, globals_1.expect)(lowThresholdResult.statistics.matchedRows).toBeGreaterThanOrEqual(highThresholdResult.statistics.matchedRows); }); }); (0, globals_1.describe)('Error Handling', () => { (0, globals_1.it)('should throw error for missing join columns', async () => { const leftData = { id: [1, 2, 3] }; const rightData = { user_id: [1, 2, 3] }; await (0, globals_1.expect)(joinOperator.semanticJoin(leftData, rightData, { leftOn: 'missing_column', rightOn: 'user_id' })).rejects.toThrow('Column \'missing_column\' not found'); }); (0, globals_1.it)('should throw error for mismatched join column counts', async () => { const leftData = { id: [1, 2, 3], name: ['a', 'b', 'c'] }; const rightData = { user_id: [1, 2, 3] }; await (0, globals_1.expect)(joinOperator.semanticJoin(leftData, rightData, { leftOn: ['id', 'name'], rightOn: ['user_id'] // Only one column })).rejects.toThrow('Number of left and right join columns must match'); }); (0, globals_1.it)('should handle empty datasets gracefully', async () => { const leftData = { id: [], name: [] }; const rightData = { user_id: [1, 2, 3], score: [85, 92, 78] }; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner' }); (0, globals_1.expect)(result.statistics.outputRows).toBe(0); (0, globals_1.expect)(result.statistics.matchedRows).toBe(0); }); }); (0, globals_1.describe)('Real-world Scenarios', () => { (0, globals_1.it)('should handle customer data integration scenario', async () => { // Scenario: Joining CRM customer data with transaction data const crmData = { customer_id: ['CUST-001', 'CUST-002', 'CUST-003', 'CUST-004'], email: ['alice@company.com', 'bob@startup.io', 'charlie@corp.net', 'diana@firm.org'], full_name: ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince'], registration_date: ['2023-01-15', '2023-02-20', '2023-03-10', '2023-04-05'] }; const transactionData = { cust_id: ['CUST-001', 'CUST-002', 'CUST-005', 'CUST-001'], purchase_amount: [150.00, 299.99, 89.50, 75.25], purchase_date: ['2023-01-20', '2023-02-25', '2023-04-10', '2023-01-25'], product_category: ['Electronics', 'Books', 'Clothing', 'Electronics'] }; const result = await joinOperator.semanticJoin(crmData, transactionData, { leftOn: 'customer_id', rightOn: 'cust_id', how: 'left', // Keep all customers, even without transactions confidenceThreshold: 0.8 }); (0, globals_1.expect)(result.statistics.inputRowsLeft).toBe(4); (0, globals_1.expect)(result.statistics.inputRowsRight).toBe(4); (0, globals_1.expect)(result.statistics.outputRows).toBe(4); // All customers preserved (0, globals_1.expect)(result.statistics.matchedRows).toBe(3); // 3 matching transactions }); (0, globals_1.it)('should handle product catalog integration', async () => { // Scenario: Joining product catalog with inventory data const catalogData = { sku: ['SKU-001', 'SKU-002', 'SKU-003'], product_name: ['Wireless Headphones', 'Smart Watch', 'Bluetooth Speaker'], category: ['Electronics', 'Wearables', 'Audio'], price: [99.99, 299.99, 79.99] }; const inventoryData = { product_code: ['SKU-001', 'SKU-002', 'SKU-004'], warehouse_location: ['WH-A', 'WH-B', 'WH-C'], stock_quantity: [150, 75, 200], last_restocked: ['2023-03-01', '2023-03-15', '2023-03-20'] }; const result = await joinOperator.semanticJoin(catalogData, inventoryData, { leftOn: 'sku', rightOn: 'product_code', how: 'outer', // Show all products and inventory items confidenceThreshold: 0.9 // High confidence for exact product matching }); (0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // SKU-001 and SKU-002 match (0, globals_1.expect)(result.statistics.outputRows).toBe(4); // 3 catalog + 1 unmatched inventory }); (0, globals_1.it)('should handle employee data deduplication scenario', async () => { // Scenario: Deduplicating employee records from different systems const hrSystemData = { emp_id: ['EMP001', 'EMP002', 'EMP003'], employee_name: ['John A. Doe', 'Jane M. Smith', 'Robert Johnson'], email: ['john.doe@company.com', 'j.smith@company.com', 'bob.johnson@company.com'], department: ['Engineering', 'Marketing', 'Sales'] }; const payrollSystemData = { employee_id: ['PAY001', 'PAY002', 'PAY003'], full_name: ['John Doe', 'Jane Smith', 'R. Johnson'], work_email: ['john.doe@company.com', 'jane.smith@company.com', 'robert.j@company.com'], salary: [95000, 75000, 68000] }; // Try matching on both name and email for better accuracy const result = await joinOperator.semanticJoin(hrSystemData, payrollSystemData, { leftOn: ['employee_name', 'email'], rightOn: ['full_name', 'work_email'], how: 'outer', confidenceThreshold: 0.6, enableFuzzyMatching: true, fuzzyThreshold: 0.7 }); (0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThan(0); // Should identify John Doe as definite match (same email) const johnDoeMatch = result.matches.find(m => m.confidence > 0.8 && m.matchType === 'exact'); (0, globals_1.expect)(johnDoeMatch).toBeDefined(); }); }); (0, globals_1.describe)('DataFrame Integration', () => { (0, globals_1.it)('should work with array of objects format', async () => { const leftData = [ { id: 1, name: 'Alice', email: 'alice@example.com' }, { id: 2, name: 'Bob', email: 'bob@example.com' }, { id: 3, name: 'Charlie', email: 'charlie@example.com' } ]; const rightData = [ { user_id: 1, score: 85, region: 'West' }, { user_id: 2, score: 92, region: 'East' }, { user_id: 4, score: 78, region: 'South' } ]; const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner' }); (0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // IDs 1 and 2 match }); }); }); //# sourceMappingURL=semantic-join.test.js.map