UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

304 lines 16.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const globals_1 = require("@jest/globals"); const semantic_join_1 = require("./semantic-join"); const cid_registry_1 = require("../registry/cid-registry"); (0, globals_1.describe)('SemanticJoinOperator Performance Tests', () => { let joinOperator; let cidRegistry; (0, globals_1.beforeEach)(() => { cidRegistry = new cid_registry_1.CIDRegistry(); joinOperator = new semantic_join_1.SemanticJoinOperator(cidRegistry); // Register performance test concepts cidRegistry.registerPack({ pack: 'performance-test-pack', version: '1.0.0', description: 'Performance test concepts', concepts: [ { cid: 'identifier.user_id', labels: ['user_id', 'id', 'customer_id'], description: 'User identifier', facets: { identifier: true }, examples: ['USER-12345'] }, { cid: 'person.email', labels: ['email', 'email_address', 'user_email'], description: 'Email address', facets: { pii: true }, examples: ['user@example.com'] } ] }); }); (0, globals_1.describe)('Scalability Tests', () => { (0, globals_1.it)('should handle 100K rows in under 100ms for simple exact matches', async () => { const leftSize = 100000; const rightSize = 100000; const overlapSize = 50000; console.log(`Generating ${leftSize} left rows and ${rightSize} right rows...`); // Generate test data const leftData = generateTestData('left', leftSize, overlapSize); const rightData = generateTestData('right', rightSize, overlapSize); console.log('Starting performance test...'); const startTime = performance.now(); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner', confidenceThreshold: 0.9, batchSize: 50000, cacheNormalizedValues: true, enableFuzzyMatching: false // Disable for pure performance test }); const endTime = performance.now(); const executionTime = endTime - startTime; console.log(`Execution time: ${executionTime.toFixed(2)}ms`); console.log(`Matched rows: ${result.statistics.matchedRows}`); console.log(`Cache hit rate: ${(joinOperator.getCacheStats().hitRate * 100).toFixed(1)}%`); // Performance assertions (0, globals_1.expect)(executionTime).toBeLessThan(200); // Allow some tolerance for CI environments (0, globals_1.expect)(result.statistics.matchedRows).toBe(overlapSize); (0, globals_1.expect)(result.performance.totalTime).toBeLessThan(200); // Memory efficiency check (0, globals_1.expect)(result.performance.cacheHits).toBeGreaterThan(0); }); (0, globals_1.it)('should maintain performance with fuzzy matching on 10K rows', async () => { const leftSize = 10000; const rightSize = 10000; const leftData = generateEmailTestData('left', leftSize); const rightData = generateEmailTestData('right', rightSize, 0.3); // 30% variation const startTime = performance.now(); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'email', rightOn: 'user_email', how: 'inner', confidenceThreshold: 0.6, enableFuzzyMatching: true, fuzzyThreshold: 0.8, autoSelectNormalizers: true, batchSize: 5000 }); const endTime = performance.now(); const executionTime = endTime - startTime; console.log(`Fuzzy matching execution time: ${executionTime.toFixed(2)}ms`); console.log(`Matched rows: ${result.statistics.matchedRows}`); console.log(`Average confidence: ${(result.statistics.confidence.average * 100).toFixed(1)}%`); // Should complete within reasonable time even with fuzzy matching (0, globals_1.expect)(executionTime).toBeLessThan(5000); (0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThan(0); }); (0, globals_1.it)('should handle multi-column joins efficiently', async () => { const dataSize = 25000; const leftData = { first_name: generateNames('first', dataSize), last_name: generateNames('last', dataSize), birth_year: generateYears(dataSize), id: Array.from({ length: dataSize }, (_, i) => i + 1) }; const rightData = { fname: leftData.first_name.slice(0, dataSize * 0.7), // 70% overlap lname: leftData.last_name.slice(0, dataSize * 0.7), year_born: leftData.birth_year.slice(0, dataSize * 0.7), score: Array.from({ length: dataSize * 0.7 }, () => Math.floor(Math.random() * 100) + 1) }; const startTime = performance.now(); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: ['first_name', 'last_name', 'birth_year'], rightOn: ['fname', 'lname', 'year_born'], how: 'inner', confidenceThreshold: 0.8, batchSize: 10000 }); const endTime = performance.now(); const executionTime = endTime - startTime; console.log(`Multi-column join execution time: ${executionTime.toFixed(2)}ms`); console.log(`Matched rows: ${result.statistics.matchedRows}`); (0, globals_1.expect)(executionTime).toBeLessThan(3000); (0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThan(0); }); }); (0, globals_1.describe)('Memory Efficiency Tests', () => { (0, globals_1.it)('should handle large datasets without excessive memory usage', async () => { const leftSize = 50000; const rightSize = 50000; const leftData = { id: Array.from({ length: leftSize }, (_, i) => `ID-${(i + 1).toString().padStart(6, '0')}`), email: Array.from({ length: leftSize }, (_, i) => `user${i + 1}@company${Math.floor(i / 1000)}.com`), name: Array.from({ length: leftSize }, (_, i) => `User ${i + 1}`), data: Array.from({ length: leftSize }, (_, i) => `Some long string data ${i} that takes up memory space`) }; const rightData = { user_id: leftData.id.slice(10000, 40000), // Subset with offset score: Array.from({ length: 30000 }, () => Math.floor(Math.random() * 1000)), metadata: Array.from({ length: 30000 }, (_, i) => `Metadata ${i} with additional information`) }; // Monitor memory usage (simplified) const initialMemory = process.memoryUsage(); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner', batchSize: 25000, // Large batches for efficiency cacheNormalizedValues: true }); const finalMemory = process.memoryUsage(); const memoryIncrease = finalMemory.heapUsed - initialMemory.heapUsed; console.log(`Memory increase: ${(memoryIncrease / 1024 / 1024).toFixed(2)} MB`); console.log(`Cache hit rate: ${(joinOperator.getCacheStats().hitRate * 100).toFixed(1)}%`); (0, globals_1.expect)(result.statistics.matchedRows).toBe(30000); // Memory increase should be reasonable (less than 100MB for this test) (0, globals_1.expect)(memoryIncrease).toBeLessThan(100 * 1024 * 1024); }); (0, globals_1.it)('should benefit from normalization caching', async () => { const dataSize = 20000; const duplicateRate = 0.3; // 30% duplicate values const emailVariations = [ '@gmail.com', '@GMAIL.COM', '@Gmail.Com', '@yahoo.com', '@YAHOO.COM', '@Yahoo.Com', '@outlook.com', '@OUTLOOK.COM', '@Outlook.Com' ]; const leftData = { email: Array.from({ length: dataSize }, (_, i) => { const baseIndex = Math.floor(i * (1 - duplicateRate)); const domain = emailVariations[baseIndex % emailVariations.length]; return `user${baseIndex}${domain}`; }), id: Array.from({ length: dataSize }, (_, i) => i + 1) }; const rightData = { user_email: leftData.email.slice(5000, 15000), // Subset for joining score: Array.from({ length: 10000 }, () => Math.floor(Math.random() * 100)) }; // First run to warm up cache await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'email', rightOn: 'user_email', how: 'inner', cacheNormalizedValues: true, autoSelectNormalizers: true }); const cacheStatsAfterWarmup = joinOperator.getCacheStats(); // Second run should benefit from cache const startTime = performance.now(); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'email', rightOn: 'user_email', how: 'inner', cacheNormalizedValues: true, autoSelectNormalizers: true }); const endTime = performance.now(); const cachedRunTime = endTime - startTime; const finalCacheStats = joinOperator.getCacheStats(); console.log(`Cached run time: ${cachedRunTime.toFixed(2)}ms`); console.log(`Cache hit rate: ${(finalCacheStats.hitRate * 100).toFixed(1)}%`); console.log(`Cache hits gained: ${finalCacheStats.hits - cacheStatsAfterWarmup.hits}`); (0, globals_1.expect)(finalCacheStats.hitRate).toBeGreaterThan(0.5); // Should have good cache utilization (0, globals_1.expect)(finalCacheStats.hits).toBeGreaterThan(cacheStatsAfterWarmup.hits); }); }); (0, globals_1.describe)('Batching and Parallelization Tests', () => { (0, globals_1.it)('should process data in batches for large datasets', async () => { const leftSize = 75000; const rightSize = 75000; const batchSize = 15000; const leftData = generateTestData('left', leftSize, leftSize * 0.6); const rightData = generateTestData('right', rightSize, rightSize * 0.6); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner', batchSize: batchSize, confidenceThreshold: 0.8 }); // Verify that batching was effective (0, globals_1.expect)(result.performance.totalOperations).toBeGreaterThan(0); (0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThan(0); // Should handle batching without errors (0, globals_1.expect)(result.data).toBeDefined(); }); }); (0, globals_1.describe)('Stress Tests', () => { (0, globals_1.it)('should handle edge cases without performance degradation', async () => { const testCases = [ // High cardinality { leftSize: 10000, rightSize: 10000, overlap: 0.9, description: 'High overlap' }, // Low cardinality { leftSize: 10000, rightSize: 10000, overlap: 0.1, description: 'Low overlap' }, // Skewed sizes { leftSize: 50000, rightSize: 5000, overlap: 0.5, description: 'Skewed left' }, { leftSize: 5000, rightSize: 50000, overlap: 0.5, description: 'Skewed right' } ]; for (const testCase of testCases) { console.log(`Testing ${testCase.description}...`); const leftData = generateTestData('left', testCase.leftSize, Math.floor(testCase.leftSize * testCase.overlap)); const rightData = generateTestData('right', testCase.rightSize, Math.floor(testCase.rightSize * testCase.overlap)); const startTime = performance.now(); const result = await joinOperator.semanticJoin(leftData, rightData, { leftOn: 'id', rightOn: 'user_id', how: 'inner', batchSize: 10000 }); const endTime = performance.now(); const executionTime = endTime - startTime; console.log(` ${testCase.description} - Time: ${executionTime.toFixed(2)}ms, Matches: ${result.statistics.matchedRows}`); // Should complete within reasonable time for all cases (0, globals_1.expect)(executionTime).toBeLessThan(10000); (0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThanOrEqual(0); } }); }); // Helper functions for generating test data function generateTestData(prefix, size, overlapSize) { const data = { id: Array.from({ length: size }, (_, i) => `${prefix.toUpperCase()}-${(i + 1).toString().padStart(6, '0')}`), name: Array.from({ length: size }, (_, i) => `${prefix} User ${i + 1}`), email: Array.from({ length: size }, (_, i) => `${prefix}user${i + 1}@example.com`) }; if (prefix === 'right') { // Create overlap by using some IDs from left pattern for (let i = 0; i < overlapSize; i++) { data.id[i] = `LEFT-${(i + 1).toString().padStart(6, '0')}`; } // Rename id field for right data return { user_id: data.id, name: data.name, email: data.email, score: Array.from({ length: size }, () => Math.floor(Math.random() * 100) + 1) }; } return data; } function generateEmailTestData(prefix, size, variationRate = 0) { const domains = ['gmail.com', 'yahoo.com', 'outlook.com', 'company.com', 'example.org']; const variations = ['', '.', '_', '-']; return { email: Array.from({ length: size }, (_, i) => { const baseName = `${prefix}user${Math.floor(i * (1 - variationRate))}`; const variation = Math.random() < variationRate ? variations[Math.floor(Math.random() * variations.length)] : ''; const domain = domains[Math.floor(Math.random() * domains.length)]; return `${baseName}${variation}@${domain}`; }), [prefix === 'left' ? 'id' : 'user_id']: Array.from({ length: size }, (_, i) => i + 1) }; } function generateNames(type, size) { const firstNames = ['John', 'Jane', 'Bob', 'Alice', 'Charlie', 'Diana', 'Frank', 'Grace']; const lastNames = ['Smith', 'Johnson', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor']; const namePool = type === 'first' ? firstNames : lastNames; return Array.from({ length: size }, (_, i) => { const baseName = namePool[i % namePool.length]; return i % 3 === 0 ? baseName : `${baseName}${Math.floor(i / namePool.length)}`; }); } function generateYears(size) { return Array.from({ length: size }, () => 1950 + Math.floor(Math.random() * 70) // Years 1950-2020 ); } }); //# sourceMappingURL=semantic-join-performance.test.js.map