UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

github.com/kneelinghorse/semantic-ds-toolkit

427 lines • 21.4 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); const src_1 = require("../../src"); const test_data_generator_1 = require("../fixtures/test-data-generator"); const fs_1 = require("fs"); const path_1 = require("path"); const perf_hooks_1 = require("perf_hooks"); const os = __importStar(require("os")); describe('End-to-End: Performance Validation', () => { let tempDir; let anchorSystem; let inferenceEngine; let driftDetector; let fuzzyMatcher; beforeAll(() => { tempDir = (0, path_1.join)(os.tmpdir(), 'semantic-performance-tests'); anchorSystem = new src_1.StableColumnAnchorSystem(); inferenceEngine = new src_1.InferenceEngine(); driftDetector = new src_1.DriftDetector(); fuzzyMatcher = new src_1.FuzzyMatcher(); // Force garbage collection if available if (global.gc) { global.gc(); } }); beforeEach(() => { // Clean up memory before each test if (global.gc) { global.gc(); } }); describe('1M+ Row Processing Targets', () => { it('should achieve 1M+ rows/second throughput for batch processing', async () => { const ROWS = 1_000_000; const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(ROWS); const startTime = perf_hooks_1.performance.now(); const initialMemory = process.memoryUsage(); // Process in batches to simulate real-world usage const BATCH_SIZE = 100_000; const batches = Math.ceil(ROWS / BATCH_SIZE); const results = []; for (let i = 0; i < batches; i++) { const batchStart = i * BATCH_SIZE; const batchEnd = Math.min(batchStart + BATCH_SIZE, ROWS); const batchData = dataset.data.slice(batchStart, batchEnd); const batchDataFrame = { columns: dataset.columns.map(c => c.name), data: batchData, rows: batchData }; const batchResult = await processBatch(batchDataFrame); results.push(batchResult); } const endTime = perf_hooks_1.performance.now(); const finalMemory = process.memoryUsage(); const duration = (endTime - startTime) / 1000; // Convert to seconds const throughput = ROWS / duration; const memoryUsageMB = (finalMemory.heapUsed - initialMemory.heapUsed) / 1024 / 1024; const metrics = { throughputRowsPerSecond: throughput, memoryUsageMB, duration }; console.log(`Batch Processing Performance:`, metrics); expect(throughput).toBeGreaterThan(1_000_000); // 1M+ rows/second target expect(memoryUsageMB).toBeLessThan(500); // Memory usage should be reasonable expect(results.length).toEqual(batches); }); it('should maintain <100ms inference latency for 1M rows', async () => { const ROWS = 1_000_000; const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(ROWS); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset); const csvPath = (0, path_1.join)(tempDir, 'inference_1m.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); const startTime = perf_hooks_1.performance.now(); const inferenceResults = await inferenceEngine.inferSchema(dataFrame, { optimizeForSpeed: true, maxSampleSize: 10000, // Use sampling for large datasets confidenceThreshold: 0.7 }); const endTime = perf_hooks_1.performance.now(); const duration = endTime - startTime; console.log(`Inference Duration for 1M rows: ${duration.toFixed(2)}ms`); expect(duration).toBeLessThan(100); // <100ms target expect(inferenceResults.columns.length).toEqual(8); // Verify inference quality despite speed optimization const highConfidenceColumns = inferenceResults.columns.filter(c => c.confidence > 0.7); expect(highConfidenceColumns.length).toBeGreaterThanOrEqual(4); }); it('should achieve >90% cache hit rate for repeated operations', async () => { const ROWS = 500_000; const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(ROWS); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset); const csvPath = (0, path_1.join)(tempDir, 'cache_test.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); // First run to populate cache const firstRun = await measureCachedOperation(dataFrame); // Second run should hit cache const secondRun = await measureCachedOperation(dataFrame); // Third run with slight modification const modifiedDF = { ...dataFrame, modified: true }; const thirdRun = await measureCachedOperation(modifiedDF); const totalOperations = 3; const cacheHits = calculateCacheHits([firstRun, secondRun, thirdRun]); const cacheHitRate = cacheHits / totalOperations; console.log(`Cache Hit Rate: ${(cacheHitRate * 100).toFixed(1)}%`); console.log(`Performance improvement: ${(firstRun.duration / secondRun.duration).toFixed(2)}x`); expect(cacheHitRate).toBeGreaterThan(0.90); // >90% cache hit rate expect(secondRun.duration).toBeLessThan(firstRun.duration * 0.2); // 5x speedup from cache }); }); describe('Memory Efficiency Validation', () => { it('should process 5M rows without memory leaks', async () => { const ROWS = 5_000_000; const BATCH_SIZE = 250_000; const batches = Math.ceil(ROWS / BATCH_SIZE); const initialMemory = process.memoryUsage(); const memorySnapshots = []; for (let i = 0; i < batches; i++) { const batchDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(BATCH_SIZE); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(batchDataset); const csvPath = (0, path_1.join)(tempDir, `memory_test_batch_${i}.csv`); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); await processBatch(dataFrame); // Force garbage collection if (global.gc) { global.gc(); } const currentMemory = process.memoryUsage(); memorySnapshots.push(currentMemory.heapUsed); // Clean up the file require('fs').unlinkSync(csvPath); } const finalMemory = process.memoryUsage(); const memoryGrowth = (finalMemory.heapUsed - initialMemory.heapUsed) / 1024 / 1024; // Check for consistent memory usage (no significant growth trend) const memoryTrend = calculateMemoryTrend(memorySnapshots); console.log(`Memory growth after processing 5M rows: ${memoryGrowth.toFixed(2)}MB`); console.log(`Memory trend: ${memoryTrend > 0 ? 'increasing' : 'stable'}`); expect(memoryGrowth).toBeLessThan(100); // <100MB growth allowed expect(Math.abs(memoryTrend)).toBeLessThan(0.1); // Stable memory usage }); it('should handle concurrent processing without memory explosion', async () => { const CONCURRENT_TASKS = 5; const ROWS_PER_TASK = 200_000; const initialMemory = process.memoryUsage(); const concurrentPromises = Array.from({ length: CONCURRENT_TASKS }, async (_, i) => { const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(ROWS_PER_TASK); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset); const csvPath = (0, path_1.join)(tempDir, `concurrent_${i}.csv`); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); const startTime = perf_hooks_1.performance.now(); const result = await processBatch(dataFrame); const duration = perf_hooks_1.performance.now() - startTime; require('fs').unlinkSync(csvPath); return { result, duration, taskId: i }; }); const results = await Promise.all(concurrentPromises); const finalMemory = process.memoryUsage(); const totalRows = CONCURRENT_TASKS * ROWS_PER_TASK; const avgDuration = results.reduce((sum, r) => sum + r.duration, 0) / results.length; const memoryUsage = (finalMemory.heapUsed - initialMemory.heapUsed) / 1024 / 1024; console.log(`Concurrent processing: ${totalRows} rows, avg duration: ${avgDuration.toFixed(2)}ms`); console.log(`Memory usage: ${memoryUsage.toFixed(2)}MB`); expect(results.length).toEqual(CONCURRENT_TASKS); expect(memoryUsage).toBeLessThan(300); // Memory should not explode expect(avgDuration).toBeLessThan(5000); // Reasonable processing time }); }); describe('Real-time Processing Performance', () => { it('should maintain low latency for streaming inference', async () => { const STREAM_SIZE = 100_000; const BATCH_SIZE = 1000; const latencies = []; const throughputs = []; for (let i = 0; i < STREAM_SIZE; i += BATCH_SIZE) { const batchDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(BATCH_SIZE); const dataFrame = { columns: batchDataset.columns.map(c => c.name), data: batchDataset.data, rows: batchDataset.data }; const startTime = perf_hooks_1.performance.now(); const inferenceResults = await inferenceEngine.inferSchema(dataFrame, { optimizeForSpeed: true, useCache: true }); const endTime = perf_hooks_1.performance.now(); const latency = endTime - startTime; const throughput = BATCH_SIZE / (latency / 1000); latencies.push(latency); throughputs.push(throughput); // Simulate real-time constraints expect(latency).toBeLessThan(50); // <50ms per batch } const avgLatency = latencies.reduce((sum, l) => sum + l, 0) / latencies.length; const avgThroughput = throughputs.reduce((sum, t) => sum + t, 0) / throughputs.length; const p95Latency = percentile(latencies, 95); console.log(`Streaming Performance - Avg Latency: ${avgLatency.toFixed(2)}ms, P95: ${p95Latency.toFixed(2)}ms`); console.log(`Average Throughput: ${avgThroughput.toFixed(0)} rows/sec`); expect(avgLatency).toBeLessThan(30); // Average <30ms expect(p95Latency).toBeLessThan(100); // P95 <100ms expect(avgThroughput).toBeGreaterThan(50_000); // >50k rows/sec }); it('should scale linearly with data size up to memory limits', async () => { const sizes = [100_000, 250_000, 500_000, 1_000_000]; const scalingResults = []; for (const size of sizes) { const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(size); const dataFrame = { columns: dataset.columns.map(c => c.name), data: dataset.data, rows: dataset.data }; const startTime = perf_hooks_1.performance.now(); await processBatch(dataFrame); const endTime = perf_hooks_1.performance.now(); const duration = endTime - startTime; const throughput = size / (duration / 1000); scalingResults.push({ size, duration, throughput }); console.log(`Size: ${size}, Duration: ${duration.toFixed(2)}ms, Throughput: ${throughput.toFixed(0)} rows/sec`); } // Check for linear scaling (throughput should remain relatively constant) const throughputs = scalingResults.map(r => r.throughput); const throughputVariation = (Math.max(...throughputs) - Math.min(...throughputs)) / Math.min(...throughputs); console.log(`Throughput variation: ${(throughputVariation * 100).toFixed(1)}%`); expect(throughputVariation).toBeLessThan(0.5); // <50% variation indicates good scaling }); }); describe('Edge Case Performance', () => { it('should handle wide datasets (many columns) efficiently', async () => { const ROWS = 50_000; const COLUMNS = 500; // Very wide dataset const wideDataset = generateWideDataset(ROWS, COLUMNS); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(wideDataset); const csvPath = (0, path_1.join)(tempDir, 'wide_dataset.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); const startTime = perf_hooks_1.performance.now(); const anchors = await anchorSystem.createAnchors(dataFrame); const endTime = perf_hooks_1.performance.now(); const duration = endTime - startTime; const columnsPerSecond = COLUMNS / (duration / 1000); console.log(`Wide dataset processing: ${COLUMNS} columns in ${duration.toFixed(2)}ms`); console.log(`Column processing rate: ${columnsPerSecond.toFixed(0)} columns/sec`); expect(anchors.length).toEqual(COLUMNS); expect(duration).toBeLessThan(10000); // <10 seconds for 500 columns expect(columnsPerSecond).toBeGreaterThan(50); // >50 columns/sec }); it('should handle sparse datasets with many nulls efficiently', async () => { const ROWS = 500_000; const NULL_PERCENTAGE = 0.8; // 80% nulls const sparseDataset = generateSparseDataset(ROWS, NULL_PERCENTAGE); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(sparseDataset); const csvPath = (0, path_1.join)(tempDir, 'sparse_dataset.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); const startTime = perf_hooks_1.performance.now(); const inferenceResults = await inferenceEngine.inferSchema(dataFrame); const endTime = perf_hooks_1.performance.now(); const duration = endTime - startTime; const throughput = ROWS / (duration / 1000); console.log(`Sparse dataset processing: ${ROWS} rows (${NULL_PERCENTAGE * 100}% nulls) in ${duration.toFixed(2)}ms`); expect(duration).toBeLessThan(2000); // Should handle nulls efficiently expect(throughput).toBeGreaterThan(250_000); // Reasonable throughput despite sparsity expect(inferenceResults.columns.length).toEqual(sparseDataset.columns.length); }); }); }); // Helper functions async function processBatch(dataFrame) { // Simulate typical batch processing workflow const inferenceResults = await inferenceEngine.inferSchema(dataFrame, { optimizeForSpeed: true }); const semanticMappings = {}; inferenceResults.columns.forEach(col => { if (col.confidence > 0.7) { semanticMappings[col.name] = { cid: col.semanticType, confidence: col.confidence }; } }); await (0, src_1.attachSemanticsShadow)(dataFrame, semanticMappings); const anchors = await anchorSystem.createAnchors(dataFrame); return { inferenceResults, semanticMappings, anchors, processedRows: dataFrame.data.length }; } async function measureCachedOperation(dataFrame) { const startTime = perf_hooks_1.performance.now(); // Simulate operation that can be cached const cacheKey = JSON.stringify(dataFrame.columns); const cached = global.__PERFORMANCE_CACHE__?.[cacheKey]; if (cached) { const endTime = perf_hooks_1.performance.now(); return { duration: endTime - startTime, fromCache: true }; } // Perform actual operation await processBatch(dataFrame); // Store in cache if (!global.__PERFORMANCE_CACHE__) { global.__PERFORMANCE_CACHE__ = {}; } global.__PERFORMANCE_CACHE__[cacheKey] = true; const endTime = perf_hooks_1.performance.now(); return { duration: endTime - startTime, fromCache: false }; } function calculateCacheHits(results) { return results.filter(r => r.fromCache).length; } function calculateMemoryTrend(snapshots) { if (snapshots.length < 2) return 0; const first = snapshots[0]; const last = snapshots[snapshots.length - 1]; return (last - first) / first; // Relative growth } function percentile(values, p) { const sorted = values.slice().sort((a, b) => a - b); const index = Math.ceil((p / 100) * sorted.length) - 1; return sorted[index]; } function generateWideDataset(rows, columns) { const columnDefs = Array.from({ length: columns }, (_, i) => ({ name: `col_${i}`, type: 'string' })); const data = Array.from({ length: rows }, (_, rowIndex) => { const row = {}; columnDefs.forEach((col, colIndex) => { row[col.name] = `value_${rowIndex}_${colIndex}`; }); return row; }); return { name: 'wide_dataset', description: `Dataset with ${columns} columns`, rows, columns: columnDefs, data }; } function generateSparseDataset(rows, nullPercentage) { const data = Array.from({ length: rows }, (_, i) => ({ id: i, value1: Math.random() < nullPercentage ? null : `value_${i}`, value2: Math.random() < nullPercentage ? null : Math.random() * 100, value3: Math.random() < nullPercentage ? null : new Date().toISOString(), value4: Math.random() < nullPercentage ? null : `category_${i % 10}` })); return { name: 'sparse_dataset', description: `Dataset with ${nullPercentage * 100}% null values`, rows, columns: [ { name: 'id', type: 'number' }, { name: 'value1', type: 'string' }, { name: 'value2', type: 'number' }, { name: 'value3', type: 'string' }, { name: 'value4', type: 'string' } ], data }; } async function loadCSVAsDataFrame(path) { const fs = require('fs'); const content = fs.readFileSync(path, 'utf-8'); const lines = content.split('\n').filter(line => line.trim()); const headers = lines[0].split(','); const rows = lines.slice(1).map(line => { const values = line.split(','); const row = {}; headers.forEach((header, index) => { const value = values[index]; row[header] = value === '' || value === 'null' ? null : value; }); return row; }); return { columns: headers, rows, data: rows }; } //# sourceMappingURL=performance-validation.test.js.map