UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

282 lines 15.4 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); const anchors_1 = require("../../src/core/anchors"); const inference_engine_1 = require("../../src/inference/inference-engine"); const test_data_generator_1 = require("../fixtures/test-data-generator"); const dataset_loader_1 = require("../fixtures/dataset-loader"); const fs_1 = require("fs"); const path_1 = require("path"); const os = __importStar(require("os")); describe('End-to-End: Basic Workflow Validation', () => { let tempDir; let anchorSystem; let inferenceEngine; beforeAll(() => { tempDir = (0, path_1.join)(os.tmpdir(), 'semantic-basic-e2e'); anchorSystem = new anchors_1.StableColumnAnchorSystem(); inferenceEngine = new inference_engine_1.InferenceEngine(); }); describe('Dataset Loading and Processing', () => { it('should successfully load and parse test datasets', async () => { const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(100); expect(dataset).toBeDefined(); expect(dataset.rows).toBe(100); expect(dataset.columns).toHaveLength(8); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset); expect(csvContent).toContain('customer_id,email,phone'); expect(csvContent.split('\n')).toHaveLength(102); // Header + 100 rows + trailing newline }); it('should handle CSV parsing with the DatasetLoader', async () => { const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(50); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(testData); const csvPath = (0, path_1.join)(tempDir, 'test-dataset.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); try { const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath); expect(loadedDataset.name).toBe('test-dataset'); expect(loadedDataset.rows).toHaveLength(50); expect(loadedDataset.columns).toHaveLength(8); expect(loadedDataset.metadata.rowCount).toBe(50); expect(loadedDataset.metadata.columnCount).toBe(8); // Verify data types were inferred expect(loadedDataset.metadata.dataTypes['customer_id']).toBe('string'); expect(loadedDataset.metadata.dataTypes['email']).toBe('email'); expect(loadedDataset.metadata.dataTypes['purchase_amount']).toBe('float'); // Test data quality validation const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset); expect(qualityResult.score).toBeGreaterThan(0.8); expect(qualityResult.issues).toBeDefined(); expect(qualityResult.recommendations).toBeDefined(); } finally { (0, fs_1.unlinkSync)(csvPath); } }); it('should handle edge case datasets', async () => { const unicodeData = test_data_generator_1.TestDataGenerator.generateUnicodeDataset(); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(unicodeData); const csvPath = (0, path_1.join)(tempDir, 'unicode-test.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); try { const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath); expect(loadedDataset.rows.length).toBe(unicodeData.rows); expect(loadedDataset.columns).toContain('name'); expect(loadedDataset.columns).toContain('email'); // Verify Unicode names are preserved const names = loadedDataset.rows.map(row => row.name); expect(names.some(name => /[\u0600-\u06FF]/.test(name))).toBe(true); // Arabic expect(names.some(name => /[\u4e00-\u9fff]/.test(name))).toBe(true); // Chinese } finally { (0, fs_1.unlinkSync)(csvPath); } }); it('should handle messy data gracefully', async () => { const messyData = test_data_generator_1.TestDataGenerator.generateMessyDataset(); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(messyData); const csvPath = (0, path_1.join)(tempDir, 'messy-test.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); try { const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath); expect(loadedDataset.rows.length).toBe(messyData.rows); // Verify null handling expect(loadedDataset.metadata.nullCount).toBeGreaterThan(0); expect(loadedDataset.metadata.nullPercentage).toBeGreaterThan(0); // Quality should be lower for messy data const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset); expect(qualityResult.score).toBeLessThan(0.7); expect(qualityResult.issues.length).toBeGreaterThan(0); } finally { (0, fs_1.unlinkSync)(csvPath); } }); }); describe('Inference Engine Integration', () => { it('should perform column type inference', async () => { const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(200); // Test email column inference const emailValues = testData.data.map(row => row.email); const emailInference = await inferenceEngine.inferColumnType('email', emailValues); expect(emailInference.columnName).toBe('email'); expect(emailInference.confidence).toBeGreaterThan(0.7); expect(emailInference.semanticType).toContain('email'); // Test customer ID inference const customerIdValues = testData.data.map(row => row.customer_id); const idInference = await inferenceEngine.inferColumnType('customer_id', customerIdValues); expect(idInference.columnName).toBe('customer_id'); expect(idInference.confidence).toBeGreaterThan(0.7); expect(idInference.semanticType).toContain('identifier'); }); it('should handle performance requirements for inference', async () => { const ROWS = 10_000; const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(ROWS); const startTime = Date.now(); // Test inference on all columns const inferencePromises = testData.columns.map(async (column) => { const values = testData.data.map(row => row[column.name]); return inferenceEngine.inferColumnType(column.name, values, { performanceMode: 'fast', sampleSize: 1000 }); }); const results = await Promise.all(inferencePromises); const endTime = Date.now(); const duration = endTime - startTime; const throughput = ROWS / (duration / 1000); expect(results).toHaveLength(testData.columns.length); expect(duration).toBeLessThan(5000); // Should complete in <5 seconds expect(throughput).toBeGreaterThan(2000); // >2k rows/second minimum // Verify all columns got inferences results.forEach(result => { expect(result.columnName).toBeDefined(); expect(result.semanticType).toBeDefined(); expect(result.confidence).toBeGreaterThan(0); }); }); }); describe('Anchor System Integration', () => { it('should create stable column anchors', async () => { const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(100); // Convert to the format expected by anchor system const columnData = testData.columns.map(col => ({ column_name: col.name, values: testData.data.map(row => row[col.name]), data_type: col.type === 'string' ? 'object' : col.type === 'number' ? 'float64' : col.type === 'date' ? 'datetime64' : 'object' })); for (const column of columnData) { const anchor = anchorSystem.createAnchor({ name: 'test_dataset', columns: columnData }, column); expect(anchor).toBeDefined(); expect(anchor.anchorId).toBeDefined(); expect(anchor.columnName).toBe(column.column_name); expect(anchor.fingerprint).toBeDefined(); expect(anchor.fingerprint.statistics).toBeDefined(); } }); it('should handle anchor reconciliation', async () => { const originalData = test_data_generator_1.TestDataGenerator.generateLargeDataset(50); const modifiedData = { ...originalData, columns: originalData.columns.map(col => ({ ...col, name: col.name === 'customer_id' ? 'cust_id' : col.name === 'email' ? 'email_address' : col.name })) }; // Convert to column data format const originalColumns = originalData.columns.map(col => ({ column_name: col.name, values: originalData.data.map(row => row[col.name]), data_type: 'object' })); const modifiedColumns = modifiedData.columns.map(col => ({ column_name: col.name, values: originalData.data.map(row => { const originalColName = col.name === 'cust_id' ? 'customer_id' : col.name === 'email_address' ? 'email' : col.name; return row[originalColName]; }), data_type: 'object' })); // Create anchors for original data const originalAnchors = originalColumns.map(col => anchorSystem.createAnchor({ name: 'original', columns: originalColumns }, col)); // Test reconciliation const reconciliationResult = anchorSystem.reconcileAnchors(modifiedColumns, originalAnchors, { similarity_threshold: 0.8 }); expect(reconciliationResult).toBeDefined(); expect(reconciliationResult.total_columns).toBe(modifiedColumns.length); expect(reconciliationResult.matched_columns).toBeGreaterThan(0); // Should match renamed columns const customerMatch = reconciliationResult.matches.find(match => match.new_column === 'cust_id'); expect(customerMatch).toBeDefined(); expect(customerMatch?.confidence_score).toBeGreaterThan(0.8); }); }); describe('Performance Validation', () => { it('should maintain reasonable memory usage', async () => { const initialMemory = process.memoryUsage(); // Process multiple datasets for (let i = 0; i < 10; i++) { const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset); const csvPath = (0, path_1.join)(tempDir, `perf-test-${i}.csv`); (0, fs_1.writeFileSync)(csvPath, csvContent); try { const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath); await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset); } finally { (0, fs_1.unlinkSync)(csvPath); } // Force garbage collection if available if (global.gc) { global.gc(); } } const finalMemory = process.memoryUsage(); const memoryGrowth = (finalMemory.heapUsed - initialMemory.heapUsed) / 1024 / 1024; expect(memoryGrowth).toBeLessThan(50); // <50MB growth for processing 10k rows }); it('should handle concurrent operations', async () => { const concurrentTasks = Array.from({ length: 5 }, async (_, i) => { const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(500); const csvPath = (0, path_1.join)(tempDir, `concurrent-${i}.csv`); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset); (0, fs_1.writeFileSync)(csvPath, csvContent); try { const startTime = Date.now(); const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath); const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset); const duration = Date.now() - startTime; return { duration, qualityScore: qualityResult.score, taskId: i }; } finally { (0, fs_1.unlinkSync)(csvPath); } }); const results = await Promise.all(concurrentTasks); expect(results).toHaveLength(5); results.forEach(result => { expect(result.duration).toBeLessThan(2000); // <2 seconds per task expect(result.qualityScore).toBeGreaterThan(0.8); }); const avgDuration = results.reduce((sum, r) => sum + r.duration, 0) / results.length; expect(avgDuration).toBeLessThan(1000); // Average <1 second }); }); }); //# sourceMappingURL=basic-e2e.test.js.map