UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

github.com/kneelinghorse/semantic-ds-toolkit

335 lines • 16.5 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); const src_1 = require("../../src"); const test_data_generator_1 = require("../fixtures/test-data-generator"); const fs_1 = require("fs"); const path_1 = require("path"); const os = __importStar(require("os")); describe('End-to-End: Complete Semantic Workflow', () => { let tempDir; let anchorSystem; let inferenceEngine; let driftDetector; beforeAll(() => { tempDir = (0, path_1.join)(os.tmpdir(), 'semantic-e2e-tests'); anchorSystem = new src_1.StableColumnAnchorSystem(); inferenceEngine = new src_1.InferenceEngine(); driftDetector = new src_1.DriftDetector(); }); beforeEach(() => { src_1.ShadowSemanticsAPI.resetShadowAPI(); }); describe('CSV → Inference → SCA → SQL Generation', () => { it('should handle full workflow from messy CSV to SQL', async () => { const messyDataset = test_data_generator_1.TestDataGenerator.generateMessyDataset(); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(messyDataset); const csvPath = (0, path_1.join)(tempDir, 'messy-sales.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); const inferenceResults = await inferenceEngine.inferSchema(dataFrame); expect(inferenceResults).toBeDefined(); expect(inferenceResults.columns).toHaveLength(4); const customerIdInference = inferenceResults.columns.find(c => c.name === 'customer_id'); expect(customerIdInference?.semanticType).toContain('identity'); const emailInference = inferenceResults.columns.find(c => c.name === 'email'); expect(emailInference?.semanticType).toContain('contact.email'); await (0, src_1.attachSemanticsShadow)(dataFrame, { customer_id: { cid: 'identity.customer', confidence: 0.91 }, email: { cid: 'contact.email', confidence: 0.87 } }); const anchors = await anchorSystem.createAnchors(dataFrame); expect(anchors).toHaveLength(4); const customerAnchor = anchors.find(a => a.columnName === 'customer_id'); expect(customerAnchor?.fingerprint.statistics.uniqueness).toBeGreaterThan(0.9); const renamedDataFrame = renameColumns(dataFrame, { customer_id: 'cust_identifier', purchase_amount: 'amount' }); const reconciliation = await (0, src_1.reconcileAnchors)(dataFrame, renamedDataFrame); expect(reconciliation.matchedPercentage).toBeGreaterThan(0.95); expect(reconciliation.matches.length).toBeGreaterThanOrEqual(2); const sql = generateSnowflakeSQL(renamedDataFrame, reconciliation); expect(sql).toContain('CREATE OR REPLACE VIEW'); expect(sql).toContain('cust_identifier'); expect(sql).toContain('-- Semantic mapping: identity.customer'); }); it('should maintain semantic context through schema changes', async () => { const originalDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(originalDataset); const csvPath = (0, path_1.join)(tempDir, 'original-sales.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); const originalDF = await loadCSVAsDataFrame(csvPath); await (0, src_1.attachSemanticsShadow)(originalDF, { customer_id: { cid: 'identity.customer', confidence: 0.95 }, email: { cid: 'contact.email', confidence: 0.92 }, phone: { cid: 'contact.phone', confidence: 0.88 } }); const originalAnchors = await anchorSystem.createAnchors(originalDF); const schemaChanges = [ { from: 'customer_id', to: 'id' }, { from: 'email', to: 'email_address' }, { from: 'purchase_amount', to: 'total_amount' }, { from: 'timestamp', to: 'created_at' } ]; let modifiedDF = originalDF; for (const change of schemaChanges) { modifiedDF = renameColumns(modifiedDF, { [change.from]: change.to }); } const reconciliation = await (0, src_1.reconcileAnchors)(originalDF, modifiedDF); expect(reconciliation.matchedPercentage).toBeGreaterThan(0.90); expect(reconciliation.matches.length).toEqual(schemaChanges.length); for (const change of schemaChanges) { const match = reconciliation.matches.find(m => m.originalColumn === change.from && m.newColumn === change.to); expect(match).toBeDefined(); expect(match?.confidence).toBeGreaterThan(0.85); } }); it('should detect and handle semantic drift scenarios', async () => { const baselineDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(5000); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(baselineDataset); const baselinePath = (0, path_1.join)(tempDir, 'baseline.csv'); (0, fs_1.writeFileSync)(baselinePath, csvContent); const baselineDF = await loadCSVAsDataFrame(baselinePath); await (0, src_1.attachSemanticsShadow)(baselineDF, { customer_id: { cid: 'identity.customer', confidence: 0.95 }, email: { cid: 'contact.email', confidence: 0.92 } }); const baselineAnchors = await anchorSystem.createAnchors(baselineDF); const driftDataset = createDriftedDataset(baselineDataset); const driftCsvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(driftDataset); const driftPath = (0, path_1.join)(tempDir, 'drifted.csv'); (0, fs_1.writeFileSync)(driftPath, driftCsvContent); const driftedDF = await loadCSVAsDataFrame(driftPath); const driftResults = await driftDetector.detectDrift(baselineDF, driftedDF, { window: '7d', alertThreshold: 0.1 }); expect(driftResults.alerts).toBeDefined(); expect(driftResults.alerts.length).toBeGreaterThan(0); const emailDrift = driftResults.alerts.find(alert => alert.column === 'email' && alert.type === 'pattern_drift'); expect(emailDrift).toBeDefined(); expect(emailDrift?.severity).toBeGreaterThanOrEqual(0.3); }); }); describe('Semantic Join with Normalization', () => { it('should perform semantic joins with email normalization', async () => { const customersDataset = { name: 'customers', description: 'Customer master data', rows: 500, columns: [ { name: 'id', type: 'string', semanticType: 'identity.customer' }, { name: 'email', type: 'email', semanticType: 'contact.email' }, { name: 'full_name', type: 'string', semanticType: 'identity.person' } ], data: Array.from({ length: 500 }, (_, i) => ({ id: `cust_${i}`, email: `USER${i}@EXAMPLE.COM`, // Uppercase for normalization test full_name: `Customer ${i}` })) }; const transactionsDataset = { name: 'transactions', description: 'Transaction data', rows: 1000, columns: [ { name: 'tx_id', type: 'string' }, { name: 'customer_email', type: 'email', semanticType: 'contact.email' }, { name: 'amount', type: 'number' } ], data: Array.from({ length: 1000 }, (_, i) => ({ tx_id: `tx_${i}`, customer_email: `user${i % 500}@example.com`, // Lowercase amount: Math.random() * 1000 })) }; const customersCSV = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(customersDataset); const transactionsCSV = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(transactionsDataset); const customersPath = (0, path_1.join)(tempDir, 'customers.csv'); const transactionsPath = (0, path_1.join)(tempDir, 'transactions.csv'); (0, fs_1.writeFileSync)(customersPath, customersCSV); (0, fs_1.writeFileSync)(transactionsPath, transactionsCSV); const customersDF = await loadCSVAsDataFrame(customersPath); const transactionsDF = await loadCSVAsDataFrame(transactionsPath); await (0, src_1.attachSemanticsShadow)(customersDF, { email: { cid: 'contact.email', confidence: 0.95 } }); await (0, src_1.attachSemanticsShadow)(transactionsDF, { customer_email: { cid: 'contact.email', confidence: 0.93 } }); const joinResult = await performSemanticJoin(customersDF, transactionsDF, { on: 'contact.email', normalizer: 'email', joinType: 'inner' }); expect(joinResult.rows).toBeGreaterThan(900); // Should match most records expect(joinResult.matchRate).toBeGreaterThan(0.9); const joinedColumns = joinResult.columns; expect(joinedColumns.some(c => c.includes('full_name'))).toBe(true); expect(joinedColumns.some(c => c.includes('amount'))).toBe(true); }); }); describe('SQL Generation for Multiple Targets', () => { it('should generate compatible SQL for different platforms', async () => { const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(100); const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset); const csvPath = (0, path_1.join)(tempDir, 'multi-platform.csv'); (0, fs_1.writeFileSync)(csvPath, csvContent); const dataFrame = await loadCSVAsDataFrame(csvPath); await (0, src_1.attachSemanticsShadow)(dataFrame, { customer_id: { cid: 'identity.customer', confidence: 0.95 }, timestamp: { cid: 'event.timestamp', confidence: 0.92 } }); const platforms = ['snowflake', 'bigquery', 'postgres', 'redshift']; for (const platform of platforms) { const sql = generateSQLForPlatform(dataFrame, platform); expect(sql).toBeDefined(); expect(sql.length).toBeGreaterThan(100); switch (platform) { case 'snowflake': expect(sql).toContain('CREATE OR REPLACE VIEW'); expect(sql).toContain('VARIANT'); break; case 'bigquery': expect(sql).toContain('CREATE OR REPLACE VIEW'); expect(sql).toContain('STRUCT'); break; case 'postgres': expect(sql).toContain('CREATE OR REPLACE VIEW'); expect(sql).toContain('JSONB'); break; case 'redshift': expect(sql).toContain('CREATE OR REPLACE VIEW'); expect(sql).toContain('SUPER'); break; } } }); }); }); // Mock implementations for testing async function loadCSVAsDataFrame(path) { const content = (0, fs_1.readFileSync)(path, 'utf-8'); const lines = content.split('\n').filter(line => line.trim()); const headers = lines[0].split(','); const rows = lines.slice(1).map(line => { const values = line.split(','); const row = {}; headers.forEach((header, index) => { row[header] = values[index] || null; }); return row; }); return { columns: headers, rows, data: rows, getColumn: (name) => rows.map(row => row[name]), getSemantics: (columnName) => undefined }; } function renameColumns(dataFrame, mapping) { const newColumns = dataFrame.columns.map((col) => mapping[col] || col); const newRows = dataFrame.rows.map((row) => { const newRow = {}; Object.keys(row).forEach(key => { const newKey = mapping[key] || key; newRow[newKey] = row[key]; }); return newRow; }); return { ...dataFrame, columns: newColumns, rows: newRows, data: newRows }; } function generateSnowflakeSQL(dataFrame, reconciliation) { const columns = dataFrame.columns.map((col) => { const semantic = getSemanticForColumn(col, reconciliation); return ` ${col} ${getSnowflakeType(col)}${semantic ? ` -- Semantic mapping: ${semantic}` : ''}`; }).join(',\n'); return `CREATE OR REPLACE VIEW semantic_view AS SELECT ${columns} FROM source_table;`; } function generateSQLForPlatform(dataFrame, platform) { const platformConfig = { snowflake: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'VARIANT' }, bigquery: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'STRUCT' }, postgres: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'JSONB' }, redshift: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'SUPER' } }; const config = platformConfig[platform]; return `${config.viewType} semantic_view AS SELECT * FROM source_table -- Platform: ${platform} -- JSON Type: ${config.jsonType}`; } function createDriftedDataset(originalDataset) { const driftedData = originalDataset.data.map(row => ({ ...row, email: Math.random() > 0.3 ? row.email : 'invalid-email@domain.invalid' })); return { ...originalDataset, name: 'drifted_dataset', data: driftedData }; } async function performSemanticJoin(df1, df2, options) { return { rows: Math.min(df1.rows.length, df2.rows.length) * 0.95, matchRate: 0.95, columns: [...df1.columns, ...df2.columns.filter((c) => !df1.columns.includes(c))] }; } function getSemanticForColumn(columnName, reconciliation) { const match = reconciliation?.matches?.find((m) => m.newColumn === columnName); return match?.semanticType || null; } function getSnowflakeType(columnName) { if (columnName.includes('amount')) return 'NUMBER(10,2)'; if (columnName.includes('timestamp') || columnName.includes('date')) return 'TIMESTAMP'; if (columnName.includes('email')) return 'VARCHAR(255)'; return 'VARCHAR(100)'; } //# sourceMappingURL=complete-workflow.test.js.map