semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
335 lines • 16.5 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const src_1 = require("../../src");
const test_data_generator_1 = require("../fixtures/test-data-generator");
const fs_1 = require("fs");
const path_1 = require("path");
const os = __importStar(require("os"));
describe('End-to-End: Complete Semantic Workflow', () => {
let tempDir;
let anchorSystem;
let inferenceEngine;
let driftDetector;
beforeAll(() => {
tempDir = (0, path_1.join)(os.tmpdir(), 'semantic-e2e-tests');
anchorSystem = new src_1.StableColumnAnchorSystem();
inferenceEngine = new src_1.InferenceEngine();
driftDetector = new src_1.DriftDetector();
});
beforeEach(() => {
src_1.ShadowSemanticsAPI.resetShadowAPI();
});
describe('CSV → Inference → SCA → SQL Generation', () => {
it('should handle full workflow from messy CSV to SQL', async () => {
const messyDataset = test_data_generator_1.TestDataGenerator.generateMessyDataset();
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(messyDataset);
const csvPath = (0, path_1.join)(tempDir, 'messy-sales.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const dataFrame = await loadCSVAsDataFrame(csvPath);
const inferenceResults = await inferenceEngine.inferSchema(dataFrame);
expect(inferenceResults).toBeDefined();
expect(inferenceResults.columns).toHaveLength(4);
const customerIdInference = inferenceResults.columns.find(c => c.name === 'customer_id');
expect(customerIdInference?.semanticType).toContain('identity');
const emailInference = inferenceResults.columns.find(c => c.name === 'email');
expect(emailInference?.semanticType).toContain('contact.email');
await (0, src_1.attachSemanticsShadow)(dataFrame, {
customer_id: { cid: 'identity.customer', confidence: 0.91 },
email: { cid: 'contact.email', confidence: 0.87 }
});
const anchors = await anchorSystem.createAnchors(dataFrame);
expect(anchors).toHaveLength(4);
const customerAnchor = anchors.find(a => a.columnName === 'customer_id');
expect(customerAnchor?.fingerprint.statistics.uniqueness).toBeGreaterThan(0.9);
const renamedDataFrame = renameColumns(dataFrame, {
customer_id: 'cust_identifier',
purchase_amount: 'amount'
});
const reconciliation = await (0, src_1.reconcileAnchors)(dataFrame, renamedDataFrame);
expect(reconciliation.matchedPercentage).toBeGreaterThan(0.95);
expect(reconciliation.matches.length).toBeGreaterThanOrEqual(2);
const sql = generateSnowflakeSQL(renamedDataFrame, reconciliation);
expect(sql).toContain('CREATE OR REPLACE VIEW');
expect(sql).toContain('cust_identifier');
expect(sql).toContain('-- Semantic mapping: identity.customer');
});
it('should maintain semantic context through schema changes', async () => {
const originalDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(originalDataset);
const csvPath = (0, path_1.join)(tempDir, 'original-sales.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const originalDF = await loadCSVAsDataFrame(csvPath);
await (0, src_1.attachSemanticsShadow)(originalDF, {
customer_id: { cid: 'identity.customer', confidence: 0.95 },
email: { cid: 'contact.email', confidence: 0.92 },
phone: { cid: 'contact.phone', confidence: 0.88 }
});
const originalAnchors = await anchorSystem.createAnchors(originalDF);
const schemaChanges = [
{ from: 'customer_id', to: 'id' },
{ from: 'email', to: 'email_address' },
{ from: 'purchase_amount', to: 'total_amount' },
{ from: 'timestamp', to: 'created_at' }
];
let modifiedDF = originalDF;
for (const change of schemaChanges) {
modifiedDF = renameColumns(modifiedDF, { [change.from]: change.to });
}
const reconciliation = await (0, src_1.reconcileAnchors)(originalDF, modifiedDF);
expect(reconciliation.matchedPercentage).toBeGreaterThan(0.90);
expect(reconciliation.matches.length).toEqual(schemaChanges.length);
for (const change of schemaChanges) {
const match = reconciliation.matches.find(m => m.originalColumn === change.from && m.newColumn === change.to);
expect(match).toBeDefined();
expect(match?.confidence).toBeGreaterThan(0.85);
}
});
it('should detect and handle semantic drift scenarios', async () => {
const baselineDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(5000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(baselineDataset);
const baselinePath = (0, path_1.join)(tempDir, 'baseline.csv');
(0, fs_1.writeFileSync)(baselinePath, csvContent);
const baselineDF = await loadCSVAsDataFrame(baselinePath);
await (0, src_1.attachSemanticsShadow)(baselineDF, {
customer_id: { cid: 'identity.customer', confidence: 0.95 },
email: { cid: 'contact.email', confidence: 0.92 }
});
const baselineAnchors = await anchorSystem.createAnchors(baselineDF);
const driftDataset = createDriftedDataset(baselineDataset);
const driftCsvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(driftDataset);
const driftPath = (0, path_1.join)(tempDir, 'drifted.csv');
(0, fs_1.writeFileSync)(driftPath, driftCsvContent);
const driftedDF = await loadCSVAsDataFrame(driftPath);
const driftResults = await driftDetector.detectDrift(baselineDF, driftedDF, {
window: '7d',
alertThreshold: 0.1
});
expect(driftResults.alerts).toBeDefined();
expect(driftResults.alerts.length).toBeGreaterThan(0);
const emailDrift = driftResults.alerts.find(alert => alert.column === 'email' && alert.type === 'pattern_drift');
expect(emailDrift).toBeDefined();
expect(emailDrift?.severity).toBeGreaterThanOrEqual(0.3);
});
});
describe('Semantic Join with Normalization', () => {
it('should perform semantic joins with email normalization', async () => {
const customersDataset = {
name: 'customers',
description: 'Customer master data',
rows: 500,
columns: [
{ name: 'id', type: 'string', semanticType: 'identity.customer' },
{ name: 'email', type: 'email', semanticType: 'contact.email' },
{ name: 'full_name', type: 'string', semanticType: 'identity.person' }
],
data: Array.from({ length: 500 }, (_, i) => ({
id: `cust_${i}`,
email: `USER${i}@EXAMPLE.COM`, // Uppercase for normalization test
full_name: `Customer ${i}`
}))
};
const transactionsDataset = {
name: 'transactions',
description: 'Transaction data',
rows: 1000,
columns: [
{ name: 'tx_id', type: 'string' },
{ name: 'customer_email', type: 'email', semanticType: 'contact.email' },
{ name: 'amount', type: 'number' }
],
data: Array.from({ length: 1000 }, (_, i) => ({
tx_id: `tx_${i}`,
customer_email: `user${i % 500}@example.com`, // Lowercase
amount: Math.random() * 1000
}))
};
const customersCSV = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(customersDataset);
const transactionsCSV = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(transactionsDataset);
const customersPath = (0, path_1.join)(tempDir, 'customers.csv');
const transactionsPath = (0, path_1.join)(tempDir, 'transactions.csv');
(0, fs_1.writeFileSync)(customersPath, customersCSV);
(0, fs_1.writeFileSync)(transactionsPath, transactionsCSV);
const customersDF = await loadCSVAsDataFrame(customersPath);
const transactionsDF = await loadCSVAsDataFrame(transactionsPath);
await (0, src_1.attachSemanticsShadow)(customersDF, {
email: { cid: 'contact.email', confidence: 0.95 }
});
await (0, src_1.attachSemanticsShadow)(transactionsDF, {
customer_email: { cid: 'contact.email', confidence: 0.93 }
});
const joinResult = await performSemanticJoin(customersDF, transactionsDF, {
on: 'contact.email',
normalizer: 'email',
joinType: 'inner'
});
expect(joinResult.rows).toBeGreaterThan(900); // Should match most records
expect(joinResult.matchRate).toBeGreaterThan(0.9);
const joinedColumns = joinResult.columns;
expect(joinedColumns.some(c => c.includes('full_name'))).toBe(true);
expect(joinedColumns.some(c => c.includes('amount'))).toBe(true);
});
});
describe('SQL Generation for Multiple Targets', () => {
it('should generate compatible SQL for different platforms', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(100);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, 'multi-platform.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const dataFrame = await loadCSVAsDataFrame(csvPath);
await (0, src_1.attachSemanticsShadow)(dataFrame, {
customer_id: { cid: 'identity.customer', confidence: 0.95 },
timestamp: { cid: 'event.timestamp', confidence: 0.92 }
});
const platforms = ['snowflake', 'bigquery', 'postgres', 'redshift'];
for (const platform of platforms) {
const sql = generateSQLForPlatform(dataFrame, platform);
expect(sql).toBeDefined();
expect(sql.length).toBeGreaterThan(100);
switch (platform) {
case 'snowflake':
expect(sql).toContain('CREATE OR REPLACE VIEW');
expect(sql).toContain('VARIANT');
break;
case 'bigquery':
expect(sql).toContain('CREATE OR REPLACE VIEW');
expect(sql).toContain('STRUCT');
break;
case 'postgres':
expect(sql).toContain('CREATE OR REPLACE VIEW');
expect(sql).toContain('JSONB');
break;
case 'redshift':
expect(sql).toContain('CREATE OR REPLACE VIEW');
expect(sql).toContain('SUPER');
break;
}
}
});
});
});
// Mock implementations for testing
async function loadCSVAsDataFrame(path) {
const content = (0, fs_1.readFileSync)(path, 'utf-8');
const lines = content.split('\n').filter(line => line.trim());
const headers = lines[0].split(',');
const rows = lines.slice(1).map(line => {
const values = line.split(',');
const row = {};
headers.forEach((header, index) => {
row[header] = values[index] || null;
});
return row;
});
return {
columns: headers,
rows,
data: rows,
getColumn: (name) => rows.map(row => row[name]),
getSemantics: (columnName) => undefined
};
}
function renameColumns(dataFrame, mapping) {
const newColumns = dataFrame.columns.map((col) => mapping[col] || col);
const newRows = dataFrame.rows.map((row) => {
const newRow = {};
Object.keys(row).forEach(key => {
const newKey = mapping[key] || key;
newRow[newKey] = row[key];
});
return newRow;
});
return {
...dataFrame,
columns: newColumns,
rows: newRows,
data: newRows
};
}
function generateSnowflakeSQL(dataFrame, reconciliation) {
const columns = dataFrame.columns.map((col) => {
const semantic = getSemanticForColumn(col, reconciliation);
return ` ${col} ${getSnowflakeType(col)}${semantic ? ` -- Semantic mapping: ${semantic}` : ''}`;
}).join(',\n');
return `CREATE OR REPLACE VIEW semantic_view AS
SELECT
${columns}
FROM source_table;`;
}
function generateSQLForPlatform(dataFrame, platform) {
const platformConfig = {
snowflake: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'VARIANT' },
bigquery: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'STRUCT' },
postgres: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'JSONB' },
redshift: { viewType: 'CREATE OR REPLACE VIEW', jsonType: 'SUPER' }
};
const config = platformConfig[platform];
return `${config.viewType} semantic_view AS
SELECT * FROM source_table
-- Platform: ${platform}
-- JSON Type: ${config.jsonType}`;
}
function createDriftedDataset(originalDataset) {
const driftedData = originalDataset.data.map(row => ({
...row,
email: Math.random() > 0.3 ? row.email : 'invalid-email@domain.invalid'
}));
return {
...originalDataset,
name: 'drifted_dataset',
data: driftedData
};
}
async function performSemanticJoin(df1, df2, options) {
return {
rows: Math.min(df1.rows.length, df2.rows.length) * 0.95,
matchRate: 0.95,
columns: [...df1.columns, ...df2.columns.filter((c) => !df1.columns.includes(c))]
};
}
function getSemanticForColumn(columnName, reconciliation) {
const match = reconciliation?.matches?.find((m) => m.newColumn === columnName);
return match?.semanticType || null;
}
function getSnowflakeType(columnName) {
if (columnName.includes('amount'))
return 'NUMBER(10,2)';
if (columnName.includes('timestamp') || columnName.includes('date'))
return 'TIMESTAMP';
if (columnName.includes('email'))
return 'VARCHAR(255)';
return 'VARCHAR(100)';
}
//# sourceMappingURL=complete-workflow.test.js.map