semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
402 lines • 21.1 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const src_1 = require("../../src");
const test_data_generator_1 = require("../fixtures/test-data-generator");
const fs_1 = require("fs");
const path_1 = require("path");
const os = __importStar(require("os"));
describe('End-to-End: Integration Points Validation', () => {
let tempDir;
let anchorSystem;
let inferenceEngine;
let driftDetector;
let fuzzyMatcher;
let patternMatcher;
let statisticalAnalyzer;
beforeAll(() => {
tempDir = (0, path_1.join)(os.tmpdir(), 'semantic-integration-tests');
anchorSystem = new src_1.StableColumnAnchorSystem();
inferenceEngine = new src_1.InferenceEngine();
driftDetector = new src_1.DriftDetector();
fuzzyMatcher = new src_1.FuzzyMatcher();
patternMatcher = new src_1.PatternMatcher();
statisticalAnalyzer = new src_1.StatisticalAnalyzer();
});
beforeEach(() => {
src_1.ShadowSemanticsAPI.resetShadowAPI();
});
describe('Inference Engine ↔ Anchor System Integration', () => {
it('should create anchors with inferred semantic types', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, 'inference-anchor.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const dataFrame = await loadCSVAsDataFrame(csvPath);
const inferenceResults = await inferenceEngine.inferSchema(dataFrame);
expect(inferenceResults.columns.length).toEqual(8);
const inferredMappings = {};
inferenceResults.columns.forEach(col => {
if (col.semanticType && col.confidence > 0.8) {
inferredMappings[col.name] = {
cid: col.semanticType,
confidence: col.confidence
};
}
});
await (0, src_1.attachSemanticsShadow)(dataFrame, inferredMappings);
const anchors = await anchorSystem.createAnchors(dataFrame);
expect(anchors.length).toEqual(8);
const semanticAnchors = anchors.filter(anchor => {
const semantics = dataFrame.getSemantics?.(anchor.columnName);
return semantics && semantics.confidence > 0.8;
});
expect(semanticAnchors.length).toBeGreaterThanOrEqual(4);
for (const anchor of semanticAnchors) {
expect(anchor.fingerprint).toBeDefined();
expect(anchor.fingerprint.statistics).toBeDefined();
expect(anchor.fingerprint.patterns).toBeDefined();
}
});
it('should propagate semantic confidence through anchor matching', async () => {
const originalDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(500);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(originalDataset);
const originalPath = (0, path_1.join)(tempDir, 'confidence-original.csv');
(0, fs_1.writeFileSync)(originalPath, csvContent);
const originalDF = await loadCSVAsDataFrame(originalPath);
await (0, src_1.attachSemanticsShadow)(originalDF, {
customer_id: { cid: 'identity.customer', confidence: 0.95 },
email: { cid: 'contact.email', confidence: 0.88 },
phone: { cid: 'contact.phone', confidence: 0.82 }
});
const originalAnchors = await anchorSystem.createAnchors(originalDF);
const modifiedDF = renameColumns(originalDF, {
customer_id: 'cust_id',
email: 'email_addr'
});
const reconciliation = await (0, src_1.reconcileAnchors)(originalDF, modifiedDF);
const customerMatch = reconciliation.matches.find(m => m.originalColumn === 'customer_id');
expect(customerMatch?.confidence).toBeGreaterThan(0.90);
const emailMatch = reconciliation.matches.find(m => m.originalColumn === 'email');
expect(emailMatch?.confidence).toBeGreaterThan(0.85);
expect(reconciliation.confidenceMetrics.averageConfidence).toBeGreaterThan(0.80);
expect(reconciliation.confidenceMetrics.highConfidenceMatches).toBeGreaterThanOrEqual(2);
});
});
describe('Normalizers ↔ Fuzzy Matching Integration', () => {
it('should use normalized values for improved matching accuracy', async () => {
const customerDataset = {
name: 'customers_unnormalized',
description: 'Customer data with unnormalized emails and phones',
rows: 100,
columns: [
{ name: 'id', type: 'string' },
{ name: 'email', type: 'string' },
{ name: 'phone', type: 'string' }
],
data: Array.from({ length: 100 }, (_, i) => ({
id: `cust_${i}`,
email: ` USER${i}@EXAMPLE.COM `, // Whitespace + uppercase
phone: `+1 (555) ${String(i).padStart(3, '0')}-${String(i * 2).padStart(4, '0')}` // Various formats
}))
};
const transactionDataset = {
name: 'transactions_unnormalized',
description: 'Transaction data with different email/phone formats',
rows: 100,
columns: [
{ name: 'tx_id', type: 'string' },
{ name: 'customer_email', type: 'string' },
{ name: 'customer_phone', type: 'string' }
],
data: Array.from({ length: 100 }, (_, i) => ({
tx_id: `tx_${i}`,
customer_email: `user${i}@example.com`, // Lowercase, no whitespace
customer_phone: `555${String(i).padStart(3, '0')}${String(i * 2).padStart(4, '0')}` // No formatting
}))
};
const customersCSV = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(customerDataset);
const transactionsCSV = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(transactionDataset);
const customersPath = (0, path_1.join)(tempDir, 'customers_norm.csv');
const transactionsPath = (0, path_1.join)(tempDir, 'transactions_norm.csv');
(0, fs_1.writeFileSync)(customersPath, customersCSV);
(0, fs_1.writeFileSync)(transactionsPath, transactionsCSV);
const customersDF = await loadCSVAsDataFrame(customersPath);
const transactionsDF = await loadCSVAsDataFrame(transactionsPath);
const normalizedCustomerEmails = customersDF.data.map((row) => (0, src_1.normalizeEmail)(row.email, { preserveCase: false, trimWhitespace: true }));
const normalizedTransactionEmails = transactionsDF.data.map((row) => (0, src_1.normalizeEmail)(row.customer_email, { preserveCase: false, trimWhitespace: true }));
let exactMatches = 0;
for (let i = 0; i < normalizedCustomerEmails.length; i++) {
if (normalizedCustomerEmails[i].normalized === normalizedTransactionEmails[i].normalized) {
exactMatches++;
}
}
expect(exactMatches).toBeGreaterThan(95); // Should match almost all
const phoneMatches = customersDF.data.map((row, index) => {
const normalizedCustomerPhone = (0, src_1.normalizePhone)(row.phone);
const normalizedTransactionPhone = (0, src_1.normalizePhone)(transactionsDF.data[index].customer_phone);
return normalizedCustomerPhone.normalized === normalizedTransactionPhone.normalized;
}).filter(Boolean).length;
expect(phoneMatches).toBeGreaterThan(95);
});
it('should integrate fuzzy matching with pattern recognition', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(200);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, 'fuzzy-pattern.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const dataFrame = await loadCSVAsDataFrame(csvPath);
const emailColumn = dataFrame.data.map((row) => row.email);
const patterns = await patternMatcher.detectPatterns(emailColumn);
expect(patterns.length).toBeGreaterThan(0);
const emailPattern = patterns.find(p => p.type === 'email');
expect(emailPattern).toBeDefined();
expect(emailPattern?.confidence).toBeGreaterThan(0.90);
const fuzzyResults = await fuzzyMatcher.findSimilar(emailColumn[0], emailColumn.slice(1, 10));
expect(fuzzyResults.length).toBeGreaterThan(0);
expect(fuzzyResults[0].similarity).toBeLessThanOrEqual(1.0);
expect(fuzzyResults[0].similarity).toBeGreaterThan(0.0);
const highSimilarityMatches = fuzzyResults.filter(r => r.similarity > 0.8);
expect(highSimilarityMatches.length).toBeGreaterThanOrEqual(0);
});
});
describe('Shadow Semantics ↔ Drift Detection Integration', () => {
it('should detect semantic drift in attached shadow semantics', async () => {
const baselineDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(baselineDataset);
const baselinePath = (0, path_1.join)(tempDir, 'semantic-drift-baseline.csv');
(0, fs_1.writeFileSync)(baselinePath, csvContent);
const baselineDF = await loadCSVAsDataFrame(baselinePath);
await (0, src_1.attachSemanticsShadow)(baselineDF, {
customer_id: { cid: 'identity.customer', confidence: 0.95 },
email: { cid: 'contact.email', confidence: 0.92 },
timestamp: { cid: 'event.timestamp', confidence: 0.88 }
});
const driftedDataset = createSemanticDriftDataset(baselineDataset);
const driftedCSV = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(driftedDataset);
const driftedPath = (0, path_1.join)(tempDir, 'semantic-drift-drifted.csv');
(0, fs_1.writeFileSync)(driftedPath, driftedCSV);
const driftedDF = await loadCSVAsDataFrame(driftedPath);
await (0, src_1.attachSemanticsShadow)(driftedDF, {
customer_id: { cid: 'identity.customer', confidence: 0.95 },
email: { cid: 'contact.email', confidence: 0.92 },
timestamp: { cid: 'event.timestamp', confidence: 0.88 }
});
const driftResults = await driftDetector.detectDrift(baselineDF, driftedDF, {
includeSemanticDrift: true,
alertThreshold: 0.1
});
expect(driftResults.alerts.length).toBeGreaterThan(0);
const semanticDriftAlert = driftResults.alerts.find(alert => alert.type === 'semantic_drift');
expect(semanticDriftAlert).toBeDefined();
const emailPatternDrift = driftResults.alerts.find(alert => alert.column === 'email' && alert.type === 'pattern_drift');
expect(emailPatternDrift).toBeDefined();
expect(emailPatternDrift?.severity).toBeGreaterThan(0.2);
});
});
describe('Statistical Analysis ↔ Performance Optimization Integration', () => {
it('should maintain statistical accuracy under performance constraints', async () => {
const largeDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(10000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(largeDataset);
const csvPath = (0, path_1.join)(tempDir, 'stats-perf.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const dataFrame = await loadCSVAsDataFrame(csvPath);
const startTime = Date.now();
const purchaseAmountColumn = dataFrame.data.map((row) => parseFloat(row.purchase_amount));
const stats = await statisticalAnalyzer.analyze(purchaseAmountColumn);
const analysisTime = Date.now() - startTime;
expect(analysisTime).toBeLessThan(1000); // Should complete under 1 second
expect(stats.mean).toBeGreaterThan(0);
expect(stats.variance).toBeGreaterThan(0);
expect(stats.standardDeviation).toEqual(Math.sqrt(stats.variance));
expect(stats.min).toBeLessThanOrEqual(stats.max);
expect(stats.median).toBeGreaterThanOrEqual(stats.min);
expect(stats.median).toBeLessThanOrEqual(stats.max);
const anchors = await anchorSystem.createAnchors(dataFrame);
const purchaseAnchor = anchors.find(a => a.columnName === 'purchase_amount');
expect(purchaseAnchor?.fingerprint.statistics.mean).toBeCloseTo(stats.mean, 1);
expect(purchaseAnchor?.fingerprint.statistics.variance).toBeCloseTo(stats.variance, 1);
});
});
describe('Cross-Component Error Handling', () => {
it('should gracefully handle errors across component boundaries', async () => {
const corruptedDataset = {
name: 'corrupted',
description: 'Dataset with corrupted data for error testing',
rows: 50,
columns: [
{ name: 'id', type: 'string' },
{ name: 'corrupted_json', type: 'string' },
{ name: 'invalid_number', type: 'string' }
],
data: Array.from({ length: 50 }, (_, i) => ({
id: i % 10 === 0 ? null : `id_${i}`, // Some nulls
corrupted_json: i % 5 === 0 ? '{"invalid": json}' : '{"valid": "json"}',
invalid_number: i % 3 === 0 ? 'not_a_number' : String(Math.random() * 100)
}))
};
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(corruptedDataset);
const csvPath = (0, path_1.join)(tempDir, 'corrupted.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const dataFrame = await loadCSVAsDataFrame(csvPath);
let inferenceError = null;
try {
await inferenceEngine.inferSchema(dataFrame);
}
catch (error) {
inferenceError = error;
}
if (inferenceError) {
expect(inferenceError.message).toContain('inference');
}
let anchorError = null;
try {
await anchorSystem.createAnchors(dataFrame);
}
catch (error) {
anchorError = error;
}
const shouldHandleGracefully = anchorError === null ||
(anchorError && anchorError.message.includes('data quality'));
expect(shouldHandleGracefully).toBe(true);
});
it('should maintain data lineage through error recovery', async () => {
const partialDataset = test_data_generator_1.TestDataGenerator.generateMessyDataset();
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(partialDataset);
const csvPath = (0, path_1.join)(tempDir, 'partial-recovery.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
const dataFrame = await loadCSVAsDataFrame(csvPath);
const partialResults = await inferenceEngine.inferSchema(dataFrame, {
skipOnError: true,
requireMinimumConfidence: 0.5
});
expect(partialResults.columns.length).toBeGreaterThan(0);
const successfulColumns = partialResults.columns.filter(c => c.confidence > 0.5);
expect(successfulColumns.length).toBeGreaterThanOrEqual(2);
const semanticMappings = {};
successfulColumns.forEach(col => {
semanticMappings[col.name] = {
cid: col.semanticType,
confidence: col.confidence
};
});
await (0, src_1.attachSemanticsShadow)(dataFrame, semanticMappings);
const anchors = await anchorSystem.createAnchors(dataFrame);
const validAnchors = anchors.filter(a => a.fingerprint.statistics.dataQualityScore > 0.5);
expect(validAnchors.length).toBeGreaterThanOrEqual(1);
for (const anchor of validAnchors) {
expect(anchor.lineage).toBeDefined();
expect(anchor.lineage?.source).toBeDefined();
}
});
});
});
// Helper functions
async function loadCSVAsDataFrame(path) {
const fs = require('fs');
const content = fs.readFileSync(path, 'utf-8');
const lines = content.split('\n').filter(line => line.trim());
const headers = lines[0].split(',');
const rows = lines.slice(1).map(line => {
const values = line.split(',');
const row = {};
headers.forEach((header, index) => {
const value = values[index];
row[header] = value === '' || value === 'null' ? null : value;
});
return row;
});
return {
columns: headers,
rows,
data: rows,
getSemantics: (columnName) => {
// Mock implementation that returns attached semantics
const semantics = global.__ATTACHED_SEMANTICS__?.[columnName];
return semantics;
}
};
}
function renameColumns(dataFrame, mapping) {
const newColumns = dataFrame.columns.map((col) => mapping[col] || col);
const newRows = dataFrame.rows.map((row) => {
const newRow = {};
Object.keys(row).forEach(key => {
const newKey = mapping[key] || key;
newRow[newKey] = row[key];
});
return newRow;
});
return {
...dataFrame,
columns: newColumns,
rows: newRows,
data: newRows,
getSemantics: dataFrame.getSemantics
};
}
function createSemanticDriftDataset(originalDataset) {
const driftedData = originalDataset.data.map((row, index) => {
const driftedRow = { ...row };
// Introduce semantic drift in email patterns
if (index % 5 === 0) {
driftedRow.email = 'invalid.email.format';
}
// Introduce drift in customer ID patterns
if (index % 7 === 0) {
driftedRow.customer_id = `NEW_${index}`;
}
// Introduce timestamp format drift
if (index % 6 === 0) {
driftedRow.timestamp = '2023-13-45'; // Invalid date
}
return driftedRow;
});
return {
...originalDataset,
name: 'drifted_dataset',
data: driftedData
};
}
// Mock the shadow semantics attachment for testing
const originalAttachSemanticsShadow = src_1.attachSemanticsShadow;
beforeEach(() => {
global.__ATTACHED_SEMANTICS__ = {};
});
// Override the function for testing
function mockAttachSemanticsShadow(dataFrame, semantics) {
global.__ATTACHED_SEMANTICS__ = { ...global.__ATTACHED_SEMANTICS__, ...semantics };
return Promise.resolve();
}
//# sourceMappingURL=integration-points.test.js.map