semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
282 lines • 15.4 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const anchors_1 = require("../../src/core/anchors");
const inference_engine_1 = require("../../src/inference/inference-engine");
const test_data_generator_1 = require("../fixtures/test-data-generator");
const dataset_loader_1 = require("../fixtures/dataset-loader");
const fs_1 = require("fs");
const path_1 = require("path");
const os = __importStar(require("os"));
describe('End-to-End: Basic Workflow Validation', () => {
let tempDir;
let anchorSystem;
let inferenceEngine;
beforeAll(() => {
tempDir = (0, path_1.join)(os.tmpdir(), 'semantic-basic-e2e');
anchorSystem = new anchors_1.StableColumnAnchorSystem();
inferenceEngine = new inference_engine_1.InferenceEngine();
});
describe('Dataset Loading and Processing', () => {
it('should successfully load and parse test datasets', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(100);
expect(dataset).toBeDefined();
expect(dataset.rows).toBe(100);
expect(dataset.columns).toHaveLength(8);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
expect(csvContent).toContain('customer_id,email,phone');
expect(csvContent.split('\n')).toHaveLength(102); // Header + 100 rows + trailing newline
});
it('should handle CSV parsing with the DatasetLoader', async () => {
const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(50);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(testData);
const csvPath = (0, path_1.join)(tempDir, 'test-dataset.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
expect(loadedDataset.name).toBe('test-dataset');
expect(loadedDataset.rows).toHaveLength(50);
expect(loadedDataset.columns).toHaveLength(8);
expect(loadedDataset.metadata.rowCount).toBe(50);
expect(loadedDataset.metadata.columnCount).toBe(8);
// Verify data types were inferred
expect(loadedDataset.metadata.dataTypes['customer_id']).toBe('string');
expect(loadedDataset.metadata.dataTypes['email']).toBe('email');
expect(loadedDataset.metadata.dataTypes['purchase_amount']).toBe('float');
// Test data quality validation
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
expect(qualityResult.score).toBeGreaterThan(0.8);
expect(qualityResult.issues).toBeDefined();
expect(qualityResult.recommendations).toBeDefined();
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
it('should handle edge case datasets', async () => {
const unicodeData = test_data_generator_1.TestDataGenerator.generateUnicodeDataset();
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(unicodeData);
const csvPath = (0, path_1.join)(tempDir, 'unicode-test.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
expect(loadedDataset.rows.length).toBe(unicodeData.rows);
expect(loadedDataset.columns).toContain('name');
expect(loadedDataset.columns).toContain('email');
// Verify Unicode names are preserved
const names = loadedDataset.rows.map(row => row.name);
expect(names.some(name => /[\u0600-\u06FF]/.test(name))).toBe(true); // Arabic
expect(names.some(name => /[\u4e00-\u9fff]/.test(name))).toBe(true); // Chinese
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
it('should handle messy data gracefully', async () => {
const messyData = test_data_generator_1.TestDataGenerator.generateMessyDataset();
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(messyData);
const csvPath = (0, path_1.join)(tempDir, 'messy-test.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
expect(loadedDataset.rows.length).toBe(messyData.rows);
// Verify null handling
expect(loadedDataset.metadata.nullCount).toBeGreaterThan(0);
expect(loadedDataset.metadata.nullPercentage).toBeGreaterThan(0);
// Quality should be lower for messy data
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
expect(qualityResult.score).toBeLessThan(0.7);
expect(qualityResult.issues.length).toBeGreaterThan(0);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
});
describe('Inference Engine Integration', () => {
it('should perform column type inference', async () => {
const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(200);
// Test email column inference
const emailValues = testData.data.map(row => row.email);
const emailInference = await inferenceEngine.inferColumnType('email', emailValues);
expect(emailInference.columnName).toBe('email');
expect(emailInference.confidence).toBeGreaterThan(0.7);
expect(emailInference.semanticType).toContain('email');
// Test customer ID inference
const customerIdValues = testData.data.map(row => row.customer_id);
const idInference = await inferenceEngine.inferColumnType('customer_id', customerIdValues);
expect(idInference.columnName).toBe('customer_id');
expect(idInference.confidence).toBeGreaterThan(0.7);
expect(idInference.semanticType).toContain('identifier');
});
it('should handle performance requirements for inference', async () => {
const ROWS = 10_000;
const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(ROWS);
const startTime = Date.now();
// Test inference on all columns
const inferencePromises = testData.columns.map(async (column) => {
const values = testData.data.map(row => row[column.name]);
return inferenceEngine.inferColumnType(column.name, values, {
performanceMode: 'fast',
sampleSize: 1000
});
});
const results = await Promise.all(inferencePromises);
const endTime = Date.now();
const duration = endTime - startTime;
const throughput = ROWS / (duration / 1000);
expect(results).toHaveLength(testData.columns.length);
expect(duration).toBeLessThan(5000); // Should complete in <5 seconds
expect(throughput).toBeGreaterThan(2000); // >2k rows/second minimum
// Verify all columns got inferences
results.forEach(result => {
expect(result.columnName).toBeDefined();
expect(result.semanticType).toBeDefined();
expect(result.confidence).toBeGreaterThan(0);
});
});
});
describe('Anchor System Integration', () => {
it('should create stable column anchors', async () => {
const testData = test_data_generator_1.TestDataGenerator.generateLargeDataset(100);
// Convert to the format expected by anchor system
const columnData = testData.columns.map(col => ({
column_name: col.name,
values: testData.data.map(row => row[col.name]),
data_type: col.type === 'string' ? 'object' :
col.type === 'number' ? 'float64' :
col.type === 'date' ? 'datetime64' : 'object'
}));
for (const column of columnData) {
const anchor = anchorSystem.createAnchor({ name: 'test_dataset', columns: columnData }, column);
expect(anchor).toBeDefined();
expect(anchor.anchorId).toBeDefined();
expect(anchor.columnName).toBe(column.column_name);
expect(anchor.fingerprint).toBeDefined();
expect(anchor.fingerprint.statistics).toBeDefined();
}
});
it('should handle anchor reconciliation', async () => {
const originalData = test_data_generator_1.TestDataGenerator.generateLargeDataset(50);
const modifiedData = {
...originalData,
columns: originalData.columns.map(col => ({
...col,
name: col.name === 'customer_id' ? 'cust_id' :
col.name === 'email' ? 'email_address' : col.name
}))
};
// Convert to column data format
const originalColumns = originalData.columns.map(col => ({
column_name: col.name,
values: originalData.data.map(row => row[col.name]),
data_type: 'object'
}));
const modifiedColumns = modifiedData.columns.map(col => ({
column_name: col.name,
values: originalData.data.map(row => {
const originalColName = col.name === 'cust_id' ? 'customer_id' :
col.name === 'email_address' ? 'email' : col.name;
return row[originalColName];
}),
data_type: 'object'
}));
// Create anchors for original data
const originalAnchors = originalColumns.map(col => anchorSystem.createAnchor({ name: 'original', columns: originalColumns }, col));
// Test reconciliation
const reconciliationResult = anchorSystem.reconcileAnchors(modifiedColumns, originalAnchors, { similarity_threshold: 0.8 });
expect(reconciliationResult).toBeDefined();
expect(reconciliationResult.total_columns).toBe(modifiedColumns.length);
expect(reconciliationResult.matched_columns).toBeGreaterThan(0);
// Should match renamed columns
const customerMatch = reconciliationResult.matches.find(match => match.new_column === 'cust_id');
expect(customerMatch).toBeDefined();
expect(customerMatch?.confidence_score).toBeGreaterThan(0.8);
});
});
describe('Performance Validation', () => {
it('should maintain reasonable memory usage', async () => {
const initialMemory = process.memoryUsage();
// Process multiple datasets
for (let i = 0; i < 10; i++) {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, `perf-test-${i}.csv`);
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
// Force garbage collection if available
if (global.gc) {
global.gc();
}
}
const finalMemory = process.memoryUsage();
const memoryGrowth = (finalMemory.heapUsed - initialMemory.heapUsed) / 1024 / 1024;
expect(memoryGrowth).toBeLessThan(50); // <50MB growth for processing 10k rows
});
it('should handle concurrent operations', async () => {
const concurrentTasks = Array.from({ length: 5 }, async (_, i) => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(500);
const csvPath = (0, path_1.join)(tempDir, `concurrent-${i}.csv`);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const startTime = Date.now();
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
const duration = Date.now() - startTime;
return { duration, qualityScore: qualityResult.score, taskId: i };
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
const results = await Promise.all(concurrentTasks);
expect(results).toHaveLength(5);
results.forEach(result => {
expect(result.duration).toBeLessThan(2000); // <2 seconds per task
expect(result.qualityScore).toBeGreaterThan(0.8);
});
const avgDuration = results.reduce((sum, r) => sum + r.duration, 0) / results.length;
expect(avgDuration).toBeLessThan(1000); // Average <1 second
});
});
});
//# sourceMappingURL=basic-e2e.test.js.map