semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
372 lines • 19.7 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const test_data_generator_1 = require("../fixtures/test-data-generator");
const dataset_loader_1 = require("../fixtures/dataset-loader");
const fs_1 = require("fs");
const path_1 = require("path");
const os = __importStar(require("os"));
const anchors_1 = require("../../src/core/anchors");
const attachment_api_1 = require("../../src/core/attachment-api");
describe('End-to-End: Simplified Integration Tests', () => {
let tempDir;
beforeAll(() => {
tempDir = (0, path_1.join)(os.tmpdir(), 'semantic-simple-e2e');
if (!(0, fs_1.existsSync)(tempDir)) {
(0, fs_1.mkdirSync)(tempDir, { recursive: true });
}
});
describe('Test Data Generation and Loading', () => {
it('should generate and load datasets successfully', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(100);
expect(dataset).toBeDefined();
expect(dataset.rows).toBe(100);
expect(dataset.columns).toHaveLength(8);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
expect(csvContent).toContain('customer_id,email,phone');
const csvPath = (0, path_1.join)(tempDir, 'test-dataset.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
expect(loadedDataset.rows).toHaveLength(100);
expect(loadedDataset.columns).toHaveLength(8);
expect(loadedDataset.metadata.dataTypes['email']).toBe('email');
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
it('should handle Unicode datasets', async () => {
const unicodeDataset = test_data_generator_1.TestDataGenerator.generateUnicodeDataset();
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(unicodeDataset);
const csvPath = (0, path_1.join)(tempDir, 'unicode.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
expect(loadedDataset.rows.length).toBeGreaterThan(0);
// Check that Unicode characters are preserved
const names = loadedDataset.rows.map(row => row.name);
const hasArabic = names.some(name => /[\u0600-\u06FF]/.test(name));
const hasChinese = names.some(name => /[\u4e00-\u9fff]/.test(name));
expect(hasArabic || hasChinese).toBe(true);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
it('should handle messy data gracefully', async () => {
const messyDataset = test_data_generator_1.TestDataGenerator.generateMessyDataset();
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(messyDataset);
const csvPath = (0, path_1.join)(tempDir, 'messy.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
expect(loadedDataset.rows.length).toBe(messyDataset.rows);
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
// Messy datasets should not be perfect; allow generous threshold
expect(qualityResult.score).toBeLessThan(0.95);
expect(qualityResult.issues.length).toBeGreaterThan(0);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
it('should handle legacy COBOL-style datasets', async () => {
const legacyDataset = test_data_generator_1.TestDataGenerator.generateLegacyDataset();
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(legacyDataset);
const csvPath = (0, path_1.join)(tempDir, 'legacy.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
expect(loadedDataset.columns).toContain('CUSTNO');
expect(loadedDataset.columns).toContain('EMAILADR');
expect(loadedDataset.metadata.dataTypes['EMAILADR']).toBe('email');
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
});
describe('Performance and Memory Tests', () => {
it('should handle large datasets efficiently', async () => {
const ROWS = 10_000;
const largeDataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(ROWS);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(largeDataset);
const csvPath = (0, path_1.join)(tempDir, 'large.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const startTime = Date.now();
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const loadTime = Date.now() - startTime;
expect(loadedDataset.rows.length).toBe(ROWS);
expect(loadTime).toBeLessThan(5000); // Should load in <5 seconds
const throughput = ROWS / (loadTime / 1000);
expect(throughput).toBeGreaterThan(2000); // >2k rows/second
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
it('should maintain reasonable memory usage', async () => {
const initialMemory = process.memoryUsage();
// Process multiple datasets
for (let i = 0; i < 5; i++) {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, `memory-test-${i}.csv`);
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
// Force garbage collection if available
if (global.gc) {
global.gc();
}
}
const finalMemory = process.memoryUsage();
const memoryGrowth = (finalMemory.heapUsed - initialMemory.heapUsed) / 1024 / 1024;
expect(memoryGrowth).toBeLessThan(50); // <50MB growth
});
it('should support concurrent processing', async () => {
const CONCURRENT_TASKS = 3;
const tasks = Array.from({ length: CONCURRENT_TASKS }, async (_, i) => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(500);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, `concurrent-${i}.csv`);
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const startTime = Date.now();
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
const duration = Date.now() - startTime;
return {
taskId: i,
duration,
rowCount: loadedDataset.rows.length,
qualityScore: qualityResult.score
};
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
const results = await Promise.all(tasks);
expect(results).toHaveLength(CONCURRENT_TASKS);
results.forEach(result => {
expect(result.rowCount).toBe(500);
expect(result.duration).toBeLessThan(3000); // <3 seconds per task
expect(result.qualityScore).toBeGreaterThan(0.8);
});
});
});
describe('Data Quality and Validation', () => {
it('should detect data quality issues correctly', async () => {
// Create dataset with known quality issues
const problematicDataset = {
name: 'problematic',
description: 'Dataset with quality issues',
rows: 100,
columns: [
{ name: 'id', type: 'string' },
{ name: 'bad col name', type: 'string' }, // Space in name
{ name: 'email', type: 'string' },
{ name: 'amount', type: 'number' }
],
data: Array.from({ length: 100 }, (_, i) => ({
id: i % 10 === 0 ? null : `id_${i}`, // 10% nulls
'bad col name': `value_${i}`,
email: i % 5 === 0 ? 'invalid_email' : `user${i}@example.com`, // 20% invalid emails
amount: Math.random() * 100
}))
};
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(problematicDataset);
const csvPath = (0, path_1.join)(tempDir, 'problematic.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
expect(qualityResult.score).toBeLessThan(0.9); // Should detect issues
expect(qualityResult.issues.length).toBeGreaterThan(0);
expect(qualityResult.recommendations.length).toBeGreaterThan(0);
// Check for specific issues
const hasNullIssue = qualityResult.issues.some(issue => issue.includes('null') || issue.includes('missing'));
const hasColumnNameIssue = qualityResult.issues.some(issue => issue.includes('column name'));
expect(hasNullIssue || hasColumnNameIssue).toBe(true);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
it('should provide useful dataset summaries', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(50);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, 'summary.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const summary = dataset_loader_1.DatasetLoader.getDatasetSummary(loadedDataset);
expect(summary).toContain('50 rows');
expect(summary).toContain('8 columns');
expect(summary).toContain('email');
expect(summary).toContain('string');
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
});
describe('Benchmark Dataset Validation', () => {
it('should load predefined test datasets', async () => {
// Test that our fixture datasets can be loaded
const fixtureFiles = [
'test/fixtures/edge-cases/unicode-names.csv',
'test/fixtures/edge-cases/messy-data.csv',
'test/fixtures/edge-cases/legacy-cobol.csv'
];
for (const filePath of fixtureFiles) {
const fullPath = (0, path_1.join)(process.cwd(), filePath);
if ((0, fs_1.existsSync)(fullPath)) {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(fullPath);
expect(loadedDataset.rows.length).toBeGreaterThan(0);
expect(loadedDataset.columns.length).toBeGreaterThan(0);
}
}
});
it('should validate fixture data quality', async () => {
const unicodePath = (0, path_1.join)(process.cwd(), 'test/fixtures/edge-cases/unicode-names.csv');
if ((0, fs_1.existsSync)(unicodePath)) {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(unicodePath);
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
expect(qualityResult.score).toBeGreaterThan(0.7); // Unicode data should be good quality
expect(loadedDataset.metadata.dataTypes['email']).toBe('email');
}
const messyPath = (0, path_1.join)(process.cwd(), 'test/fixtures/edge-cases/messy-data.csv');
if ((0, fs_1.existsSync)(messyPath)) {
const loadedDataset = await dataset_loader_1.DatasetLoader.loadDataset(messyPath);
const qualityResult = await dataset_loader_1.DatasetLoader.validateDatasetQuality(loadedDataset);
expect(qualityResult.score).toBeLessThan(0.95); // Messy data should have lower quality than perfect
expect(qualityResult.issues.length).toBeGreaterThan(0);
}
});
});
describe('Caching and Performance', () => {
it('should cache loaded datasets efficiently', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(1000);
const csvContent = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, 'cache-test.csv');
(0, fs_1.writeFileSync)(csvPath, csvContent);
try {
// Clear cache
dataset_loader_1.DatasetLoader.clearCache();
expect(dataset_loader_1.DatasetLoader.getCacheSize()).toBe(0);
// First load
const startTime1 = Date.now();
const loadedDataset1 = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const loadTime1 = Date.now() - startTime1;
expect(dataset_loader_1.DatasetLoader.getCacheSize()).toBeGreaterThanOrEqual(1);
// Second load (should be cached)
const startTime2 = Date.now();
const loadedDataset2 = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const loadTime2 = Date.now() - startTime2;
expect(loadTime2).toBeLessThan(loadTime1 * 0.5); // Should be significantly faster
expect(loadedDataset1.rows.length).toBe(loadedDataset2.rows.length);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
dataset_loader_1.DatasetLoader.clearCache();
}
});
});
describe('Core Integration Smoke', () => {
function toDataFrame(dataset) {
const columns = dataset.columns;
const rows = dataset.rows;
const dtypes = {};
for (const c of columns) {
const t = dataset.metadata.dataTypes[c] || 'string';
// map our loader types to pandas-like dtypes used in mapDataType
dtypes[c] = t === 'integer' || t === 'float' ? 'float64' : t;
}
return {
columns,
dtypes,
shape: [rows.length, columns.length],
sample: (n = 1000) => {
const limit = Math.min(n, rows.length);
const out = {};
for (const c of columns) {
out[c] = rows.slice(0, limit).map(r => r[c]);
}
return out;
},
getColumn: (name) => rows.map(r => r[name])
};
}
it('should attach semantics and reconcile basic columns', async () => {
const dataset = test_data_generator_1.TestDataGenerator.generateLargeDataset(50);
const csv = test_data_generator_1.TestDataGenerator.writeDatasetToCSV(dataset);
const csvPath = (0, path_1.join)(tempDir, 'core-smoke.csv');
(0, fs_1.writeFileSync)(csvPath, csv);
try {
const loaded = await dataset_loader_1.DatasetLoader.loadDataset(csvPath);
const df = toDataFrame(loaded);
const result = (0, attachment_api_1.attachSemanticsShadow)(df, { dataset_name: 'core_smoke' });
expect(result.semantic_attachments.length).toBeGreaterThan(0);
const strategies = result.reconciliation_result.strategy_used;
expect(typeof strategies).toBe('string');
// Create anchors and verify fingerprints
const system = new anchors_1.StableColumnAnchorSystem();
const columns = df.columns.map(name => ({
name,
values: df.sample(100)[name],
data_type: 'string'
}));
const anchor = system.createAnchor('core_smoke', columns[0]);
expect(anchor.anchor_id).toMatch(/^sca_/);
// Compatibility check
const compat = (0, attachment_api_1.analyzeDataFrameCompatibility)(df, df);
expect(compat.compatibility_score).toBe(1);
}
finally {
(0, fs_1.unlinkSync)(csvPath);
}
});
});
});
//# sourceMappingURL=simple-e2e.test.js.map