semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
450 lines • 22.6 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const globals_1 = require("@jest/globals");
const semantic_join_1 = require("./semantic-join");
const cid_registry_1 = require("../registry/cid-registry");
const shadow_semantics_1 = require("../core/shadow-semantics");
const statistical_analyzer_1 = require("../inference/statistical-analyzer");
(0, globals_1.describe)('SemanticJoinOperator', () => {
let joinOperator;
let cidRegistry;
let semanticsLayer;
let statisticalAnalyzer;
(0, globals_1.beforeEach)(() => {
cidRegistry = new cid_registry_1.CIDRegistry();
semanticsLayer = new shadow_semantics_1.ShadowSemanticsLayer();
statisticalAnalyzer = new statistical_analyzer_1.StatisticalAnalyzer();
joinOperator = new semantic_join_1.SemanticJoinOperator(cidRegistry, semanticsLayer, statisticalAnalyzer);
// Register basic CID concepts for testing
cidRegistry.registerPack({
pack: 'test-pack',
version: '1.0.0',
description: 'Test concepts',
concepts: [
{
cid: 'person.email',
labels: ['email', 'email_address', 'user_email'],
description: 'Email address',
facets: { pii: true },
examples: ['user@example.com']
},
{
cid: 'person.phone',
labels: ['phone', 'phone_number', 'mobile'],
description: 'Phone number',
facets: { pii: true },
examples: ['+1-555-123-4567']
},
{
cid: 'person.name',
labels: ['name', 'full_name', 'customer_name'],
description: 'Person name',
facets: { pii: true },
examples: ['John Doe']
},
{
cid: 'identifier.customer_id',
labels: ['customer_id', 'cust_id', 'user_id'],
description: 'Customer identifier',
facets: { identifier: true },
examples: ['CUST-12345']
}
]
});
});
(0, globals_1.describe)('Basic Join Functionality', () => {
(0, globals_1.it)('should perform exact match join on simple data', async () => {
const leftData = {
id: [1, 2, 3, 4],
email: ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com']
};
const rightData = {
user_id: [2, 3, 4, 5],
name: ['Bob Smith', 'Charlie Brown', 'David Wilson', 'Eve Davis']
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'id',
rightOn: 'user_id',
how: 'inner',
confidenceThreshold: 0.5
});
(0, globals_1.expect)(result.data).toBeDefined();
(0, globals_1.expect)(result.statistics.outputRows).toBe(3); // Should match IDs 2, 3, 4
(0, globals_1.expect)(result.statistics.matchedRows).toBe(3);
(0, globals_1.expect)(result.matches).toHaveLength(3);
(0, globals_1.expect)(result.performance.totalTime).toBeGreaterThan(0);
});
(0, globals_1.it)('should handle left join with unmatched rows', async () => {
const leftData = {
id: [1, 2, 3],
name: ['Alice', 'Bob', 'Charlie']
};
const rightData = {
user_id: [2, 4],
score: [85, 92]
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'id',
rightOn: 'user_id',
how: 'left',
confidenceThreshold: 0.5
});
(0, globals_1.expect)(result.statistics.outputRows).toBe(3); // All left rows preserved
(0, globals_1.expect)(result.statistics.matchedRows).toBe(1); // Only ID 2 matches
});
(0, globals_1.it)('should handle outer join with all unmatched rows', async () => {
const leftData = {
id: [1, 2],
name: ['Alice', 'Bob']
};
const rightData = {
user_id: [3, 4],
score: [85, 92]
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'id',
rightOn: 'user_id',
how: 'outer',
confidenceThreshold: 0.5
});
(0, globals_1.expect)(result.statistics.outputRows).toBe(4); // 2 left + 2 right
(0, globals_1.expect)(result.statistics.matchedRows).toBe(0); // No matches
});
});
(0, globals_1.describe)('Semantic Type Matching', () => {
(0, globals_1.it)('should perform semantic join on email addresses with normalization', async () => {
const leftData = {
customer_email: ['ALICE@EXAMPLE.COM', 'Bob@Example.Com', 'charlie@example.com'],
order_id: ['ORD-001', 'ORD-002', 'ORD-003']
};
const rightData = {
user_email: ['alice@example.com', 'bob@example.com', 'eve@example.com'],
user_name: ['Alice Smith', 'Bob Jones', 'Eve Davis']
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'customer_email',
rightOn: 'user_email',
how: 'inner',
confidenceThreshold: 0.5,
autoSelectNormalizers: true
});
(0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // Alice and Bob should match after normalization
(0, globals_1.expect)(result.matches.some(m => m.matchType === 'normalized')).toBe(true);
(0, globals_1.expect)(result.statistics.confidence.average).toBeGreaterThan(0.5);
});
(0, globals_1.it)('should handle phone number normalization', async () => {
const leftData = {
phone: ['+1-555-123-4567', '555.234.5678', '(555) 345-6789'],
customer_id: ['C001', 'C002', 'C003']
};
const rightData = {
mobile: ['15551234567', '15552345678', '15556789012'],
region: ['West', 'East', 'South']
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'phone',
rightOn: 'mobile',
how: 'inner',
confidenceThreshold: 0.5,
autoSelectNormalizers: true
});
(0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // First two should match after phone normalization
(0, globals_1.expect)(result.matches.some(m => m.matchType === 'normalized')).toBe(true);
});
(0, globals_1.it)('should handle name matching with fuzzy logic', async () => {
const leftData = {
customer_name: ['John Doe', 'Jane Smith', 'Robert Johnson'],
account_id: ['A001', 'A002', 'A003']
};
const rightData = {
full_name: ['John D.', 'Jane Smyth', 'Bob Johnson'],
credit_score: [750, 680, 720]
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'customer_name',
rightOn: 'full_name',
how: 'inner',
confidenceThreshold: 0.6,
enableFuzzyMatching: true,
fuzzyThreshold: 0.7,
autoSelectNormalizers: true
});
(0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThan(0);
(0, globals_1.expect)(result.matches.some(m => m.matchType === 'fuzzy')).toBe(true);
});
});
(0, globals_1.describe)('Multi-Column Joins', () => {
(0, globals_1.it)('should perform multi-column semantic join', async () => {
const leftData = {
first_name: ['John', 'Jane', 'Bob'],
last_name: ['Doe', 'Smith', 'Johnson'],
birth_year: [1985, 1990, 1975]
};
const rightData = {
fname: ['John', 'Jane', 'Robert'],
lname: ['Doe', 'Smith', 'Johnson'],
year_born: [1985, 1990, 1975]
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: ['first_name', 'last_name'],
rightOn: ['fname', 'lname'],
how: 'inner',
confidenceThreshold: 0.7,
autoSelectNormalizers: true
});
(0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // John Doe and Jane Smith should match
(0, globals_1.expect)(result.statistics.confidence.average).toBeGreaterThan(0.7);
});
(0, globals_1.it)('should handle mixed data types in multi-column join', async () => {
const leftData = {
customer_id: ['C001', 'C002', 'C003'],
order_date: ['2023-01-15', '2023-02-20', '2023-03-10'],
amount: [100.50, 250.00, 75.25]
};
const rightData = {
cust_id: ['C001', 'C002', 'C004'],
purchase_date: ['2023-01-15', '2023-02-20', '2023-04-05'],
total: [100.5, 250, 125.75]
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: ['customer_id', 'order_date'],
rightOn: ['cust_id', 'purchase_date'],
how: 'inner',
confidenceThreshold: 0.8
});
(0, globals_1.expect)(result.statistics.matchedRows).toBe(2);
});
});
(0, globals_1.describe)('Performance and Optimization', () => {
(0, globals_1.it)('should handle large datasets efficiently', async () => {
// Generate larger test datasets
const leftSize = 10000;
const rightSize = 8000;
const overlapSize = 5000;
const leftData = {
id: Array.from({ length: leftSize }, (_, i) => i + 1),
email: Array.from({ length: leftSize }, (_, i) => `user${i + 1}@example.com`)
};
const rightData = {
user_id: Array.from({ length: rightSize }, (_, i) => i + 1000), // Some overlap
name: Array.from({ length: rightSize }, (_, i) => `User ${i + 1000}`)
};
// Ensure some overlap
for (let i = 0; i < overlapSize; i++) {
rightData.user_id[i] = leftData.id[i];
}
const startTime = performance.now();
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'id',
rightOn: 'user_id',
how: 'inner',
confidenceThreshold: 0.5,
batchSize: 5000
});
const endTime = performance.now();
(0, globals_1.expect)(result.statistics.matchedRows).toBe(overlapSize);
(0, globals_1.expect)(endTime - startTime).toBeLessThan(5000); // Should complete within 5 seconds
(0, globals_1.expect)(result.performance.totalTime).toBeGreaterThan(0);
(0, globals_1.expect)(result.performance.cacheHits).toBeGreaterThanOrEqual(0);
});
(0, globals_1.it)('should cache normalized values effectively', async () => {
const leftData = {
email: ['user1@EXAMPLE.COM', 'USER2@example.com', 'User3@Example.Com', 'user1@EXAMPLE.COM'],
id: [1, 2, 3, 1]
};
const rightData = {
user_email: ['user1@example.com', 'user2@example.com', 'user4@example.com'],
score: [85, 92, 78]
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'email',
rightOn: 'user_email',
how: 'inner',
cacheNormalizedValues: true
});
const cacheStats = joinOperator.getCacheStats();
(0, globals_1.expect)(cacheStats.hits).toBeGreaterThan(0); // Should have cache hits for repeated values
(0, globals_1.expect)(cacheStats.hitRate).toBeGreaterThan(0);
});
});
(0, globals_1.describe)('Confidence Scoring', () => {
(0, globals_1.it)('should provide detailed confidence metrics', async () => {
const leftData = {
customer_email: ['alice@example.com', 'bob@company.com', 'charlie@test.org'],
customer_id: ['C001', 'C002', 'C003']
};
const rightData = {
user_email: ['alice@example.com', 'robert@company.com', 'charles@test.org'],
user_score: [95, 87, 76]
};
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'customer_email',
rightOn: 'user_email',
how: 'inner',
confidenceThreshold: 0.5,
enableFuzzyMatching: true
});
(0, globals_1.expect)(result.statistics.confidence).toBeDefined();
(0, globals_1.expect)(result.statistics.confidence.average).toBeGreaterThan(0);
(0, globals_1.expect)(result.statistics.confidence.median).toBeGreaterThan(0);
(0, globals_1.expect)(result.statistics.confidence.distribution).toBeDefined();
// Check that we have different confidence levels
const distribution = result.statistics.confidence.distribution;
const totalDistribution = Object.values(distribution).reduce((a, b) => a + b, 0);
(0, globals_1.expect)(totalDistribution).toBe(result.statistics.matchedRows);
});
(0, globals_1.it)('should handle low confidence matches appropriately', async () => {
const leftData = {
name: ['John Smith', 'Jane Doe', 'Bob Johnson'],
id: [1, 2, 3]
};
const rightData = {
full_name: ['Johnny Smithers', 'Janet Doe-Wilson', 'Robert Johns'],
score: [85, 92, 78]
};
const highThresholdResult = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'name',
rightOn: 'full_name',
how: 'inner',
confidenceThreshold: 0.9, // Very high threshold
enableFuzzyMatching: true
});
const lowThresholdResult = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'name',
rightOn: 'full_name',
how: 'inner',
confidenceThreshold: 0.3, // Low threshold
enableFuzzyMatching: true
});
(0, globals_1.expect)(lowThresholdResult.statistics.matchedRows).toBeGreaterThanOrEqual(highThresholdResult.statistics.matchedRows);
});
});
(0, globals_1.describe)('Error Handling', () => {
(0, globals_1.it)('should throw error for missing join columns', async () => {
const leftData = { id: [1, 2, 3] };
const rightData = { user_id: [1, 2, 3] };
await (0, globals_1.expect)(joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'missing_column',
rightOn: 'user_id'
})).rejects.toThrow('Column \'missing_column\' not found');
});
(0, globals_1.it)('should throw error for mismatched join column counts', async () => {
const leftData = { id: [1, 2, 3], name: ['a', 'b', 'c'] };
const rightData = { user_id: [1, 2, 3] };
await (0, globals_1.expect)(joinOperator.semanticJoin(leftData, rightData, {
leftOn: ['id', 'name'],
rightOn: ['user_id'] // Only one column
})).rejects.toThrow('Number of left and right join columns must match');
});
(0, globals_1.it)('should handle empty datasets gracefully', async () => {
const leftData = { id: [], name: [] };
const rightData = { user_id: [1, 2, 3], score: [85, 92, 78] };
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'id',
rightOn: 'user_id',
how: 'inner'
});
(0, globals_1.expect)(result.statistics.outputRows).toBe(0);
(0, globals_1.expect)(result.statistics.matchedRows).toBe(0);
});
});
(0, globals_1.describe)('Real-world Scenarios', () => {
(0, globals_1.it)('should handle customer data integration scenario', async () => {
// Scenario: Joining CRM customer data with transaction data
const crmData = {
customer_id: ['CUST-001', 'CUST-002', 'CUST-003', 'CUST-004'],
email: ['alice@company.com', 'bob@startup.io', 'charlie@corp.net', 'diana@firm.org'],
full_name: ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince'],
registration_date: ['2023-01-15', '2023-02-20', '2023-03-10', '2023-04-05']
};
const transactionData = {
cust_id: ['CUST-001', 'CUST-002', 'CUST-005', 'CUST-001'],
purchase_amount: [150.00, 299.99, 89.50, 75.25],
purchase_date: ['2023-01-20', '2023-02-25', '2023-04-10', '2023-01-25'],
product_category: ['Electronics', 'Books', 'Clothing', 'Electronics']
};
const result = await joinOperator.semanticJoin(crmData, transactionData, {
leftOn: 'customer_id',
rightOn: 'cust_id',
how: 'left', // Keep all customers, even without transactions
confidenceThreshold: 0.8
});
(0, globals_1.expect)(result.statistics.inputRowsLeft).toBe(4);
(0, globals_1.expect)(result.statistics.inputRowsRight).toBe(4);
(0, globals_1.expect)(result.statistics.outputRows).toBe(4); // All customers preserved
(0, globals_1.expect)(result.statistics.matchedRows).toBe(3); // 3 matching transactions
});
(0, globals_1.it)('should handle product catalog integration', async () => {
// Scenario: Joining product catalog with inventory data
const catalogData = {
sku: ['SKU-001', 'SKU-002', 'SKU-003'],
product_name: ['Wireless Headphones', 'Smart Watch', 'Bluetooth Speaker'],
category: ['Electronics', 'Wearables', 'Audio'],
price: [99.99, 299.99, 79.99]
};
const inventoryData = {
product_code: ['SKU-001', 'SKU-002', 'SKU-004'],
warehouse_location: ['WH-A', 'WH-B', 'WH-C'],
stock_quantity: [150, 75, 200],
last_restocked: ['2023-03-01', '2023-03-15', '2023-03-20']
};
const result = await joinOperator.semanticJoin(catalogData, inventoryData, {
leftOn: 'sku',
rightOn: 'product_code',
how: 'outer', // Show all products and inventory items
confidenceThreshold: 0.9 // High confidence for exact product matching
});
(0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // SKU-001 and SKU-002 match
(0, globals_1.expect)(result.statistics.outputRows).toBe(4); // 3 catalog + 1 unmatched inventory
});
(0, globals_1.it)('should handle employee data deduplication scenario', async () => {
// Scenario: Deduplicating employee records from different systems
const hrSystemData = {
emp_id: ['EMP001', 'EMP002', 'EMP003'],
employee_name: ['John A. Doe', 'Jane M. Smith', 'Robert Johnson'],
email: ['john.doe@company.com', 'j.smith@company.com', 'bob.johnson@company.com'],
department: ['Engineering', 'Marketing', 'Sales']
};
const payrollSystemData = {
employee_id: ['PAY001', 'PAY002', 'PAY003'],
full_name: ['John Doe', 'Jane Smith', 'R. Johnson'],
work_email: ['john.doe@company.com', 'jane.smith@company.com', 'robert.j@company.com'],
salary: [95000, 75000, 68000]
};
// Try matching on both name and email for better accuracy
const result = await joinOperator.semanticJoin(hrSystemData, payrollSystemData, {
leftOn: ['employee_name', 'email'],
rightOn: ['full_name', 'work_email'],
how: 'outer',
confidenceThreshold: 0.6,
enableFuzzyMatching: true,
fuzzyThreshold: 0.7
});
(0, globals_1.expect)(result.statistics.matchedRows).toBeGreaterThan(0);
// Should identify John Doe as definite match (same email)
const johnDoeMatch = result.matches.find(m => m.confidence > 0.8 && m.matchType === 'exact');
(0, globals_1.expect)(johnDoeMatch).toBeDefined();
});
});
(0, globals_1.describe)('DataFrame Integration', () => {
(0, globals_1.it)('should work with array of objects format', async () => {
const leftData = [
{ id: 1, name: 'Alice', email: 'alice@example.com' },
{ id: 2, name: 'Bob', email: 'bob@example.com' },
{ id: 3, name: 'Charlie', email: 'charlie@example.com' }
];
const rightData = [
{ user_id: 1, score: 85, region: 'West' },
{ user_id: 2, score: 92, region: 'East' },
{ user_id: 4, score: 78, region: 'South' }
];
const result = await joinOperator.semanticJoin(leftData, rightData, {
leftOn: 'id',
rightOn: 'user_id',
how: 'inner'
});
(0, globals_1.expect)(result.statistics.matchedRows).toBe(2); // IDs 1 and 2 match
});
});
});
//# sourceMappingURL=semantic-join.test.js.map