UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

53 lines 3.23 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const join_planner_1 = require("../../src/operators/join-planner"); function df(rows, cols = ['k']) { const rowsArr = Array.from({ length: rows }, (_, i) => Object.fromEntries(cols.map(c => [c, `${c}_${i % 10}`]))); const columns = cols; return { columns, dtypes: Object.fromEntries(columns.map(c => [c, 'string'])), shape: [rowsArr.length, columns.length], sample: (n = 100) => { const out = {}; for (const c of columns) out[c] = rowsArr.slice(0, Math.min(n, rowsArr.length)).map(r => r[c]); return out; }, getColumn: (name) => rowsArr.map(r => r[name]) }; } const ctx = (t, c = 0.9) => ({ anchor_id: '', semantic_type: t, confidence: c, metadata: {}, inferred_relations: [], domain_specific_tags: [] }); describe('Operators: Join planner branches', () => { it('selects nested_loop for very small datasets', () => { const p = new join_planner_1.SemanticJoinPlanner(); const plan = p.planOptimalJoin(df(10), df(20), { k: ctx('identifier') }, { k: ctx('identifier') }, { leftOn: 'k', rightOn: 'k' }); expect(plan.strategy).toBe('nested_loop'); expect(plan.indexingStrategy).toBe('none'); }); it('selects broadcast_join when one side is small', () => { const p = new join_planner_1.SemanticJoinPlanner(); const plan = p.planOptimalJoin(df(5000), df(200000), { k: ctx('identifier') }, { k: ctx('identifier') }, { leftOn: 'k', rightOn: 'k' }); expect(plan.strategy).toBe('broadcast_join'); expect(['build_left', 'build_right']).toContain(plan.indexingStrategy); }); it('selects sort_merge for high selectivity massive expected matches', () => { const p = new join_planner_1.SemanticJoinPlanner(); // Create many rows to satisfy expectedMatches > 100000 const left = df(50000); const right = df(50000); const plan = p.planOptimalJoin(left, right, { k: ctx('identifier', 0.99) }, { k: ctx('identifier', 0.99) }, { leftOn: 'k', rightOn: 'k', enableFuzzyMatching: true }); expect(['sort_merge', 'hash_join']).toContain(plan.strategy); }); it('precompute normalization toggles based on cost and option', () => { const p = new join_planner_1.SemanticJoinPlanner(); const left = df(2000, ['email']); const right = df(2000, ['user_email']); const plan1 = p.planOptimalJoin(left, right, { email: ctx('email_address') }, { user_email: ctx('email_address') }, { leftOn: 'email', rightOn: 'user_email', cacheNormalizedValues: true }); expect(plan1.normalizationPlan.precomputeNormalization).toBe(true); const plan2 = p.planOptimalJoin(left, right, { email: ctx('email_address') }, { user_email: ctx('email_address') }, { leftOn: 'email', rightOn: 'user_email', cacheNormalizedValues: false }); // costPerRow may still be > 3 depending on estimator; allow either true or false but assert property exists expect(typeof plan2.normalizationPlan.precomputeNormalization).toBe('boolean'); }); }); //# sourceMappingURL=join-planner-branches.test.js.map