UNPKG

@astermind/astermind-pro

Version:

Astermind Pro - Premium ML Toolkit with Advanced RAG, Reranking, Summarization, and Information Flow Analysis

90 lines 3.67 kB
// Hybrid retrieval system (sparse + dense + keyword bonus) // Extracted from workers for reuse import { requireLicense } from '../core/license.js'; import { tokenize, expandQuery } from '../utils/tokenization.js'; import { toTfidf, cosineSparse, kernelSim, projectToDense } from './vectorization.js'; /** * Compute keyword bonus scores for chunks */ export function keywordBonus(chunks, query) { const kws = Array.from(new Set(query.toLowerCase().split(/\W+/).filter(t => t.length > 2))); const syntaxBoost = /\b(define|declare|syntax|example|function|struct|map|interface)\b/i.test(query); return chunks.map(c => { const text = c.rich || c.content || ''; const lc = text.toLowerCase(); let hit = 0; for (const k of kws) if (lc.includes(k)) hit++; if (syntaxBoost && /```/.test(text)) hit += 5; // strong bonus for code presence return Math.min(1.0, hit * 0.03); }); } /** * Get top K indices from scores */ export function topKIndices(arr, k) { const idx = Array.from(arr, (_, i) => i); idx.sort((i, j) => (arr[j] - arr[i])); return idx.slice(0, k); } /** * Clamp value between min and max */ function clamp(x, a, b) { return Math.max(a, Math.min(b, x)); } /** * Perform hybrid retrieval (sparse + dense + keyword bonus) */ export function hybridRetrieve(opts) { requireLicense(); // Premium feature - requires valid license const { query, chunks, vocabMap, idf, tfidfDocs, denseDocs, landmarksIdx, landmarkMat, vocabSize, kernel, sigma, alpha, beta, ridge, headingW, useStem, expandQuery: shouldExpand, topK: k, prefilter, } = opts; // Expand query if needed const qexp = shouldExpand ? expandQuery(query) : query; const toks = tokenize(qexp, useStem); const qvec = toTfidf(toks, idf, vocabMap, headingW); const qdense = projectToDense(qvec, vocabSize, landmarkMat, kernel, sigma); // Compute sparse (TF-IDF) scores const tfidfScores = tfidfDocs.map(v => cosineSparse(v, qvec)); // Compute dense (kernel) scores const denseScores = denseDocs.map((v) => kernelSim(v, qdense, kernel, sigma)); // Compute keyword bonus const bonus = keywordBonus(chunks, query); // Hybrid scoring with ridge regularization const alphaClamped = clamp(alpha, 0, 1); const lambda = ridge ?? 0.08; const scores = denseScores.map((d, i) => { const t = tfidfScores[i]; const b = beta * bonus[i]; // Ridge damping on ALL components (dense, tfidf, and keyword bonus) const reg = 1 / (1 + lambda * (d * d + t * t + 0.5 * b * b)); const s = reg * (alphaClamped * d + (1 - alphaClamped) * t + b); // soft clip extremes; helps prevent a single noisy dimension from dominating return Math.tanh(s); }); // Pre-filter then final topK (retrieval stage) const pre = Math.max(k, prefilter ?? 0); const idxs = topKIndices(scores, pre); const finalIdxs = topKIndices(idxs.map(i => scores[i]), k).map(k => idxs[k]); // Build result items const items = finalIdxs.map(i => { const c = chunks[i]; const body = (c.rich && c.rich.trim()) || (c.content && c.content.trim()) || '(see subsections)'; return { score: scores[i], heading: c.heading, content: body, index: i, }; }); return { items, scores: finalIdxs.map(i => scores[i]), indices: finalIdxs, tfidfScores: finalIdxs.map(i => tfidfScores[i]), denseScores: finalIdxs.map(i => denseScores[i]), }; } //# sourceMappingURL=hybrid-retriever.js.map