@astermind/astermind-pro
Version:
Astermind Pro - Premium ML Toolkit with Advanced RAG, Reranking, Summarization, and Information Flow Analysis
111 lines • 3.91 kB
JavaScript
// Index building utilities
// Extracted from workers for reuse
import { requireLicense } from '../core/license.js';
import { tokenize } from '../utils/tokenization.js';
import { toTfidf, sparseToDense } from './vectorization.js';
/**
* Build vocabulary and IDF from chunks
*/
export function buildVocabAndIdf(chunks, vocabSize, useStem) {
const docsTokens = chunks.map(ch => tokenize((ch.heading + ' \n' + ch.content), useStem));
const df = new Map();
for (const toks of docsTokens) {
const unique = new Set(toks);
for (const t of unique)
df.set(t, (df.get(t) || 0) + 1);
}
const sorted = [...df.entries()].sort((a, b) => b[1] - a[1]).slice(0, vocabSize);
const vocabMap = new Map(sorted.map(([tok], i) => [tok, i]));
const idf = new Array(vocabMap.size).fill(0);
const N = docsTokens.length;
for (const [tok, i] of vocabMap.entries()) {
const dfi = df.get(tok) || 1;
idf[i] = Math.log((N + 1) / (dfi + 1)) + 1;
}
return { vocabMap, idf };
}
/**
* Build TF-IDF vectors for all chunks
*/
export function buildTfidfDocs(chunks, vocabMap, idf, headingW, useStem) {
return chunks.map(ch => {
const toks = tokenize((ch.heading + ' \n' + ch.content), useStem);
return toTfidf(toks, idf, vocabMap, headingW);
});
}
/**
* Build Nyström landmarks from TF-IDF documents
*/
export function buildLandmarks(tfidfDocs, vocabSize, numLandmarks) {
const L = Math.max(32, numLandmarks);
const step = Math.max(1, Math.floor(Math.max(1, tfidfDocs.length) / L));
const landmarksIdx = Array.from({ length: L }, (_, k) => Math.min(tfidfDocs.length - 1, k * step));
const landmarkMat = landmarksIdx.map(i => sparseToDense(tfidfDocs[i], vocabSize));
return { landmarksIdx, landmarkMat };
}
/**
* Build dense projections for all TF-IDF documents
*/
export function buildDenseDocs(tfidfDocs, vocabSize, landmarkMat, kernel, sigma) {
return tfidfDocs.map(v => {
const x = sparseToDense(v, vocabSize);
const feats = new Float64Array(landmarkMat.length);
for (let j = 0; j < landmarkMat.length; j++) {
const l = landmarkMat[j];
feats[j] = baseKernel(x, l, kernel, sigma);
}
const n = Math.hypot(...feats);
if (n > 0)
for (let i = 0; i < feats.length; i++)
feats[i] /= n;
return feats;
});
}
function baseKernel(a, b, k, sigma) {
if (k === 'cosine') {
const dot = dotProd(a, b), na = Math.hypot(...a), nb = Math.hypot(...b);
return (na && nb) ? (dot / (na * nb)) : 0;
}
else if (k === 'poly2') {
const dot = dotProd(a, b);
return (dot + 1) ** 2;
}
else {
let s = 0;
for (let i = 0; i < a.length; i++) {
const d = a[i] - b[i];
s += d * d;
}
return Math.exp(-s / Math.max(1e-9, 2 * sigma * sigma));
}
}
function dotProd(a, b) {
let s = 0;
for (let i = 0; i < a.length; i++)
s += a[i] * b[i];
return s;
}
/**
* Build complete index from chunks
*/
export function buildIndex(opts) {
requireLicense(); // Premium feature - requires valid license
const { chunks, vocab, landmarks, headingW, useStem, kernel, sigma } = opts;
// Build vocab and IDF
const { vocabMap, idf } = buildVocabAndIdf(chunks, vocab, useStem);
// Build TF-IDF vectors
const tfidfDocs = buildTfidfDocs(chunks, vocabMap, idf, headingW, useStem);
// Build landmarks
const { landmarksIdx, landmarkMat } = buildLandmarks(tfidfDocs, vocabMap.size, landmarks);
// Build dense projections
const denseDocs = buildDenseDocs(tfidfDocs, vocabMap.size, landmarkMat, kernel, sigma);
return {
vocabMap,
idf,
tfidfDocs,
landmarksIdx,
landmarkMat,
denseDocs,
};
}
//# sourceMappingURL=index-builder.js.map