@astermind/astermind-pro
Version:
Astermind Pro - Premium ML Toolkit with Advanced RAG, Reranking, Summarization, and Information Flow Analysis
180 lines • 6.88 kB
JavaScript
// Auto-tuning utilities for hyperparameter optimization
// Extracted from dev-worker for reuse
import { requireLicense } from '../core/license.js';
import { tokenize } from './tokenization.js';
import { toTfidf, cosineSparse, projectToDense, kernelSim } from '../retrieval/vectorization.js';
import { topKIndices } from '../retrieval/hybrid-retriever.js';
import { buildLandmarks, buildDenseDocs } from '../retrieval/index-builder.js';
/**
* Sample queries from corpus
*/
export function sampleQueriesFromCorpus(chunks, n, useStem) {
const out = [];
for (let i = 0; i < n; i++) {
const s = chunks[Math.floor(Math.random() * chunks.length)];
// short synthetic queries from headings + nouns-ish tokens
const toks = tokenize((s.heading + ' ' + s.content).slice(0, 400), useStem)
.filter(t => t.length > 3)
.slice(0, 40);
const uniq = Array.from(new Set(toks));
out.push(uniq.slice(0, 6).join(' '));
}
return out;
}
/**
* Compute penalty for configuration complexity
*/
export function penalty(cfg) {
const lmCost = (cfg.landmarks - 128) / 512;
const vocabCost = (cfg.vocab - 8000) / 24000;
const preCost = (cfg.prefilter - 200) / 1200;
return 0.02 * (lmCost + vocabCost + preCost);
}
/**
* Jaccard similarity between two index arrays
*/
export function jaccard(a, b) {
const A = new Set(a);
const B = new Set(b);
let inter = 0;
for (const x of A)
if (B.has(x))
inter++;
const uni = new Set([...A, ...B]).size;
return uni ? inter / uni : 0;
}
/**
* Clamp value between min and max
*/
function clamp(x, a, b) {
return Math.max(a, Math.min(b, x));
}
/**
* Pick random element from array
*/
function pick(arr) {
return arr[Math.floor(Math.random() * arr.length)];
}
/**
* Random number in range
*/
function randRange(a, b) {
return a + Math.random() * (b - a);
}
/**
* Mutate object with patch
*/
function mutate(base, patch) {
return Object.assign({}, base, patch);
}
/**
* Auto-tune hyperparameters
*/
export async function autoTune(opts, onProgress) {
requireLicense(); // Premium feature - requires valid license
const { chunks, vocabMap, idf, tfidfDocs, vocabSize, budget = 40, sampleQueries: Qn = 24, currentSettings, } = opts;
const budgetClamped = Math.max(10, Math.min(200, budget));
const QnClamped = Math.max(8, Math.min(60, Qn));
const useStem = (currentSettings.useStem ?? true);
const queries = sampleQueriesFromCorpus(chunks, QnClamped, useStem);
// Pre-compute TF-IDF top-K for each query (baseline)
const tfidfTops = queries.map(q => {
const qv = toTfidf(tokenize(q, useStem), idf, vocabMap, 1);
const scores = tfidfDocs.map(v => cosineSparse(v, qv));
return topKIndices(scores, (currentSettings.topK ?? 8));
});
let best = { score: -Infinity, cfg: { ...currentSettings } };
// Cache for dense docs (keyed by kernel params)
const denseCache = new Map();
const denseDocsFor = (cfg) => {
// ridge doesn't affect projection; key on kernel params only
const key = `${cfg.kernel}:${cfg.landmarks}:${cfg.sigma}`;
let dd = denseCache.get(key);
if (!dd) {
const { landmarksIdx, landmarkMat } = buildLandmarks(tfidfDocs, vocabSize, cfg.landmarks);
dd = buildDenseDocs(tfidfDocs, vocabSize, landmarkMat, cfg.kernel, cfg.sigma);
denseCache.set(key, dd);
}
return dd;
};
let trial = 0;
const tryCfg = (cfg, note) => {
const jScores = [];
const dd = denseDocsFor(cfg);
const alpha = clamp(cfg.alpha, 0, 1);
const lambda = (cfg.ridge ?? 0.05);
for (let qi = 0; qi < queries.length; qi++) {
const q = queries[qi];
const qv = toTfidf(tokenize(q, cfg.useStem), idf, vocabMap, 1);
const { landmarksIdx, landmarkMat } = buildLandmarks(tfidfDocs, vocabSize, cfg.landmarks);
const qd = projectToDense(qv, vocabSize, landmarkMat, cfg.kernel, cfg.sigma);
const tfidfScores = tfidfDocs.map(v => cosineSparse(v, qv));
// Compute dense scores using kernel similarity
const denseScoresSimple = dd.map((v) => kernelSim(v, qd, cfg.kernel, cfg.sigma));
// ridge-regularized hybrid (bonus off during tuning)
const hybrid = denseScoresSimple.map((d, i) => {
const t = tfidfScores[i];
const reg = 1 / (1 + lambda * (d * d + t * t));
return reg * (alpha * d + (1 - alpha) * t);
});
const idxs = topKIndices(hybrid, cfg.topK);
jScores.push(jaccard(tfidfTops[qi], idxs));
}
const score = (jScores.reduce((a, b) => a + b, 0) / jScores.length) - penalty(cfg);
if (score > best.score)
best = { score, cfg: { ...cfg } };
if (onProgress)
onProgress(++trial, best.score, note);
};
// random warmup
for (let i = 0; i < Math.floor(budgetClamped * 0.6); i++) {
const cfg = mutate(currentSettings, {
alpha: randRange(0.55, 0.95),
beta: randRange(0.0, 0.35),
sigma: randRange(0.18, 0.75),
kernel: pick(['rbf', 'cosine', 'poly2']),
vocab: pick([8000, 10000, 12000, 15000]),
landmarks: pick([128, 192, 256, 320, 384]),
prefilter: pick([200, 300, 400, 600]),
topK: pick([4, 6, 8]),
headingW: randRange(1.5, 4.5),
chunk: pick([450, 550, 650]),
overlap: pick([50, 75, 100]),
penalizeLinks: true,
stripCode: true,
expandQuery: true,
useStem: true,
ridge: randRange(0.02, 0.18),
});
tryCfg(cfg, 'random');
}
// refinement
for (let i = trial; i < budgetClamped; i++) {
const b = best.cfg;
const cfg = mutate(b, {
alpha: clamp(b.alpha + randRange(-0.1, 0.1), 0.4, 0.98),
beta: clamp(b.beta + randRange(-0.1, 0.1), 0, 0.4),
sigma: clamp(b.sigma + randRange(-0.08, 0.08), 0.12, 1.0),
kernel: b.kernel,
vocab: b.vocab,
landmarks: b.landmarks,
prefilter: b.prefilter,
topK: b.topK,
headingW: clamp(b.headingW + randRange(-0.4, 0.4), 1.0, 6.0),
chunk: b.chunk,
overlap: b.overlap,
penalizeLinks: b.penalizeLinks,
stripCode: b.stripCode,
expandQuery: b.expandQuery,
useStem: b.useStem,
ridge: clamp((b.ridge ?? 0.05) + randRange(-0.02, 0.02), 0.0, 0.2),
});
tryCfg(cfg, 'refine');
}
return {
bestSettings: best.cfg,
bestScore: best.score,
trials: trial,
};
}
//# sourceMappingURL=autotune.js.map