@astermind/astermind-pro
Version:
Astermind Pro - Premium ML Toolkit with Advanced RAG, Reranking, Summarization, and Information Flow Analysis
413 lines • 16.4 kB
JavaScript
/// <reference path="./worker-types.d.ts" />
// worker.ts — AsterMind KELM Worker with OmegaRR + OmegaSum pipeline
// Retrieval (TF-IDF + Nyström kernels) → Rerank+Filter (OmegaRR) → Grounded Summary (OmegaSum)
//
// - Tree-aware markdown parsing, rich+plain chunk retention, parent backfill
// - Hybrid scoring with ridge-style damping
// - Drop-in reranker (ridge over engineered features) + MMR reducer
// - Constrained summarizer that only uses kept chunk text (with citations)
import { requireLicense } from '../core/license.js';
import { InfoFlowGraph } from '../infoflow/index.js';
import { InfoFlowGraphPWS } from '../infoflow/TransferEntropyPWS.js';
import { rerankAndFilter } from '../reranking/OmegaRR.js';
import { summarizeDeterministic } from '../summarization/OmegaSumDet.js';
// Import extracted modules
import { parseMarkdownToSections, backfillEmptyParents, flattenSections } from '../utils/markdown.js';
import { tokenize, expandQuery } from '../utils/tokenization.js';
import { buildIndex as buildIndexUtil, buildDenseDocs } from '../retrieval/index-builder.js';
import { hybridRetrieve } from '../retrieval/hybrid-retriever.js';
import { projectToDense, toTfidf } from '../retrieval/vectorization.js';
import { autoTune as autoTuneUtil } from '../utils/autotune.js';
import { exportModel as exportModelUtil, importModel as importModelUtil } from '../utils/model-serialization.js';
import { buildLandmarks } from '../retrieval/index-builder.js';
// SerializedModel is imported from types.ts
/* =========================
Global State
========================= */
let SETTINGS;
let IFLOW = null;
let CTRL = null;
let sections = []; // legacy flat (kept for compatibility/debug)
let chunks = [];
let vocabMap = new Map(); // token -> id
let idf = [];
let tfidfDocs = []; // chunk vectors
// Dense (Nyström) state
let landmarksIdx = [];
let landmarkMat = []; // landmark vectors in sparse->dense kernel space
let denseDocs = [];
// post() loosened to any to allow new message kinds like 'kept'
const post = (m) => postMessage(m);
self.addEventListener('message', (e) => {
// union-safe destructuring
const msg = e.data;
const action = (msg && msg.action);
const payload = (msg && msg.payload) ?? {};
(async () => {
try {
// License check - all worker actions require a valid license
requireLicense();
if (action === 'init') {
SETTINGS = { ...(payload?.settings || {}) };
await loadAndIndex(payload?.chaptersPath || '/chapters.json');
}
else if (action === 'reindex') {
Object.assign(SETTINGS, payload?.settings || {});
await buildIndex();
}
else if (action === 'ask') {
if (payload?.settings)
Object.assign(SETTINGS, payload.settings);
const res = await answer(payload.q);
post({ type: 'answer', text: res.answer });
post({ type: 'results', items: res.items });
post({ type: 'stats', text: res.stats });
if (res.kept)
post({ type: 'kept', items: res.kept });
}
else if (action === 'autotune') {
await autoTune(payload || {});
}
else if (action === 'reset') {
sections = [];
chunks = [];
vocabMap.clear();
idf = [];
tfidfDocs = [];
denseDocs = [];
post({ type: 'ready' });
}
}
catch (err) {
post({ type: 'error', error: String(err?.message || err) });
}
})();
});
post({ type: 'ready' });
/* =========================
Markdown Tree Parsing + Chunking
========================= */
// Markdown parsing functions are now imported from '../utils/markdown.js'
/* =========================
Load + Index
========================= */
async function loadAndIndex(chaptersPath) {
const meta = await (await fetch(chaptersPath)).json();
const files = meta.files || [];
const parsedRoots = [];
if (SETTINGS.enableInfoFlow) {
const common = {
window: Math.max(32, SETTINGS.infoFlowWindow ?? 256),
condLags: Math.max(1, SETTINGS.infoFlowCondLags ?? 1),
xLags: 1,
ridge: 1e-6,
bits: true,
};
IFLOW = SETTINGS.infoFlowMode === 'pws'
? new InfoFlowGraphPWS({ ...common, usePWS: true, tailQuantile: 0.9, tailBoost: 4, jitterSigma: 0.15, pwsIters: 8 })
: new InfoFlowGraph(common); // Phase-1
}
else {
IFLOW = null;
}
for (const f of files) {
const raw = await (await fetch('/' + f)).text();
// Ignore top-level # doc title; start at ## and deeper
const root = parseMarkdownToSections(raw, { stripCode: SETTINGS.stripCode ?? true, stripLinks: true });
backfillEmptyParents(root);
parsedRoots.push(root);
}
// Build flat arrays for compatibility & vectorization
sections = [];
chunks = [];
for (const root of parsedRoots) {
const flat = flattenSections(root);
chunks.push(...flat);
for (const c of flat)
sections.push({ heading: c.heading, content: c.content });
}
await buildIndex();
}
async function buildIndex() {
// Chunks already exist (tree-aware). Do NOT re-slice from `sections` here.
const indexState = buildIndexUtil({
chunks,
vocab: SETTINGS.vocab,
landmarks: SETTINGS.landmarks,
headingW: SETTINGS.headingW,
useStem: SETTINGS.useStem,
kernel: SETTINGS.kernel,
sigma: SETTINGS.sigma,
});
vocabMap = indexState.vocabMap;
idf = indexState.idf;
tfidfDocs = indexState.tfidfDocs;
landmarksIdx = indexState.landmarksIdx;
landmarkMat = indexState.landmarkMat;
denseDocs = indexState.denseDocs;
post({ type: 'indexed', docs: chunks.length, stats: `${chunks.length} chunks • vocab ${vocabMap.size} • L=${SETTINGS.landmarks}` });
}
/* =========================
Retrieval → Rerank/Filter → Summarize (ridge-regularized hybrid)
========================= */
async function answer(q) {
// Use hybrid retrieval
const retrievalResult = hybridRetrieve({
query: q,
chunks,
vocabMap,
idf,
tfidfDocs,
denseDocs,
landmarksIdx,
landmarkMat,
vocabSize: vocabMap.size,
kernel: SETTINGS.kernel,
sigma: SETTINGS.sigma,
alpha: SETTINGS.alpha,
beta: SETTINGS.beta ?? 0,
ridge: SETTINGS.ridge ?? 0.08,
headingW: SETTINGS.headingW ?? 1.0,
useStem: SETTINGS.useStem,
expandQuery: SETTINGS.expandQuery ?? false,
topK: SETTINGS.topK,
prefilter: SETTINGS.prefilter,
});
const finalIdxs = retrievalResult.indices;
const items = retrievalResult.items;
// --- TE: Retriever (Query -> Hybrid scores) ---
if (IFLOW) {
// Build query vector for InfoFlow tracking
const qexp = SETTINGS.expandQuery ? expandQuery(q) : q;
const toks = tokenize(qexp, SETTINGS.useStem);
const qvec = toTfidf(toks, idf, vocabMap, 1.0);
const qdense = projectToDense(qvec, vocabMap.size, landmarkMat, SETTINGS.kernel, SETTINGS.sigma);
// Represent query as a small vector: [avg_tfidf, avg_dense]
const qSig = [avg(Array.from(qvec.values())), avg(qdense)];
// Represent scores as a short vector: stats over current candidate pool
const scoreSig = [
avg(retrievalResult.tfidfScores), avg(retrievalResult.denseScores), avg(retrievalResult.scores)
];
if (isFiniteVec(qSig) && isFiniteVec(scoreSig)) {
IFLOW.get('Retriever:Q->Score').push(qSig, scoreSig);
}
}
// ---------- NEW: OmegaRR + OmegaSum ----------
// Prepare reranker input from the SAME selected chunks, passing the hybrid score as a prior.
const rerankInput = finalIdxs.map(i => {
const c = chunks[i];
return {
heading: c.heading,
content: c.content || "", // index/plain text (no code fences needed)
rich: c.rich, // keep rich for code-aware summarization
level: c.level,
secId: c.secId,
// OmegaRR reads score_base as prior
// @ts-ignore
score_base: scores[i]
};
});
// ---------- OmegaRR: rerank+filter (single call) ----------
const kept = rerankAndFilter(q, rerankInput, {
lambdaRidge: 1e-2,
probThresh: 0.45,
epsilonTop: 0.05,
useMMR: true,
mmrLambda: 0.7,
budgetChars: 1200,
randomProjDim: 32,
});
// --- TE: OmegaRR engineered features driving score ---
if (IFLOW) {
for (const k of kept) {
const f = k._features;
if (f && f.length && isFiniteVec(f) && Number.isFinite(k.score_rr)) {
IFLOW.get('OmegaRR:Feat->Score').push(f, [k.score_rr]);
}
}
}
// ---------- OmegaSumDet: deterministic, context-locked summarization ----------
// Map OmegaRR fields into the simple ScoredChunk shape expected by OmegaSumDet.
// We treat the array order of `kept` as the rrRank (0..N-1) for stability.
const detInput = kept.map((k, i) => ({
heading: k.heading,
content: k.content || "",
rich: k.rich,
level: k.level,
secId: k.secId,
rrScore: k.score_rr ?? 0,
rrRank: i,
}));
const sum = summarizeDeterministic(q, detInput, {
// output shaping
maxAnswerChars: 1100,
maxBullets: 3,
includeCitations: true,
addFooter: true,
preferCode: true,
// weights — conservative rrWeight so reranker doesn’t dominate query-alignment
teWeight: 0.25,
queryWeight: 0.50,
evidenceWeight: 0.15,
rrWeight: 0.10,
// bonuses/thresholds
codeBonus: 0.05,
headingBonus: 0.04,
jaccardDedupThreshold: 0.6,
// HARD gates to prevent off-topic leakage
allowOffTopic: false,
minQuerySimForCode: 0.35,
// keep answers focused on the most aligned heading
focusTopAlignedHeadings: 1,
maxSectionsInAnswer: 1,
});
// --- TE: Kept -> Summary (grounded influence) ---
if (IFLOW && kept.length > 0) {
// Build a compact "kept" signature: average TF-IDF over kept contents
const keptTokens = kept.map(k => tokenize(k.content || '', SETTINGS.useStem));
const keptVecs = keptTokens.map(toks => toTfidf(toks, idf, vocabMap, 1.0));
// Average over kept vectors into one dense projection to keep spaces consistent
let keptDense = new Float64Array(landmarksIdx.length);
let cnt = 0;
for (const v of keptVecs) {
const d = projectToDense(v, vocabMap.size, landmarkMat, SETTINGS.kernel, SETTINGS.sigma);
// sanitize non-finite
for (let i = 0; i < d.length; i++)
if (!Number.isFinite(d[i]))
d[i] = 0;
for (let i = 0; i < keptDense.length; i++)
keptDense[i] += d[i];
cnt++;
}
if (cnt > 0)
for (let i = 0; i < keptDense.length; i++)
keptDense[i] /= cnt;
// Summary signature: project answer text using same pipeline
const sumTok = tokenize(sum.text || '', SETTINGS.useStem);
const sumVec = toTfidf(sumTok, idf, vocabMap, 1.0);
const sumDense = projectToDense(sumVec, vocabMap.size, landmarkMat, SETTINGS.kernel, SETTINGS.sigma);
for (let i = 0; i < sumDense.length; i++)
if (!Number.isFinite(sumDense[i]))
sumDense[i] = 0;
const kd = Array.from(keptDense);
const sd = Array.from(sumDense);
if (isFiniteVec(kd) && isFiniteVec(sd)) {
IFLOW.get('Omega:Kept->Summary').push(kd, sd);
}
}
if (IFLOW)
post({ type: 'infoflow', te: IFLOW.snapshot() });
const alpha = SETTINGS.alpha;
const lambda = SETTINGS.ridge ?? 0.08;
const tf = mean(retrievalResult.tfidfScores, finalIdxs);
const de = mean(retrievalResult.denseScores, finalIdxs);
const teSnap = IFLOW ? IFLOW.snapshot() : null;
const teLine = teSnap
? ` | TE bits — Q→Score ${fmt(teSnap['Retriever:Q->Score'])}, Feat→Score ${fmt(teSnap['OmegaRR:Feat->Score'])}, Kept→Summary ${fmt(teSnap['Omega:Kept->Summary'])}`
: '';
const stats = `α=${alpha.toFixed(2)} σ=${(SETTINGS.sigma).toFixed(2)} K=${SETTINGS.kernel} λ=${lambda.toFixed(3)} | tfidf ${tf.toFixed(3)} dense ${de.toFixed(3)} | kept ${kept.length}${teLine}`;
// Return grounded answer + original retrieved list + debug kept list
return {
answer: sum.text,
items,
stats,
kept: kept.map(k => ({
heading: k.heading,
p: Number(k.p_relevant.toFixed(3)),
rr: Number(k.score_rr.toFixed(3))
}))
};
}
/* =========================
Auto-Tune
========================= */
async function autoTune(payload) {
const result = await autoTuneUtil({
chunks: chunks.map(c => ({ heading: c.heading, content: c.content })),
vocabMap,
idf,
tfidfDocs,
vocabSize: vocabMap.size,
budget: payload.budget,
sampleQueries: payload.sampleQueries,
currentSettings: SETTINGS,
}, (trial, best, note) => {
post({ type: 'autotune/progress', trial, best, note });
});
Object.assign(SETTINGS, result.bestSettings);
const { landmarksIdx: newLandmarksIdx, landmarkMat: newLandmarkMat } = buildLandmarks(tfidfDocs, vocabMap.size, SETTINGS.landmarks);
landmarksIdx = newLandmarksIdx;
landmarkMat = newLandmarkMat;
denseDocs = buildDenseDocs(tfidfDocs, vocabMap.size, landmarkMat, SETTINGS.kernel, SETTINGS.sigma);
post({ type: 'autotune/done', best: SETTINGS, score: result.bestScore });
}
/* =========================
Misc helpers (worker-specific)
========================= */
function avg(arr) {
let s = 0;
for (let i = 0; i < arr.length; i++)
s += arr[i];
return s / Math.max(1, arr.length);
}
function isFiniteVec(v) {
if (!v || v.length === 0)
return false;
for (let i = 0; i < v.length; i++)
if (!Number.isFinite(v[i]))
return false;
return true;
}
function fmt(x) {
return Number.isFinite(x) ? x.toFixed(4) : '0';
}
function mean(arr, idx) {
if (idx.length === 0)
return 0;
let s = 0;
for (const i of idx)
s += arr[i];
return s / idx.length;
}
export async function exportModel(opts) {
return exportModelUtil({
settings: SETTINGS,
vocabMap,
idf,
chunks,
tfidfDocs,
landmarksIdx,
landmarkMat,
denseDocs,
includeRich: opts?.includeRich,
includeDense: opts?.includeDense,
});
}
export function downloadModelJSON(model, filename = 'astermind-elm.model.json') {
const blob = new Blob([JSON.stringify(model)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
postMessage({ type: 'download/url', url, filename }); // your UI can catch this and create an <a download>
}
// Only if you call from Node context, not the worker
export async function saveModelToFs(model, path) {
const fs = await import('node:fs/promises');
await fs.writeFile(path, JSON.stringify(model));
}
export async function importModel(model, opts) {
const imported = importModelUtil(model, {
...opts,
buildDense: (tfidfDocs, vocabSize, landmarkMat, kernel, sigma) => buildDenseDocs(tfidfDocs, vocabSize, landmarkMat, kernel, sigma),
});
SETTINGS = imported.settings;
vocabMap = imported.vocabMap;
idf = imported.idf;
chunks = imported.chunks;
tfidfDocs = imported.tfidfDocs;
landmarksIdx = imported.landmarksIdx;
landmarkMat = imported.landmarkMat;
denseDocs = imported.denseDocs;
// legacy `sections` for UI/debug parity (optional)
sections = chunks.map(c => ({ heading: c.heading, content: c.content || '' }));
// Done. You can now call answer(q) immediately—no corpus needed.
}
//# sourceMappingURL=dev-worker.js.map