c9ai
Version:
Universal AI assistant with vibe-based workflows, hybrid cloud+local AI, and comprehensive tool integration
90 lines (77 loc) • 3.34 kB
JavaScript
;
// Lightweight streaming quality heuristics — no heavy deps.
// Idea: If novelty stays low and repetition stays high for a few windows, stop.
// Optionally, if we have source text (e.g., PDF), monitor drift from source keywords.
function tokenize(s) {
return String(s || "")
.toLowerCase()
.replace(/[^\p{L}\p{N}\s\-']/gu, " ")
.split(/\s+/)
.filter(Boolean);
}
function ngrams(tokens, n = 2) {
const out = [];
for (let i = 0; i + n <= tokens.length; i++) out.push(tokens.slice(i, i + n).join(" "));
return out;
}
function topKeywords(text, k = 40) {
const stop = new Set([
"the","and","of","to","a","in","for","is","on","it","that","as","with","at","by",
"an","be","or","are","from","this","was","but","if","not","we","you","i","they",
"he","she","them","his","her","their","our","your","so","do","does","did","can",
"will","would","could","should","have","has","had","about","into","over","than"
]);
const freq = new Map();
for (const t of tokenize(text)) {
if (stop.has(t) || t.length <= 2) continue;
freq.set(t, (freq.get(t) || 0) + 1);
}
return new Set([...freq.entries()].sort((a,b)=>b[1]-a[1]).slice(0, k).map(e=>e[0]));
}
class StreamGuard {
constructor(opts = {}) {
this.windowChars = opts.windowChars ?? 600; // rolling window size
this.minNovelty = opts.minNovelty ?? 0.16; // unique tokens / total
this.maxRepeatRate = opts.maxRepeatRate ?? 0.38; // repeated bigrams / total bigrams
this.minDriftOverlap = opts.minDriftOverlap ?? 0.08; // overlap vs. source keywords
this.badWindowsToStop = opts.badWindowsToStop ?? 3;
this.enableDrift = !!opts.sourceText;
this.sourceKeys = opts.sourceText ? topKeywords(opts.sourceText, opts.sourceTopK ?? 50) : null;
this.buffer = "";
this.badCount = 0;
}
// returns {ok:boolean, reason?:string}
push(chunk) {
this.buffer += chunk;
const window = this.buffer.slice(-this.windowChars);
const toks = tokenize(window);
if (toks.length < 30) return { ok: true }; // too early to judge
const uniq = new Set(toks);
const novelty = uniq.size / toks.length;
const bi = ngrams(toks, 2);
const seen = new Set();
let reps = 0;
for (const g of bi) {
if (seen.has(g)) reps++;
else seen.add(g);
}
const repeatRate = bi.length ? (reps / bi.length) : 0;
let driftOk = true;
if (this.enableDrift && this.sourceKeys && this.sourceKeys.size) {
let hits = 0;
for (const t of uniq) if (this.sourceKeys.has(t)) hits++;
const overlap = hits / Math.max(1, uniq.size);
driftOk = overlap >= this.minDriftOverlap;
}
const bad = (novelty < this.minNovelty) && (repeatRate > this.maxRepeatRate) && driftOk;
// if driftOk is false (too off-topic), also treat as bad:
const trulyBad = bad || (!driftOk && repeatRate > 0.25);
if (trulyBad) this.badCount++; else this.badCount = Math.max(0, this.badCount - 1);
if (this.badCount >= this.badWindowsToStop) {
return { ok: false, reason: `low-novelty/high-repeat (${novelty.toFixed(2)}/${repeatRate.toFixed(2)})` };
}
return { ok: true };
}
snapshot() { return this.buffer; }
}
module.exports = { StreamGuard };