UNPKG

graphlit-client

Version:
184 lines (183 loc) 7.46 kB
/** * Breaks an LLM’s streaming token deltas into character, word, or sentence * chunks – or lets you plug in your own chunker. * * Usage * ----- * const buf = new ChunkBuffer('sentence'); * stream.on('delta', d => buf.addToken(d).forEach(pushToUI)); * stream.on('end', () => buf.flush().forEach(pushToUI)); */ const hasSegmenter = typeof Intl !== "undefined" && "Segmenter" in Intl; export class ChunkBuffer { // ──────────────────────────────────────────────────────────────────── // public API // ──────────────────────────────────────────────────────────────────── constructor(strategy, opts = {}) { if (typeof strategy === "function") { this.customChunker = strategy; this.strategy = "custom"; } else { this.strategy = strategy; } this.MAX_WORD_LEN = opts.maxWordLen ?? 50; this.MAX_BUFFER_NO_BREAK = opts.maxBufferNoBreak ?? 400; if (hasSegmenter) { this.graphemeSeg = new Intl.Segmenter(undefined, { granularity: "grapheme", }); this.wordSeg = new Intl.Segmenter(undefined, { granularity: "word" }); this.sentenceSeg = new Intl.Segmenter(undefined, { granularity: "sentence", }); } } /** Feed one LLM delta; receive zero‑or‑more flushed chunks. */ addToken(token) { this.buffer += token; if (this.customChunker) return this.flushCustom(); // emergency bailout for giant uninterrupted text const forced = this.flushLongRuns(); const fresh = this.strategy === "character" ? this.flushGraphemes() : this.strategy === "word" ? this.flushWords() : this.flushSentences(); return forced.concat(fresh); } /** Call when the stream closes to emit the final remainder. */ flush() { if (!this.buffer.length) return []; if (this.customChunker) { const { chunks, remainder } = this.customChunker(this.buffer); this.buffer = ""; return [...chunks, remainder].filter(Boolean); } // Re‑use the normal strategy until nothing more flushes. const out = []; while (true) { const next = this.strategy === "character" ? this.flushGraphemes() : this.strategy === "word" ? this.flushWords() : this.flushSentences(); if (!next.length) break; out.push(...next); } if (this.buffer) out.push(this.buffer); this.buffer = ""; return out; } // ──────────────────────────────────────────────────────────────────── // internals // ──────────────────────────────────────────────────────────────────── buffer = ""; strategy; customChunker; MAX_WORD_LEN; MAX_BUFFER_NO_BREAK; // These are only defined when Intl.Segmenter exists. graphemeSeg; wordSeg; sentenceSeg; // -- character ------------------------------------------------------ flushGraphemes() { if (!hasSegmenter) return []; // unreachable on modern runtimes const segs = Array.from(this.graphemeSeg.segment(this.buffer)).map((s) => s.segment); /* Strategy: always keep exactly one segment in the buffer. If we only have one segment so far, we don’t know whether it’s complete (could be half a surrogate pair). Wait for more. */ if (segs.length <= 1) return []; const emit = segs.slice(0, -1); this.buffer = segs[segs.length - 1]; return emit; } // -- word ----------------------------------------------------------- flushWords() { if (!hasSegmenter) return []; // unreachable on modern runtimes const chunks = []; let leadNonWord = ""; let word = ""; let tailNonWord = ""; for (const s of this.wordSeg.segment(this.buffer)) { if (s.isWordLike) { if (word && tailNonWord) { // previous word finished chunks.push(word + tailNonWord); word = tailNonWord = ""; } word += s.segment; if (word.length > this.MAX_WORD_LEN) { // force‑break huge “word” chunks.push(word + tailNonWord); word = tailNonWord = ""; } } else { // non‑word segment (space / punctuation) if (!word) { leadNonWord += s.segment; // leading whitespace } else { tailNonWord += s.segment; // trailing whitespace } } } // flush leading non‑word if present and some word followed if (leadNonWord && word) { chunks.push(leadNonWord); leadNonWord = ""; } this.buffer = leadNonWord + word + tailNonWord; return chunks.filter(Boolean); } // -- sentence ------------------------------------------------------- flushSentences() { if (!hasSegmenter) return []; // unreachable on modern runtimes // find last confirmed boundary with regex (includes CJK punctuation) const boundary = /.*?[.?!。!?](\s+|$)/g; // negative‑look‑behind ellipsis left out for perf let last = -1, m; while ((m = boundary.exec(this.buffer))) last = boundary.lastIndex; if (last === -1) return []; const slice = this.buffer.slice(0, last); this.buffer = this.buffer.slice(last); return Array.from(this.sentenceSeg.segment(slice)) .map((s) => s.segment) .filter(Boolean); } // -- long‑run bailout ---------------------------------------------- flushLongRuns() { if (this.buffer.length > this.MAX_BUFFER_NO_BREAK && !/\s/.test(this.buffer)) { const head = this.buffer.slice(0, this.MAX_BUFFER_NO_BREAK); this.buffer = this.buffer.slice(this.MAX_BUFFER_NO_BREAK); return [head]; } return []; } // -- custom --------------------------------------------------------- flushCustom() { try { const { chunks, remainder } = this.customChunker(this.buffer); this.buffer = remainder; return chunks; } catch (err) { console.error("Custom chunker failed – flushing whole buffer to avoid data loss", err); const all = this.buffer; this.buffer = ""; return [all]; } } }