node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
212 lines • 11.6 kB
JavaScript
import { LlamaText, SpecialTokensText } from "../../utils/LlamaText.js";
import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js";
import { safeEventCallback } from "../../utils/safeEventCallback.js";
import { maxRecentDetokenizerTokens } from "../../consts.js";
/**
* Chunk the given document using a given context sequence to use the chunks for RAG (Retrieval Augmented Generation) embeddings.
*
* This chunking method is fast and efficient, and utilizes as much parallelization as your hardware allows.
*
* Based on https://github.com/ZeroEntropy-AI/llama-chunk
* @experimental - this API is experimental and may change or be removed in subsequent releases
* @hidden
*/
export async function experimentalChunkDocument(options) {
const { contextSequence, document, separatorTokens = findAppropriateSeparatorTokens(contextSequence.model), getSystemPrompt = getDefaultPrompt, maxChunkSize = 500, maxChunkSizeAlignmentCurve = 4, syntaxAlignment: { maxTokens: maxSyntaxAlignment = 4, trimmedTexts: syntaxAlignmentTrimmedTexts = ["", ".", ";"] } = {}, skipFirstTokens = 3, normalizationTrailSize = 100 } = options;
const onChunkTokens = safeEventCallback(options.onChunkTokens);
const onChunkText = safeEventCallback(options.onChunkText);
if (separatorTokens.length === 0)
throw new Error("Separator tokens must be provided");
const chatHistory = [{
type: "system",
text: LlamaText(getSystemPrompt({
separatorTokens,
tokenizer: contextSequence.model.tokenizer,
maxChunkSize: maxChunkSize <= 0
? undefined
: maxChunkSize
})).toJSON()
}, {
type: "user",
text: document
}, {
type: "model",
response: [""]
}];
const chatWrapper = resolveChatWrapper(contextSequence.model);
const { contextText } = chatWrapper.generateContextState({ chatHistory });
const initialContextTokens = contextText.tokenize(contextSequence.model.tokenizer, "trimLeadingSpace");
const documentTokens = contextSequence.model.tokenize(document, false, "trimLeadingSpace");
const syntaxAlignmentTrimmedTextsSet = new Set(syntaxAlignmentTrimmedTexts);
if (initialContextTokens.length + documentTokens.length > contextSequence.context.contextSize)
throw new Error("The context size is too small to chunk the given document");
const evaluateInput = initialContextTokens.slice();
for (let i = 0; i < documentTokens.length - 1; i++) {
const token = documentTokens[i];
evaluateInput.push([token, {
generateNext: {
probabilities: true
}
}]);
}
let weight = 1;
const recentProbabilitiesTrail = [];
let chunkStartIndex = 0;
let lastPushedSeparatorIndex = 0;
const chunks = [];
const res = [];
function pushSeparatorIndex(separateIndex, separatorToken) {
lastPushedSeparatorIndex = separateIndex;
if (separateIndex <= chunkStartIndex)
return;
let endIndex = separateIndex;
for (let i = 0; i < maxSyntaxAlignment && documentTokens[endIndex + i] != null; i++) {
const text = contextSequence.model.detokenize([documentTokens[endIndex + i]]);
if (!syntaxAlignmentTrimmedTextsSet.has(text.trim()))
break;
endIndex++;
}
const chunk = documentTokens.slice(chunkStartIndex, endIndex);
const text = contextSequence.model.detokenize(chunk, false, documentTokens.slice(chunkStartIndex - maxRecentDetokenizerTokens, chunkStartIndex));
chunks.push(chunk);
chunkStartIndex = endIndex;
onChunkTokens?.(chunk, separatorToken);
onChunkText?.(text, separatorToken);
res.push(text);
}
await contextSequence.controlledEvaluate(evaluateInput, {
onTokenResult(inputTokenIndex, result) {
const i = inputTokenIndex - initialContextTokens.length;
const nextProbabilities = result?.next?.probabilities;
const nextDocumentToken = documentTokens[i + 1];
if (nextProbabilities == null)
throw new Error("received no result for token " + i);
const topProbabilityScore = nextProbabilities.entries()
.next().value?.[1];
const [usedSeparatorToken, separatorProbability] = separatorTokens
.filter((token) => token !== nextDocumentToken) // avoid splitting on document tokens
.map((token) => [token, nextProbabilities.get(token)])
.filter((pair) => pair[1] != null)
.reduce(([tokenA, probabilityA], [tokenB, probabilityB]) => {
if (probabilityA >= probabilityB)
return [tokenA, probabilityA];
return [tokenB, probabilityB];
}, [separatorTokens[0], 0]);
if (topProbabilityScore == null || separatorProbability == null || separatorProbability === 0)
return;
// console.log(
// i, contextSequence.model.detokenize([documentTokens[i]!]),
// Array.from(nextProbabilities.entries()).slice(0, 5)
// .map(([token, probability]) => [contextSequence.model.detokenize([token], true), probability])
// );
if (separatorProbability >= topProbabilityScore)
pushSeparatorIndex(i + 1, usedSeparatorToken);
else if (i > skipFirstTokens) {
const adjustedProbability = separatorProbability + (weight * (1 - separatorProbability));
let maxChunkSizeAlignment = 0;
if (maxChunkSize !== 0 && adjustedProbability < topProbabilityScore) {
const leftProbability = 1 - adjustedProbability;
const currentChunkSize = Math.max(0, 1 + i - chunkStartIndex);
maxChunkSizeAlignment = currentChunkSize === 0
? 0
: adjustExponential(leftProbability * Math.min(1, currentChunkSize / maxChunkSize), maxChunkSizeAlignmentCurve <= 0
? 1
: maxChunkSizeAlignmentCurve, 0.8);
if (currentChunkSize === maxChunkSize)
maxChunkSizeAlignment = 1;
}
if (adjustedProbability + maxChunkSizeAlignment >= topProbabilityScore && adjustedProbability > 0) {
pushSeparatorIndex(i + 1, usedSeparatorToken);
// update the weight of the current token with the adjusted probability in the trail
if (recentProbabilitiesTrail.length > 1) {
weight /= recentProbabilitiesTrail.pop();
recentProbabilitiesTrail.push(adjustedProbability);
weight *= adjustedProbability;
}
}
}
const nextDocumentTokenProbability = nextDocumentToken == null
? undefined
: nextProbabilities.get(nextDocumentToken);
if (nextDocumentTokenProbability != null && nextDocumentTokenProbability > 0) {
recentProbabilitiesTrail.push(nextDocumentTokenProbability);
weight *= nextDocumentTokenProbability;
if (recentProbabilitiesTrail.length > normalizationTrailSize)
weight /= recentProbabilitiesTrail.shift();
}
}
});
if (lastPushedSeparatorIndex !== documentTokens.length)
pushSeparatorIndex(documentTokens.length, separatorTokens[0]);
return res;
}
const idealTokenTexts = [
"\u6bb5", // means "section" in Chinese (according to https://github.com/ZeroEntropy-AI/llama-chunk)
"\u987f", // means "pause" in Chinese (according to Llama 3.1 8B and Qwen 2.5 3B)
"\u00a1", // inverted exclamation mark
"|",
"_"
];
function findAppropriateSeparatorTokens(model, maxTokens = 2) {
const idealTextsSet = new Set(idealTokenTexts);
const foundTokens = [];
for (const token of model.iterateAllTokens()) {
if (model.isSpecialToken(token))
continue;
const text = model.detokenize([token]);
const trimmedText = text.trim();
if (idealTextsSet.has(trimmedText)) {
const textIndex = idealTokenTexts.findIndex((idealText) => idealText === trimmedText);
if (foundTokens[textIndex] == null || text === trimmedText)
foundTokens[textIndex] = token;
}
}
const res = [];
for (let i = 0; i < idealTokenTexts.length; i++) {
const token = foundTokens[i];
if (token != null)
res.push(token);
}
return res.slice(0, maxTokens);
}
function getDefaultPrompt({ separatorTokens, tokenizer, maxChunkSize = 500 }) {
if (separatorTokens.length === 0)
throw new Error("No separator tokens provided");
else if (separatorTokens.length > 2)
throw new Error("Maximum of 2 separator tokens are supported");
return LlamaText.joinValues("\n", [
'Your job is to act as a "Chunker", for usage in RAG pipelines. The user will provide a long document.',
"",
"You should repeat the exact same message verbatim. EXCEPT, you should insert split tokens throughout the document.",
"",
"# Instructions",
LlamaText([
"- For splits, use `",
new SpecialTokensText(tokenizer.detokenize([separatorTokens[0]])),
'` as the "big split token" separator.'
]),
separatorTokens.length > 1 && (LlamaText([
"- For small splits, use `",
new SpecialTokensText(tokenizer.detokenize([separatorTokens[1]])),
'` as the "big split token" separator.'
])),
"- For example, in text document, small splits will be per-sentence, and big splits will be per-section. Do a big split BEFORE the header that defines a section.",
LlamaText([
"- You may get a user message that is unstructured or not structured cleanly. " +
"Still try to split that input as best as you can, even if it means doing a small split every ", Math.ceil(maxChunkSize / 5),
" characters, and a big split every ", Math.floor(maxChunkSize), " characters."
]),
"- You should prefer to wait until the end of a newline or period to break, instead of breaking one or two tokens before that. If there are no newlines or periods, pick some other reasonable breakpoints instead.",
"- Your input could be anything - code, HTML, markdown, etc. You MUST try to output SOME split regardless of the input. Pick something reasonable! E.g. for nodejs, do a small split after every line or code block, and a big split after every function or class definitions.",
'- For HTML, add a small split token after every closing tag and sentence. Add a big split token after every closing tag of an "important" tag.',
"- Please note that you will sometimes not see your own splits in your previous output, that's OK, you MUST continue to try to output split tokens"
].filter((x) => x !== false));
}
function adjustExponential(value, exponent, weight) {
if (value < 0)
return 0;
else if (value > 1)
return 1;
return (value * (1 - weight)) + (weight * Math.pow(value, exponent));
}
//# sourceMappingURL=chunkDocument.js.map