UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

node-llama-cpp.withcat.ai

withcatai/node-llama-cpp

979 lines • 113 kB

JavaScript

import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils"; import { isChatModelResponseFunctionCall, isChatModelResponseSegment, allSegmentTypes } from "../../types.js"; import { removeNullFields } from "../../utils/removeNullFields.js"; import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js"; import { LlamaText, SpecialToken } from "../../utils/LlamaText.js"; import { StopGenerationDetector } from "../../utils/StopGenerationDetector.js"; import { TokenStreamRegulator } from "../../utils/TokenStreamRegulator.js"; import { maxRecentDetokenizerTokens, UNKNOWN_UNICODE_CHAR } from "../../consts.js"; import { getQueuedTokensBeforeStopTrigger } from "../../utils/getQueuedTokensBeforeStopTrigger.js"; import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js"; import { safeEventCallback } from "../../utils/safeEventCallback.js"; import { pushAll } from "../../utils/pushAll.js"; import { resolveLastTokens } from "../../utils/resolveLastTokens.js"; import { LlamaSampler } from "../LlamaContext/LlamaSampler.js"; import { getChatWrapperSegmentDefinition } from "../../utils/getChatWrapperSegmentDefinition.js"; import { jsonDumps } from "../../chatWrappers/utils/jsonDumps.js"; import { eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js"; import { FunctionCallNameGrammar } from "./utils/FunctionCallNameGrammar.js"; import { FunctionCallParamsGrammar } from "./utils/FunctionCallParamsGrammar.js"; const defaultContextShiftOptions = { size: (sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)), strategy: "eraseFirstResponseAndKeepFirstSystem", lastEvaluationMetadata: null }; const defaultRepeatPenaltyLastTokens = 64; const defaultTrimWhitespaceSuffix = false; const defaultEvaluationPriority = 5; export class LlamaChat { /** @internal */ _chatWrapper; /** @internal */ _disposeAggregator = new DisposeAggregator(); /** @internal */ _autoDisposeSequence; /** @internal */ _chatLock = {}; /** @internal */ _sequence; onDispose = new EventRelay(); constructor({ contextSequence, chatWrapper = "auto", autoDisposeSequence = false }) { if (contextSequence == null) throw new Error("contextSequence cannot be null"); if (contextSequence.disposed) throw new DisposedError(); this._sequence = contextSequence; this._autoDisposeSequence = autoDisposeSequence; this._disposeAggregator.add(this._sequence.onDispose.createListener(() => { this.dispose(); })); this._disposeAggregator.add(this.onDispose.dispatchEvent); this._chatWrapper = chatWrapper === "auto" ? resolveChatWrapper(contextSequence.model) : chatWrapper; } dispose({ disposeSequence = this._autoDisposeSequence } = {}) { if (this._sequence == null) return; if (disposeSequence) this._sequence.dispose(); this._sequence = null; this._disposeAggregator.dispose(); } /** @hidden */ [Symbol.dispose]() { return this.dispose(); } get disposed() { return this._sequence == null; } get chatWrapper() { if (this._sequence == null) throw new DisposedError(); return this._chatWrapper; } get sequence() { if (this._sequence == null) throw new DisposedError(); return this._sequence; } get context() { return this.sequence.context; } get model() { return this.sequence.model; } async generateResponse(history, options = {}) { const { onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options; this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? "")); const generateResponseState = new GenerateResponseState(this, this._chatWrapper, history, { onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar` trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift } }); if (generateResponseState.grammar != null && generateResponseState.functionsEnabled) throw new Error("Using both grammar and functions is not supported yet"); return await withLock(this._chatLock, "evaluate", signal, async () => { try { generateResponseState.ensureLastHistoryItemIsModel(); generateResponseState.ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded(); const loadContextWindow = async (avoidReloadingHistory = false) => { await generateResponseState.loadContextWindow(generateResponseState.getResolvedHistoryWithCurrentModelResponse(), generateResponseState.getContextWindowsHistoryWithCurrentModelResponse(), false, avoidReloadingHistory); }; const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true); while (true) { generateResponseState.startTokenLoop(); generateResponseState.canAvoidReloadingHistory = false; await loadContextWindow(); generateResponseState.addStopGenerationTriggersFromChatWrapper(); if (generateResponseState.generatedTokens === 0) { generateResponseState.addIgnoreStartTextTriggersFromChatWrapper(); if (generateResponseState.functionsEnabled) { generateResponseState.initFunctions(); } } if (generateResponseState.functionEvaluationMode !== false) { const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop); if (functionsCallsRes != null) return functionsCallsRes; await loadContextWindowForFunctionCallingLoop(); } await generateResponseState.alignCurrentSequenceStateWithCurrentTokens(); await generateResponseState.createNewEvaluationIterator(); while (await generateResponseState.iterateEvaluation()) { if (!generateResponseState.holdPartialTokensForNextEvaluation()) { generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens(); generateResponseState.detectAndHandleFunctionStartSyntax(); if (generateResponseState.functionEvaluationMode !== false) { generateResponseState.canAvoidReloadingHistory = false; generateResponseState.releasePartiallyFreeTokensBeforeFunctionCallStart(); const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop); if (functionsCallsRes != null) return functionsCallsRes; } generateResponseState.recordStopGenerationEvaluation(); generateResponseState.popStreamRegulatorFreeTokens(); generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens(); const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("model"); if (stopGenerationTriggerRes != null) return stopGenerationTriggerRes; generateResponseState.spliceIgnoreStartTextDetectedTokens(); generateResponseState.moveFreePendingTokensToRes(); } const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("model"); if (maxTokensTriggerRes != null) return maxTokensTriggerRes; if (generateResponseState.updateShouldContextShift()) break; const abortRes = generateResponseState.handleAbortTrigger("model"); if (abortRes != null) return abortRes; } generateResponseState.isFirstEvaluation = false; if (generateResponseState.shouldContextShift) continue; break; } throw new Error("The context size is too small to generate a response"); } finally { await generateResponseState.dispose(); } }); } async loadChatAndCompleteUserMessage(history, options = {}) { const { initialUserPrompt = "", stopOnAbortSignal = false, onTextChunk, onToken, signal, maxTokens = Math.min(256, Math.ceil(this.context.contextSize / 2)), temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.8 } = {} } = options; this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize((findLastModelMessageInChatHistory(history)?.response ?? []) .map((item) => { if (typeof item === "string") return item; else if (isChatModelResponseFunctionCall(item)) return null; else if (isChatModelResponseSegment(item)) return item.text; void item; return null; }) .filter((item) => item != null) .join(" "))); const generateResponseState = new GenerateResponseState(this, this._chatWrapper, mergeGeneratedResultWithChatHistory("user", history, [initialUserPrompt]), { onTextChunk, onToken, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar` trimWhitespaceSuffix, repeatPenalty, tokenBias, evaluationPriority, functions, documentFunctionParams, contextShift, customStopTriggers, lastEvaluationContextWindow: { history: mergeGeneratedResultWithChatHistory("user", lastEvaluationContextWindowHistory ?? history, [initialUserPrompt]), minimumOverlapPercentageToPreventContextShift } }); return await withLock(this._chatLock, "evaluate", signal, async () => { try { generateResponseState.ensureLastHistoryItemIsUser(); while (true) { generateResponseState.startTokenLoop(); const { userTextSuffix } = await generateResponseState.loadContextWindow(mergeGeneratedResultWithChatHistory("user", generateResponseState.resolvedHistory, generateResponseState.segmentHandler.getModelResponseSegments()), mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), true); generateResponseState.functionEvaluationMode = false; generateResponseState.addStopGenerationTriggersFromChatWrapper(); if (userTextSuffix != null && userTextSuffix.values.length > 0) generateResponseState.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(userTextSuffix, this.model.tokenizer)); await generateResponseState.alignCurrentSequenceStateWithCurrentTokens(); if (generateResponseState.maxTokens === 0) { await generateResponseState.evaluateWithoutGeneratingNewTokens(); return { completion: "", lastEvaluation: { contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), contextShiftMetadata: generateResponseState.lastHistoryCompressionMetadata }, metadata: { stopReason: "maxTokens" } }; } await generateResponseState.createNewEvaluationIterator(); while (await generateResponseState.iterateEvaluation()) { if (!generateResponseState.holdPartialTokensForNextEvaluation()) { generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens(); generateResponseState.recordStopGenerationEvaluation(); generateResponseState.popStreamRegulatorFreeTokens(); const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user"); if (stopGenerationTriggerRes != null) return { completion: stopGenerationTriggerRes.response, lastEvaluation: { contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), contextShiftMetadata: stopGenerationTriggerRes.lastEvaluation.contextShiftMetadata }, metadata: stopGenerationTriggerRes.metadata.stopReason === "customStopTrigger" ? stopGenerationTriggerRes.metadata : stopGenerationTriggerRes.metadata }; generateResponseState.moveFreePendingTokensToRes(false); } const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("user"); if (maxTokensTriggerRes != null) return { completion: maxTokensTriggerRes.response, lastEvaluation: { contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), contextShiftMetadata: maxTokensTriggerRes.lastEvaluation.contextShiftMetadata }, metadata: maxTokensTriggerRes.metadata }; if (generateResponseState.updateShouldContextShift()) break; const abortRes = generateResponseState.handleAbortTrigger("user"); if (abortRes != null) return { completion: abortRes.response, lastEvaluation: { contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), contextShiftMetadata: abortRes.lastEvaluation.contextShiftMetadata }, metadata: abortRes.metadata }; } generateResponseState.isFirstEvaluation = false; if (generateResponseState.shouldContextShift) continue; break; } throw new Error("The context size is too small to generate a completion"); } finally { await generateResponseState.dispose(); } }); } } function removeRawFromHistoryItem(historyItem) { if (historyItem.type === "model") { const newHistoryItem = { ...historyItem }; newHistoryItem.response = newHistoryItem.response.map((item) => { if (typeof item === "string") return item; else if (isChatModelResponseFunctionCall(item)) return { ...item, rawCall: undefined }; else if (isChatModelResponseSegment(item)) return { ...item, raw: undefined }; void item; return item; }); return newHistoryItem; } return historyItem; } async function compressHistoryToFitContextSize({ history, contextShiftSize, contextShiftStrategy, contextShiftLastEvaluationMetadata, contextSize, tokenizer, chatWrapper, functions, documentFunctionParams }) { function checkIfHistoryFitsContext(history) { const { contextText } = chatWrapper.generateContextState({ chatHistory: history, availableFunctions: functions, documentFunctionParams }); const tokens = contextText.tokenize(tokenizer); return tokens.length <= contextSize - contextShiftSize; } if (contextSize - contextShiftSize <= 0) throw new Error(`The context size (${contextSize}) is too small to fit the context shift size (${contextShiftSize})`); if (checkIfHistoryFitsContext(history)) return { compressedHistory: history, metadata: null }; if (contextShiftStrategy instanceof Function) { try { const { chatHistory, metadata } = await contextShiftStrategy({ chatHistory: history, maxTokensCount: contextSize - contextShiftSize, tokenizer, chatWrapper, lastShiftMetadata: contextShiftLastEvaluationMetadata }); if (checkIfHistoryFitsContext(chatHistory)) return { compressedHistory: chatHistory, metadata }; console.warn("The provided context shift strategy did not return a history that fits the context size. " + "Using the default strategy instead."); } catch (err) { console.error("The provided context shift strategy threw an error. " + "Using the default strategy instead.", err); } } else if (contextShiftStrategy !== "eraseFirstResponseAndKeepFirstSystem") console.warn(`Unknown context shift strategy "${contextShiftStrategy}". ` + "Using the default strategy instead."); const { chatHistory, metadata } = await eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({ chatHistory: history, maxTokensCount: contextSize - contextShiftSize, tokenizer, chatWrapper, lastShiftMetadata: contextShiftLastEvaluationMetadata }); if (!checkIfHistoryFitsContext(chatHistory)) throw new Error("The default context shift strategy did not return a history that fits the context size. " + "This may happen due to the system prompt being too long"); return { compressedHistory: chatHistory, metadata }; } function getLastModelMessageFullResponseFromChatHistory(chatHistory) { const lastModelResponseItem = chatHistory.at(-1); if (lastModelResponseItem == null || lastModelResponseItem.type !== "model") return []; return lastModelResponseItem.response; } function getLastUserTextFromChatHistory(chatHistory) { if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "user") return ""; return chatHistory[chatHistory.length - 1].text; } function setLastUserTextInChatHistory(chatHistory, userText) { const newChatHistory = chatHistory.slice(); if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "user") newChatHistory.push({ type: "user", text: "" }); const lastUserItem = newChatHistory[newChatHistory.length - 1]; const newLastUserItem = { ...lastUserItem }; newChatHistory[newChatHistory.length - 1] = newLastUserItem; newLastUserItem.text = userText; return newChatHistory; } function mergeGeneratedResultWithChatHistory(itemType, chatHistory, generatedResult) { if (generatedResult.length === 0 || (generatedResult.length === 1 && generatedResult[0] === "")) return chatHistory; const newChatHistory = chatHistory.slice(); if (itemType === "user") { let lastUserItem = newChatHistory.at(-1); if (lastUserItem?.type !== "user") { lastUserItem = { type: "user", text: "" }; newChatHistory.push(lastUserItem); } const newLastUserItem = { ...lastUserItem }; newChatHistory[newChatHistory.length - 1] = newLastUserItem; newLastUserItem.text += generatedResult .map((item) => { if (typeof item === "string") return item; return item.text; }) .join(""); return newChatHistory; } else { let lastModelItem = newChatHistory.at(-1); if (lastModelItem?.type !== "model") { lastModelItem = { type: "model", response: [] }; newChatHistory.push(lastModelItem); } const newLastModelItem = { ...lastModelItem }; newChatHistory[newChatHistory.length - 1] = newLastModelItem; const modelResponse = newLastModelItem.response.slice(); newLastModelItem.response = modelResponse; const firstGeneratedResultItem = generatedResult[0]; if (firstGeneratedResultItem == null) return newChatHistory; const lastModelResponseItem = modelResponse.at(-1); if (typeof firstGeneratedResultItem === "string" && typeof lastModelResponseItem === "string") { modelResponse[modelResponse.length - 1] = lastModelResponseItem + firstGeneratedResultItem; } else if (typeof firstGeneratedResultItem !== "string" && isChatModelResponseSegment(firstGeneratedResultItem) && typeof lastModelResponseItem !== "string" && isChatModelResponseSegment(lastModelResponseItem) && !lastModelResponseItem.ended && lastModelResponseItem.segmentType === firstGeneratedResultItem.segmentType) { modelResponse[modelResponse.length - 1] = { ...lastModelResponseItem, ...firstGeneratedResultItem, text: lastModelResponseItem.text + firstGeneratedResultItem.text, ended: firstGeneratedResultItem.ended, raw: (lastModelResponseItem.raw != null && firstGeneratedResultItem.raw != null) ? LlamaText([ LlamaText.fromJSON(lastModelResponseItem.raw), LlamaText.fromJSON(firstGeneratedResultItem.raw) ]).toJSON() : undefined, startTime: lastModelResponseItem.startTime, endTime: firstGeneratedResultItem.endTime }; } else modelResponse.push(firstGeneratedResultItem); pushAll(modelResponse, generatedResult.slice(1)); return newChatHistory; } } function findLastUserMessageInChatHistory(chatHistory) { for (let i = chatHistory.length - 1; i >= 0; i--) { const item = chatHistory[i]; if (item.type === "user") return item; } return undefined; } function findLastModelMessageInChatHistory(chatHistory) { for (let i = chatHistory.length - 1; i >= 0; i--) { const item = chatHistory[i]; if (item.type === "model") return item; } return undefined; } function generateContextText(endWithUserText, chatWrapper, options) { if (endWithUserText) return generateContextTextThatEndsWithUserText(chatWrapper, options); return chatWrapper.generateContextState(options); } function generateContextTextThatEndsWithUserText(chatWrapper, options) { const lastUserText = getLastUserTextFromChatHistory(options.chatHistory); const randomId = "W" + (Math.random() .toString(36) .slice(2)) + "W"; const { contextText, ...rest } = chatWrapper.generateContextState({ ...options, chatHistory: setLastUserTextInChatHistory(options.chatHistory, lastUserText + randomId) }); let newContextText = contextText; for (let i = 0; i < newContextText.values.length; i++) { const item = newContextText.values[i]; if (typeof item !== "string") continue; const randomTextIndex = item.indexOf(randomId); if (randomTextIndex < 0) continue; const newValue = item.slice(0, randomTextIndex); newContextText = LlamaText([ ...newContextText.values.slice(0, i), newValue ]); return { contextText: newContextText, userTextSuffix: LlamaText([ item.slice(randomTextIndex + randomId.length), ...newContextText.values.slice(i + 1) ]), ...rest }; } throw new Error("The random ID was not found in the context text. " + `There might be an issue with the chat wrapper "${chatWrapper.wrapperName}" ` + "where not all user messages are properly added to the the result LlamaText"); } async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHistoryCompressionMetadata, pendingTokensCount = 0, isFirstEvaluation, chatWrapper, lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift, sequence, minFreeContextTokens = 1, functions, documentFunctionParams, endWithUserText }) { if (sequence == null) throw new DisposedError(); const model = sequence.model; const context = sequence.context; let removeRawFromHistory = false; if (isFirstEvaluation && lastEvaluationContextWindowHistory != null && sequence.isLoadedToMemory) { const newContextWindow = lastEvaluationContextWindowHistory.slice(); if (endWithUserText) { if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "user") newContextWindow.push({ type: "user", text: "" }); } else if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "model") newContextWindow.push({ type: "model", response: [] }); const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, { chatHistory: newContextWindow, availableFunctions: functions, documentFunctionParams }); const tokens = contextText.tokenize(model.tokenizer); if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize) { const { firstDifferentIndex } = sequence.compareContextTokens(tokens); const existingEvaluationPercentage = firstDifferentIndex / tokens.length; if (existingEvaluationPercentage >= minimumOverlapPercentageToPreventContextShift) return { history: newContextWindow, stopGenerationTriggers, tokens, removeRawFromHistory, newHistoryCompressionMetadata: lastHistoryCompressionMetadata, ignoreStartText: ignoreStartText ?? [], functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false, disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [], userTextSuffix }; } } removeRawFromHistory = !sequence.isLoadedToMemory; resolvedHistory = removeRawFromHistory ? resolvedHistory.map(removeRawFromHistoryItem) : resolvedHistory.slice(); if (resolvedContextShift.lastEvaluationMetadata != null) { const contextShiftSize = resolvedContextShift.size instanceof Function ? await resolvedContextShift.size(sequence) : resolvedContextShift.size; const { compressedHistory, metadata } = await compressHistoryToFitContextSize({ history: resolvedHistory, contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount, contextShiftStrategy: resolvedContextShift.strategy, contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata, contextSize: context.contextSize, tokenizer: model.tokenizer, chatWrapper: chatWrapper, functions, documentFunctionParams }); const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, { chatHistory: compressedHistory, availableFunctions: functions, documentFunctionParams }); return { history: compressedHistory, stopGenerationTriggers, tokens: contextText.tokenize(model.tokenizer), removeRawFromHistory, newHistoryCompressionMetadata: metadata, ignoreStartText: ignoreStartText ?? [], functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false, disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [], userTextSuffix }; } { const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, { chatHistory: resolvedHistory, availableFunctions: functions, documentFunctionParams }); const tokens = contextText.tokenize(model.tokenizer); if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize) return { history: resolvedHistory, stopGenerationTriggers, tokens, removeRawFromHistory, newHistoryCompressionMetadata: lastHistoryCompressionMetadata, ignoreStartText: ignoreStartText ?? [], functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false, disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [], userTextSuffix }; } const contextShiftSize = Math.min(context.contextSize, Math.max(1, Math.floor(resolvedContextShift.size instanceof Function ? await resolvedContextShift.size(sequence) : resolvedContextShift.size))); const { compressedHistory, metadata } = await compressHistoryToFitContextSize({ history: resolvedHistory, contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount, contextShiftStrategy: resolvedContextShift.strategy, contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata, contextSize: context.contextSize, tokenizer: model.tokenizer, chatWrapper: chatWrapper, functions, documentFunctionParams }); const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, { chatHistory: compressedHistory, availableFunctions: functions, documentFunctionParams }); return { history: compressedHistory, stopGenerationTriggers, tokens: contextText.tokenize(model.tokenizer), removeRawFromHistory, newHistoryCompressionMetadata: metadata, ignoreStartText: ignoreStartText ?? [], functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false, disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [], userTextSuffix }; } class GenerateResponseState { llamaChat; chatWrapper; history; onTextChunk; onToken; onResponseChunk; signal; stopOnAbortSignal; maxTokens; temperature; minP; topK; topP; seed; grammar; trimWhitespaceSuffix; tokenBias; evaluationPriority; functions; onFunctionCall; documentFunctionParams; maxParallelFunctionCalls; contextShift; customStopTriggers; minimumOverlapPercentageToPreventContextShift; functionsEnabled; repeatPenaltyEnabled; resolvedContextShift; resolvedRepeatPenalty; grammarEvaluationState; functionNameGrammar; functionsGrammar; functionsEvaluationState; streamRegulator = new TokenStreamRegulator(); stopGenerationDetector = new StopGenerationDetector(); customStopGenerationTriggersDetector = new StopGenerationDetector(); functionSyntaxStartDetector = new StopGenerationDetector(); disengageInitiallyEngagedFunctionMode = new StopGenerationDetector(); ignoreStartTextDetector = new StopGenerationDetector(); locksToReleaseOnValidGeneration = []; resolvedHistory; noRawInResolvedHistory; res = []; pendingTokens = []; ignoredStartTextTokens = []; resFunctionCalls = []; segmentHandler; pendingPartialTokens = []; functionEvaluationMode = false; currentFunctionCallPreviousText = LlamaText([]); currentFunctionCallCurrentPartTokens = []; functionEvaluationFunctionName = ""; currentFunctionCallPreviousPartLeftoverText = ""; removedStartTextToIgnore = false; releasedPartiallyFreeTokensBeforeFunctionCallStartSyntax = false; generatedTokens = 0; isFirstEvaluation = true; initiallyEngagedFunctionMode = false; lastContextWindowHistory; lastHistoryCompressionMetadata; restartEvaluationIterator = false; // context shift loop shouldContextShift = false; canAvoidReloadingHistory = false; contextWindowTokens = []; stopGenerationTriggers = []; ignoreStartText = []; functionCallInitiallyEngaged = false; disengageInitiallyEngagedFunctionCall = []; userTextSuffix = undefined; tokens = []; // token evaluation loop evaluationIterator; currentIteration; currentIterationReplacementToken; currentToken; currentTokens = []; currentText = ""; currentQueuedTokenRelease; constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) { this.llamaChat = llamaChat; this.chatWrapper = chatWrapper; this.history = history; this.onTextChunk = safeEventCallback(onTextChunk); this.onToken = safeEventCallback(onToken); this.onResponseChunk = safeEventCallback(onResponseChunk); this.signal = signal; this.stopOnAbortSignal = stopOnAbortSignal; this.maxTokens = maxTokens; this.temperature = temperature; this.minP = minP; this.topK = topK; this.topP = topP; this.seed = seed; this.grammar = grammar; this.trimWhitespaceSuffix = trimWhitespaceSuffix; this.tokenBias = tokenBias; this.evaluationPriority = evaluationPriority; this.functions = functions; this.onFunctionCall = safeEventCallback(onFunctionCall); this.documentFunctionParams = documentFunctionParams; this.maxParallelFunctionCalls = maxParallelFunctionCalls; this.contextShift = contextShift; this.customStopTriggers = customStopTriggers; this.minimumOverlapPercentageToPreventContextShift = minimumOverlapPercentageToPreventContextShift; this.functionsEnabled = (this.functions != null && Object.keys(this.functions).length > 0); if (this.signal?.aborted) throw this.signal.reason; if (this.llamaChat.disposed) throw new DisposedError(); this.noRawInResolvedHistory = !this.llamaChat.sequence.isLoadedToMemory; this.resolvedHistory = this.noRawInResolvedHistory ? this.history.map(removeRawFromHistoryItem) : this.history.slice(); this.resolvedContextShift = { ...defaultContextShiftOptions, ...removeNullFields(this.contextShift) }; this.resolvedRepeatPenalty = repeatPenalty === false ? { lastTokens: 0 } : { ...(repeatPenalty ?? {}), lastTokens: repeatPenalty?.lastTokens ?? defaultRepeatPenaltyLastTokens }; this.repeatPenaltyEnabled = this.resolvedRepeatPenalty.lastTokens > 0; this.grammarEvaluationState = this.grammar != null ? new LlamaGrammarEvaluationState({ model: this.llamaChat.model, grammar: this.grammar }) : undefined; this.functionNameGrammar = this.functionsEnabled ? new FunctionCallNameGrammar(this.llamaChat.model._llama, this.functions, this.chatWrapper) : undefined; this.functionsGrammar = undefined; this.functionsEvaluationState = undefined; this.lastContextWindowHistory = lastEvaluationContextWindowHistory ?? this.resolvedHistory; this.lastHistoryCompressionMetadata = this.resolvedContextShift.lastEvaluationMetadata; if (this.customStopTriggers != null) StopGenerationDetector.resolveStopTriggers(this.customStopTriggers, this.llamaChat.model.tokenizer) .map((stopTrigger) => this.customStopGenerationTriggersDetector.addStopTrigger(stopTrigger)); if (this.grammar != null) StopGenerationDetector.resolveStopTriggers(this.grammar.stopGenerationTriggers, this.llamaChat.model.tokenizer) .map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger)); if (this.functions != null && Object.keys(this.functions).length > 0) this.functionSyntaxStartDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([ this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "", this.chatWrapper.settings.functions.call.prefix ]), this.llamaChat.model.tokenizer)); const segmentDefinitions = new Map(); for (const segmentType of allSegmentTypes) { const segmentDefinition = getChatWrapperSegmentDefinition(this.chatWrapper.settings, segmentType); if (segmentDefinition != null) segmentDefinitions.set(segmentType, segmentDefinition); } this.segmentHandler = new SegmentHandler({ model: this.llamaChat.model, onTextChunk: this.onTextChunk, onToken: this.onToken, onResponseChunk: this.onResponseChunk, previousTokens: this.getLastTokens(), closeAllSegments: this.chatWrapper.settings.segments?.closeAllSegments, segmentDefinitions, initialSegmentStack: SegmentHandler.getStackFromModelResponse(getLastModelMessageFullResponseFromChatHistory(this.resolvedHistory)) }); this.getPenaltyTokens = this.getPenaltyTokens.bind(this); } async dispose() { await this.evaluationIterator?.return(); } async [Symbol.asyncDispose]() { await this.dispose(); } ensureLastHistoryItemIsModel() { if (this.resolvedHistory.at(-1)?.type !== "model") this.resolvedHistory.push({ type: "model", response: [] }); } ensureLastHistoryItemIsUser() { if (this.resolvedHistory.at(-1)?.type !== "user") this.resolvedHistory.push({ type: "user", text: "" }); } ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded() { if (this.chatWrapper.settings.segments?.thought?.reopenAfterFunctionCalls !== true) return; const lastModelResponseItem = this.resolvedHistory.at(-1); if (lastModelResponseItem == null || lastModelResponseItem.type !== "model") return; const lastResponse = lastModelResponseItem.response.at(-1); if (lastResponse == null) return; const lastResponseIsFunctionCall = typeof lastResponse !== "string" && lastResponse.type === "functionCall"; if (!lastResponseIsFunctionCall) return; const currentResponseSegmentsStack = SegmentHandler.getStackFromModelResponse(lastModelResponseItem.response); if (currentResponseSegmentsStack.includes("thought")) return; const hadThoughtSegments = this.resolvedHistory.some((chatItem) => { if (chatItem.type !== "model") return false; return chatItem.response.some((responseItem) => { if (typeof responseItem === "string") return false; return responseItem.type === "segment" && responseItem.segmentType === "thought"; }); }); if (!hadThoughtSegments) return; this.segmentHandler.openSegment("thought"); } ensureNotAborted() { if (this.signal?.aborted && (!this.stopOnAbortSignal || this.res.length === 0)) throw this.signal.reason; if (this.llamaChat.disposed) throw new DisposedError(); } getPenaltyTokens() { if (this.llamaChat.disposed) return []; let punishTokens = this.res.slice(-this.resolvedRepeatPenalty.lastTokens); if (this.resolvedRepeatPenalty.punishTokensFilter != null) punishTokens = this.resolvedRepeatPenalty.punishTokensFilter(punishTokens); if (this.resolvedRepeatPenalty.penalizeNewLine == null || !this.resolvedRepeatPenalty.penalizeNewLine) { const nlToken = this.llamaChat.model.tokens.nl; if (nlToken != null) punishTokens = punishTokens.filter((token) => token !== nlToken); } return punishTokens; } getResolvedHistoryWithCurrentModelResponse() { return mergeGeneratedResultWithChatHistory("model", this.resolvedHistory, this.segmentHandler.getModelResponseSegments()); } getContextWindowsHistoryWithCurrentModelResponse() { return mergeGeneratedResultWithChatHistory("model", this.lastContextWindowHistory, this.segmentHandler.getContextWindowModelResponseSegments()); } removeFoundStartIgnoreTextsFromPendingTokens(forceRemove = false) { if (!this.removedStartTextToIgnore && this.res.length === 0 && this.pendingTokens.length > 0 && this.ignoreStartTextDetector.hasTriggeredStops && (forceRemove || !this.ignoreStartTextDetector.hasInProgressStops)) { this.ignoreStartTextDetector.clearInProgressStops(); this.ignoreStartTextDetector.clearTriggeredStops(); let mostExhaustiveTriggeredStops = null; let mostExhaustiveTriggeredStopsLeftoverTokens = []; const lastTokensForDetokenizer = resolveLastTokens([ this.contextWindowTokens, this.ignoredStartTextTokens ]); const pendingPartialTokens = []; for (let i = 0; i < this.pendingTokens.length; i++) { const currentToken = this.pendingTokens[i]; const tokens = [...pendingPartialTokens, currentToken]; const text = this.llamaChat.model.detokenize(tokens, false, lastTokensForDetokenizer); if (pendingPartialTokens.length === 0 && text.endsWith(UNKNOWN_UNICODE_CHAR) && !this.llamaChat.model.isSpecialToken(currentToken) && !this.llamaChat.model.isEogToken(currentToken)) { pendingPartialTokens.length = 0; pushAll(pendingPartialTokens, tokens); continue; } this.ignoreStartTextDetector.recordGeneration({ text: this.llamaChat.model.detokenize(tokens, false, lastTokensForDetokenizer), tokens, startNewChecks: i === 0, triggerMustStartWithGeneration: true }); pushAll(lastTokensForDetokenizer, tokens); if (this.ignoreStartTextDetector.hasTriggeredStops) { mostExhaustiveTriggeredStops = this.ignoreStartTextDetector.getTriggeredStops(); this.ignoreStartTextDetector.clearTriggeredStops(); mostExhaustiveTriggeredStopsLeftoverTokens = this.pendingTokens.slice(i + 1); } else if (!this.ignoreStartTextDetector.hasInProgressStops) break; } if (mostExhaustiveTriggeredStops != null) { const [mostExhaustiveTriggeredStop] = mostExhaustiveTriggeredStops; if (mostExhaustiveTriggeredStop != null) { this.ignoredStartTextTokens = mostExhaustiveTriggeredStop.stopTrigger .map((stopTrigger) => { if (typeof stopTrigger === "string") return this.llamaChat.model.tokenize(stopTrigger, false, "trimLeadingSpace"); else return [stopTrigger]; }) .flat(1); const newPendingTokens = [ ...mostExhaustiveTriggeredStop.remainingGeneration, mostExhaustiveTriggeredStopsLeftoverTokens ] .map((generation) => { if (typeof generation === "string") return this.llamaChat.model.tokenize(generation, false, "trimLeadingSpace"); else return generation; }) .flat(1); this.pendingTokens.length = 0; pushAll(this.pendingTokens, newPendingTokens); this.removedStartTextToIgnore = true; } } } } startTokenLoop() { this.ensureNotAborted(); this.shouldContextShift = false; } getContextWindowFunctionCallsTokens() { if (this.functionEvaluationMode === false) return []; else if (this.functionEvaluationMode === "prefixOrDisengage") return [ ...LlamaText(this.currentFunctionCallPreviousText).tokenize(this.llamaChat.model.tokenizer, "trimLeadingSpace"), ...this.currentFunctionCallCurrentPartTokens ]; const text = []; if (this.chatWrapper.settings.functions?.paralle