node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
979 lines • 113 kB
JavaScript
import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
import { isChatModelResponseFunctionCall, isChatModelResponseSegment, allSegmentTypes } from "../../types.js";
import { removeNullFields } from "../../utils/removeNullFields.js";
import { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js";
import { LlamaText, SpecialToken } from "../../utils/LlamaText.js";
import { StopGenerationDetector } from "../../utils/StopGenerationDetector.js";
import { TokenStreamRegulator } from "../../utils/TokenStreamRegulator.js";
import { maxRecentDetokenizerTokens, UNKNOWN_UNICODE_CHAR } from "../../consts.js";
import { getQueuedTokensBeforeStopTrigger } from "../../utils/getQueuedTokensBeforeStopTrigger.js";
import { resolveChatWrapper } from "../../chatWrappers/utils/resolveChatWrapper.js";
import { safeEventCallback } from "../../utils/safeEventCallback.js";
import { pushAll } from "../../utils/pushAll.js";
import { resolveLastTokens } from "../../utils/resolveLastTokens.js";
import { LlamaSampler } from "../LlamaContext/LlamaSampler.js";
import { getChatWrapperSegmentDefinition } from "../../utils/getChatWrapperSegmentDefinition.js";
import { jsonDumps } from "../../chatWrappers/utils/jsonDumps.js";
import { eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy } from "./utils/contextShiftStrategies/eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy.js";
import { FunctionCallNameGrammar } from "./utils/FunctionCallNameGrammar.js";
import { FunctionCallParamsGrammar } from "./utils/FunctionCallParamsGrammar.js";
const defaultContextShiftOptions = {
size: (sequence) => Math.max(1, Math.floor(sequence.context.contextSize / 10)),
strategy: "eraseFirstResponseAndKeepFirstSystem",
lastEvaluationMetadata: null
};
const defaultRepeatPenaltyLastTokens = 64;
const defaultTrimWhitespaceSuffix = false;
const defaultEvaluationPriority = 5;
export class LlamaChat {
/** @internal */ _chatWrapper;
/** @internal */ _disposeAggregator = new DisposeAggregator();
/** @internal */ _autoDisposeSequence;
/** @internal */ _chatLock = {};
/** @internal */ _sequence;
onDispose = new EventRelay();
constructor({ contextSequence, chatWrapper = "auto", autoDisposeSequence = false }) {
if (contextSequence == null)
throw new Error("contextSequence cannot be null");
if (contextSequence.disposed)
throw new DisposedError();
this._sequence = contextSequence;
this._autoDisposeSequence = autoDisposeSequence;
this._disposeAggregator.add(this._sequence.onDispose.createListener(() => {
this.dispose();
}));
this._disposeAggregator.add(this.onDispose.dispatchEvent);
this._chatWrapper = chatWrapper === "auto"
? resolveChatWrapper(contextSequence.model)
: chatWrapper;
}
dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
if (this._sequence == null)
return;
if (disposeSequence)
this._sequence.dispose();
this._sequence = null;
this._disposeAggregator.dispose();
}
/** @hidden */
[Symbol.dispose]() {
return this.dispose();
}
get disposed() {
return this._sequence == null;
}
get chatWrapper() {
if (this._sequence == null)
throw new DisposedError();
return this._chatWrapper;
}
get sequence() {
if (this._sequence == null)
throw new DisposedError();
return this._sequence;
}
get context() {
return this.sequence.context;
}
get model() {
return this.sequence.model;
}
async generateResponse(history, options = {}) {
const { onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = options;
this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize(findLastUserMessageInChatHistory(history)?.text ?? ""));
const generateResponseState = new GenerateResponseState(this, this._chatWrapper, history, {
onTextChunk,
onToken,
onResponseChunk,
signal,
stopOnAbortSignal,
maxTokens,
temperature,
minP,
topK,
topP,
seed,
grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
trimWhitespaceSuffix,
repeatPenalty,
tokenBias,
evaluationPriority,
functions,
onFunctionCall,
documentFunctionParams,
maxParallelFunctionCalls,
contextShift,
customStopTriggers,
lastEvaluationContextWindow: {
history: lastEvaluationContextWindowHistory,
minimumOverlapPercentageToPreventContextShift
}
});
if (generateResponseState.grammar != null && generateResponseState.functionsEnabled)
throw new Error("Using both grammar and functions is not supported yet");
return await withLock(this._chatLock, "evaluate", signal, async () => {
try {
generateResponseState.ensureLastHistoryItemIsModel();
generateResponseState.ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded();
const loadContextWindow = async (avoidReloadingHistory = false) => {
await generateResponseState.loadContextWindow(generateResponseState.getResolvedHistoryWithCurrentModelResponse(), generateResponseState.getContextWindowsHistoryWithCurrentModelResponse(), false, avoidReloadingHistory);
};
const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true);
while (true) {
generateResponseState.startTokenLoop();
generateResponseState.canAvoidReloadingHistory = false;
await loadContextWindow();
generateResponseState.addStopGenerationTriggersFromChatWrapper();
if (generateResponseState.generatedTokens === 0) {
generateResponseState.addIgnoreStartTextTriggersFromChatWrapper();
if (generateResponseState.functionsEnabled) {
generateResponseState.initFunctions();
}
}
if (generateResponseState.functionEvaluationMode !== false) {
const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop);
if (functionsCallsRes != null)
return functionsCallsRes;
await loadContextWindowForFunctionCallingLoop();
}
await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
await generateResponseState.createNewEvaluationIterator();
while (await generateResponseState.iterateEvaluation()) {
if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
generateResponseState.detectAndHandleFunctionStartSyntax();
if (generateResponseState.functionEvaluationMode !== false) {
generateResponseState.canAvoidReloadingHistory = false;
generateResponseState.releasePartiallyFreeTokensBeforeFunctionCallStart();
const functionsCallsRes = await generateResponseState.enterFunctionCallingLoop(loadContextWindowForFunctionCallingLoop);
if (functionsCallsRes != null)
return functionsCallsRes;
}
generateResponseState.recordStopGenerationEvaluation();
generateResponseState.popStreamRegulatorFreeTokens();
generateResponseState.removeFoundStartIgnoreTextsFromPendingTokens();
const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("model");
if (stopGenerationTriggerRes != null)
return stopGenerationTriggerRes;
generateResponseState.spliceIgnoreStartTextDetectedTokens();
generateResponseState.moveFreePendingTokensToRes();
}
const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("model");
if (maxTokensTriggerRes != null)
return maxTokensTriggerRes;
if (generateResponseState.updateShouldContextShift())
break;
const abortRes = generateResponseState.handleAbortTrigger("model");
if (abortRes != null)
return abortRes;
}
generateResponseState.isFirstEvaluation = false;
if (generateResponseState.shouldContextShift)
continue;
break;
}
throw new Error("The context size is too small to generate a response");
}
finally {
await generateResponseState.dispose();
}
});
}
async loadChatAndCompleteUserMessage(history, options = {}) {
const { initialUserPrompt = "", stopOnAbortSignal = false, onTextChunk, onToken, signal, maxTokens = Math.min(256, Math.ceil(this.context.contextSize / 2)), temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, documentFunctionParams, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.8 } = {} } = options;
this.sequence.tokenPredictor?.updateInputTokens?.(this.model.tokenize((findLastModelMessageInChatHistory(history)?.response ?? [])
.map((item) => {
if (typeof item === "string")
return item;
else if (isChatModelResponseFunctionCall(item))
return null;
else if (isChatModelResponseSegment(item))
return item.text;
void item;
return null;
})
.filter((item) => item != null)
.join(" ")));
const generateResponseState = new GenerateResponseState(this, this._chatWrapper, mergeGeneratedResultWithChatHistory("user", history, [initialUserPrompt]), {
onTextChunk,
onToken,
signal,
stopOnAbortSignal,
maxTokens,
temperature,
minP,
topK,
topP,
seed,
grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
trimWhitespaceSuffix,
repeatPenalty,
tokenBias,
evaluationPriority,
functions,
documentFunctionParams,
contextShift,
customStopTriggers,
lastEvaluationContextWindow: {
history: mergeGeneratedResultWithChatHistory("user", lastEvaluationContextWindowHistory ?? history, [initialUserPrompt]),
minimumOverlapPercentageToPreventContextShift
}
});
return await withLock(this._chatLock, "evaluate", signal, async () => {
try {
generateResponseState.ensureLastHistoryItemIsUser();
while (true) {
generateResponseState.startTokenLoop();
const { userTextSuffix } = await generateResponseState.loadContextWindow(mergeGeneratedResultWithChatHistory("user", generateResponseState.resolvedHistory, generateResponseState.segmentHandler.getModelResponseSegments()), mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()), true);
generateResponseState.functionEvaluationMode = false;
generateResponseState.addStopGenerationTriggersFromChatWrapper();
if (userTextSuffix != null && userTextSuffix.values.length > 0)
generateResponseState.stopGenerationDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(userTextSuffix, this.model.tokenizer));
await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
if (generateResponseState.maxTokens === 0) {
await generateResponseState.evaluateWithoutGeneratingNewTokens();
return {
completion: "",
lastEvaluation: {
contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
contextShiftMetadata: generateResponseState.lastHistoryCompressionMetadata
},
metadata: {
stopReason: "maxTokens"
}
};
}
await generateResponseState.createNewEvaluationIterator();
while (await generateResponseState.iterateEvaluation()) {
if (!generateResponseState.holdPartialTokensForNextEvaluation()) {
generateResponseState.waitOnPartialCharactersOrWhiteSpaceTokens();
generateResponseState.recordStopGenerationEvaluation();
generateResponseState.popStreamRegulatorFreeTokens();
const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user");
if (stopGenerationTriggerRes != null)
return {
completion: stopGenerationTriggerRes.response,
lastEvaluation: {
contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
contextShiftMetadata: stopGenerationTriggerRes.lastEvaluation.contextShiftMetadata
},
metadata: stopGenerationTriggerRes.metadata.stopReason === "customStopTrigger"
? stopGenerationTriggerRes.metadata
: stopGenerationTriggerRes.metadata
};
generateResponseState.moveFreePendingTokensToRes(false);
}
const maxTokensTriggerRes = generateResponseState.handleMaxTokensTrigger("user");
if (maxTokensTriggerRes != null)
return {
completion: maxTokensTriggerRes.response,
lastEvaluation: {
contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
contextShiftMetadata: maxTokensTriggerRes.lastEvaluation.contextShiftMetadata
},
metadata: maxTokensTriggerRes.metadata
};
if (generateResponseState.updateShouldContextShift())
break;
const abortRes = generateResponseState.handleAbortTrigger("user");
if (abortRes != null)
return {
completion: abortRes.response,
lastEvaluation: {
contextWindow: mergeGeneratedResultWithChatHistory("user", generateResponseState.lastContextWindowHistory, generateResponseState.segmentHandler.getContextWindowModelResponseSegments()),
contextShiftMetadata: abortRes.lastEvaluation.contextShiftMetadata
},
metadata: abortRes.metadata
};
}
generateResponseState.isFirstEvaluation = false;
if (generateResponseState.shouldContextShift)
continue;
break;
}
throw new Error("The context size is too small to generate a completion");
}
finally {
await generateResponseState.dispose();
}
});
}
}
function removeRawFromHistoryItem(historyItem) {
if (historyItem.type === "model") {
const newHistoryItem = { ...historyItem };
newHistoryItem.response = newHistoryItem.response.map((item) => {
if (typeof item === "string")
return item;
else if (isChatModelResponseFunctionCall(item))
return {
...item,
rawCall: undefined
};
else if (isChatModelResponseSegment(item))
return {
...item,
raw: undefined
};
void item;
return item;
});
return newHistoryItem;
}
return historyItem;
}
async function compressHistoryToFitContextSize({ history, contextShiftSize, contextShiftStrategy, contextShiftLastEvaluationMetadata, contextSize, tokenizer, chatWrapper, functions, documentFunctionParams }) {
function checkIfHistoryFitsContext(history) {
const { contextText } = chatWrapper.generateContextState({
chatHistory: history,
availableFunctions: functions,
documentFunctionParams
});
const tokens = contextText.tokenize(tokenizer);
return tokens.length <= contextSize - contextShiftSize;
}
if (contextSize - contextShiftSize <= 0)
throw new Error(`The context size (${contextSize}) is too small to fit the context shift size (${contextShiftSize})`);
if (checkIfHistoryFitsContext(history))
return {
compressedHistory: history,
metadata: null
};
if (contextShiftStrategy instanceof Function) {
try {
const { chatHistory, metadata } = await contextShiftStrategy({
chatHistory: history,
maxTokensCount: contextSize - contextShiftSize,
tokenizer,
chatWrapper,
lastShiftMetadata: contextShiftLastEvaluationMetadata
});
if (checkIfHistoryFitsContext(chatHistory))
return {
compressedHistory: chatHistory,
metadata
};
console.warn("The provided context shift strategy did not return a history that fits the context size. " +
"Using the default strategy instead.");
}
catch (err) {
console.error("The provided context shift strategy threw an error. " +
"Using the default strategy instead.", err);
}
}
else if (contextShiftStrategy !== "eraseFirstResponseAndKeepFirstSystem")
console.warn(`Unknown context shift strategy "${contextShiftStrategy}". ` +
"Using the default strategy instead.");
const { chatHistory, metadata } = await eraseFirstResponseAndKeepFirstSystemChatContextShiftStrategy({
chatHistory: history,
maxTokensCount: contextSize - contextShiftSize,
tokenizer,
chatWrapper,
lastShiftMetadata: contextShiftLastEvaluationMetadata
});
if (!checkIfHistoryFitsContext(chatHistory))
throw new Error("The default context shift strategy did not return a history that fits the context size. " +
"This may happen due to the system prompt being too long");
return {
compressedHistory: chatHistory,
metadata
};
}
function getLastModelMessageFullResponseFromChatHistory(chatHistory) {
const lastModelResponseItem = chatHistory.at(-1);
if (lastModelResponseItem == null || lastModelResponseItem.type !== "model")
return [];
return lastModelResponseItem.response;
}
function getLastUserTextFromChatHistory(chatHistory) {
if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "user")
return "";
return chatHistory[chatHistory.length - 1].text;
}
function setLastUserTextInChatHistory(chatHistory, userText) {
const newChatHistory = chatHistory.slice();
if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "user")
newChatHistory.push({
type: "user",
text: ""
});
const lastUserItem = newChatHistory[newChatHistory.length - 1];
const newLastUserItem = { ...lastUserItem };
newChatHistory[newChatHistory.length - 1] = newLastUserItem;
newLastUserItem.text = userText;
return newChatHistory;
}
function mergeGeneratedResultWithChatHistory(itemType, chatHistory, generatedResult) {
if (generatedResult.length === 0 || (generatedResult.length === 1 && generatedResult[0] === ""))
return chatHistory;
const newChatHistory = chatHistory.slice();
if (itemType === "user") {
let lastUserItem = newChatHistory.at(-1);
if (lastUserItem?.type !== "user") {
lastUserItem = {
type: "user",
text: ""
};
newChatHistory.push(lastUserItem);
}
const newLastUserItem = { ...lastUserItem };
newChatHistory[newChatHistory.length - 1] = newLastUserItem;
newLastUserItem.text += generatedResult
.map((item) => {
if (typeof item === "string")
return item;
return item.text;
})
.join("");
return newChatHistory;
}
else {
let lastModelItem = newChatHistory.at(-1);
if (lastModelItem?.type !== "model") {
lastModelItem = {
type: "model",
response: []
};
newChatHistory.push(lastModelItem);
}
const newLastModelItem = { ...lastModelItem };
newChatHistory[newChatHistory.length - 1] = newLastModelItem;
const modelResponse = newLastModelItem.response.slice();
newLastModelItem.response = modelResponse;
const firstGeneratedResultItem = generatedResult[0];
if (firstGeneratedResultItem == null)
return newChatHistory;
const lastModelResponseItem = modelResponse.at(-1);
if (typeof firstGeneratedResultItem === "string" && typeof lastModelResponseItem === "string") {
modelResponse[modelResponse.length - 1] = lastModelResponseItem + firstGeneratedResultItem;
}
else if (typeof firstGeneratedResultItem !== "string" && isChatModelResponseSegment(firstGeneratedResultItem) &&
typeof lastModelResponseItem !== "string" && isChatModelResponseSegment(lastModelResponseItem) &&
!lastModelResponseItem.ended && lastModelResponseItem.segmentType === firstGeneratedResultItem.segmentType) {
modelResponse[modelResponse.length - 1] = {
...lastModelResponseItem,
...firstGeneratedResultItem,
text: lastModelResponseItem.text + firstGeneratedResultItem.text,
ended: firstGeneratedResultItem.ended,
raw: (lastModelResponseItem.raw != null && firstGeneratedResultItem.raw != null)
? LlamaText([
LlamaText.fromJSON(lastModelResponseItem.raw),
LlamaText.fromJSON(firstGeneratedResultItem.raw)
]).toJSON()
: undefined,
startTime: lastModelResponseItem.startTime,
endTime: firstGeneratedResultItem.endTime
};
}
else
modelResponse.push(firstGeneratedResultItem);
pushAll(modelResponse, generatedResult.slice(1));
return newChatHistory;
}
}
function findLastUserMessageInChatHistory(chatHistory) {
for (let i = chatHistory.length - 1; i >= 0; i--) {
const item = chatHistory[i];
if (item.type === "user")
return item;
}
return undefined;
}
function findLastModelMessageInChatHistory(chatHistory) {
for (let i = chatHistory.length - 1; i >= 0; i--) {
const item = chatHistory[i];
if (item.type === "model")
return item;
}
return undefined;
}
function generateContextText(endWithUserText, chatWrapper, options) {
if (endWithUserText)
return generateContextTextThatEndsWithUserText(chatWrapper, options);
return chatWrapper.generateContextState(options);
}
function generateContextTextThatEndsWithUserText(chatWrapper, options) {
const lastUserText = getLastUserTextFromChatHistory(options.chatHistory);
const randomId = "W" + (Math.random()
.toString(36)
.slice(2)) + "W";
const { contextText, ...rest } = chatWrapper.generateContextState({
...options,
chatHistory: setLastUserTextInChatHistory(options.chatHistory, lastUserText + randomId)
});
let newContextText = contextText;
for (let i = 0; i < newContextText.values.length; i++) {
const item = newContextText.values[i];
if (typeof item !== "string")
continue;
const randomTextIndex = item.indexOf(randomId);
if (randomTextIndex < 0)
continue;
const newValue = item.slice(0, randomTextIndex);
newContextText = LlamaText([
...newContextText.values.slice(0, i),
newValue
]);
return {
contextText: newContextText,
userTextSuffix: LlamaText([
item.slice(randomTextIndex + randomId.length),
...newContextText.values.slice(i + 1)
]),
...rest
};
}
throw new Error("The random ID was not found in the context text. " +
`There might be an issue with the chat wrapper "${chatWrapper.wrapperName}" ` +
"where not all user messages are properly added to the the result LlamaText");
}
async function getContextWindow({ resolvedHistory, resolvedContextShift, lastHistoryCompressionMetadata, pendingTokensCount = 0, isFirstEvaluation, chatWrapper, lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift, sequence, minFreeContextTokens = 1, functions, documentFunctionParams, endWithUserText }) {
if (sequence == null)
throw new DisposedError();
const model = sequence.model;
const context = sequence.context;
let removeRawFromHistory = false;
if (isFirstEvaluation && lastEvaluationContextWindowHistory != null && sequence.isLoadedToMemory) {
const newContextWindow = lastEvaluationContextWindowHistory.slice();
if (endWithUserText) {
if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "user")
newContextWindow.push({
type: "user",
text: ""
});
}
else if (newContextWindow.length === 0 || newContextWindow[newContextWindow.length - 1].type !== "model")
newContextWindow.push({
type: "model",
response: []
});
const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
chatHistory: newContextWindow,
availableFunctions: functions,
documentFunctionParams
});
const tokens = contextText.tokenize(model.tokenizer);
if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize) {
const { firstDifferentIndex } = sequence.compareContextTokens(tokens);
const existingEvaluationPercentage = firstDifferentIndex / tokens.length;
if (existingEvaluationPercentage >= minimumOverlapPercentageToPreventContextShift)
return {
history: newContextWindow,
stopGenerationTriggers,
tokens,
removeRawFromHistory,
newHistoryCompressionMetadata: lastHistoryCompressionMetadata,
ignoreStartText: ignoreStartText ?? [],
functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
userTextSuffix
};
}
}
removeRawFromHistory = !sequence.isLoadedToMemory;
resolvedHistory = removeRawFromHistory
? resolvedHistory.map(removeRawFromHistoryItem)
: resolvedHistory.slice();
if (resolvedContextShift.lastEvaluationMetadata != null) {
const contextShiftSize = resolvedContextShift.size instanceof Function
? await resolvedContextShift.size(sequence)
: resolvedContextShift.size;
const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
history: resolvedHistory,
contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
contextShiftStrategy: resolvedContextShift.strategy,
contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
contextSize: context.contextSize,
tokenizer: model.tokenizer,
chatWrapper: chatWrapper,
functions,
documentFunctionParams
});
const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
chatHistory: compressedHistory,
availableFunctions: functions,
documentFunctionParams
});
return {
history: compressedHistory,
stopGenerationTriggers,
tokens: contextText.tokenize(model.tokenizer),
removeRawFromHistory,
newHistoryCompressionMetadata: metadata,
ignoreStartText: ignoreStartText ?? [],
functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
userTextSuffix
};
}
{
const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
chatHistory: resolvedHistory,
availableFunctions: functions,
documentFunctionParams
});
const tokens = contextText.tokenize(model.tokenizer);
if (tokens.length + pendingTokensCount + minFreeContextTokens < context.contextSize)
return {
history: resolvedHistory,
stopGenerationTriggers,
tokens,
removeRawFromHistory,
newHistoryCompressionMetadata: lastHistoryCompressionMetadata,
ignoreStartText: ignoreStartText ?? [],
functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
userTextSuffix
};
}
const contextShiftSize = Math.min(context.contextSize, Math.max(1, Math.floor(resolvedContextShift.size instanceof Function
? await resolvedContextShift.size(sequence)
: resolvedContextShift.size)));
const { compressedHistory, metadata } = await compressHistoryToFitContextSize({
history: resolvedHistory,
contextShiftSize: Math.max(minFreeContextTokens, Math.min(contextShiftSize, context.contextSize - pendingTokensCount)) + pendingTokensCount,
contextShiftStrategy: resolvedContextShift.strategy,
contextShiftLastEvaluationMetadata: resolvedContextShift.lastEvaluationMetadata,
contextSize: context.contextSize,
tokenizer: model.tokenizer,
chatWrapper: chatWrapper,
functions,
documentFunctionParams
});
const { contextText, stopGenerationTriggers, ignoreStartText, functionCall, userTextSuffix } = generateContextText(endWithUserText, chatWrapper, {
chatHistory: compressedHistory,
availableFunctions: functions,
documentFunctionParams
});
return {
history: compressedHistory,
stopGenerationTriggers,
tokens: contextText.tokenize(model.tokenizer),
removeRawFromHistory,
newHistoryCompressionMetadata: metadata,
ignoreStartText: ignoreStartText ?? [],
functionCallInitiallyEngaged: functionCall?.initiallyEngaged ?? false,
disengageInitiallyEngagedFunctionCall: functionCall?.disengageInitiallyEngaged ?? [],
userTextSuffix
};
}
class GenerateResponseState {
llamaChat;
chatWrapper;
history;
onTextChunk;
onToken;
onResponseChunk;
signal;
stopOnAbortSignal;
maxTokens;
temperature;
minP;
topK;
topP;
seed;
grammar;
trimWhitespaceSuffix;
tokenBias;
evaluationPriority;
functions;
onFunctionCall;
documentFunctionParams;
maxParallelFunctionCalls;
contextShift;
customStopTriggers;
minimumOverlapPercentageToPreventContextShift;
functionsEnabled;
repeatPenaltyEnabled;
resolvedContextShift;
resolvedRepeatPenalty;
grammarEvaluationState;
functionNameGrammar;
functionsGrammar;
functionsEvaluationState;
streamRegulator = new TokenStreamRegulator();
stopGenerationDetector = new StopGenerationDetector();
customStopGenerationTriggersDetector = new StopGenerationDetector();
functionSyntaxStartDetector = new StopGenerationDetector();
disengageInitiallyEngagedFunctionMode = new StopGenerationDetector();
ignoreStartTextDetector = new StopGenerationDetector();
locksToReleaseOnValidGeneration = [];
resolvedHistory;
noRawInResolvedHistory;
res = [];
pendingTokens = [];
ignoredStartTextTokens = [];
resFunctionCalls = [];
segmentHandler;
pendingPartialTokens = [];
functionEvaluationMode = false;
currentFunctionCallPreviousText = LlamaText([]);
currentFunctionCallCurrentPartTokens = [];
functionEvaluationFunctionName = "";
currentFunctionCallPreviousPartLeftoverText = "";
removedStartTextToIgnore = false;
releasedPartiallyFreeTokensBeforeFunctionCallStartSyntax = false;
generatedTokens = 0;
isFirstEvaluation = true;
initiallyEngagedFunctionMode = false;
lastContextWindowHistory;
lastHistoryCompressionMetadata;
restartEvaluationIterator = false;
// context shift loop
shouldContextShift = false;
canAvoidReloadingHistory = false;
contextWindowTokens = [];
stopGenerationTriggers = [];
ignoreStartText = [];
functionCallInitiallyEngaged = false;
disengageInitiallyEngagedFunctionCall = [];
userTextSuffix = undefined;
tokens = [];
// token evaluation loop
evaluationIterator;
currentIteration;
currentIterationReplacementToken;
currentToken;
currentTokens = [];
currentText = "";
currentQueuedTokenRelease;
constructor(llamaChat, chatWrapper, history, { onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = defaultTrimWhitespaceSuffix, repeatPenalty = {}, tokenBias, evaluationPriority = defaultEvaluationPriority, functions, onFunctionCall, documentFunctionParams, maxParallelFunctionCalls, contextShift = defaultContextShiftOptions, customStopTriggers, lastEvaluationContextWindow: { history: lastEvaluationContextWindowHistory, minimumOverlapPercentageToPreventContextShift = 0.5 } = {} } = {}) {
this.llamaChat = llamaChat;
this.chatWrapper = chatWrapper;
this.history = history;
this.onTextChunk = safeEventCallback(onTextChunk);
this.onToken = safeEventCallback(onToken);
this.onResponseChunk = safeEventCallback(onResponseChunk);
this.signal = signal;
this.stopOnAbortSignal = stopOnAbortSignal;
this.maxTokens = maxTokens;
this.temperature = temperature;
this.minP = minP;
this.topK = topK;
this.topP = topP;
this.seed = seed;
this.grammar = grammar;
this.trimWhitespaceSuffix = trimWhitespaceSuffix;
this.tokenBias = tokenBias;
this.evaluationPriority = evaluationPriority;
this.functions = functions;
this.onFunctionCall = safeEventCallback(onFunctionCall);
this.documentFunctionParams = documentFunctionParams;
this.maxParallelFunctionCalls = maxParallelFunctionCalls;
this.contextShift = contextShift;
this.customStopTriggers = customStopTriggers;
this.minimumOverlapPercentageToPreventContextShift = minimumOverlapPercentageToPreventContextShift;
this.functionsEnabled = (this.functions != null && Object.keys(this.functions).length > 0);
if (this.signal?.aborted)
throw this.signal.reason;
if (this.llamaChat.disposed)
throw new DisposedError();
this.noRawInResolvedHistory = !this.llamaChat.sequence.isLoadedToMemory;
this.resolvedHistory = this.noRawInResolvedHistory
? this.history.map(removeRawFromHistoryItem)
: this.history.slice();
this.resolvedContextShift = {
...defaultContextShiftOptions,
...removeNullFields(this.contextShift)
};
this.resolvedRepeatPenalty = repeatPenalty === false
? { lastTokens: 0 }
: {
...(repeatPenalty ?? {}),
lastTokens: repeatPenalty?.lastTokens ?? defaultRepeatPenaltyLastTokens
};
this.repeatPenaltyEnabled = this.resolvedRepeatPenalty.lastTokens > 0;
this.grammarEvaluationState = this.grammar != null
? new LlamaGrammarEvaluationState({ model: this.llamaChat.model, grammar: this.grammar })
: undefined;
this.functionNameGrammar = this.functionsEnabled
? new FunctionCallNameGrammar(this.llamaChat.model._llama, this.functions, this.chatWrapper)
: undefined;
this.functionsGrammar = undefined;
this.functionsEvaluationState = undefined;
this.lastContextWindowHistory = lastEvaluationContextWindowHistory ?? this.resolvedHistory;
this.lastHistoryCompressionMetadata = this.resolvedContextShift.lastEvaluationMetadata;
if (this.customStopTriggers != null)
StopGenerationDetector.resolveStopTriggers(this.customStopTriggers, this.llamaChat.model.tokenizer)
.map((stopTrigger) => this.customStopGenerationTriggersDetector.addStopTrigger(stopTrigger));
if (this.grammar != null)
StopGenerationDetector.resolveStopTriggers(this.grammar.stopGenerationTriggers, this.llamaChat.model.tokenizer)
.map((stopTrigger) => this.stopGenerationDetector.addStopTrigger(stopTrigger));
if (this.functions != null && Object.keys(this.functions).length > 0)
this.functionSyntaxStartDetector.addStopTrigger(StopGenerationDetector.resolveLlamaTextTrigger(LlamaText([
this.chatWrapper.settings.functions?.parallelism?.call?.sectionPrefix ?? "",
this.chatWrapper.settings.functions.call.prefix
]), this.llamaChat.model.tokenizer));
const segmentDefinitions = new Map();
for (const segmentType of allSegmentTypes) {
const segmentDefinition = getChatWrapperSegmentDefinition(this.chatWrapper.settings, segmentType);
if (segmentDefinition != null)
segmentDefinitions.set(segmentType, segmentDefinition);
}
this.segmentHandler = new SegmentHandler({
model: this.llamaChat.model,
onTextChunk: this.onTextChunk,
onToken: this.onToken,
onResponseChunk: this.onResponseChunk,
previousTokens: this.getLastTokens(),
closeAllSegments: this.chatWrapper.settings.segments?.closeAllSegments,
segmentDefinitions,
initialSegmentStack: SegmentHandler.getStackFromModelResponse(getLastModelMessageFullResponseFromChatHistory(this.resolvedHistory))
});
this.getPenaltyTokens = this.getPenaltyTokens.bind(this);
}
async dispose() {
await this.evaluationIterator?.return();
}
async [Symbol.asyncDispose]() {
await this.dispose();
}
ensureLastHistoryItemIsModel() {
if (this.resolvedHistory.at(-1)?.type !== "model")
this.resolvedHistory.push({
type: "model",
response: []
});
}
ensureLastHistoryItemIsUser() {
if (this.resolvedHistory.at(-1)?.type !== "user")
this.resolvedHistory.push({
type: "user",
text: ""
});
}
ensureReopenedThoughtSegmentAfterFunctionCallsIfNeeded() {
if (this.chatWrapper.settings.segments?.thought?.reopenAfterFunctionCalls !== true)
return;
const lastModelResponseItem = this.resolvedHistory.at(-1);
if (lastModelResponseItem == null || lastModelResponseItem.type !== "model")
return;
const lastResponse = lastModelResponseItem.response.at(-1);
if (lastResponse == null)
return;
const lastResponseIsFunctionCall = typeof lastResponse !== "string" && lastResponse.type === "functionCall";
if (!lastResponseIsFunctionCall)
return;
const currentResponseSegmentsStack = SegmentHandler.getStackFromModelResponse(lastModelResponseItem.response);
if (currentResponseSegmentsStack.includes("thought"))
return;
const hadThoughtSegments = this.resolvedHistory.some((chatItem) => {
if (chatItem.type !== "model")
return false;
return chatItem.response.some((responseItem) => {
if (typeof responseItem === "string")
return false;
return responseItem.type === "segment" && responseItem.segmentType === "thought";
});
});
if (!hadThoughtSegments)
return;
this.segmentHandler.openSegment("thought");
}
ensureNotAborted() {
if (this.signal?.aborted && (!this.stopOnAbortSignal || this.res.length === 0))
throw this.signal.reason;
if (this.llamaChat.disposed)
throw new DisposedError();
}
getPenaltyTokens() {
if (this.llamaChat.disposed)
return [];
let punishTokens = this.res.slice(-this.resolvedRepeatPenalty.lastTokens);
if (this.resolvedRepeatPenalty.punishTokensFilter != null)
punishTokens = this.resolvedRepeatPenalty.punishTokensFilter(punishTokens);
if (this.resolvedRepeatPenalty.penalizeNewLine == null || !this.resolvedRepeatPenalty.penalizeNewLine) {
const nlToken = this.llamaChat.model.tokens.nl;
if (nlToken != null)
punishTokens = punishTokens.filter((token) => token !== nlToken);
}
return punishTokens;
}
getResolvedHistoryWithCurrentModelResponse() {
return mergeGeneratedResultWithChatHistory("model", this.resolvedHistory, this.segmentHandler.getModelResponseSegments());
}
getContextWindowsHistoryWithCurrentModelResponse() {
return mergeGeneratedResultWithChatHistory("model", this.lastContextWindowHistory, this.segmentHandler.getContextWindowModelResponseSegments());
}
removeFoundStartIgnoreTextsFromPendingTokens(forceRemove = false) {
if (!this.removedStartTextToIgnore && this.res.length === 0 && this.pendingTokens.length > 0 &&
this.ignoreStartTextDetector.hasTriggeredStops && (forceRemove || !this.ignoreStartTextDetector.hasInProgressStops)) {
this.ignoreStartTextDetector.clearInProgressStops();
this.ignoreStartTextDetector.clearTriggeredStops();
let mostExhaustiveTriggeredStops = null;
let mostExhaustiveTriggeredStopsLeftoverTokens = [];
const lastTokensForDetokenizer = resolveLastTokens([
this.contextWindowTokens,
this.ignoredStartTextTokens
]);
const pendingPartialTokens = [];
for (let i = 0; i < this.pendingTokens.length; i++) {
const currentToken = this.pendingTokens[i];
const tokens = [...pendingPartialTokens, currentToken];
const text = this.llamaChat.model.detokenize(tokens, false, lastTokensForDetokenizer);
if (pendingPartialTokens.length === 0 &&
text.endsWith(UNKNOWN_UNICODE_CHAR) &&
!this.llamaChat.model.isSpecialToken(currentToken) &&
!this.llamaChat.model.isEogToken(currentToken)) {
pendingPartialTokens.length = 0;
pushAll(pendingPartialTokens, tokens);
continue;
}
this.ignoreStartTextDetector.recordGeneration({
text: this.llamaChat.model.detokenize(tokens, false, lastTokensForDetokenizer),
tokens,
startNewChecks: i === 0,
triggerMustStartWithGeneration: true
});
pushAll(lastTokensForDetokenizer, tokens);
if (this.ignoreStartTextDetector.hasTriggeredStops) {
mostExhaustiveTriggeredStops = this.ignoreStartTextDetector.getTriggeredStops();
this.ignoreStartTextDetector.clearTriggeredStops();
mostExhaustiveTriggeredStopsLeftoverTokens = this.pendingTokens.slice(i + 1);
}
else if (!this.ignoreStartTextDetector.hasInProgressStops)
break;
}
if (mostExhaustiveTriggeredStops != null) {
const [mostExhaustiveTriggeredStop] = mostExhaustiveTriggeredStops;
if (mostExhaustiveTriggeredStop != null) {
this.ignoredStartTextTokens = mostExhaustiveTriggeredStop.stopTrigger
.map((stopTrigger) => {
if (typeof stopTrigger === "string")
return this.llamaChat.model.tokenize(stopTrigger, false, "trimLeadingSpace");
else
return [stopTrigger];
})
.flat(1);
const newPendingTokens = [
...mostExhaustiveTriggeredStop.remainingGeneration,
mostExhaustiveTriggeredStopsLeftoverTokens
]
.map((generation) => {
if (typeof generation === "string")
return this.llamaChat.model.tokenize(generation, false, "trimLeadingSpace");
else
return generation;
})
.flat(1);
this.pendingTokens.length = 0;
pushAll(this.pendingTokens, newPendingTokens);
this.removedStartTextToIgnore = true;
}
}
}
}
startTokenLoop() {
this.ensureNotAborted();
this.shouldContextShift = false;
}
getContextWindowFunctionCallsTokens() {
if (this.functionEvaluationMode === false)
return [];
else if (this.functionEvaluationMode === "prefixOrDisengage")
return [
...LlamaText(this.currentFunctionCallPreviousText).tokenize(this.llamaChat.model.tokenizer, "trimLeadingSpace"),
...this.currentFunctionCallCurrentPartTokens
];
const text = [];
if (this.chatWrapper.settings.functions?.paralle