node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
481 lines • 24.9 kB
JavaScript
import { DisposeAggregator, DisposedError, EventRelay, withLock } from "lifecycle-utils";
import { appendUserMessageToChatHistory } from "../../utils/appendUserMessageToChatHistory.js";
import { LlamaChat } from "../LlamaChat/LlamaChat.js";
import { wrapAbortSignal } from "../../utils/wrapAbortSignal.js";
import { safeEventCallback } from "../../utils/safeEventCallback.js";
import { LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
/**
* @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
*/
export class LlamaChatSession {
/** @internal */ _disposeAggregator = new DisposeAggregator();
/** @internal */ _autoDisposeSequence;
/** @internal */ _contextShift;
/** @internal */ _forceAddSystemPrompt;
/** @internal */ _systemPrompt;
/** @internal */ _chatLock = {};
/** @internal */ _chatHistory;
/** @internal */ _lastEvaluation;
/** @internal */ _chat;
/** @internal */ _chatHistoryStateRef = {};
/** @internal */ _preloadAndCompleteAbortControllers = new Set();
onDispose = new EventRelay();
constructor(options) {
const { contextSequence, chatWrapper = "auto", systemPrompt, forceAddSystemPrompt = false, autoDisposeSequence = false, contextShift } = options;
if (contextSequence == null)
throw new Error("contextSequence cannot be null");
if (contextSequence.disposed)
throw new DisposedError();
this._contextShift = contextShift;
this._forceAddSystemPrompt = forceAddSystemPrompt;
this._systemPrompt = systemPrompt;
this._chat = new LlamaChat({
autoDisposeSequence,
chatWrapper,
contextSequence
});
const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
this._chatHistory = this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt });
else
this._chatHistory = [];
this._autoDisposeSequence = autoDisposeSequence;
this._disposeAggregator.add(this._chat.onDispose.createListener(() => {
this.dispose();
}));
this._disposeAggregator.add(this.onDispose.dispatchEvent);
}
dispose({ disposeSequence = this._autoDisposeSequence } = {}) {
if (this._chat == null)
return;
this._chat.dispose({ disposeSequence });
this._chat = null;
this._disposeAggregator.dispose();
}
/** @hidden */
[Symbol.dispose]() {
return this.dispose();
}
get disposed() {
return this._chat == null || this._chat.disposed;
}
get chatWrapper() {
if (this._chat == null)
throw new DisposedError();
return this._chat.chatWrapper;
}
get sequence() {
if (this._chat == null)
throw new DisposedError();
return this._chat.sequence;
}
get context() {
return this.sequence.context;
}
get model() {
return this.sequence.model;
}
async prompt(prompt, options = {}) {
const { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers } = options;
const { responseText } = await this.promptWithMeta(prompt, {
// this is a workaround to allow passing both `functions` and `grammar`
functions: functions,
documentFunctionParams: documentFunctionParams,
maxParallelFunctionCalls: maxParallelFunctionCalls,
onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar,
trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers
});
return responseText;
}
/**
* @param prompt
* @param [options]
*/
async promptWithMeta(prompt, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal = false, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority } = {}) {
this._ensureNotDisposed();
if (grammar != null && grammar._llama !== this.model._llama)
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
this._stopAllPreloadAndPromptCompletions();
return await withLock(this._chatLock, "evaluation", signal, async () => {
this._ensureNotDisposed();
this._stopAllPreloadAndPromptCompletions();
if (this._chat == null)
throw new DisposedError();
const supportsParallelFunctionCalling = this._chat.chatWrapper.settings.functions.parallelism != null;
const [abortController, disposeAbortController] = wrapAbortSignal(signal);
let lastEvaluation = this._lastEvaluation;
let newChatHistory = appendUserMessageToChatHistory(this._chatHistory, prompt);
let newContextWindowChatHistory = lastEvaluation?.contextWindow == null
? undefined
: appendUserMessageToChatHistory(lastEvaluation?.contextWindow, prompt);
const resolvedResponsePrefix = (responsePrefix != null && responsePrefix !== "")
? responsePrefix
: undefined;
newChatHistory.push({
type: "model",
response: resolvedResponsePrefix != null
? [resolvedResponsePrefix]
: []
});
if (newContextWindowChatHistory != null)
newContextWindowChatHistory.push({
type: "model",
response: resolvedResponsePrefix != null
? [resolvedResponsePrefix]
: []
});
if (resolvedResponsePrefix != null) {
safeEventCallback(onToken)?.(this.model.tokenize(resolvedResponsePrefix));
safeEventCallback(onTextChunk)?.(resolvedResponsePrefix);
safeEventCallback(onResponseChunk)?.({
type: undefined,
segmentType: undefined,
text: resolvedResponsePrefix,
tokens: this.model.tokenize(resolvedResponsePrefix)
});
}
try {
while (true) {
const functionCallsAndResults = [];
let canThrowFunctionCallingErrors = false;
let abortedOnFunctionCallError = false;
const initialOutputTokens = this._chat.sequence.tokenMeter.usedOutputTokens;
const { lastEvaluation: currentLastEvaluation, metadata } = await this._chat.generateResponse(newChatHistory, {
functions,
documentFunctionParams,
maxParallelFunctionCalls,
grammar: grammar, // this is a workaround to allow passing both `functions` and `grammar`
onTextChunk: safeEventCallback(onTextChunk),
onToken: safeEventCallback(onToken),
onResponseChunk: safeEventCallback(onResponseChunk),
signal: abortController.signal,
stopOnAbortSignal,
repeatPenalty,
minP,
topK,
topP,
seed,
tokenBias,
customStopTriggers,
maxTokens,
temperature,
trimWhitespaceSuffix,
contextShift: {
...this._contextShift,
lastEvaluationMetadata: lastEvaluation?.contextShiftMetadata
},
evaluationPriority,
lastEvaluationContextWindow: {
history: newContextWindowChatHistory,
minimumOverlapPercentageToPreventContextShift: 0.5
},
onFunctionCall: async (functionCall) => {
functionCallsAndResults.push((async () => {
try {
const functionDefinition = functions?.[functionCall.functionName];
if (functionDefinition == null)
throw new Error(`The model tried to call function "${functionCall.functionName}" which is not defined`);
const functionCallResult = await functionDefinition.handler(functionCall.params);
return {
functionCall,
functionDefinition,
functionCallResult
};
}
catch (err) {
if (!abortController.signal.aborted) {
abortedOnFunctionCallError = true;
abortController.abort(err);
}
if (canThrowFunctionCallingErrors)
throw err;
return null;
}
})());
}
});
this._ensureNotDisposed();
if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
throw abortController.signal.reason;
if (maxTokens != null)
maxTokens = Math.max(0, maxTokens - (this._chat.sequence.tokenMeter.usedOutputTokens - initialOutputTokens));
lastEvaluation = currentLastEvaluation;
newChatHistory = lastEvaluation.cleanHistory;
if (functionCallsAndResults.length > 0) {
canThrowFunctionCallingErrors = true;
const functionCallResultsPromise = Promise.all(functionCallsAndResults);
const raceEventAbortController = new AbortController();
await Promise.race([
functionCallResultsPromise,
new Promise((accept, reject) => {
abortController.signal.addEventListener("abort", () => {
if (abortedOnFunctionCallError || !stopOnAbortSignal)
reject(abortController.signal.reason);
else
accept();
}, { signal: raceEventAbortController.signal });
if (abortController.signal.aborted) {
if (abortedOnFunctionCallError || !stopOnAbortSignal)
reject(abortController.signal.reason);
else
accept();
}
})
]);
raceEventAbortController.abort();
this._ensureNotDisposed();
if (!abortController.signal.aborted) {
const functionCallResults = (await functionCallResultsPromise)
.filter((result) => result != null);
this._ensureNotDisposed();
if (abortController.signal.aborted && (abortedOnFunctionCallError || !stopOnAbortSignal))
throw abortController.signal.reason;
newContextWindowChatHistory = lastEvaluation.contextWindow;
let startNewChunk = supportsParallelFunctionCalling;
for (const { functionCall, functionDefinition, functionCallResult } of functionCallResults) {
newChatHistory = addFunctionCallToChatHistory({
chatHistory: newChatHistory,
functionName: functionCall.functionName,
functionDescription: functionDefinition.description,
callParams: functionCall.params,
callResult: functionCallResult,
rawCall: functionCall.raw,
startsNewChunk: startNewChunk
});
newContextWindowChatHistory = addFunctionCallToChatHistory({
chatHistory: newContextWindowChatHistory,
functionName: functionCall.functionName,
functionDescription: functionDefinition.description,
callParams: functionCall.params,
callResult: functionCallResult,
rawCall: functionCall.raw,
startsNewChunk: startNewChunk
});
startNewChunk = false;
}
lastEvaluation.cleanHistory = newChatHistory;
lastEvaluation.contextWindow = newContextWindowChatHistory;
if (abortController.signal.aborted && !abortedOnFunctionCallError && stopOnAbortSignal) {
metadata.stopReason = "abort";
metadata.remainingGenerationAfterStop = undefined;
}
else
continue;
}
}
this._lastEvaluation = lastEvaluation;
this._chatHistory = newChatHistory;
this._chatHistoryStateRef = {};
const lastModelResponseItem = getLastModelResponseItem(newChatHistory);
const responseText = lastModelResponseItem.response
.filter((item) => typeof item === "string")
.join("");
if (metadata.stopReason === "customStopTrigger")
return {
response: lastModelResponseItem.response,
responseText,
stopReason: metadata.stopReason,
customStopTrigger: metadata.customStopTrigger,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
return {
response: lastModelResponseItem.response,
responseText,
stopReason: metadata.stopReason,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
}
}
finally {
disposeAbortController();
}
});
}
/**
* Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
* and feel faster.
*
* > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
* @param prompt - the prompt to preload
* @param [options]
*/
async preloadPrompt(prompt, options = {}) {
await this.completePromptWithMeta(prompt, {
...options,
maxTokens: 0
});
}
/**
* Preload a user prompt into the current context sequence state and generate a completion for it.
*
* > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
* > so consider limiting the length of prompts you preload.
* >
* > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
* @param prompt - the prompt to preload
* @param [options]
*/
async completePrompt(prompt, options = {}) {
const { completion } = await this.completePromptWithMeta(prompt, options);
return completion;
}
/**
* Create a smart completion engine that caches the prompt completions
* and reuses them when the user prompt matches the beginning of the cached prompt or completion.
*
* All completions are made and cache is used only for the current chat session state.
* You can create a single completion engine for an entire chat session.
*/
createPromptCompletionEngine(options) {
return LlamaChatSessionPromptCompletionEngine._create(this, options);
}
/**
* See `completePrompt` for more information.
* @param prompt
* @param [options]
*/
async completePromptWithMeta(prompt, { maxTokens, stopOnAbortSignal = false, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix = false, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority } = {}) {
this._ensureNotDisposed();
if (grammar != null) {
if (grammar._llama == null)
throw new Error("The grammar passed to this function is not a LlamaGrammar instance.");
else if (grammar._llama !== this.model._llama)
throw new Error("The LlamaGrammar used by passed to this function was created with a different Llama instance than the one used by this sequence's model. Make sure you use the same Llama instance for both the model and the grammar.");
}
const [abortController, disposeAbortController] = wrapAbortSignal(signal);
this._preloadAndCompleteAbortControllers.add(abortController);
try {
return await withLock(this._chatLock, "evaluation", abortController.signal, async () => {
this._ensureNotDisposed();
if (this._chat == null)
throw new DisposedError();
const { completion, lastEvaluation, metadata } = await this._chat.loadChatAndCompleteUserMessage(asWithLastUserMessageRemoved(this._chatHistory), {
initialUserPrompt: prompt,
functions,
documentFunctionParams,
grammar,
onTextChunk,
onToken,
signal: abortController.signal,
stopOnAbortSignal: true,
repeatPenalty,
minP,
topK,
topP,
seed,
tokenBias,
customStopTriggers,
maxTokens,
temperature,
trimWhitespaceSuffix,
contextShift: {
...this._contextShift,
lastEvaluationMetadata: this._lastEvaluation?.contextShiftMetadata
},
evaluationPriority,
lastEvaluationContextWindow: {
history: asWithLastUserMessageRemoved(this._lastEvaluation?.contextWindow),
minimumOverlapPercentageToPreventContextShift: 0.8
}
});
this._ensureNotDisposed();
this._lastEvaluation = {
cleanHistory: this._chatHistory,
contextWindow: lastEvaluation.contextWindow,
contextShiftMetadata: lastEvaluation.contextShiftMetadata
};
if (!stopOnAbortSignal && metadata.stopReason === "abort" && abortController.signal?.aborted)
throw abortController.signal.reason;
if (metadata.stopReason === "customStopTrigger")
return {
completion: completion,
stopReason: metadata.stopReason,
customStopTrigger: metadata.customStopTrigger,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
return {
completion: completion,
stopReason: metadata.stopReason,
remainingGenerationAfterStop: metadata.remainingGenerationAfterStop
};
});
}
finally {
this._preloadAndCompleteAbortControllers.delete(abortController);
disposeAbortController();
}
}
getChatHistory() {
return structuredClone(this._chatHistory);
}
getLastEvaluationContextWindow() {
if (this._lastEvaluation == null)
return null;
return structuredClone(this._lastEvaluation?.contextWindow);
}
setChatHistory(chatHistory) {
this._chatHistory = structuredClone(chatHistory);
this._chatHistoryStateRef = {};
this._lastEvaluation = undefined;
}
/** Clear the chat history and reset it to the initial state. */
resetChatHistory() {
if (this._chat == null || this.disposed)
throw new DisposedError();
const chatWrapperSupportsSystemMessages = this._chat.chatWrapper.settings.supportsSystemMessages;
if (chatWrapperSupportsSystemMessages == null || chatWrapperSupportsSystemMessages || this._forceAddSystemPrompt)
this.setChatHistory(this._chat.chatWrapper.generateInitialChatHistory({ systemPrompt: this._systemPrompt }));
else
this.setChatHistory([]);
}
/** @internal */
_stopAllPreloadAndPromptCompletions() {
for (const abortController of this._preloadAndCompleteAbortControllers)
abortController.abort();
this._preloadAndCompleteAbortControllers.clear();
}
/** @internal */
_ensureNotDisposed() {
if (this.disposed)
throw new DisposedError();
}
}
function addFunctionCallToChatHistory({ chatHistory, functionName, functionDescription, callParams, callResult, rawCall, startsNewChunk }) {
const newChatHistory = chatHistory.slice();
if (newChatHistory.length === 0 || newChatHistory[newChatHistory.length - 1].type !== "model")
newChatHistory.push({
type: "model",
response: []
});
const lastModelResponseItem = newChatHistory[newChatHistory.length - 1];
const newLastModelResponseItem = { ...lastModelResponseItem };
newChatHistory[newChatHistory.length - 1] = newLastModelResponseItem;
const modelResponse = newLastModelResponseItem.response.slice();
newLastModelResponseItem.response = modelResponse;
const functionCall = {
type: "functionCall",
name: functionName,
description: functionDescription,
params: callParams,
result: callResult,
rawCall
};
if (startsNewChunk)
functionCall.startsNewChunk = true;
modelResponse.push(functionCall);
return newChatHistory;
}
function getLastModelResponseItem(chatHistory) {
if (chatHistory.length === 0 || chatHistory[chatHistory.length - 1].type !== "model")
throw new Error("Expected chat history to end with a model response");
return chatHistory[chatHistory.length - 1];
}
function asWithLastUserMessageRemoved(chatHistory) {
if (chatHistory == null)
return chatHistory;
const newChatHistory = chatHistory.slice();
while (newChatHistory.at(-1)?.type === "user")
newChatHistory.pop();
return newChatHistory;
}
//# sourceMappingURL=LlamaChatSession.js.map