node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
347 lines (346 loc) • 15.8 kB
TypeScript
import { EventRelay } from "lifecycle-utils";
import { ChatWrapper } from "../../ChatWrapper.js";
import { ChatHistoryItem, ChatModelFunctionCall, ChatSessionModelFunctions, Token } from "../../types.js";
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
import { LlamaGrammar } from "../LlamaGrammar.js";
import { LLamaChatContextShiftOptions, LlamaChatResponseChunk } from "../LlamaChat/LlamaChat.js";
import { EvaluationPriority } from "../LlamaContext/types.js";
import { TokenBias } from "../TokenBias.js";
import { LlamaText } from "../../utils/LlamaText.js";
import { LLamaChatPromptCompletionEngineOptions, LlamaChatSessionPromptCompletionEngine } from "./utils/LlamaChatSessionPromptCompletionEngine.js";
export type LlamaChatSessionOptions = {
contextSequence: LlamaContextSequence;
/** `"auto"` is used by default */
chatWrapper?: "auto" | ChatWrapper;
systemPrompt?: string;
/**
* Add the system prompt even on models that don't support a system prompt.
*
* Each chat wrapper has its own workaround for adding a system prompt to a model that doesn't support it,
* but forcing the system prompt on unsupported models may not always work as expected.
*
* Use with caution.
*/
forceAddSystemPrompt?: boolean;
/**
* Automatically dispose the sequence when the session is disposed.
*
* Defaults to `false`.
*/
autoDisposeSequence?: boolean;
contextShift?: LlamaChatSessionContextShiftOptions;
};
export type LlamaChatSessionContextShiftOptions = {
/**
* The number of tokens to delete from the context window to make space for new ones.
* Defaults to 10% of the context size.
*/
size?: LLamaChatContextShiftOptions["size"];
/**
* The strategy to use when deleting tokens from the context window.
*
* Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
*/
strategy?: LLamaChatContextShiftOptions["strategy"];
};
export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions | undefined = ChatSessionModelFunctions | undefined> = {
/**
* Called as the model generates the main response with the generated text chunk.
*
* Useful for streaming the generated response as it's being generated.
*
* Includes only the main response without any text segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onTextChunk?: (text: string) => void;
/**
* Called as the model generates the main response with the generated tokens.
*
* Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
*
* Includes only the main response without any segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onToken?: (tokens: Token[]) => void;
/**
* Called as the model generates a response with the generated text and tokens,
* including segment information (when the generated output is part of a segment).
*
* Useful for streaming the generated response as it's being generated, including the main response and all segments.
*
* Only use this function when you need the segmented texts, like thought segments (chain of thought text).
*/
onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
signal?: AbortSignal;
/**
* When a response already started being generated and then the signal is aborted,
* the generation will stop and the response will be returned as is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: boolean;
maxTokens?: number;
/**
* Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
*
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
*
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
*
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
* Disabled by default (set to `0`).
*/
temperature?: number;
/**
* From the next token candidates, discard the percentage of tokens with the lowest probability.
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
* This is useful for generating more high-quality results when using a high temperature.
* Set to a value between `0` and `1` to enable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* Disabled by default.
*/
minP?: number;
/**
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
*/
topK?: number;
/**
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
*/
topP?: number;
/**
* Used to control the randomness of the generated text.
*
* Change the seed to get different results.
*
* Only relevant when using `temperature`.
*/
seed?: number;
/**
* Trim whitespace from the end of the generated text
* Disabled by default.
*/
trimWhitespaceSuffix?: boolean;
/**
* Force a given text prefix to be the start of the model response, to make the model follow a certain direction.
*
* May cause some models to not use the given functions in some scenarios where they would have been used otherwise,
* so avoid using it together with function calling if you notice unexpected behavior.
*/
responsePrefix?: string;
/**
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
*/
evaluationPriority?: EvaluationPriority;
repeatPenalty?: false | LlamaChatSessionRepeatPenalty;
/**
* Adjust the probability of tokens being generated.
* Can be used to bias the model to generate tokens that you want it to lean towards,
* or to avoid generating tokens that you want it to avoid.
*/
tokenBias?: TokenBias | (() => TokenBias);
/**
* Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
*/
customStopTriggers?: (LlamaText | string | (string | Token)[])[];
} & ({
grammar?: LlamaGrammar;
functions?: never;
documentFunctionParams?: never;
maxParallelFunctionCalls?: never;
} | {
grammar?: never;
functions?: Functions | ChatSessionModelFunctions;
documentFunctionParams?: boolean;
maxParallelFunctionCalls?: number;
});
export type LLamaChatCompletePromptOptions = {
/**
* Generate a completion for the given user prompt up to the given number of tokens.
*
* Defaults to `256` or half the context size, whichever is smaller.
*/
maxTokens?: LLamaChatPromptOptions["maxTokens"];
/**
* When a completion already started being generated and then the given `signal` is aborted,
* the generation will stop and the completion will be returned as-is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: LLamaChatPromptOptions["stopOnAbortSignal"];
/**
* Called as the model generates a completion with the generated text chunk.
*
* Useful for streaming the generated completion as it's being generated.
*/
onTextChunk?: LLamaChatPromptOptions["onTextChunk"];
/**
* Called as the model generates a completion with the generated tokens.
*
* Preferably, you'd want to use `onTextChunk` instead of this.
*/
onToken?: LLamaChatPromptOptions["onToken"];
signal?: LLamaChatPromptOptions["signal"];
temperature?: LLamaChatPromptOptions["temperature"];
minP?: LLamaChatPromptOptions["minP"];
topK?: LLamaChatPromptOptions["topK"];
topP?: LLamaChatPromptOptions["topP"];
seed?: LLamaChatPromptOptions["seed"];
trimWhitespaceSuffix?: LLamaChatPromptOptions["trimWhitespaceSuffix"];
evaluationPriority?: LLamaChatPromptOptions["evaluationPriority"];
repeatPenalty?: LLamaChatPromptOptions["repeatPenalty"];
tokenBias?: LLamaChatPromptOptions["tokenBias"];
customStopTriggers?: LLamaChatPromptOptions["customStopTriggers"];
grammar?: LlamaGrammar;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same functions that were used for the previous prompt here.
*/
functions?: ChatSessionModelFunctions;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same value that was used for the previous prompt here.
*/
documentFunctionParams?: boolean;
};
export type LLamaChatPreloadPromptOptions = {
signal?: LLamaChatCompletePromptOptions["signal"];
evaluationPriority?: LLamaChatCompletePromptOptions["evaluationPriority"];
functions?: LLamaChatCompletePromptOptions["functions"];
documentFunctionParams?: LLamaChatCompletePromptOptions["documentFunctionParams"];
};
export type LlamaChatSessionRepeatPenalty = {
/**
* Number of recent tokens generated by the model to apply penalties to repetition of.
* Defaults to `64`.
*/
lastTokens?: number;
punishTokensFilter?: (tokens: Token[]) => Token[];
/**
* Penalize new line tokens.
* Enabled by default.
*/
penalizeNewLine?: boolean;
/**
* The relative amount to lower the probability of the tokens in `punishTokens` by
* Defaults to `1.1`.
* Set to `1` to disable.
*/
penalty?: number;
/**
* For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`
* Disabled by default (`0`).
* Set to a value between `0` and `1` to enable.
*/
frequencyPenalty?: number;
/**
* Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`
* Disabled by default (`0`).
* Set to a value between `0` and `1` to enable.
*/
presencePenalty?: number;
};
/**
* @see [Using `LlamaChatSession`](https://node-llama-cpp.withcat.ai/guide/chat-session) tutorial
*/
export declare class LlamaChatSession {
readonly onDispose: EventRelay<void>;
constructor(options: LlamaChatSessionOptions);
dispose({ disposeSequence }?: {
disposeSequence?: boolean;
}): void;
/** @hidden */
[Symbol.dispose](): void;
get disposed(): boolean;
get chatWrapper(): ChatWrapper;
get sequence(): LlamaContextSequence;
get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
get model(): import("../LlamaModel/LlamaModel.js").LlamaModel;
prompt<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, options?: LLamaChatPromptOptions<Functions>): Promise<string>;
/**
* @param prompt
* @param [options]
*/
promptWithMeta<const Functions extends ChatSessionModelFunctions | undefined = undefined>(prompt: string, { functions, documentFunctionParams, maxParallelFunctionCalls, onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority }?: LLamaChatPromptOptions<Functions>): Promise<{
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
responseText: string;
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
remainingGenerationAfterStop: string | Token[] | undefined;
} | {
response: (string | ChatModelFunctionCall | import("../../types.js").ChatModelSegment)[];
responseText: string;
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger" | "functionCalls";
remainingGenerationAfterStop: string | Token[] | undefined;
customStopTrigger?: undefined;
}>;
/**
* Preload a user prompt into the current context sequence state to make later inference of the model response begin sooner
* and feel faster.
*
* > **Note:** Preloading a long user prompt can incur context shifts, so consider limiting the length of prompts you preload
* @param prompt - the prompt to preload
* @param [options]
*/
preloadPrompt(prompt: string, options?: LLamaChatPreloadPromptOptions): Promise<void>;
/**
* Preload a user prompt into the current context sequence state and generate a completion for it.
*
* > **Note:** Preloading a long user prompt and completing a user prompt with a high number of `maxTokens` can incur context shifts,
* > so consider limiting the length of prompts you preload.
* >
* > Also, it's recommended to limit the number of tokens generated to a reasonable amount by configuring `maxTokens`.
* @param prompt - the prompt to preload
* @param [options]
*/
completePrompt(prompt: string, options?: LLamaChatCompletePromptOptions): Promise<string>;
/**
* Create a smart completion engine that caches the prompt completions
* and reuses them when the user prompt matches the beginning of the cached prompt or completion.
*
* All completions are made and cache is used only for the current chat session state.
* You can create a single completion engine for an entire chat session.
*/
createPromptCompletionEngine(options?: LLamaChatPromptCompletionEngineOptions): LlamaChatSessionPromptCompletionEngine;
/**
* See `completePrompt` for more information.
* @param prompt
* @param [options]
*/
completePromptWithMeta(prompt: string, { maxTokens, stopOnAbortSignal, functions, documentFunctionParams, onTextChunk, onToken, signal, temperature, minP, topK, topP, seed, grammar, trimWhitespaceSuffix, repeatPenalty, tokenBias, customStopTriggers, evaluationPriority }?: LLamaChatCompletePromptOptions): Promise<{
completion: string;
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
remainingGenerationAfterStop: string | Token[] | undefined;
} | {
completion: string;
stopReason: "abort" | "maxTokens" | "eogToken" | "stopGenerationTrigger";
remainingGenerationAfterStop: string | Token[] | undefined;
customStopTrigger?: undefined;
}>;
getChatHistory(): ChatHistoryItem[];
getLastEvaluationContextWindow(): ChatHistoryItem[] | null;
setChatHistory(chatHistory: ChatHistoryItem[]): void;
/** Clear the chat history and reset it to the initial state. */
resetChatHistory(): void;
}