node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
377 lines (376 loc) • 16.7 kB
TypeScript
import { EventRelay } from "lifecycle-utils";
import { ChatWrapper } from "../../ChatWrapper.js";
import { LlamaContextSequence } from "../LlamaContext/LlamaContext.js";
import { ChatHistoryItem, ChatModelFunctions, ChatModelSegmentType, LLamaContextualRepeatPenalty, Token, Tokenizer } from "../../types.js";
import { GbnfJsonSchemaToType } from "../../utils/gbnfJson/types.js";
import { LlamaGrammar } from "../LlamaGrammar.js";
import { LlamaText, LlamaTextJSON } from "../../utils/LlamaText.js";
import { EvaluationPriority } from "../LlamaContext/types.js";
import { TokenBias } from "../TokenBias.js";
import { LlamaModel } from "../LlamaModel/LlamaModel.js";
export type LlamaChatOptions = {
contextSequence: LlamaContextSequence;
/** `"auto"` is used by default */
chatWrapper?: "auto" | ChatWrapper;
/**
* Automatically dispose the sequence when the session is disposed
*
* Defaults to `false`.
*/
autoDisposeSequence?: boolean;
};
export type LlamaChatResponseChunk = LlamaChatResponseTextChunk | LlamaChatResponseSegmentChunk;
export type LlamaChatResponseTextChunk = {
/** When `type` is `undefined`, the chunk is part of the main response and is not a segment */
type: undefined;
/**
* `segmentType` has no purpose when `type` is `undefined` (meaning that this chunk is part of the main response and is not a segment).
*/
segmentType: undefined;
/**
* The generated text chunk.
*
* Detokenized from the `tokens` property,
* but with the context of the previous generation (for better spacing of the text with some models).
*
* Prefer using this property over `tokens` when streaming the generated response as text.
*/
text: string;
/** The generated tokens */
tokens: Token[];
};
export type LlamaChatResponseSegmentChunk = {
type: "segment";
/** Segment type */
segmentType: ChatModelSegmentType;
/**
* The generated text chunk.
*
* Detokenized from the `tokens` property,
* but with the context of the previous generation (for better spacing of the text with some models).
*
* Prefer using this property over `tokens` when streaming the generated response as text.
*/
text: string;
/** The generated tokens */
tokens: Token[];
/**
* When the current chunk is the start of a segment, this field will be set.
*
* It's possible that a chunk with no tokens and empty text will be emitted just to set this field
* to signify that the segment has started.
*/
segmentStartTime?: Date;
/**
* When the current chunk is the last one of a segment (meaning the current segment has ended), this field will be set.
*
* It's possible that a chunk with no tokens and empty text will be emitted just to set this field
* to signify that the segment has ended.
*/
segmentEndTime?: Date;
};
export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
/**
* Called as the model generates the main response with the generated text chunk.
*
* Useful for streaming the generated response as it's being generated.
*
* Includes only the main response without any text segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onTextChunk?: (text: string) => void;
/**
* Called as the model generates the main response with the generated tokens.
*
* Preferably, you'd want to use {@link onTextChunk `onTextChunk`} instead of this.
*
* Includes only the main response without any segments (like thoughts).
* For streaming the response with segments, use {@link onResponseChunk `onResponseChunk`}.
*/
onToken?: (tokens: Token[]) => void;
/**
* Called as the model generates a response with the generated text and tokens,
* including segment information (when the generated output is part of a segment).
*
* Useful for streaming the generated response as it's being generated, including the main response and all segments.
*
* Only use this function when you need the segmented texts, like thought segments (chain of thought text).
*/
onResponseChunk?: (chunk: LlamaChatResponseChunk) => void;
signal?: AbortSignal;
/**
* When a response already started being generated and then the signal is aborted,
* the generation will stop and the response will be returned as is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: boolean;
maxTokens?: number;
/**
* Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
*
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
*
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
*
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
* Disabled by default (set to `0`).
*/
temperature?: number;
/**
* From the next token candidates, discard the percentage of tokens with the lowest probability.
* For example, if set to `0.05`, 5% of the lowest probability tokens will be discarded.
* This is useful for generating more high-quality results when using a high temperature.
* Set to a value between `0` and `1` to enable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* Disabled by default.
*/
minP?: number;
/**
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
*/
topK?: number;
/**
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
*/
topP?: number;
/**
* Used to control the randomness of the generated text.
*
* Change the seed to get different results.
*
* Only relevant when using `temperature`.
*/
seed?: number;
/**
* Trim whitespace from the end of the generated text
*
* Defaults to `false`.
*/
trimWhitespaceSuffix?: boolean;
repeatPenalty?: false | LLamaContextualRepeatPenalty;
/**
* Adjust the probability of tokens being generated.
* Can be used to bias the model to generate tokens that you want it to lean towards,
* or to avoid generating tokens that you want it to avoid.
*/
tokenBias?: TokenBias | (() => TokenBias);
/**
* See the parameter `evaluationPriority` on the `LlamaContextSequence.evaluate()` function for more information.
*/
evaluationPriority?: EvaluationPriority;
contextShift?: LLamaChatContextShiftOptions;
/**
* Custom stop triggers to stop the generation of the response when any of the provided triggers are found.
*/
customStopTriggers?: readonly (LlamaText | string | readonly (string | Token)[])[];
/**
* The evaluation context window returned from the last evaluation.
* This is an optimization to utilize existing context sequence state better when possible.
*/
lastEvaluationContextWindow?: {
/** The history of the last evaluation. */
history?: ChatHistoryItem[];
/**
* Minimum overlap percentage with existing context sequence state to use the last evaluation context window.
* If the last evaluation context window is not used, a new context will be generated based on the full history,
* which will decrease the likelihood of another context shift happening so soon.
*
* A number between `0` (exclusive) and `1` (inclusive).
*/
minimumOverlapPercentageToPreventContextShift?: number;
};
} & ({
grammar?: LlamaGrammar;
functions?: never;
documentFunctionParams?: never;
maxParallelFunctionCalls?: never;
onFunctionCall?: never;
} | {
grammar?: never;
functions?: Functions | ChatModelFunctions;
documentFunctionParams?: boolean;
maxParallelFunctionCalls?: number;
onFunctionCall?: (functionCall: LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions ? Functions : ChatModelFunctions>) => void;
});
export type LLamaChatLoadAndCompleteUserMessageOptions<Functions extends ChatModelFunctions | undefined = undefined> = {
/**
* Complete the given user prompt without adding it or the completion to the returned context window.
*/
initialUserPrompt?: string;
/**
* When a completion already started being generated and then the signal is aborted,
* the generation will stop and the completion will be returned as is instead of throwing an error.
*
* Defaults to `false`.
*/
stopOnAbortSignal?: boolean;
/**
* Called as the model generates a completion with the generated text chunk.
*
* Useful for streaming the generated completion as it's being generated.
*/
onTextChunk?: LLamaChatGenerateResponseOptions<Functions>["onTextChunk"];
/**
* Called as the model generates a completion with the generated tokens.
*
* Preferably, you'd want to use `onTextChunk` instead of this.
*/
onToken?: LLamaChatGenerateResponseOptions<Functions>["onToken"];
signal?: LLamaChatGenerateResponseOptions<Functions>["signal"];
maxTokens?: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
temperature?: LLamaChatGenerateResponseOptions<Functions>["temperature"];
minP?: LLamaChatGenerateResponseOptions<Functions>["minP"];
topK?: LLamaChatGenerateResponseOptions<Functions>["topK"];
topP?: LLamaChatGenerateResponseOptions<Functions>["topP"];
seed?: LLamaChatGenerateResponseOptions<Functions>["seed"];
trimWhitespaceSuffix?: LLamaChatGenerateResponseOptions<Functions>["trimWhitespaceSuffix"];
repeatPenalty?: LLamaChatGenerateResponseOptions<Functions>["repeatPenalty"];
tokenBias?: LLamaChatGenerateResponseOptions<Functions>["tokenBias"];
evaluationPriority?: LLamaChatGenerateResponseOptions<Functions>["evaluationPriority"];
contextShift?: LLamaChatGenerateResponseOptions<Functions>["contextShift"];
customStopTriggers?: LLamaChatGenerateResponseOptions<Functions>["customStopTriggers"];
lastEvaluationContextWindow?: LLamaChatGenerateResponseOptions<Functions>["lastEvaluationContextWindow"];
grammar?: LlamaGrammar;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same functions that were used for the previous prompt here.
*/
functions?: Functions | ChatModelFunctions;
/**
* Functions are not used by the model here,
* but are used for keeping the instructions given to the model about the functions in the current context state,
* to avoid context shifts.
*
* It's best to provide the same value that was used for the previous prompt here.
*/
documentFunctionParams?: boolean;
};
export type LLamaChatContextShiftOptions = {
/**
* The number of tokens to delete from the context window to make space for new ones.
* Defaults to 10% of the context size.
*/
size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>);
/**
* The strategy to use when deleting tokens from the context window.
*
* Defaults to `"eraseFirstResponseAndKeepFirstSystem"`.
*/
strategy?: "eraseFirstResponseAndKeepFirstSystem" | ((options: {
/** Full chat history */
chatHistory: readonly ChatHistoryItem[];
/** Maximum number of tokens that the new chat history should fit under when tokenized */
maxTokensCount: number;
/** Tokenizer used to tokenize the chat history */
tokenizer: Tokenizer;
/** Chat wrapper used to generate the context state */
chatWrapper: ChatWrapper;
/**
* The metadata returned from the last context shift strategy call.
* Will be `null` on the first call.
*/
lastShiftMetadata?: object | null;
}) => {
chatHistory: ChatHistoryItem[];
metadata?: object | null;
} | Promise<{
chatHistory: ChatHistoryItem[];
metadata?: object | null;
}>);
/**
* The `contextShiftMetadata` returned from the last evaluation.
* This is an optimization to utilize the existing context state better when possible.
*/
lastEvaluationMetadata?: object | undefined | null;
};
export declare class LlamaChat {
readonly onDispose: EventRelay<void>;
constructor({ contextSequence, chatWrapper, autoDisposeSequence }: LlamaChatOptions);
dispose({ disposeSequence }?: {
disposeSequence?: boolean;
}): void;
/** @hidden */
[Symbol.dispose](): void;
get disposed(): boolean;
get chatWrapper(): ChatWrapper;
get sequence(): LlamaContextSequence;
get context(): import("../LlamaContext/LlamaContext.js").LlamaContext;
get model(): LlamaModel;
generateResponse<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatGenerateResponseOptions<Functions>): Promise<LlamaChatResponse<Functions>>;
loadChatAndCompleteUserMessage<const Functions extends ChatModelFunctions | undefined = undefined>(history: ChatHistoryItem[], options?: LLamaChatLoadAndCompleteUserMessageOptions<Functions>): Promise<LlamaChatLoadAndCompleteUserResponse>;
}
export type LlamaChatResponse<Functions extends ChatModelFunctions | undefined = undefined> = {
/**
* The response text only, _without_ any text segments (like thoughts).
*/
response: string;
/**
* The full response, including all text and text segments (like thoughts).
*/
fullResponse: Array<string | LlamaChatResponseSegment>;
functionCalls?: Functions extends ChatModelFunctions ? LlamaChatResponseFunctionCall<Functions>[] : never;
lastEvaluation: {
cleanHistory: ChatHistoryItem[];
contextWindow: ChatHistoryItem[];
contextShiftMetadata: any;
};
metadata: {
remainingGenerationAfterStop?: string | Token[];
stopReason: "eogToken" | "stopGenerationTrigger" | "functionCalls" | "maxTokens" | "abort";
} | {
remainingGenerationAfterStop?: string | Token[];
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
};
};
export type LlamaChatResponseFunctionCall<Functions extends ChatModelFunctions, FunctionCallName extends keyof Functions & string = string & keyof Functions, Params = Functions[FunctionCallName]["params"] extends undefined | null | void ? undefined : GbnfJsonSchemaToType<Functions[FunctionCallName]["params"]>> = {
functionName: FunctionCallName;
params: Params;
raw: LlamaTextJSON;
};
export type LlamaChatResponseSegment = {
type: "segment";
segmentType: ChatModelSegmentType;
text: string;
ended: boolean;
raw: LlamaTextJSON;
startTime?: string;
endTime?: string;
};
export type LlamaChatLoadAndCompleteUserResponse = {
completion: string;
lastEvaluation: {
/**
* The completion and initial user prompt are not added to this context window result,
* but are loaded to the current context sequence state as tokens
*/
contextWindow: ChatHistoryItem[];
contextShiftMetadata: any;
};
metadata: {
remainingGenerationAfterStop?: string | Token[];
stopReason: "eogToken" | "stopGenerationTrigger" | "maxTokens" | "abort";
} | {
remainingGenerationAfterStop?: string | Token[];
stopReason: "customStopTrigger";
customStopTrigger: (string | Token)[];
};
};