UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

357 lines (356 loc) 13.1 kB
import { GbnfJsonSchema, GbnfJsonSchemaToType } from "./utils/gbnfJson/types.js"; import { LlamaText, BuiltinSpecialTokenValue, LlamaTextJSON } from "./utils/LlamaText.js"; import type { GgufFileInfo } from "./gguf/types/GgufFileInfoTypes.js"; export type Token = number & { __token: never; }; export type Detokenizer = { detokenize(tokens: readonly Token[], specialTokens?: boolean, lastTokens?: readonly Token[]): string; }["detokenize"]; export type Tokenizer = { tokenize(text: string, specialTokens?: boolean, options?: "trimLeadingSpace"): Token[]; tokenize(text: BuiltinSpecialTokenValue, specialTokens: "builtin"): Token[]; }["tokenize"] & { readonly detokenize: Detokenizer; isSpecialToken(token: Token): boolean; isEogToken(token: Token): boolean; }; export type ChatWrapperSettings = { readonly supportsSystemMessages: boolean; readonly functions: { readonly call: { readonly optionalPrefixSpace: boolean; readonly prefix: string | LlamaText; readonly paramsPrefix: string | LlamaText; readonly suffix: string | LlamaText; /** * The value to use when the function has no arguments. * * Will be stringified using `jsonDumps`. * * Defaults to `""`. */ readonly emptyCallParamsPlaceholder?: object | string | number | boolean | null; }; readonly result: { /** * Supported template parameters: * - <span v-pre>`{{functionName}}`</span> * - <span v-pre>`{{functionParams}}`</span> * * Template parameters can only appear in a string or a string in a `LlamaText`. * * Template parameters inside a `SpecialTokensText` inside a `LlamaText` won't be replaced. * * Example of supported values: * - `"text{{functionName}}text"` * - `LlamaText(["text{{functionName}}text"])` * * Example of unsupported values: * - `LlamaText([new SpecialTokensText("text{{functionName}}text")])` */ readonly prefix: string | LlamaText; /** * Supported template parameters: * - <span v-pre>`{{functionName}}`</span> * - <span v-pre>`{{functionParams}}`</span> * * Template parameters can only appear in a string or a string in a `LlamaText`. * * Template parameters inside a `SpecialTokensText` inside a `LlamaText` won't be replaced. * * Example of **supported** values: * - `"text{{functionName}}text"` * - `LlamaText(["text{{functionName}}text"])` * * Example of **unsupported** values: * - `LlamaText([new SpecialTokensText("text{{functionName}}text")])` */ readonly suffix: string | LlamaText; }; /** If this field is present, parallel function calling is supported */ readonly parallelism?: { readonly call: { readonly sectionPrefix: string | LlamaText; readonly betweenCalls?: string | LlamaText; readonly sectionSuffix?: string | LlamaText; }; readonly result?: { readonly sectionPrefix?: string | LlamaText; readonly betweenResults?: string | LlamaText; readonly sectionSuffix?: string | LlamaText; }; }; }; readonly segments?: { /** Consider all active segments to be closed when this text is detected */ readonly closeAllSegments?: string | LlamaText; /** * After function calls, reiterate the stack of the active segments to remind the model of the context. * * Defaults to `false`. */ readonly reiterateStackAfterFunctionCalls?: boolean; /** Chain of Thought text segment */ readonly thought?: ChatWrapperSettingsSegment & { reopenAfterFunctionCalls?: boolean; }; /** * Comment segment. * * Used by models such as gpt-oss. */ readonly comment?: ChatWrapperSettingsSegment; }; }; export type ChatWrapperSettingsSegment = { readonly prefix: string | LlamaText; readonly suffix?: string | LlamaText; }; export type ChatWrapperGenerateContextStateOptions = { chatHistory: readonly ChatHistoryItem[]; availableFunctions?: ChatModelFunctions; documentFunctionParams?: boolean; }; export type ChatWrapperCheckModelCompatibilityParams = { tokenizer?: Tokenizer; fileInfo?: GgufFileInfo; }; export type ChatWrapperGeneratedContextState = ChatWrapperGeneratedPrefixTriggersContextState | ChatWrapperGeneratedInitiallyEngagedFunctionsContextState; export type ChatWrapperGeneratedPrefixTriggersContextState = { /** * The rendered chat to load into the context sequence state */ contextText: LlamaText; /** * Triggers to stop the generation */ stopGenerationTriggers: LlamaText[]; /** * When this option is set, after evaluating the `contextText`, * it'll look for any of the triggers to be the first generated output. * * When a trigger is matched, its type will determine the mode to enter to, a segment to open, * or to continue the generation as a textual output. * * If all the triggers are unmatched, the `noPrefixTrigger` will take effect. */ prefixTriggers?: Array<{ triggers: LlamaText[]; /** * Enter into function calling mode. * * Entering this mode will put the function calling prefix into the context sequence state * and force it to choose a function to call. * * If no functions are available, this trigger will be ignored. */ type: "functionCall"; /** * Remove the trigger tokens and replace them with the function call prefix. * * Defaults to `true`. */ replaceTrigger?: boolean; /** * Text to inject into the context sequence state when this trigger is matched. */ inject?: LlamaText; } | { triggers: LlamaText[]; /** * Open a segment of the specified type. * * If the budget for this segment has exceeded, this trigger will be ignored, * so ensure to have a fallback for a response. */ type: "segment"; /** * Type of the segment to open. */ segmentType: ChatModelSegmentType; /** * Text to inject into the context sequence state when this trigger is matched. */ inject?: LlamaText; } | { triggers: LlamaText[]; /** * Continue the generation as a textual output. */ type: "response"; /** * Text to inject into the context sequence state when this trigger is matched. */ inject?: LlamaText; }>; /** * When no prefix triggers are matched or non are provided, after evaluating the `contextText`, * perform the action specified by this option. */ noPrefixTrigger?: { /** * Enter into function calling mode. * * Entering this mode will put the function calling prefix into the context sequence state * and force it to choose a function to call. * * If no functions are available, this action will be ignored. */ type: "functionCall"; /** * Text to inject into the context sequence state when this action is performed. */ inject: LlamaText; } | { /** * Open a segment of the specified type. * * If the budget for this segment has exceeded, this action will be ignored. */ type: "segment"; /** * Type of the segment to open. */ segmentType: ChatModelSegmentType; /** * Text to inject into the context sequence state when this action is performed. */ inject: LlamaText; } | { /** * Continue the generation as a textual output. */ type: "response"; /** * Text to inject into the context sequence state when this action is performed. */ inject: LlamaText; }; /** * Trigger a rerender of the chat template when any of the provided triggers are matched. * * When a rerender it triggered, the chat template will be rendered again and the next trigger options will come into effect again, * so if no prefix triggers are required after the rerender, make sure to not provide any. * * When a rerender is triggered, the `action` will be performed. */ rerender?: { triggers: LlamaText[]; /** * Action to perform when the rerender is triggered. * * - **`"closeResponseItem"`**: Close the current segment or stop the textual response generation. */ action?: "closeResponseItem"; }; /** * Whether to detect the function calling prefix syntax in the current text generation to dynamically enter into function calling mode. * * If it's only possible to enter function calling using a prefix trigger, then set this option to `false`. */ detectFunctionCalls?: boolean; ignoreStartText?: never; functionCall?: never; }; export type ChatWrapperGeneratedInitiallyEngagedFunctionsContextState = { contextText: LlamaText; stopGenerationTriggers: LlamaText[]; ignoreStartText?: LlamaText[]; functionCall?: { initiallyEngaged: boolean; disengageInitiallyEngaged: LlamaText[]; }; detectFunctionCalls?: never; prefixTriggers?: never; noPrefixTrigger?: never; rerender?: never; }; export type ChatWrapperGenerateInitialHistoryOptions = { systemPrompt?: string; }; export type ChatHistoryItem = ChatSystemMessage | ChatUserMessage | ChatModelResponse; export type ChatSystemMessage = { type: "system"; text: string | LlamaTextJSON; }; export type ChatUserMessage = { type: "user"; text: string; }; export type ChatModelResponse = { type: "model"; response: Array<string | ChatModelFunctionCall | ChatModelSegment>; }; export type ChatModelFunctionCall = { type: "functionCall"; name: string; description?: string; params: any; result: any; rawCall?: LlamaTextJSON; /** * Whether this function call starts a new function calling chunk. * * Relevant only when parallel function calling is supported. */ startsNewChunk?: boolean; }; export declare const allSegmentTypes: readonly ["thought", "comment"]; export type ChatModelSegmentType = "thought" | "comment"; export type ChatModelSegment = { type: "segment"; segmentType: ChatModelSegmentType; text: string; ended: boolean; raw?: LlamaTextJSON; startTime?: string; endTime?: string; }; export type ChatModelFunctions = { readonly [name: string]: { readonly description?: string; readonly params?: Readonly<GbnfJsonSchema> | undefined | null; }; }; export type ChatSessionModelFunctions = { readonly [name: string]: ChatSessionModelFunction<any>; }; export type ChatSessionModelFunction<Params extends GbnfJsonSchema | undefined = GbnfJsonSchema | undefined> = { readonly description?: string; readonly params?: Params; readonly handler: (params: GbnfJsonSchemaToType<NoInfer<Params>>) => any; }; export declare function isChatModelResponseFunctionCall(item: ChatModelResponse["response"][number] | undefined): item is ChatModelFunctionCall; export declare function isChatModelResponseSegment(item: ChatModelResponse["response"][number] | undefined): item is ChatModelSegment; export type LLamaContextualRepeatPenalty = { /** * Number of recent tokens generated by the model to apply penalties to repetition of. * Defaults to `64`. */ lastTokens?: number; punishTokensFilter?: (tokens: Token[]) => Token[]; /** * Penalize new line tokens. * Enabled by default. */ penalizeNewLine?: boolean; /** * The relative amount to lower the probability of the tokens in `punishTokens` by * Defaults to `1.1`. * Set to `1` to disable. */ penalty?: number; /** * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty` * Disabled by default (`0`). * Set to a value between `0` and `1` to enable. */ frequencyPenalty?: number; /** * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty` * Disabled by default (`0`). * Set to a value between `0` and `1` to enable. */ presencePenalty?: number; };