UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

437 lines (436 loc) 18.3 kB
import { PickOptions } from "../../utils/utilTypes.js"; import type { LlamaGrammarEvaluationState } from "../LlamaGrammarEvaluationState.js"; import type { TokenBias } from "../TokenBias.js"; import type { Token } from "../../types.js"; import type { LlamaContextSequence } from "./LlamaContext.js"; export type LlamaContextOptions = { /** * number of sequences for the context. * Each sequence is a different "text generation process" that can run in parallel to other sequences in the same context. * Although a single context has multiple sequences, the sequences are separate from each other and do not share data with each other. * This is beneficial for performance, as multiple sequences can be evaluated in parallel (on the same batch). * * Each sequence increases the memory usage of the context. * * Defaults to `1`. */ sequences?: number; /** * The number of tokens the model can see at once. * - **`"auto"`** - adapt to the current VRAM state and attemp to set the context size as high as possible up to the size * the model was trained on. * - **`number`** - set the context size to a specific number of tokens. * If there's not enough VRAM, an error will be thrown. * Use with caution. * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible * up to the size the model was trained on, but at least `min` and at most `max`. * * Defaults to `"auto"`. */ contextSize?: "auto" | number | { min?: number; max?: number; }; /** * The number of tokens that can be processed at once by the GPU. * * Defaults to `512` or `contextSize` if `contextSize` is less than `512`. */ batchSize?: number; /** * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. * * The support for flash attention is currently experimental and may not always work as expected. * Use with caution. * * This option will be ignored if flash attention is not supported by the model. * * Defaults to `false` (inherited from the model option `defaultContextFlashAttention`). * * Upon flash attention exiting the experimental status, the default value will become `true` * (the inherited value from the model option `defaultContextFlashAttention` will become `true`). */ flashAttention?: boolean; /** * number of threads to use to evaluate tokens. * set to 0 to use the maximum threads supported by the current machine hardware. * * This value is considered as a hint, and the actual number of threads used may be lower when other evaluations are running. * To ensure the minimum number of threads you want to use are always used, * set this to an object with a `min` property (see the `min` property description for more details). * * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used. * * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance, * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details). */ threads?: number | { /** * The ideal number of threads to use for evaluations. * * If other evaluations are running, the actual number of threads may be lower than this value. * * If `maxThreads` from the Llama instance is set to `0`, this value will always be the actual number of threads used. * * If `maxThreads` from the Llama instance is set to `0`, defaults to the `.cpuMathCores` value from the Llama instance, * otherwise defaults to `maxThreads` from the Llama instance (see the `maxThreads` option of `getLlama` method for more details). */ ideal?: number; /** * Ensure evaluations always use at least this number of threads. * * Use with caution, since setting this value too high can lead to the context waiting too much time * to reserve this number of threads before the evaluation can start. */ min?: number; }; /** * Control the parallel sequences processing behavior. * * See {@link BatchingOptions} for more information. */ batching?: BatchingOptions; /** * Load the provided LoRA adapters onto the context. * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains * without the need for extensive retraining from scratch. * * If a string is provided, it will be treated as a path to a single LoRA adapter file. */ lora?: string | { adapters: Array<{ filePath: string; /** * Defaults to `1` */ scale?: number; }>; /** * Called with the LoRA adapters load percentage when the LoRA adapters are being loaded. * @param loadProgress - a number between 0 (exclusive) and 1 (inclusive). */ onLoadProgress?(loadProgress: number): void; }; /** An abort signal to abort the context creation */ createSignal?: AbortSignal; /** * Ignore insufficient memory errors and continue with the context creation. * Can cause the process to crash if there's not enough VRAM for the new context. * * Defaults to `false`. */ ignoreMemorySafetyChecks?: boolean; /** * On failed context creation, retry the creation with a smaller context size. * * Only works if `contextSize` is set to `"auto"`, left as default or set to an object with `min` and/or `max` properties. * * Set `retries` to `false` to disable. */ failedCreationRemedy?: false | { /** * Retries to attempt to create the context. * * Defaults to `6`. */ retries?: number; /** * The percentage to decrease the context size by on each retry. * Should be a number between `0` and `1`. * * If a function is provided, it will be called with the current context size and should return the new context size. * * Defaults to `0.16`. */ autoContextSizeShrink?: number | ((contextSize: number) => number); }; /** * Track the inference performance of the context, so using `.printTimings()` will work. * * Defaults to `false`. */ performanceTracking?: boolean; }; export type LlamaContextSequenceRepeatPenalty = { /** Tokens to lower the predication probability of to be the next predicted token */ punishTokens: Token[] | (() => Token[]); /** * The maximum number of tokens that will be provided in the `punishTokens` array. * * This is used as a hint for a performance optimization for avoiding frequent memory deallocation and reallocation. * * Don't set this value too high, as it can allocate too much memory. * * Defaults to `64`. */ maxPunishTokens?: number; /** * The relative amount to lower the probability of the tokens in `punishTokens` by. * * Defaults to `1.1`. * Set to `1` to disable. */ penalty?: number; /** * For n time a token is in the `punishTokens` array, lower its probability by `n * frequencyPenalty`. * * Disabled by default (`0`). * Set to a value between `0` and `1` to enable. */ frequencyPenalty?: number; /** * Lower the probability of all the tokens in the `punishTokens` array by `presencePenalty`. * * Disabled by default (`0`). * Set to a value between `0` and `1` to enable. */ presencePenalty?: number; }; export type BatchingOptions = { /** * The strategy used to dispatch items to be processed when there are items pending to be processed. * - **`"nextCycle"`** - dispatch the items on the next event loop cycle. * You can provide a custom function to define a custom dispatch schedule. * * Defaults to `"nextCycle"`. */ dispatchSchedule?: "nextCycle" | CustomBatchingDispatchSchedule; /** * The strategy used to prioritize pending items to be processed. * - **`"maximumParallelism"`** - process as many different sequences in parallel as possible. * - **`"firstInFirstOut"`** - process items in the order they were added. * - **Custom prioritization function** - a custom function that prioritizes the items to be processed. * See the {@link CustomBatchingPrioritizationStrategy} type for more information. * * Defaults to `"maximumParallelism"`. */ itemPrioritizationStrategy?: "maximumParallelism" | "firstInFirstOut" | CustomBatchingPrioritizationStrategy; }; /** * A function that schedules the dispatch of the batch items. * Call the `dispatch` function to dispatch the items. */ export type CustomBatchingDispatchSchedule = (dispatch: () => void) => void; /** * A function that prioritizes the batch items to be processed. * The function receives an array of `items` and the `size` of how many tokens can be processed in this batch. * * The function should return an array of prioritized items, * where the sum of `processAmount` of all the items is less or equal to the given `size` that the function received, * and where the `item` of each prioritized item is the same reference to an original item in the `items` array. */ export type CustomBatchingPrioritizationStrategy = (options: { items: readonly BatchItem[]; size: number; }) => PrioritizedBatchItem[]; export type ContextShiftOptions = { size?: number | ((sequence: LlamaContextSequence) => number | Promise<number>); strategy?: "eraseBeginning" | ((options: { sequence: LlamaContextSequence; size: number; }) => ContextTokensDeleteRange[] | Promise<ContextTokensDeleteRange[]>); }; export type ContextTokensDeleteRange = { start: number; end: number; }; export type SequenceEvaluateOptions = { temperature?: number; minP?: number; topK?: number; topP?: number; /** * Used to control the randomness of the generated text. * * Change the seed to get different results. * * Defaults to the current epoch time. * * Only relevant when using `temperature`. */ seed?: number; grammarEvaluationState?: LlamaGrammarEvaluationState | (() => LlamaGrammarEvaluationState | undefined); repeatPenalty?: LlamaContextSequenceRepeatPenalty; /** * Adjust the probability of tokens being generated. * Can be used to bias the model to generate tokens that you want it to lean towards, * or to avoid generating tokens that you want it to avoid. */ tokenBias?: TokenBias | (() => TokenBias); /** * When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be * evaluated based on the strategy chosen for the context. * By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible, * but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the * highest evaluation priority. * Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority * is, the more likely and more tokens will be evaluated for that sequence in the next queued batch. */ evaluationPriority?: EvaluationPriority; /** * Override the sequence context shift options for this evaluation * * See {@link ContextShiftOptions} for more information. */ contextShift?: ContextShiftOptions; /** * Yield an EOG (End Of Generation) token (like EOS and EOT) when it's generated. * When `false` the generation will stop when an EOG token is generated and the token won't be yielded. * Defaults to `false`. */ yieldEogToken?: boolean; }; export type SequenceEvaluateMetadataOptions = { /** * Get the confidence (probability) of the selected token. * * Same as `probabilities.get(token)` from the output. * * If you need only this value, you can skip getting the full probabilities list to improve performance. * * This value might be slightly different when evaluated on different GPUs and configurations. */ readonly confidence?: boolean; /** * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options. * * Only enable when needed, as it impacts the performance. * * Defaults to `false`. */ readonly probabilities?: boolean; }; export type SequenceEvaluateOutput<Options extends { readonly confidence?: boolean; readonly probabilities?: boolean; } = { readonly confidence: true; readonly probabilities: true; }> = PickOptions<{ /** * The next token generated by the model and selected using the given options (such a temperature). */ token: Token; /** * The confidence (probability) of the selected token. * * Same as `probabilities.get(token)`. * * If you need only this value, you can skip getting the full probabilities list to improve performance. * * This value might be slightly different when evaluated on different GPUs and configurations. */ confidence: number; /** * The probabilities of the tokens from the vocabulary to be the next token. * * A probability is a number from `0` to `1`. * * The probabilities might be slightly different when evaluated on different GPUs and configurations. * * The map is sorted by the probability of the tokens from the highest to the lowest, * and is reflected in the order of the entries when iterating over the map. * Use `.entries().next().value` to get the top probability pair * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)). */ probabilities: Map<Token, number>; }, Options & { token: true; }>; export type ControlledEvaluateInputItem = Token | [ token: Token, options: { generateNext?: { /** * Get the full probabilities list of tokens from the vocabulary to be the next token, after applying the given options. * * Only enable when needed, as it impacts the performance. * * Defaults to `false`. */ probabilities?: boolean; /** * Get the confidence (probability) of the selected token. * * Same as `next.probabilities.get(next.token)` from the output. * * If you need only this value, you can skip getting the full probabilities list to improve performance. * * This value might be slightly different when evaluated on different GPUs and configurations. */ confidence?: boolean; /** * Generate the next token with the provided options using sampling. * * Setting this to `true` will generate probabilities for the next token and sample it. */ token?: boolean; options?: { temperature?: number; minP?: number; topK?: number; topP?: number; /** * Used to control the randomness of the generated text. * * Change the seed to get different results. * * Defaults to the current epoch time. * * Only relevant when using `temperature`. */ seed?: number; repeatPenalty?: LlamaContextSequenceRepeatPenalty; /** * Adjust the probability of tokens being generated. * Can be used to bias the model to generate tokens that you want it to lean towards, * or to avoid generating tokens that you want it to avoid. */ tokenBias?: TokenBias | (() => TokenBias); }; }; } ]; export type ControlledEvaluateIndexOutput = { next: { token?: Token | null; /** * The confidence (probability) of the selected token (the `token` field in this object). * * Same as `next.probabilities.get(next.token)`. * * If you need only this value, you can skip getting the full probabilities list to improve performance. * * This value might be slightly different when evaluated on different GPUs and configurations. */ confidence?: number; /** * The probabilities of the tokens from the vocabulary to be the next token. * * A probability is a number from `0` to `1`. * * The probabilities might be slightly different when evaluated on different GPUs and configurations. * * The map is sorted by the probability of the tokens from the highest to the lowest, * and is reflected in the order of the entries when iterating over the map. * Use `.entries().next().value` to get the top probability pair * ([learn more](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Map/entries)). */ probabilities?: Map<Token, number>; }; }; /** * 1 - low * * 5 - high */ export type EvaluationPriority = 1 | 2 | 3 | 4 | 5; export type BatchItem = { readonly tokens: readonly Token[]; readonly logits: readonly (true | undefined)[]; readonly evaluationPriority: EvaluationPriority; }; export type PrioritizedBatchItem = { item: BatchItem; processAmount: number; };