UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

191 lines (190 loc) 8.2 kB
import { BuildGpu } from "../../bindings/types.js"; import { LlamaModelOptions } from "../../evaluator/LlamaModel/LlamaModel.js"; import { LlamaContextOptions } from "../../evaluator/LlamaContext/types.js"; import type { GgufInsights } from "./GgufInsights.js"; export declare const defaultTrainContextSizeForEstimationPurposes = 4096; export declare class GgufInsightsConfigurationResolver { private constructor(); get ggufInsights(): GgufInsights; /** * Resolve the best configuration for loading a model and creating a context using the current hardware. * * Specifying a `targetGpuLayers` and/or `targetContextSize` will ensure the resolved configuration matches those values, * but note it can lower the compatibility score if the hardware doesn't support it. * * Overriding hardware values it possible by configuring `hardwareOverrides`. * @param options * @param hardwareOverrides */ resolveAndScoreConfig({ targetGpuLayers, targetContextSize, embeddingContext, flashAttention, useMmap }?: { targetGpuLayers?: number | "max"; targetContextSize?: number; embeddingContext?: boolean; flashAttention?: boolean; useMmap?: boolean; }, { getVramState, getRamState, getSwapState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading }?: { getVramState?(): Promise<{ total: number; free: number; unifiedSize: number; }>; getRamState?(): Promise<{ total: number; free: number; }>; getSwapState?(): Promise<{ total: number; free: number; }>; llamaVramPaddingSize?: number; llamaGpu?: BuildGpu; llamaSupportsGpuOffloading?: boolean; }): Promise<{ /** * A number between `0` (inclusive) and `1` (inclusive) representing the compatibility score. */ compatibilityScore: number; /** * A number starting at `0` with no upper limit representing the bonus score. * For each multiplier of the specified `contextSize` that the resolved context size is larger by, 1 bonus point is given. */ bonusScore: number; /** * The total score, which is the sum of the compatibility and bonus scores. */ totalScore: number; /** * The resolved values used to calculate the scores. */ resolvedValues: { gpuLayers: number; contextSize: number; modelRamUsage: number; contextRamUsage: number; totalRamUsage: number; modelVramUsage: number; contextVramUsage: number; totalVramUsage: number; }; }>; /** * Score the compatibility of the model configuration with the current GPU and VRAM state. * Assumes a model is loaded with the default `"auto"` configurations. * Scored based on the following criteria: * - The number of GPU layers that can be offloaded to the GPU (only if there's a GPU. If there's no GPU then by how small the model is) * - Whether all layers can be offloaded to the GPU (gives additional points) * - Whether the resolved context size is at least as large as the specified `contextSize` * * If the resolved context size is larger than the specified context size, for each multiplier of the specified `contextSize` * that the resolved context size is larger by, 1 bonus point is given in the `bonusScore`. * * `maximumFittedContextSizeMultiplier` is used to improve the proportionality of the bonus score between models. * Set this to any value higher than `<max compared model context size> / contextSize`. * Defaults to `100`. * * `maximumUnfitConfigurationResourceMultiplier` is used to improve the proportionality of the bonus score between unfit models. * Set this to any value higher than `<max compared model resource usage> / <total available resources>`. * Defaults to `100`. * * `contextSize` defaults to `4096` (if the model train context size is lower than this, the model train context size is used instead). */ scoreModelConfigurationCompatibility({ contextSize, embeddingContext, flashAttention, maximumFittedContextSizeMultiplier, maximumUnfitConfigurationResourceMultiplier, forceStrictContextSize, forceGpuLayers, useMmap }?: { contextSize?: number; embeddingContext?: boolean; flashAttention?: boolean; maximumFittedContextSizeMultiplier?: number; maximumUnfitConfigurationResourceMultiplier?: number; /** * Do not resolve a context size larger than the specified `contextSize`. * * Defaults to `false`. */ forceStrictContextSize?: boolean; forceGpuLayers?: number | "max"; useMmap?: boolean; }, { getVramState, getRamState, getSwapState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading }?: { getVramState?(): Promise<{ total: number; free: number; unifiedSize: number; }>; getRamState?(): Promise<{ total: number; free: number; }>; getSwapState?(): Promise<{ total: number; free: number; }>; llamaVramPaddingSize?: number; llamaGpu?: BuildGpu; llamaSupportsGpuOffloading?: boolean; }): Promise<{ /** * A number between `0` (inclusive) and `1` (inclusive) representing the compatibility score. */ compatibilityScore: number; /** * A number starting at `0` with no upper limit representing the bonus score. * For each multiplier of the specified `contextSize` that the resolved context size is larger by, 1 bonus point is given. */ bonusScore: number; /** * The total score, which is the sum of the compatibility and bonus scores. */ totalScore: number; /** * The resolved values used to calculate the scores. */ resolvedValues: { gpuLayers: number; contextSize: number; modelRamUsage: number; contextRamUsage: number; totalRamUsage: number; modelVramUsage: number; contextVramUsage: number; totalVramUsage: number; }; }>; resolveModelGpuLayers(gpuLayers?: LlamaModelOptions["gpuLayers"], { ignoreMemorySafetyChecks, getVramState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, useMmap }?: { ignoreMemorySafetyChecks?: boolean; getVramState?(): Promise<{ total: number; free: number; }>; llamaVramPaddingSize?: number; llamaGpu?: BuildGpu; llamaSupportsGpuOffloading?: boolean; defaultContextFlashAttention?: boolean; useMmap?: boolean; }): Promise<number>; /** * Resolve a context size option for the given options and constraints. * * If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown. */ resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], { modelGpuLayers, batchSize, modelTrainContextSize, flashAttention, getVramState, getRamState, getSwapState, llamaGpu, ignoreMemorySafetyChecks, isEmbeddingContext, sequences }: { modelGpuLayers: number; modelTrainContextSize: number; flashAttention?: boolean; batchSize?: LlamaContextOptions["batchSize"]; sequences?: number; getVramState?(): Promise<{ total: number; free: number; unifiedSize: number; }>; getRamState?(): Promise<{ total: number; free: number; }>; getSwapState?(): Promise<{ total: number; free: number; }>; llamaGpu?: BuildGpu; ignoreMemorySafetyChecks?: boolean; isEmbeddingContext?: boolean; }): Promise<number>; }