UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

108 lines 7.28 kB
import { minAllowedContextSizeInCalculations } from "../../../config.js"; import { getDefaultContextBatchSize, getDefaultModelContextSize } from "../../../evaluator/LlamaContext/LlamaContext.js"; import { InsufficientMemoryError } from "../../../utils/InsufficientMemoryError.js"; import { getRamUsageFromUnifiedVram } from "./getRamUsageFromUnifiedVram.js"; const defaultMaxContextSizeSwapUse = 2048; export async function resolveContextContextSizeOption({ contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, maxContextSizeSwapUse = defaultMaxContextSizeSwapUse }) { if (contextSize == null) contextSize = "auto"; if (typeof contextSize === "number") { const resolvedContextSize = Math.max(1, Math.floor(contextSize)); if (ignoreMemorySafetyChecks) return resolvedContextSize; const [vramState, ramState, swapState] = await Promise.all([ getVramState(), getRamState(), getSwapState() ]); const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ contextSize: resolvedContextSize, batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: resolvedContextSize, sequences }), modelGpuLayers: modelGpuLayers, sequences, flashAttention, isEmbeddingContext }); if (contextResourceRequirements.gpuVram > vramState.free) throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); else if (contextResourceRequirements.cpuRam > (ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState))) throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); return resolvedContextSize; } else if (contextSize === "auto" || typeof contextSize === "object") { const [vramState, ramState, swapState] = await Promise.all([ getVramState(), getRamState(), getSwapState() ]); const maxContextSize = contextSize === "auto" ? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }) : Math.min(contextSize.max ?? getDefaultModelContextSize({ trainContextSize: modelTrainContextSize }), getDefaultModelContextSize({ trainContextSize: modelTrainContextSize })); const minContextSize = contextSize === "auto" ? minAllowedContextSizeInCalculations : Math.max(contextSize.min ?? minAllowedContextSizeInCalculations, minAllowedContextSizeInCalculations); let highestCompatibleContextSize = null; let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4)); for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) { const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ contextSize: testContextSize, batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: testContextSize, sequences }), modelGpuLayers: modelGpuLayers, sequences, flashAttention, isEmbeddingContext }); if (contextResourceRequirements.gpuVram <= vramState.free && contextResourceRequirements.cpuRam <= (ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + (testContextSize <= maxContextSizeSwapUse ? swapState.free : 0))) { if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) { highestCompatibleContextSize = testContextSize; if (step === -1) break; else if (step < 0) step = Math.max(1, Math.floor(-step / 2)); } } else if (step > 0) step = -Math.max(1, Math.floor(step / 2)); if (testContextSize == minContextSize && step === -1) break; testContextSize += step; if (testContextSize < minContextSize) { testContextSize = minContextSize; step = Math.max(1, Math.floor(Math.abs(step) / 2)); } else if (testContextSize > maxContextSize) { testContextSize = maxContextSize; step = -Math.max(1, Math.floor(Math.abs(step) / 2)); } } if (highestCompatibleContextSize != null) return highestCompatibleContextSize; if (ignoreMemorySafetyChecks) return minContextSize; const minContextSizeResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ contextSize: minContextSize, batchSize: batchSize ?? getDefaultContextBatchSize({ contextSize: minContextSize, sequences }), modelGpuLayers: modelGpuLayers, sequences, flashAttention, isEmbeddingContext }); const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState); if (minContextSizeResourceRequirements.gpuVram > vramState.free && minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage) throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}`); else if (minContextSizeResourceRequirements.gpuVram > vramState.free) throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage) throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage) throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM`); else throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources`); } throw new Error(`Invalid context size: "${contextSize}"`); } //# sourceMappingURL=resolveContextContextSizeOption.js.map