node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
189 lines • 8.1 kB
JavaScript
import { InsufficientMemoryError } from "../../../utils/InsufficientMemoryError.js";
import { findBestOption } from "../../../utils/findBestOption.js";
import { getDefaultContextBatchSize, getDefaultModelContextSize } from "../../../evaluator/LlamaContext/LlamaContext.js";
import { minAllowedContextSizeInCalculations } from "../../../config.js";
import { scoreLevels } from "./scoreLevels.js";
const fitContextExtraMemoryPaddingPercentage = 0.5;
export async function resolveModelGpuLayersOption(gpuLayers, { ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, useMmap }) {
if (gpuLayers == null)
gpuLayers = "auto";
if (!llamaSupportsGpuOffloading)
return 0;
if (gpuLayers === "max" || typeof gpuLayers === "number") {
const resolvedGpuLayers = typeof gpuLayers === "number"
? Math.max(0, Math.min(ggufInsights.totalLayers, gpuLayers))
: ggufInsights.totalLayers;
if (ignoreMemorySafetyChecks)
return resolvedGpuLayers;
const vramState = await getVramState();
const maxLayersRequirements = getVramRequiredForGpuLayers({
gpuLayers: resolvedGpuLayers,
ggufInsights,
currentVram: vramState.free,
defaultContextFlashAttention,
useMmap
});
if (maxLayersRequirements == null)
throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings");
return resolvedGpuLayers;
}
else if (gpuLayers === "auto" || typeof gpuLayers === "object") {
if (llamaGpu === false)
return 0;
const vramState = await getVramState();
if (vramState.total === 0)
return 0;
let freeVram = vramState.free;
if (typeof gpuLayers === "object" && gpuLayers.fitContext?.contextSize != null) {
freeVram -= llamaVramPaddingSize * fitContextExtraMemoryPaddingPercentage;
if (freeVram < 0)
freeVram = 0;
}
const bestGpuLayersOption = getBestGpuLayersForFreeVram({
ggufInsights,
freeVram,
fitContext: typeof gpuLayers === "object"
? gpuLayers.fitContext
: undefined,
minGpuLayers: typeof gpuLayers === "object"
? gpuLayers.min
: undefined,
maxGpuLayers: typeof gpuLayers === "object"
? gpuLayers.max
: undefined,
defaultContextFlashAttention,
useMmap
});
const hasGpuLayersRequirements = typeof gpuLayers === "object" &&
(gpuLayers.min != null || gpuLayers.max != null || gpuLayers.fitContext?.contextSize != null);
if (!ignoreMemorySafetyChecks && bestGpuLayersOption == null && hasGpuLayersRequirements)
throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings");
return bestGpuLayersOption ?? 0;
}
throw new Error(`Invalid gpuLayers value: ${gpuLayers}`);
}
function getBestGpuLayersForFreeVram({ ggufInsights, freeVram, fitContext, minGpuLayers, maxGpuLayers, defaultContextFlashAttention, useMmap }) {
return findBestOption({
*generator() {
const minLayers = Math.floor(Math.max(0, minGpuLayers ?? 0));
const maxLayers = Math.floor(Math.min(ggufInsights.totalLayers, maxGpuLayers ?? ggufInsights.totalLayers));
for (let layers = maxLayers; layers >= minLayers; layers--) {
yield {
gpuLayers: layers
};
}
},
score(option) {
const layersRequirements = getVramRequiredForGpuLayers({
gpuLayers: option.gpuLayers,
ggufInsights,
currentVram: freeVram,
fitContext,
defaultContextFlashAttention,
useMmap
});
if (layersRequirements == null)
return null;
return scoreGpuLayersAndContextCombination({ gpuLayers: option.gpuLayers, contextSize: layersRequirements.contextSize }, {
totalGpuLayers: ggufInsights.totalLayers,
trainContextSize: getDefaultModelContextSize({ trainContextSize: ggufInsights.trainContextSize })
});
}
})?.gpuLayers ?? null;
}
function scoreGpuLayersAndContextCombination({ gpuLayers, contextSize }, { totalGpuLayers, trainContextSize }) {
function scoreGpuLayers() {
return scoreLevels(gpuLayers, [{
start: 0,
points: 4
}, {
start: 1,
points: 26
}, {
start: totalGpuLayers,
points: 14,
end: totalGpuLayers
}]);
}
function scoreContextSize() {
const gpuLayersPercentage = gpuLayers / totalGpuLayers;
return scoreLevels(contextSize, [{
start: 0,
points: 2
}, {
start: 1024,
points: 4
}, {
start: 2048,
points: gpuLayersPercentage < 0.1 ? 1 : 8
}, {
start: 4096,
points: gpuLayersPercentage < 0.3 ? 4 : 16
}, {
start: 8192,
points: gpuLayersPercentage < 0.6 ? 1 : 8,
end: Math.max(trainContextSize, 16384)
}]);
}
return scoreGpuLayers() + scoreContextSize();
}
function getVramRequiredForGpuLayers({ gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, useMmap }) {
const modelVram = ggufInsights.estimateModelResourceRequirements({
gpuLayers,
useMmap
}).gpuVram;
if (modelVram > currentVram)
return null;
if (fitContext != null && fitContext.contextSize != null) {
const contextVram = ggufInsights.estimateContextResourceRequirements({
contextSize: fitContext.contextSize,
batchSize: getDefaultContextBatchSize({ contextSize: fitContext.contextSize, sequences: 1 }),
modelGpuLayers: gpuLayers,
sequences: 1,
isEmbeddingContext: fitContext.embeddingContext ?? false,
flashAttention: defaultContextFlashAttention
}).gpuVram;
const totalVram = modelVram + contextVram;
if (totalVram > currentVram)
return null;
return {
contextSize: fitContext.contextSize,
contextVram,
totalVram
};
}
const maxContext = findMaxPossibleContextSizeForVram({
gpuLayers,
ggufInsights,
vram: currentVram - modelVram,
isEmbeddingContext: fitContext?.embeddingContext ?? false,
flashAttention: defaultContextFlashAttention
});
if (maxContext == null || modelVram + maxContext.vram > currentVram)
return null;
return {
contextSize: maxContext.contextSize,
contextVram: maxContext.vram,
totalVram: modelVram + maxContext.vram
};
}
function findMaxPossibleContextSizeForVram({ gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention }) {
const maxContextSize = getDefaultModelContextSize({ trainContextSize: ggufInsights.trainContextSize });
for (let contextSize = maxContextSize; contextSize >= minAllowedContextSizeInCalculations; contextSize--) {
const contextVram = ggufInsights.estimateContextResourceRequirements({
contextSize,
batchSize: getDefaultContextBatchSize({ contextSize, sequences: 1 }),
modelGpuLayers: gpuLayers,
sequences: 1,
isEmbeddingContext,
flashAttention
}).gpuVram;
if (contextVram <= vram)
return {
contextSize,
vram: contextVram
};
}
return null;
}
//# sourceMappingURL=resolveModelGpuLayersOption.js.map