node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
257 lines • 14.8 kB
JavaScript
import { getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js";
import { InsufficientMemoryError } from "../../utils/InsufficientMemoryError.js";
import { resolveModelGpuLayersOption } from "./utils/resolveModelGpuLayersOption.js";
import { resolveContextContextSizeOption } from "./utils/resolveContextContextSizeOption.js";
import { scoreLevels } from "./utils/scoreLevels.js";
import { getRamUsageFromUnifiedVram } from "./utils/getRamUsageFromUnifiedVram.js";
export const defaultTrainContextSizeForEstimationPurposes = 4096;
const defaultContextSizeForUnfitContextSizeConfiguration = 2048;
export class GgufInsightsConfigurationResolver {
/** @internal */ _ggufInsights;
constructor(ggufInsights) {
this._ggufInsights = ggufInsights;
}
get ggufInsights() {
return this._ggufInsights;
}
/**
* Resolve the best configuration for loading a model and creating a context using the current hardware.
*
* Specifying a `targetGpuLayers` and/or `targetContextSize` will ensure the resolved configuration matches those values,
* but note it can lower the compatibility score if the hardware doesn't support it.
*
* Overriding hardware values it possible by configuring `hardwareOverrides`.
* @param options
* @param hardwareOverrides
*/
async resolveAndScoreConfig({ targetGpuLayers, targetContextSize, embeddingContext = false, flashAttention = false, useMmap = this._ggufInsights._llama.supportsMmap } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading } = {}) {
const compatibilityScore = await this.scoreModelConfigurationCompatibility({
flashAttention,
contextSize: targetContextSize,
embeddingContext,
forceGpuLayers: targetGpuLayers,
forceStrictContextSize: targetContextSize != null,
useMmap
}, {
getVramState,
getRamState,
getSwapState,
llamaVramPaddingSize,
llamaGpu,
llamaSupportsGpuOffloading
});
return compatibilityScore;
}
/**
* Score the compatibility of the model configuration with the current GPU and VRAM state.
* Assumes a model is loaded with the default `"auto"` configurations.
* Scored based on the following criteria:
* - The number of GPU layers that can be offloaded to the GPU (only if there's a GPU. If there's no GPU then by how small the model is)
* - Whether all layers can be offloaded to the GPU (gives additional points)
* - Whether the resolved context size is at least as large as the specified `contextSize`
*
* If the resolved context size is larger than the specified context size, for each multiplier of the specified `contextSize`
* that the resolved context size is larger by, 1 bonus point is given in the `bonusScore`.
*
* `maximumFittedContextSizeMultiplier` is used to improve the proportionality of the bonus score between models.
* Set this to any value higher than `<max compared model context size> / contextSize`.
* Defaults to `100`.
*
* `maximumUnfitConfigurationResourceMultiplier` is used to improve the proportionality of the bonus score between unfit models.
* Set this to any value higher than `<max compared model resource usage> / <total available resources>`.
* Defaults to `100`.
*
* `contextSize` defaults to `4096` (if the model train context size is lower than this, the model train context size is used instead).
*/
async scoreModelConfigurationCompatibility({ contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, flashAttention = false, maximumFittedContextSizeMultiplier = 100, maximumUnfitConfigurationResourceMultiplier = 100, forceStrictContextSize = false, forceGpuLayers, useMmap = this._ggufInsights._llama.supportsMmap } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading } = {}) {
const [vramState, ramState, swapState] = await Promise.all([
getVramState(),
getRamState(),
getSwapState()
]);
let resolvedGpuLayers = (forceGpuLayers == null || forceGpuLayers == "max")
? this.ggufInsights.totalLayers
: forceGpuLayers;
let gpuLayersFitMemory = false;
try {
resolvedGpuLayers = await this.resolveModelGpuLayers(forceGpuLayers != null
? forceGpuLayers
: embeddingContext
? {
fitContext: {
embeddingContext: true,
contextSize: forceStrictContextSize
? contextSize
: undefined
}
}
: forceStrictContextSize != null
? { fitContext: { contextSize } }
: "auto", {
getVramState: async () => vramState,
llamaVramPaddingSize,
llamaGpu,
llamaSupportsGpuOffloading,
defaultContextFlashAttention: flashAttention,
ignoreMemorySafetyChecks: forceGpuLayers != null,
useMmap
});
gpuLayersFitMemory = true;
}
catch (err) {
if (!(err instanceof InsufficientMemoryError))
throw err;
}
const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false;
const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({
gpuLayers: resolvedGpuLayers,
useMmap
});
let resolvedContextSize = Math.min(this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration, defaultContextSizeForUnfitContextSizeConfiguration);
let contextFitsMemory = false;
try {
resolvedContextSize = await this.resolveContextContextSize("auto", {
getVramState: async () => ({
total: vramState.total,
free: Math.max(0, vramState.free - estimatedModelResourceUsage.gpuVram),
unifiedSize: vramState.unifiedSize
}),
getRamState: async () => ({
total: ramState.total,
free: Math.max(0, ramState.free - estimatedModelResourceUsage.cpuRam +
(-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)))
}),
getSwapState: async () => ({
total: swapState.total,
free: Math.max(0, swapState.free - Math.max(0, estimatedModelResourceUsage.cpuRam +
(-getRamUsageFromUnifiedVram(estimatedModelResourceUsage.gpuVram, vramState)) +
(-ramState.free)))
}),
llamaGpu,
isEmbeddingContext: embeddingContext,
modelGpuLayers: resolvedGpuLayers,
modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
ignoreMemorySafetyChecks: forceStrictContextSize,
flashAttention
});
contextFitsMemory = true;
}
catch (err) {
if (!(err instanceof InsufficientMemoryError))
throw err;
}
const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({
contextSize: resolvedContextSize,
isEmbeddingContext: embeddingContext,
modelGpuLayers: resolvedGpuLayers,
flashAttention
});
const rankPoints = {
gpuLayers: 60,
allLayersAreOffloaded: 10,
contextSize: 30,
ramUsageFitsInRam: 10,
cpuOnlySmallModelSize: 70, // also defined inside `scoreModelSizeForCpuOnlyUsage`
bonusContextSize: 10
};
const gpuLayersPoints = rankPoints.gpuLayers * Math.min(1, resolvedGpuLayers / this._ggufInsights.totalLayers);
const allLayersAreOffloadedPoints = rankPoints.allLayersAreOffloaded * (resolvedGpuLayers === this._ggufInsights.totalLayers ? 1 : 0);
const contextSizePoints = contextFitsMemory
? rankPoints.contextSize * Math.min(1, resolvedContextSize / contextSize)
: 0;
const ramUsageFitsInRamPoints = rankPoints.ramUsageFitsInRam * (estimatedModelResourceUsage.cpuRam <= ramState.free
? 1
: estimatedModelResourceUsage.cpuRam <= ramState.free + swapState.free
? 0.8
: estimatedModelResourceUsage.cpuRam <= ramState.total
? 0.5
: (0.5 - Math.min(0.5, 0.5 * ((estimatedModelResourceUsage.cpuRam - ramState.total) / ramState.total))));
const bonusContextSizePoints = contextFitsMemory
? (10 * Math.min(1, (Math.max(0, resolvedContextSize - contextSize) / contextSize) / maximumFittedContextSizeMultiplier))
: 0;
let compatibilityScore = canUseGpu
? ((gpuLayersPoints + allLayersAreOffloadedPoints + contextSizePoints + ramUsageFitsInRamPoints) /
(rankPoints.gpuLayers + rankPoints.allLayersAreOffloaded + rankPoints.contextSize + rankPoints.ramUsageFitsInRam))
: ((contextSizePoints + ramUsageFitsInRamPoints + scoreModelSizeForCpuOnlyUsage(this._ggufInsights.modelSize)) /
(rankPoints.contextSize + rankPoints.ramUsageFitsInRam + rankPoints.cpuOnlySmallModelSize));
let bonusScore = bonusContextSizePoints / rankPoints.bonusContextSize;
if (!gpuLayersFitMemory || !contextFitsMemory ||
estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram > vramState.total ||
estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam > ramState.total + swapState.total) {
const totalVramRequirement = estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram;
const totalRamRequirement = estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam;
compatibilityScore = 0;
bonusScore = ((1 - (totalVramRequirement / (vramState.total * maximumUnfitConfigurationResourceMultiplier))) +
(1 - (totalRamRequirement / ((ramState.total + swapState.total) * maximumUnfitConfigurationResourceMultiplier)))) / 2;
}
return {
compatibilityScore,
bonusScore,
totalScore: compatibilityScore + bonusScore,
resolvedValues: {
gpuLayers: resolvedGpuLayers,
contextSize: resolvedContextSize,
modelRamUsage: estimatedModelResourceUsage.cpuRam,
contextRamUsage: estimatedContextResourceUsage.cpuRam,
totalRamUsage: estimatedModelResourceUsage.cpuRam + estimatedContextResourceUsage.cpuRam,
modelVramUsage: estimatedModelResourceUsage.gpuVram,
contextVramUsage: estimatedContextResourceUsage.gpuVram,
totalVramUsage: estimatedModelResourceUsage.gpuVram + estimatedContextResourceUsage.gpuVram
}
};
}
async resolveModelGpuLayers(gpuLayers, { ignoreMemorySafetyChecks = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, defaultContextFlashAttention = false, useMmap = this._ggufInsights._llama.supportsMmap } = {}) {
return resolveModelGpuLayersOption(gpuLayers, {
ggufInsights: this._ggufInsights,
ignoreMemorySafetyChecks,
getVramState,
llamaVramPaddingSize,
llamaGpu,
llamaSupportsGpuOffloading,
defaultContextFlashAttention,
useMmap
});
}
/**
* Resolve a context size option for the given options and constraints.
*
* If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown.
*/
async resolveContextContextSize(contextSize, { modelGpuLayers, batchSize, modelTrainContextSize, flashAttention = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), llamaGpu = this._ggufInsights._llama.gpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, sequences = getDefaultContextSequences() }) {
return await resolveContextContextSizeOption({
contextSize,
batchSize,
sequences,
modelFileInsights: this._ggufInsights,
modelGpuLayers,
modelTrainContextSize,
flashAttention,
getVramState,
getRamState,
getSwapState,
llamaGpu,
ignoreMemorySafetyChecks,
isEmbeddingContext
});
}
/** @internal */
static _create(ggufInsights) {
return new GgufInsightsConfigurationResolver(ggufInsights);
}
}
function scoreModelSizeForCpuOnlyUsage(modelSize) {
const s1GB = Math.pow(1024, 3);
return 70 - scoreLevels(modelSize, [{
start: s1GB,
end: s1GB * 2.5,
points: 46
}, {
start: s1GB * 2.5,
end: s1GB * 4,
points: 17
}, {
start: s1GB * 4,
points: 7
}]);
}
//# sourceMappingURL=GgufInsightsConfigurationResolver.js.map