node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
530 lines • 25 kB
JavaScript
import { getLlamaWithoutBackend } from "../../bindings/utils/getLlamaWithoutBackend.js";
import { getDefaultContextBatchSize, getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js";
import { GgufArchitectureType } from "../types/GgufMetadataTypes.js";
import { getReadablePath } from "../../cli/utils/getReadablePath.js";
import { GgufInsightsConfigurationResolver } from "./GgufInsightsConfigurationResolver.js";
export class GgufInsights {
/** @internal */ _llama;
/** @internal */ _modelSize;
/** @internal */ _totalLayers = null;
/** @internal */ _ggufFileInfo;
/** @internal */ _configurationResolver;
constructor(ggufFileInfo, llama) {
this._llama = llama;
this._ggufFileInfo = ggufFileInfo;
this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true);
this._configurationResolver = GgufInsightsConfigurationResolver._create(this);
}
/**
* Get warnings about the model file that would affect its usage.
*
* Most of these warnings are also generated by `llama.cpp`
*/
getWarnings(modelFilePath) {
const warnings = [];
const modelFilePathText = (modelFilePath != null && modelFilePath !== "")
? ` ("${getReadablePath(modelFilePath)}")`
: "";
if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" &&
this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null) {
// equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'"
warnings.push(`This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` +
"This may cause incorrect tokenization and thus degrade the generation quality. " +
"Consider using a newer model or regenerating this GGUF model file");
}
return warnings;
}
get ggufFileInfo() {
return this._ggufFileInfo;
}
get configurationResolver() {
return this._configurationResolver;
}
/** The context size the model was trained on */
get trainContextSize() {
return this._ggufFileInfo.architectureMetadata.context_length;
}
/** The size of an embedding vector the model can produce */
get embeddingVectorSize() {
return this._ggufFileInfo.architectureMetadata.embedding_length;
}
get totalLayers() {
if (this._totalLayers != null)
return this._totalLayers;
const outputLayers = 1;
this._totalLayers = this._getFileLayers() + outputLayers;
return this._totalLayers;
}
get modelSize() {
return this._modelSize;
}
get flashAttentionSupported() {
// source: `llama_new_context_with_model` in `llama.cpp`
if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
return false;
else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
return false;
else {
const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0;
const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0;
const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
if (nEmbdHeadK !== nEmbdHeadV)
return false;
}
return true;
}
get hasEncoder() {
switch (this._ggufFileInfo.metadata?.general?.architecture) {
case GgufArchitectureType.t5:
case GgufArchitectureType.t5encoder:
return true;
}
return false;
}
get hasDecoder() {
switch (this._ggufFileInfo.metadata?.general?.architecture) {
case GgufArchitectureType.t5encoder:
return false;
}
return true;
}
get isRecurrent() {
switch (this._ggufFileInfo.metadata?.general?.architecture) {
case GgufArchitectureType.mamba:
case GgufArchitectureType.rwkv6:
return true;
}
return false;
}
estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }) {
const { cpu, gpu } = this._getTensorResourceSplit(gpuLayers);
return {
cpuRam: calculateTensorsSize(cpu, this._llama, false),
gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap)
};
}
/**
* Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`.
* The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that.
* The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now.
*/
estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false }) {
if (sequences == null)
sequences = getDefaultContextSequences();
if (batchSize == null)
batchSize = getDefaultContextBatchSize({ contextSize, sequences });
const actualContextSize = contextSize * sequences;
const totalLayers = this.totalLayers;
const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalLayers, totalLayers));
const finalCpuLayers = totalLayers - finalGpuLayers;
const llmData = this._ggufFileInfo.architectureMetadata;
const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0;
const logitsSize = vocabularySize * batchSize;
const embedSize = isEmbeddingContext
? (llmData.embedding_length ?? 0) * batchSize
: 0;
const sizeTBytes = 8; // sizeof(size_t)
const floatBytes = 4; // sizeof(float)
const uint32TBytes = 4; // sizeof(uint32_t)
const int32TBytes = 4; // sizeof(int32_t)
// source: `llama_state_get_size` in `llama.cpp`
const sRngSize = sizeTBytes;
const sRng = 64 * 1024; // LLAMA_MAX_RNG_STATE
const sNOutputs = sizeTBytes;
const sNOutputPos = batchSize * int32TBytes;
const sLogitsSize = sizeTBytes;
const sLogits = logitsSize * floatBytes;
const sEmbeddingSize = sizeTBytes;
const sEmbedding = embedSize * floatBytes;
const sKvBufSize = sizeTBytes;
const sKvHead = uint32TBytes;
const sKvSize = uint32TBytes;
const sKvUsed = uint32TBytes;
const sKv = 2 * int32TBytes * modelGpuLayers * this._llama._consts.ggmlTensorOverhead;
const sKvCell = this._llama._consts.llamaPosSize + sizeTBytes + this._llama._consts.llamaSeqIdSize;
const kvSelfLength = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
? Math.max(1, sequences)
: actualContextSize;
const sKvCells = kvSelfLength * sKvCell;
const overheadMemory = (sRngSize +
sRng +
sNOutputs +
sNOutputPos +
sLogitsSize +
sLogits +
sEmbeddingSize +
sEmbedding +
sKvBufSize +
sKvHead +
sKvSize +
sKvUsed +
sKv +
sKvCells);
// Estimates the memory allocated by `ggml_backend_sched_reserve` in `llama_new_context_with_model` in `llama.cpp`.
// If you read this line and have better insights on how to estimate this memory, please open a PR to improve it :)
const estimateGraphOverheadMemory = () => {
const s1MB = Math.pow(1024, 2);
const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
let defaultCalculationAdjustment = 0;
if (batchSize == null)
return 0;
if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) {
const expertCount = this._ggufFileInfo.architectureMetadata.expert_count ?? 0;
const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
const embeddingLength = llmData.embedding_length ?? 0;
if (expertCount > 0) {
const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (actualContextSize * headCount));
}
return int32TBytes * batchSize * (embeddingLength + (actualContextSize * headCount));
}
else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) {
if (modelGpuLayers === this.totalLayers) {
defaultCalculationAdjustment -= (s1MB * 340) * (this.trainContextSize == null
? 1
: actualContextSize / this.trainContextSize);
}
else {
defaultCalculationAdjustment -= (s1MB * 250) + ((s1MB * 50) * (this.trainContextSize == null
? 1
: actualContextSize / this.trainContextSize));
}
}
else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma) {
// only works properly when all layers are on the GPU, which is why it's commented out:
// return int32TBytes * batchSize * ((llmData.embedding_length ?? 0));
if (modelGpuLayers === this.totalLayers) {
defaultCalculationAdjustment += (s1MB * 40) - ((s1MB * 270) * (this.trainContextSize == null
? 1
: actualContextSize / this.trainContextSize));
}
else {
defaultCalculationAdjustment += -(s1MB * 550) + ((s1MB * 150) * (this.trainContextSize == null
? 1
: Math.max(0, (1 - (actualContextSize / this.trainContextSize)))));
}
}
else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) {
const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
return (int32TBytes * batchSize * actualContextSize * headCount) - (50 * s1MB);
// if (modelGpuLayers === this.totalLayers) {
// defaultCalculationAdjustment += -(s1MB * 20) + (
// (s1MB * 250) * (
// this.trainContextSize == null
// ? 1
// : actualContextSize / this.trainContextSize
// )
// );
// } else {
// defaultCalculationAdjustment += -(s1MB * 40) + (
// (s1MB * 300) * (
// this.trainContextSize == null
// ? 1
// : actualContextSize / this.trainContextSize
// )
// );
// }
}
const totalElements = tensorInfo.length === 0
? this.totalLayers * (((llmData.embedding_length ?? 0) +
(llmData.feed_forward_length ?? 0)) / 2)
: tensorInfo.reduce((res, tensor) => {
return res + tensor.dimensions.reduce((res, dim) => res + Number(dim), 0);
}, 0);
if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) {
// magic numbers for estimation. will be improved in the future
return (totalElements * 123 * (actualContextSize / 4096)) + defaultCalculationAdjustment;
}
// magic numbers for estimation. will be improved in the future
return (totalElements * 77.655 * (actualContextSize / 4096)) + defaultCalculationAdjustment;
};
const graphOverheadMemory = (flashAttention || !includeGraphOverhead)
? 0
: estimateGraphOverheadMemory();
const usingGpu = finalGpuLayers !== 0;
const cpuRam = (!usingGpu
? (overheadMemory + graphOverheadMemory)
: 0) +
this._estimateKvMemorySizeInBytes(actualContextSize, finalCpuLayers);
const gpuVram = usingGpu
? (overheadMemory +
graphOverheadMemory +
this._estimateKvMemorySizeInBytes(actualContextSize, finalGpuLayers < totalLayers
? (finalGpuLayers + 1)
: finalGpuLayers))
: 0;
return {
cpuRam,
gpuVram
};
}
/**
* Get the split tensor resources for CPU and GPU based on the number of GPU layers
* @internal
*/
_getTensorResourceSplit(gpuLayers) {
const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
const architecture = this._ggufFileInfo.metadata?.general?.architecture;
if (gpuLayers === 0) {
return {
cpu: tensorInfo,
gpu: []
};
}
const fileLayers = this._getFileLayers();
const startGpuLayer = Math.max(0, fileLayers - gpuLayers);
const gpuTensors = [];
const cpuTensors = [];
let tokenEmbedLayer;
let mainOutputLayer;
for (const singleTensorInfo of tensorInfo) {
if (isMainOutputLayer(singleTensorInfo.name))
mainOutputLayer = singleTensorInfo;
else if (isTokenEmbedLayer(singleTensorInfo.name))
tokenEmbedLayer = singleTensorInfo;
// in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always
// loaded with `model.dev_input`, which is always set to the CPU
if (isInputLayer(singleTensorInfo.name)) {
cpuTensors.push(singleTensorInfo);
continue;
// in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always
// loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU
}
else if (isOutputLayer(singleTensorInfo.name)) {
if (gpuLayers === this.totalLayers) {
gpuTensors.push(singleTensorInfo);
continue;
}
else {
cpuTensors.push(singleTensorInfo);
continue;
}
}
const { layerNumber } = parseTensorName(singleTensorInfo.name);
if (gpuLayers !== this.totalLayers) {
if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) {
if (layerNumber != null && layerNumber >= startGpuLayer)
gpuTensors.push(singleTensorInfo);
else
cpuTensors.push(singleTensorInfo);
continue;
}
}
if (layerNumber == null || layerNumber >= startGpuLayer)
gpuTensors.push(singleTensorInfo);
else
cpuTensors.push(singleTensorInfo);
}
if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer))
gpuTensors.push(tokenEmbedLayer);
return {
cpu: cpuTensors,
gpu: gpuTensors
};
}
/** @internal */
_determineNumberOfLayersFromTensorInfo() {
const layerNumbers = new Set();
for (const singleTensorInfo of (this._ggufFileInfo.fullTensorInfo ?? [])) {
const { layerNumber } = parseTensorName(singleTensorInfo.name);
if (layerNumber != null)
layerNumbers.add(layerNumber);
}
return layerNumbers.size;
}
/** @internal */
_getFileLayers() {
return this._ggufFileInfo.architectureMetadata.block_count ?? this._determineNumberOfLayersFromTensorInfo();
}
/** @internal */
_estimateKvMemorySizeInBytes(contextSize, layers) {
// source: `llama_kv_cache_init` in `llama.cpp`
const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0;
const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
const nHeadKv = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead;
const modelNEmbdKGqa = nEmbdHeadK * nHeadKv;
const ssmDConv = this._ggufFileInfo.architectureMetadata.ssm?.conv_kernel ?? 0;
const ssmDInner = this._ggufFileInfo.architectureMetadata.ssm?.inner_size ?? 0;
const modelNEmbdKS = (ssmDConv > 0 ? (ssmDConv - 1) : 0) * ssmDInner;
const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
const modelNEmbdVGqa = nEmbdHeadV * nHeadKv;
const ssmDState = this._ggufFileInfo.architectureMetadata.ssm?.state_size ?? 0;
const modelNEmbdVS = ssmDState * ssmDInner;
const totalNEmbdKGqa = modelNEmbdKGqa + modelNEmbdKS;
const totalNEmbdVGqa = modelNEmbdVGqa + modelNEmbdVS;
const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
// if `type_k` of `llama_context_params` changes to be configurable in `LlamaContext`,
// this would have to depend on that value
? this._llama._consts.ggmlTypeF32Size
: this._llama._consts.ggmlTypeF16Size;
const valueTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
// if `type_v` of `llama_context_params` changes to be configurable in `LlamaContext`,
// this would have to depend on that value
? this._llama._consts.ggmlTypeF32Size
: this._llama._consts.ggmlTypeF16Size;
const keyTensorsSize = layers * totalNEmbdKGqa * contextSize * keyTypeSize;
const valueTensorsSize = layers * totalNEmbdVGqa * contextSize * valueTypeSize;
return keyTensorsSize + valueTensorsSize;
}
/**
* @param ggufFileInfo
* @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance.
* If you don't pass a `Llama` instance, a basic `Llama` instance is created as a fallback - it's a slim instance that
* doesn't instantiate a `llama.cpp` backend, so it won't utilize the GPU at all, and be shared with other `GgufInsights` instances
* that need a fallback `Llama` instance.
*/
static async from(ggufFileInfo, llama) {
let resolvedLlama = llama;
if (resolvedLlama == null)
resolvedLlama = await getLlamaWithoutBackend();
return new GgufInsights(ggufFileInfo, resolvedLlama);
}
}
function parseTensorName(tensorName) {
if (tensorName == null)
return { layerNumber: undefined };
const layerTensorPrefix = "blk.";
if (!tensorName.startsWith(layerTensorPrefix))
return { layerNumber: undefined };
const dotIndex = tensorName.indexOf(".", layerTensorPrefix.length);
const layerNumberString = tensorName.slice(layerTensorPrefix.length, dotIndex < 0
? tensorName.length
: dotIndex);
const layerNumber = parseInt(layerNumberString);
if (Number.isFinite(layerNumber))
return { layerNumber };
return { layerNumber: undefined };
}
function calculateTensorsSize(tensorsInfo, llama, useMmap, startFromTensorDataOffset = false) {
if (!useMmap) {
let size = 0;
for (const tensorInfo of tensorsInfo)
size += calculateTensorSize(tensorInfo, llama);
return size;
}
const fileStats = new Map();
for (const tensorInfo of tensorsInfo) {
let stats = fileStats.get(tensorInfo.filePart);
if (stats == null) {
stats = {
tensorsSize: 0
};
fileStats.set(tensorInfo.filePart, stats);
}
const tensorSize = calculateTensorSize(tensorInfo, llama);
stats.tensorsSize += tensorSize;
const startOffset = tensorInfo.offset;
const endOffset = typeof startOffset === "number"
? startOffset + tensorSize
: startOffset + BigInt(tensorSize);
if (startFromTensorDataOffset)
stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset));
else if (stats.startOffset == null || startOffset < stats.startOffset)
stats.startOffset = startOffset;
if (stats.endOffset == null || endOffset > stats.endOffset)
stats.endOffset = endOffset;
}
let size = 0;
for (const [, stats] of fileStats) {
const offsetSize = (stats.endOffset == null || stats.startOffset == null)
? 0
: Number(BigInt(stats.endOffset) - BigInt(stats.startOffset));
const tensorsSize = stats.tensorsSize;
size += Math.max(offsetSize, tensorsSize);
}
return size;
}
function calculateTensorSize(tensor, llama) {
const typeSize = llama._bindings.getTypeSizeForGgmlType(tensor.ggmlType);
const blockSize = llama._bindings.getBlockSizeForGgmlType(tensor.ggmlType);
const ggmlMaxDims = llama._consts.ggmlMaxDims;
if (typeSize == null || blockSize == null)
throw new Error("Invalid type or block size");
const { ne, nb } = getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims });
if (blockSize === 1) {
let totalBytes = typeSize;
for (let i = 0; i < ggmlMaxDims; i++) {
totalBytes += (ne[i] - 1) * nb[i];
}
return totalBytes;
}
else {
let totalBytes = Math.floor((ne[0] * nb[0]) / blockSize);
for (let i = 1; i < ggmlMaxDims; i++) {
totalBytes += (ne[i] - 1) * nb[i];
}
return totalBytes;
}
}
function getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }) {
// number of elements
// source: `ggml_new_tensor_impl` in `ggml.c`
const ne = [
...tensor.dimensions,
...(Array(Math.max(0, ggmlMaxDims - tensor.dimensions.length)).fill(1))
].slice(0, ggmlMaxDims);
// number of bytes
// source: `ggml_new_tensor_impl` in `ggml.c`
const nb = [
typeSize,
Math.floor(typeSize * (ne[0] / blockSize)),
...Array(ggmlMaxDims - 2).fill(0)
];
for (let i = 2; i < ggmlMaxDims; i++) {
nb[i] = nb[i - 1] * ne[i - 1];
}
return {
ne,
nb
};
}
function isInputLayer(layerName) {
const [firstPart] = layerName.split(".");
if (firstPart == null)
return false;
// source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
// in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
switch (firstPart) {
case "token_embd":
case "token_embd_norm":
case "token_types":
case "position_embd":
return true;
}
return false;
}
function isOutputLayer(layerName) {
const [firstPart, secondPart] = layerName.split(".");
if (firstPart == null)
return false;
// source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
// in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
switch (firstPart) {
case "output":
case "output_norm":
case "cls":
return true;
}
if (secondPart == null)
return false;
// source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where
// in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT`
switch (firstPart + "." + secondPart) {
case "cls.output":
case "dec.output_norm":
case "enc.output_norm":
return true;
}
return false;
}
function isMainOutputLayer(layerName) {
const [firstPart] = layerName.split(".");
return firstPart === "output";
}
function isTokenEmbedLayer(layerName) {
const [firstPart] = layerName.split(".");
return firstPart === "token_embd";
}
//# sourceMappingURL=GgufInsights.js.map