UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

node-llama-cpp.withcat.ai

withcatai/node-llama-cpp

530 lines • 25 kB

JavaScript

import { getLlamaWithoutBackend } from "../../bindings/utils/getLlamaWithoutBackend.js"; import { getDefaultContextBatchSize, getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js"; import { GgufArchitectureType } from "../types/GgufMetadataTypes.js"; import { getReadablePath } from "../../cli/utils/getReadablePath.js"; import { GgufInsightsConfigurationResolver } from "./GgufInsightsConfigurationResolver.js"; export class GgufInsights { /** @internal */ _llama; /** @internal */ _modelSize; /** @internal */ _totalLayers = null; /** @internal */ _ggufFileInfo; /** @internal */ _configurationResolver; constructor(ggufFileInfo, llama) { this._llama = llama; this._ggufFileInfo = ggufFileInfo; this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true); this._configurationResolver = GgufInsightsConfigurationResolver._create(this); } /** * Get warnings about the model file that would affect its usage. * * Most of these warnings are also generated by `llama.cpp` */ getWarnings(modelFilePath) { const warnings = []; const modelFilePathText = (modelFilePath != null && modelFilePath !== "") ? ` ("${getReadablePath(modelFilePath)}")` : ""; if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" && this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null) { // equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'" warnings.push(`This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` + "This may cause incorrect tokenization and thus degrade the generation quality. " + "Consider using a newer model or regenerating this GGUF model file"); } return warnings; } get ggufFileInfo() { return this._ggufFileInfo; } get configurationResolver() { return this._configurationResolver; } /** The context size the model was trained on */ get trainContextSize() { return this._ggufFileInfo.architectureMetadata.context_length; } /** The size of an embedding vector the model can produce */ get embeddingVectorSize() { return this._ggufFileInfo.architectureMetadata.embedding_length; } get totalLayers() { if (this._totalLayers != null) return this._totalLayers; const outputLayers = 1; this._totalLayers = this._getFileLayers() + outputLayers; return this._totalLayers; } get modelSize() { return this._modelSize; } get flashAttentionSupported() { // source: `llama_new_context_with_model` in `llama.cpp` if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok) return false; else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) return false; else { const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0; const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0; const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); if (nEmbdHeadK !== nEmbdHeadV) return false; } return true; } get hasEncoder() { switch (this._ggufFileInfo.metadata?.general?.architecture) { case GgufArchitectureType.t5: case GgufArchitectureType.t5encoder: return true; } return false; } get hasDecoder() { switch (this._ggufFileInfo.metadata?.general?.architecture) { case GgufArchitectureType.t5encoder: return false; } return true; } get isRecurrent() { switch (this._ggufFileInfo.metadata?.general?.architecture) { case GgufArchitectureType.mamba: case GgufArchitectureType.rwkv6: return true; } return false; } estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }) { const { cpu, gpu } = this._getTensorResourceSplit(gpuLayers); return { cpuRam: calculateTensorsSize(cpu, this._llama, false), gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap) }; } /** * Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`. * The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that. * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now. */ estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false }) { if (sequences == null) sequences = getDefaultContextSequences(); if (batchSize == null) batchSize = getDefaultContextBatchSize({ contextSize, sequences }); const actualContextSize = contextSize * sequences; const totalLayers = this.totalLayers; const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalLayers, totalLayers)); const finalCpuLayers = totalLayers - finalGpuLayers; const llmData = this._ggufFileInfo.architectureMetadata; const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0; const logitsSize = vocabularySize * batchSize; const embedSize = isEmbeddingContext ? (llmData.embedding_length ?? 0) * batchSize : 0; const sizeTBytes = 8; // sizeof(size_t) const floatBytes = 4; // sizeof(float) const uint32TBytes = 4; // sizeof(uint32_t) const int32TBytes = 4; // sizeof(int32_t) // source: `llama_state_get_size` in `llama.cpp` const sRngSize = sizeTBytes; const sRng = 64 * 1024; // LLAMA_MAX_RNG_STATE const sNOutputs = sizeTBytes; const sNOutputPos = batchSize * int32TBytes; const sLogitsSize = sizeTBytes; const sLogits = logitsSize * floatBytes; const sEmbeddingSize = sizeTBytes; const sEmbedding = embedSize * floatBytes; const sKvBufSize = sizeTBytes; const sKvHead = uint32TBytes; const sKvSize = uint32TBytes; const sKvUsed = uint32TBytes; const sKv = 2 * int32TBytes * modelGpuLayers * this._llama._consts.ggmlTensorOverhead; const sKvCell = this._llama._consts.llamaPosSize + sizeTBytes + this._llama._consts.llamaSeqIdSize; const kvSelfLength = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba ? Math.max(1, sequences) : actualContextSize; const sKvCells = kvSelfLength * sKvCell; const overheadMemory = (sRngSize + sRng + sNOutputs + sNOutputPos + sLogitsSize + sLogits + sEmbeddingSize + sEmbedding + sKvBufSize + sKvHead + sKvSize + sKvUsed + sKv + sKvCells); // Estimates the memory allocated by `ggml_backend_sched_reserve` in `llama_new_context_with_model` in `llama.cpp`. // If you read this line and have better insights on how to estimate this memory, please open a PR to improve it :) const estimateGraphOverheadMemory = () => { const s1MB = Math.pow(1024, 2); const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; let defaultCalculationAdjustment = 0; if (batchSize == null) return 0; if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) { const expertCount = this._ggufFileInfo.architectureMetadata.expert_count ?? 0; const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; const embeddingLength = llmData.embedding_length ?? 0; if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (actualContextSize * headCount)); } return int32TBytes * batchSize * (embeddingLength + (actualContextSize * headCount)); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) { if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment -= (s1MB * 340) * (this.trainContextSize == null ? 1 : actualContextSize / this.trainContextSize); } else { defaultCalculationAdjustment -= (s1MB * 250) + ((s1MB * 50) * (this.trainContextSize == null ? 1 : actualContextSize / this.trainContextSize)); } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma) { // only works properly when all layers are on the GPU, which is why it's commented out: // return int32TBytes * batchSize * ((llmData.embedding_length ?? 0)); if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment += (s1MB * 40) - ((s1MB * 270) * (this.trainContextSize == null ? 1 : actualContextSize / this.trainContextSize)); } else { defaultCalculationAdjustment += -(s1MB * 550) + ((s1MB * 150) * (this.trainContextSize == null ? 1 : Math.max(0, (1 - (actualContextSize / this.trainContextSize))))); } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) { const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; return (int32TBytes * batchSize * actualContextSize * headCount) - (50 * s1MB); // if (modelGpuLayers === this.totalLayers) { // defaultCalculationAdjustment += -(s1MB * 20) + ( // (s1MB * 250) * ( // this.trainContextSize == null // ? 1 // : actualContextSize / this.trainContextSize // ) // ); // } else { // defaultCalculationAdjustment += -(s1MB * 40) + ( // (s1MB * 300) * ( // this.trainContextSize == null // ? 1 // : actualContextSize / this.trainContextSize // ) // ); // } } const totalElements = tensorInfo.length === 0 ? this.totalLayers * (((llmData.embedding_length ?? 0) + (llmData.feed_forward_length ?? 0)) / 2) : tensorInfo.reduce((res, tensor) => { return res + tensor.dimensions.reduce((res, dim) => res + Number(dim), 0); }, 0); if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) { // magic numbers for estimation. will be improved in the future return (totalElements * 123 * (actualContextSize / 4096)) + defaultCalculationAdjustment; } // magic numbers for estimation. will be improved in the future return (totalElements * 77.655 * (actualContextSize / 4096)) + defaultCalculationAdjustment; }; const graphOverheadMemory = (flashAttention || !includeGraphOverhead) ? 0 : estimateGraphOverheadMemory(); const usingGpu = finalGpuLayers !== 0; const cpuRam = (!usingGpu ? (overheadMemory + graphOverheadMemory) : 0) + this._estimateKvMemorySizeInBytes(actualContextSize, finalCpuLayers); const gpuVram = usingGpu ? (overheadMemory + graphOverheadMemory + this._estimateKvMemorySizeInBytes(actualContextSize, finalGpuLayers < totalLayers ? (finalGpuLayers + 1) : finalGpuLayers)) : 0; return { cpuRam, gpuVram }; } /** * Get the split tensor resources for CPU and GPU based on the number of GPU layers * @internal */ _getTensorResourceSplit(gpuLayers) { const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const architecture = this._ggufFileInfo.metadata?.general?.architecture; if (gpuLayers === 0) { return { cpu: tensorInfo, gpu: [] }; } const fileLayers = this._getFileLayers(); const startGpuLayer = Math.max(0, fileLayers - gpuLayers); const gpuTensors = []; const cpuTensors = []; let tokenEmbedLayer; let mainOutputLayer; for (const singleTensorInfo of tensorInfo) { if (isMainOutputLayer(singleTensorInfo.name)) mainOutputLayer = singleTensorInfo; else if (isTokenEmbedLayer(singleTensorInfo.name)) tokenEmbedLayer = singleTensorInfo; // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always // loaded with `model.dev_input`, which is always set to the CPU if (isInputLayer(singleTensorInfo.name)) { cpuTensors.push(singleTensorInfo); continue; // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU } else if (isOutputLayer(singleTensorInfo.name)) { if (gpuLayers === this.totalLayers) { gpuTensors.push(singleTensorInfo); continue; } else { cpuTensors.push(singleTensorInfo); continue; } } const { layerNumber } = parseTensorName(singleTensorInfo.name); if (gpuLayers !== this.totalLayers) { if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) { if (layerNumber != null && layerNumber >= startGpuLayer) gpuTensors.push(singleTensorInfo); else cpuTensors.push(singleTensorInfo); continue; } } if (layerNumber == null || layerNumber >= startGpuLayer) gpuTensors.push(singleTensorInfo); else cpuTensors.push(singleTensorInfo); } if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer)) gpuTensors.push(tokenEmbedLayer); return { cpu: cpuTensors, gpu: gpuTensors }; } /** @internal */ _determineNumberOfLayersFromTensorInfo() { const layerNumbers = new Set(); for (const singleTensorInfo of (this._ggufFileInfo.fullTensorInfo ?? [])) { const { layerNumber } = parseTensorName(singleTensorInfo.name); if (layerNumber != null) layerNumbers.add(layerNumber); } return layerNumbers.size; } /** @internal */ _getFileLayers() { return this._ggufFileInfo.architectureMetadata.block_count ?? this._determineNumberOfLayersFromTensorInfo(); } /** @internal */ _estimateKvMemorySizeInBytes(contextSize, layers) { // source: `llama_kv_cache_init` in `llama.cpp` const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0; const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nHeadKv = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead; const modelNEmbdKGqa = nEmbdHeadK * nHeadKv; const ssmDConv = this._ggufFileInfo.architectureMetadata.ssm?.conv_kernel ?? 0; const ssmDInner = this._ggufFileInfo.architectureMetadata.ssm?.inner_size ?? 0; const modelNEmbdKS = (ssmDConv > 0 ? (ssmDConv - 1) : 0) * ssmDInner; const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); const modelNEmbdVGqa = nEmbdHeadV * nHeadKv; const ssmDState = this._ggufFileInfo.architectureMetadata.ssm?.state_size ?? 0; const modelNEmbdVS = ssmDState * ssmDInner; const totalNEmbdKGqa = modelNEmbdKGqa + modelNEmbdKS; const totalNEmbdVGqa = modelNEmbdVGqa + modelNEmbdVS; const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba // if `type_k` of `llama_context_params` changes to be configurable in `LlamaContext`, // this would have to depend on that value ? this._llama._consts.ggmlTypeF32Size : this._llama._consts.ggmlTypeF16Size; const valueTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba // if `type_v` of `llama_context_params` changes to be configurable in `LlamaContext`, // this would have to depend on that value ? this._llama._consts.ggmlTypeF32Size : this._llama._consts.ggmlTypeF16Size; const keyTensorsSize = layers * totalNEmbdKGqa * contextSize * keyTypeSize; const valueTensorsSize = layers * totalNEmbdVGqa * contextSize * valueTypeSize; return keyTensorsSize + valueTensorsSize; } /** * @param ggufFileInfo * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance. * If you don't pass a `Llama` instance, a basic `Llama` instance is created as a fallback - it's a slim instance that * doesn't instantiate a `llama.cpp` backend, so it won't utilize the GPU at all, and be shared with other `GgufInsights` instances * that need a fallback `Llama` instance. */ static async from(ggufFileInfo, llama) { let resolvedLlama = llama; if (resolvedLlama == null) resolvedLlama = await getLlamaWithoutBackend(); return new GgufInsights(ggufFileInfo, resolvedLlama); } } function parseTensorName(tensorName) { if (tensorName == null) return { layerNumber: undefined }; const layerTensorPrefix = "blk."; if (!tensorName.startsWith(layerTensorPrefix)) return { layerNumber: undefined }; const dotIndex = tensorName.indexOf(".", layerTensorPrefix.length); const layerNumberString = tensorName.slice(layerTensorPrefix.length, dotIndex < 0 ? tensorName.length : dotIndex); const layerNumber = parseInt(layerNumberString); if (Number.isFinite(layerNumber)) return { layerNumber }; return { layerNumber: undefined }; } function calculateTensorsSize(tensorsInfo, llama, useMmap, startFromTensorDataOffset = false) { if (!useMmap) { let size = 0; for (const tensorInfo of tensorsInfo) size += calculateTensorSize(tensorInfo, llama); return size; } const fileStats = new Map(); for (const tensorInfo of tensorsInfo) { let stats = fileStats.get(tensorInfo.filePart); if (stats == null) { stats = { tensorsSize: 0 }; fileStats.set(tensorInfo.filePart, stats); } const tensorSize = calculateTensorSize(tensorInfo, llama); stats.tensorsSize += tensorSize; const startOffset = tensorInfo.offset; const endOffset = typeof startOffset === "number" ? startOffset + tensorSize : startOffset + BigInt(tensorSize); if (startFromTensorDataOffset) stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset)); else if (stats.startOffset == null || startOffset < stats.startOffset) stats.startOffset = startOffset; if (stats.endOffset == null || endOffset > stats.endOffset) stats.endOffset = endOffset; } let size = 0; for (const [, stats] of fileStats) { const offsetSize = (stats.endOffset == null || stats.startOffset == null) ? 0 : Number(BigInt(stats.endOffset) - BigInt(stats.startOffset)); const tensorsSize = stats.tensorsSize; size += Math.max(offsetSize, tensorsSize); } return size; } function calculateTensorSize(tensor, llama) { const typeSize = llama._bindings.getTypeSizeForGgmlType(tensor.ggmlType); const blockSize = llama._bindings.getBlockSizeForGgmlType(tensor.ggmlType); const ggmlMaxDims = llama._consts.ggmlMaxDims; if (typeSize == null || blockSize == null) throw new Error("Invalid type or block size"); const { ne, nb } = getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }); if (blockSize === 1) { let totalBytes = typeSize; for (let i = 0; i < ggmlMaxDims; i++) { totalBytes += (ne[i] - 1) * nb[i]; } return totalBytes; } else { let totalBytes = Math.floor((ne[0] * nb[0]) / blockSize); for (let i = 1; i < ggmlMaxDims; i++) { totalBytes += (ne[i] - 1) * nb[i]; } return totalBytes; } } function getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }) { // number of elements // source: `ggml_new_tensor_impl` in `ggml.c` const ne = [ ...tensor.dimensions, ...(Array(Math.max(0, ggmlMaxDims - tensor.dimensions.length)).fill(1)) ].slice(0, ggmlMaxDims); // number of bytes // source: `ggml_new_tensor_impl` in `ggml.c` const nb = [ typeSize, Math.floor(typeSize * (ne[0] / blockSize)), ...Array(ggmlMaxDims - 2).fill(0) ]; for (let i = 2; i < ggmlMaxDims; i++) { nb[i] = nb[i - 1] * ne[i - 1]; } return { ne, nb }; } function isInputLayer(layerName) { const [firstPart] = layerName.split("."); if (firstPart == null) return false; // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT` switch (firstPart) { case "token_embd": case "token_embd_norm": case "token_types": case "position_embd": return true; } return false; } function isOutputLayer(layerName) { const [firstPart, secondPart] = layerName.split("."); if (firstPart == null) return false; // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT` switch (firstPart) { case "output": case "output_norm": case "cls": return true; } if (secondPart == null) return false; // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT` switch (firstPart + "." + secondPart) { case "cls.output": case "dec.output_norm": case "enc.output_norm": return true; } return false; } function isMainOutputLayer(layerName) { const [firstPart] = layerName.split("."); return firstPart === "output"; } function isTokenEmbedLayer(layerName) { const [firstPart] = layerName.split("."); return firstPart === "token_embd"; } //# sourceMappingURL=GgufInsights.js.map