UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

node-llama-cpp.withcat.ai

withcatai/node-llama-cpp

653 lines • 30.9 kB

JavaScript

import { getLlamaWithoutBackend } from "../../bindings/utils/getLlamaWithoutBackend.js"; import { getDefaultContextBatchSize, getDefaultContextSequences } from "../../evaluator/LlamaContext/LlamaContext.js"; import { GgufArchitectureType } from "../types/GgufMetadataTypes.js"; import { getReadablePath } from "../../cli/utils/getReadablePath.js"; import { padSafeContextSize } from "../../evaluator/LlamaContext/utils/padSafeContextSize.js"; import { GgufInsightsConfigurationResolver } from "./GgufInsightsConfigurationResolver.js"; import { GgufInsightsTokens } from "./GgufInsightsTokens.js"; export class GgufInsights { /** @internal */ _llama; /** @internal */ _modelSize; /** @internal */ _totalFileLayers = null; /** @internal */ _supportsRanking; /** @internal */ _ggufFileInfo; /** @internal */ _configurationResolver; /** @internal */ _tokens; constructor(ggufFileInfo, llama) { this._llama = llama; this._ggufFileInfo = ggufFileInfo; this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true); this._configurationResolver = GgufInsightsConfigurationResolver._create(this); this._tokens = GgufInsightsTokens._create(this); } /** * Get warnings about the model file that would affect its usage. * * Most of these warnings are also generated by `llama.cpp` */ getWarnings(modelFilePath) { const warnings = []; const modelFilePathText = (modelFilePath != null && modelFilePath !== "") ? ` ("${getReadablePath(modelFilePath)}")` : ""; if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" && this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null) { // equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'" warnings.push(`This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` + "This may cause incorrect tokenization and thus degrade the generation quality. " + "Consider using a newer model or regenerating this GGUF model file"); } return warnings; } get ggufFileInfo() { return this._ggufFileInfo; } get configurationResolver() { return this._configurationResolver; } get tokens() { return this._tokens; } /** The context size the model was trained on */ get trainContextSize() { return this._ggufFileInfo.architectureMetadata.context_length; } /** The size of an embedding vector the model can produce */ get embeddingVectorSize() { return this._ggufFileInfo.architectureMetadata.embedding_length; } get totalLayers() { const outputLayers = 1; return this._getTotalFileLayers() + outputLayers; } get modelSize() { return this._modelSize; } get flashAttentionSupported() { // source: `llama_new_context_with_model` in `llama.cpp` if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok) return false; else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) return false; else { const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0; const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0; const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); if (nEmbdHeadK !== nEmbdHeadV) return false; } return true; } get hasEncoder() { switch (this._ggufFileInfo.metadata?.general?.architecture) { case GgufArchitectureType.t5: case GgufArchitectureType.t5encoder: return true; } return false; } get hasDecoder() { switch (this._ggufFileInfo.metadata?.general?.architecture) { case GgufArchitectureType.t5encoder: return false; } return true; } get isRecurrent() { switch (this._ggufFileInfo.metadata?.general?.architecture) { case GgufArchitectureType.mamba: case GgufArchitectureType.mamba2: case GgufArchitectureType.rwkv6: case GgufArchitectureType.rwkv6qwen2: case GgufArchitectureType.rwkv7: case GgufArchitectureType.arwkv7: return true; } return false; } get supportsRanking() { if (this._supportsRanking != null) return this._supportsRanking; const layers = this._ggufFileInfo.fullTensorInfo ?? []; for (let i = layers.length - 1; i >= 0; i--) { const tensor = layers[i]; if (tensor == null) continue; if (tensor.name === "cls.weight" || tensor.name === "cls.output.weight") { this._supportsRanking = this.tokens.sepToken != null || this.tokens.eosToken != null || isRankingTemplateValid(parseRankingTemplate(this._ggufFileInfo.metadata?.tokenizer?.["chat_template.rerank"])); this._supportsRanking &&= !(this.hasEncoder && this.hasDecoder); // encoder-decoder models are not supported return this._supportsRanking; } } this._supportsRanking = false; return this._supportsRanking; } /** * The size of the SWA (Sliding Window Attention). * * When `undefined`, the model does not use sliding window attention. */ get swaSize() { const slidingWindow = this._ggufFileInfo?.architectureMetadata?.attention?.sliding_window; if (slidingWindow == null || slidingWindow <= 0) return undefined; const trainContextSize = this.trainContextSize; if (trainContextSize != null && slidingWindow >= trainContextSize) return undefined; return slidingWindow; } estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }) { const { cpu, gpu } = this._getTensorResourceSplit(gpuLayers); return { cpuRam: calculateTensorsSize(cpu, this._llama, false), gpuVram: calculateTensorsSize(gpu, this._llama, useMmap && gpuSupportsMmap) }; } /** * Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`. * The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that. * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now. */ estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, swaFullCache = false }) { if (sequences == null) sequences = getDefaultContextSequences(); if (batchSize == null) batchSize = getDefaultContextBatchSize({ contextSize, sequences }); const llmData = this._ggufFileInfo.architectureMetadata; const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const slidingWindow = this.swaSize ?? 0; const kvUnified = false; const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize && (this.trainContextSize == null || slidingWindow < this.trainContextSize); const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture); const nonSwaPercent = swaPattern <= 1 ? 1 : (1 / (swaPattern + (flashAttention ? -0.5 : -1))); // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp` const kvCachePadding = 1; const actualContextSize = kvUnified ? padSafeContextSize(sequences * contextSize, "up") : sequences * padSafeContextSize(contextSize, "up"); const kvSize = usingSWA ? ((1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) + nonSwaPercent * actualContextSize) : actualContextSize; const totalFileLayers = this._getTotalFileLayers(); const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalFileLayers, totalFileLayers)); const finalCpuLayers = totalFileLayers - finalGpuLayers; const usingGpu = finalGpuLayers !== 0; const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0; const embeddingSize = llmData.embedding_length ?? 0; const floatBytes = 4; // sizeof(float) const int32TBytes = 4; // sizeof(int32_t) const estimateOutput = (nOutputs) => { // source: `llama_context::output_reserve` in `llama-context.cpp` const nOutputsMax = Math.max(batchSize, nOutputs); const isT5 = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.t5; const hasLogits = isT5 || !isEmbeddingContext; const hasEmbd = isT5 || isEmbeddingContext; const logitsSize = hasLogits ? (vocabularySize * nOutputsMax) : 0; const embdSize = hasEmbd ? (embeddingSize * nOutputsMax) : 0; const outputBufferSize = (logitsSize + embdSize) * floatBytes; const outputIdsArr = int32TBytes * batchSize; return outputBufferSize + outputIdsArr; }; const estimateGraphOverheadMemory = () => { const s1MB = Math.pow(1024, 2); const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const expertCount = llmData?.expert_count ?? 0; const headCount = llmData?.attention?.head_count ?? 0; const embeddingLength = llmData?.embedding_length ?? 0; let defaultCalculationAdjustment = 0; if (batchSize == null) return 0; if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) { if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); } return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) { if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment -= (s1MB * 340) * (this.trainContextSize == null ? 1 : kvSize / this.trainContextSize); } else { defaultCalculationAdjustment -= (s1MB * 250) + ((s1MB * 50) * (this.trainContextSize == null ? 1 : kvSize / this.trainContextSize)); } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma) { // only works properly when all layers are on the GPU, which is why it's commented out: // return int32TBytes * batchSize * ((llmData.embedding_length ?? 0)); if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment += (s1MB * 40) - ((s1MB * 270) * (this.trainContextSize == null ? 1 : kvSize / this.trainContextSize)); } else { defaultCalculationAdjustment += -(s1MB * 550) + ((s1MB * 150) * (this.trainContextSize == null ? 1 : Math.max(0, (1 - (kvSize / this.trainContextSize))))); } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) { const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB); // if (modelGpuLayers === this.totalLayers) { // defaultCalculationAdjustment += -(s1MB * 20) + ( // (s1MB * 250) * ( // this.trainContextSize == null // ? 1 // : kvSize / this.trainContextSize // ) // ); // } else { // defaultCalculationAdjustment += -(s1MB * 40) + ( // (s1MB * 300) * ( // this.trainContextSize == null // ? 1 // : kvSize / this.trainContextSize // ) // ); // } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen3) { return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); } else if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); } const totalElements = tensorInfo.length === 0 ? this.totalLayers * (((llmData.embedding_length ?? 0) + (llmData.feed_forward_length ?? 0)) / 2) : tensorInfo.reduce((res, tensor) => { return res + tensor.dimensions.reduce((res, dim) => res + Number(dim), 0); }, 0); if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) { // magic numbers for estimation. will be improved in the future return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment; } // magic numbers for estimation. will be improved in the future return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment; }; const gpuKVCacheSize = usingGpu ? this._estimateKvMemorySizeInBytes(kvSize, finalGpuLayers < totalFileLayers ? (finalGpuLayers + 1) : finalGpuLayers) : 0; const cpuKVCacheSize = this._estimateKvMemorySizeInBytes(kvSize, finalCpuLayers); // source: `llama_context::graph_max_nodes` in `llama-context.cpp` const getMaxNodesMultiplier = (arch, nTokens) => { if (arch === GgufArchitectureType.qwen3next) return { min: nTokens * 40, multiplier: 32 }; return { min: 1024, multiplier: 8 }; }; const maxNodesMultiplier = getMaxNodesMultiplier(this._ggufFileInfo.metadata?.general?.architecture, Math.min(actualContextSize, batchSize)); const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * tensorInfo.length); const cpuNodes = maxNodesMultiplier.multiplier * (tensorInfo.length * (finalCpuLayers / totalFileLayers)); const gpuNodes = maxNodes - cpuNodes; const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) + this._llama._bindings.getGgmlGraphOverheadCustom(gpuNodes, false); const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) + this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false); const graphOverheadMemory = (flashAttention || !includeGraphOverhead) ? 0 : estimateGraphOverheadMemory(); const graphOverheadGpuSize = usingGpu ? Math.round(graphOverheadMemory * (finalGpuLayers / totalFileLayers)) : 0; const graphOverheadCpuSize = graphOverheadMemory - graphOverheadGpuSize; const outputBufferSize = estimateOutput(sequences); const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + outputBufferSize; const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize; return { cpuRam, gpuVram: usingGpu ? gpuVram : 0 }; } /** * Get the split tensor resources for CPU and GPU based on the number of GPU layers * @internal */ _getTensorResourceSplit(gpuLayers) { const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const architecture = this._ggufFileInfo.metadata?.general?.architecture; if (gpuLayers === 0) { return { cpu: tensorInfo, gpu: [] }; } const fileLayers = this._getFileLayers(); const startGpuLayer = Math.max(0, fileLayers - gpuLayers); const gpuTensors = []; const cpuTensors = []; let tokenEmbedLayer; let mainOutputLayer; for (const singleTensorInfo of tensorInfo) { if (isMainOutputLayer(singleTensorInfo.name)) mainOutputLayer = singleTensorInfo; else if (isTokenEmbedLayer(singleTensorInfo.name)) tokenEmbedLayer = singleTensorInfo; // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_INPUT` are always // loaded with `model.dev_input`, which is always set to the CPU if (isInputLayer(singleTensorInfo.name)) { cpuTensors.push(singleTensorInfo); continue; // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU } else if (isOutputLayer(singleTensorInfo.name)) { if (gpuLayers === this.totalLayers) { gpuTensors.push(singleTensorInfo); continue; } else { cpuTensors.push(singleTensorInfo); continue; } } const { layerNumber } = parseTensorName(singleTensorInfo.name); if (gpuLayers !== this.totalLayers) { if (architecture === GgufArchitectureType.qwen2 || architecture === GgufArchitectureType.gemma) { if (layerNumber != null && layerNumber >= startGpuLayer) gpuTensors.push(singleTensorInfo); else cpuTensors.push(singleTensorInfo); continue; } } if (layerNumber == null || layerNumber >= startGpuLayer) gpuTensors.push(singleTensorInfo); else cpuTensors.push(singleTensorInfo); } if (mainOutputLayer == null && tokenEmbedLayer != null && gpuLayers === this.totalLayers && !gpuTensors.includes(tokenEmbedLayer)) gpuTensors.push(tokenEmbedLayer); return { cpu: cpuTensors, gpu: gpuTensors }; } /** @internal */ _determineNumberOfLayersFromTensorInfo() { const layerNumbers = new Set(); for (const singleTensorInfo of (this._ggufFileInfo.fullTensorInfo ?? [])) { const { layerNumber } = parseTensorName(singleTensorInfo.name); if (layerNumber != null) layerNumbers.add(layerNumber); } return layerNumbers.size; } /** @internal */ _getFileLayers() { return this._ggufFileInfo.architectureMetadata.block_count ?? this._determineNumberOfLayersFromTensorInfo(); } /** @internal */ _estimateKvMemorySizeInBytes(kvSize, layers) { // source: `llama_kv_cache_init` in `llama.cpp` const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0; const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nHeadKv = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead; const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); const ssmDConv = this._ggufFileInfo.architectureMetadata.ssm?.conv_kernel ?? 0; const ssmDInner = this._ggufFileInfo.architectureMetadata.ssm?.inner_size ?? 0; const modelNEmbdKS = (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0) !== 0 ? (this._ggufFileInfo.architectureMetadata.token_shift_count ?? 0) * nEmbd : (ssmDConv > 0 ? (ssmDConv - 1) : 0) * ssmDInner; const ssmDState = this._ggufFileInfo.architectureMetadata.ssm?.state_size ?? 0; const modelNEmbdVS = (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0) !== 0 ? nEmbd * (this._ggufFileInfo.architectureMetadata.wkv?.head_size ?? 0) : ssmDState * ssmDInner; let totalElementsK = 0; let totalElementsV = 0; for (let i = 0; i < layers; i++) { const nHeadKvArrayItem = (typeof nHeadKv === "number") ? nHeadKv : nHeadKv[i] !== 0 ? nHeadKv[i] : nHead; const nEmbdKGqa = nEmbdHeadK * nHeadKvArrayItem; const nEmbdVGqa = nEmbdHeadV * nHeadKvArrayItem; const totalNEmbdKGqa = nEmbdKGqa + modelNEmbdKS; const totalNEmbdVGqa = nEmbdVGqa + modelNEmbdVS; totalElementsK += totalNEmbdKGqa * kvSize; totalElementsV += totalNEmbdVGqa * kvSize; } const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba // if `type_k` of `llama_context_params` changes to be configurable in `LlamaContext`, // this would have to depend on that value ? this._llama._consts.ggmlTypeF32Size : this._llama._consts.ggmlTypeF16Size; const valueTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba // if `type_v` of `llama_context_params` changes to be configurable in `LlamaContext`, // this would have to depend on that value ? this._llama._consts.ggmlTypeF32Size : this._llama._consts.ggmlTypeF16Size; return ((totalElementsK * keyTypeSize) + (totalElementsV * valueTypeSize)); } /** @internal */ _getTotalFileLayers() { if (this._totalFileLayers != null) return this._totalFileLayers; this._totalFileLayers = this._getFileLayers(); return this._totalFileLayers; } /** * @param ggufFileInfo * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance. * If you don't pass a `Llama` instance, a basic `Llama` instance is created as a fallback - it's a slim instance that * doesn't instantiate a `llama.cpp` backend, so it won't utilize the GPU at all, and be shared with other `GgufInsights` instances * that need a fallback `Llama` instance. */ static async from(ggufFileInfo, llama) { let resolvedLlama = llama; if (resolvedLlama == null) resolvedLlama = await getLlamaWithoutBackend(); return new GgufInsights(ggufFileInfo, resolvedLlama); } } function parseTensorName(tensorName) { if (tensorName == null) return { layerNumber: undefined }; const layerTensorPrefix = "blk."; if (!tensorName.startsWith(layerTensorPrefix)) return { layerNumber: undefined }; const dotIndex = tensorName.indexOf(".", layerTensorPrefix.length); const layerNumberString = tensorName.slice(layerTensorPrefix.length, dotIndex < 0 ? tensorName.length : dotIndex); const layerNumber = parseInt(layerNumberString); if (Number.isFinite(layerNumber)) return { layerNumber }; return { layerNumber: undefined }; } function calculateTensorsSize(tensorsInfo, llama, useMmap, startFromTensorDataOffset = false) { if (!useMmap) { let size = 0; for (const tensorInfo of tensorsInfo) size += calculateTensorSize(tensorInfo, llama); return size; } const fileStats = new Map(); for (const tensorInfo of tensorsInfo) { let stats = fileStats.get(tensorInfo.filePart); if (stats == null) { stats = { tensorsSize: 0 }; fileStats.set(tensorInfo.filePart, stats); } const tensorSize = calculateTensorSize(tensorInfo, llama); stats.tensorsSize += tensorSize; const startOffset = tensorInfo.offset; const endOffset = typeof startOffset === "number" ? startOffset + tensorSize : startOffset + BigInt(tensorSize); if (startFromTensorDataOffset) stats.startOffset = Number(BigInt(tensorInfo.fileOffset) - BigInt(tensorInfo.offset)); else if (stats.startOffset == null || startOffset < stats.startOffset) stats.startOffset = startOffset; if (stats.endOffset == null || endOffset > stats.endOffset) stats.endOffset = endOffset; } let size = 0; for (const [, stats] of fileStats) { const offsetSize = (stats.endOffset == null || stats.startOffset == null) ? 0 : Number(BigInt(stats.endOffset) - BigInt(stats.startOffset)); const tensorsSize = stats.tensorsSize; size += Math.max(offsetSize, tensorsSize); } return size; } function calculateTensorSize(tensor, llama) { const typeSize = llama._bindings.getTypeSizeForGgmlType(tensor.ggmlType); const blockSize = llama._bindings.getBlockSizeForGgmlType(tensor.ggmlType); const ggmlMaxDims = llama._consts.ggmlMaxDims; if (typeSize == null || blockSize == null) throw new Error("Invalid type or block size"); const { ne, nb } = getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }); if (blockSize === 1) { let totalBytes = typeSize; for (let i = 0; i < ggmlMaxDims; i++) { totalBytes += (ne[i] - 1) * nb[i]; } return totalBytes; } else { let totalBytes = Math.floor((ne[0] * nb[0]) / blockSize); for (let i = 1; i < ggmlMaxDims; i++) { totalBytes += (ne[i] - 1) * nb[i]; } return totalBytes; } } function getTensorNeAndNb(tensor, { typeSize, blockSize, ggmlMaxDims }) { // number of elements // source: `ggml_new_tensor_impl` in `ggml.c` const ne = [ ...tensor.dimensions, ...(Array(Math.max(0, ggmlMaxDims - tensor.dimensions.length)).fill(1)) ].slice(0, ggmlMaxDims); // number of bytes // source: `ggml_new_tensor_impl` in `ggml.c` const nb = [ typeSize, Math.floor(typeSize * (ne[0] / blockSize)), ...Array(ggmlMaxDims - 2).fill(0) ]; for (let i = 2; i < ggmlMaxDims; i++) { nb[i] = nb[i - 1] * ne[i - 1]; } return { ne, nb }; } function isInputLayer(layerName) { const [firstPart] = layerName.split("."); if (firstPart == null) return false; // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT` switch (firstPart) { case "token_embd": case "token_embd_norm": case "token_types": case "position_embd": return true; } return false; } function isOutputLayer(layerName) { const [firstPart, secondPart] = layerName.split("."); if (firstPart == null) return false; // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT` switch (firstPart) { case "output": case "output_norm": case "cls": return true; } if (secondPart == null) return false; // source: in `llama.cpp`, all tensor names from `LLM_TENSOR_NAMES` where // in `llm_tensor_info_mapping` have a mapping to `LLM_TENSOR_LAYER_INPUT` switch (firstPart + "." + secondPart) { case "cls.output": case "dec.output_norm": case "enc.output_norm": return true; } return false; } function isMainOutputLayer(layerName) { const [firstPart] = layerName.split("."); return firstPart === "output"; } function isTokenEmbedLayer(layerName) { const [firstPart] = layerName.split("."); return firstPart === "token_embd"; } function ggmlPad(value, padding) { return ((value + padding - 1) & ~(padding - 1)); } function getSwaPatternForArchitecture(architecture) { // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern` switch (architecture) { case GgufArchitectureType.llama4: return 4; case GgufArchitectureType.phi3: return 1; case GgufArchitectureType.gemma2: return 2; case GgufArchitectureType.gemma3: return 6; case GgufArchitectureType.gemma3n: return 5; case GgufArchitectureType.cohere2: return 4; case GgufArchitectureType.exaone4: return 4; case GgufArchitectureType.gptOss: return 2; case GgufArchitectureType.smallthinker: return 4; } return 1; } export function parseRankingTemplate(template) { if (template == null) return undefined; return template .replaceAll("{query}", "{{query}}") .replaceAll("{document}", "{{document}}"); } export function isRankingTemplateValid(template) { return template != null && template.includes("{{query}}") && template.includes("{{document}}"); } //# sourceMappingURL=GgufInsights.js.map