UNPKG

@huggingface/gguf

Version:

a GGUF parser that works on remotely hosted files

github.com/huggingface/huggingface.js

huggingface/huggingface.js

585 lines (578 loc) • 23.5 kB

JavaScript

// src/transformer-llm.ts var LLM_ARCHITECTURES = [ "llama", "deci", "falcon", "grok", "gpt2", "gptj", "gptneox", "mpt", "baichuan", "starcoder", "refact", "bert", "nomic-bert", "jina-bert-v2", "bloom", "stablelm", "qwen", "qwen2", "qwen2moe", "qwen2vl", "phi2", "phi3", "phimoe", "plamo", "codeshell", "orion", "internlm2", "minicpm", "minicpm3", "gemma", "gemma2", "starcoder2", "mamba", "xverse", "command-r", "cohere2", "dbrx", "olmo", "olmo2", "olmoe", "openelm", "arctic", "deepseek", "deepseek2", "chatglm", "bitnet", "t5", "t5encoder", "jais", "nemotron", "exaone", "rwkv6", "rwkv6qwen2", "granite", "granitemoe", "chameleon", "wavtokenizer-dec" ]; // src/types.ts import { GGMLQuantizationType } from "@huggingface/tasks"; var GGUFValueType = /* @__PURE__ */ ((GGUFValueType2) => { GGUFValueType2[GGUFValueType2["UINT8"] = 0] = "UINT8"; GGUFValueType2[GGUFValueType2["INT8"] = 1] = "INT8"; GGUFValueType2[GGUFValueType2["UINT16"] = 2] = "UINT16"; GGUFValueType2[GGUFValueType2["INT16"] = 3] = "INT16"; GGUFValueType2[GGUFValueType2["UINT32"] = 4] = "UINT32"; GGUFValueType2[GGUFValueType2["INT32"] = 5] = "INT32"; GGUFValueType2[GGUFValueType2["FLOAT32"] = 6] = "FLOAT32"; GGUFValueType2[GGUFValueType2["BOOL"] = 7] = "BOOL"; GGUFValueType2[GGUFValueType2["STRING"] = 8] = "STRING"; GGUFValueType2[GGUFValueType2["ARRAY"] = 9] = "ARRAY"; GGUFValueType2[GGUFValueType2["UINT64"] = 10] = "UINT64"; GGUFValueType2[GGUFValueType2["INT64"] = 11] = "INT64"; GGUFValueType2[GGUFValueType2["FLOAT64"] = 12] = "FLOAT64"; return GGUFValueType2; })(GGUFValueType || {}); var ARCHITECTURES = [...LLM_ARCHITECTURES, "rwkv", "whisper"]; // src/utils/isBackend.ts var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined"; var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope"; var isBackend = !isBrowser && !isWebWorker; // src/utils/promisesQueue.ts async function promisesQueue(factories, concurrency) { const results = []; const executing = /* @__PURE__ */ new Set(); let index = 0; for (const factory of factories) { const closureIndex = index++; const e = factory().then((r) => { results[closureIndex] = r; executing.delete(e); }); executing.add(e); if (executing.size >= concurrency) { await Promise.race(executing); } } await Promise.all(executing); return results; } // src/quant-descriptions.ts var GGUF_QUANT_DESCRIPTIONS = { [GGMLQuantizationType.F32]: { txt: "32-bit standard IEEE 754 single-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format" }, [GGMLQuantizationType.F16]: { txt: "16-bit standard IEEE 754 half-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format" }, [GGMLQuantizationType.Q8_0]: { txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [GGMLQuantizationType.Q8_1]: { txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [GGMLQuantizationType.Q8_K]: { txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [GGMLQuantizationType.Q6_K]: { txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [GGMLQuantizationType.Q5_0]: { txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [GGMLQuantizationType.Q5_1]: { txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [GGMLQuantizationType.Q5_K]: { txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [GGMLQuantizationType.Q4_0]: { txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [GGMLQuantizationType.Q4_1]: { txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [GGMLQuantizationType.Q4_K]: { txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [GGMLQuantizationType.Q3_K]: { txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [GGMLQuantizationType.Q2_K]: { txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.625 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [GGMLQuantizationType.IQ4_XS]: { txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [GGMLQuantizationType.IQ3_S]: { txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [GGMLQuantizationType.IQ3_XXS]: { txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [GGMLQuantizationType.IQ2_S]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [GGMLQuantizationType.IQ2_XS]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [GGMLQuantizationType.IQ2_XXS]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [GGMLQuantizationType.IQ1_S]: { txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [GGMLQuantizationType.IQ4_NL]: { txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.", src_url: "https://github.com/ggerganov/llama.cpp/pull/5590" }, [GGMLQuantizationType.I8]: { txt: "8-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [GGMLQuantizationType.I16]: { txt: "16-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [GGMLQuantizationType.I32]: { txt: "32-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [GGMLQuantizationType.I64]: { txt: "64-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6062" }, [GGMLQuantizationType.F64]: { txt: "64-bit standard IEEE 754 double-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format" }, [GGMLQuantizationType.IQ1_M]: { txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6302" }, [GGMLQuantizationType.BF16]: { txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format" }, [GGMLQuantizationType.TQ1_0]: { txt: "Ternary quantization.", src_url: "https://github.com/ggml-org/llama.cpp/pull/8151" }, [GGMLQuantizationType.TQ2_0]: { txt: "Ternary quantization.", src_url: "https://github.com/ggml-org/llama.cpp/pull/8151" } }; var QK_K = 256; var calcBPW = (blockSize, typeSize) => { return typeSize * 8 / blockSize; }; var GGML_QUANT_SIZES = { [GGMLQuantizationType.F32]: calcBPW(1, 4), [GGMLQuantizationType.F16]: calcBPW(1, 2), [GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16), [GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16), [GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16), [GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16), [GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32), [GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32), [GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4), [GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12), [GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12), [GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12), [GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16), [GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8), [GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4), [GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32), [GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8), [GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16), [GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16), [GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4), [GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16), [GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64), [GGMLQuantizationType.I8]: calcBPW(1, 1), [GGMLQuantizationType.I16]: calcBPW(1, 2), [GGMLQuantizationType.I32]: calcBPW(1, 4), [GGMLQuantizationType.I64]: calcBPW(1, 8), [GGMLQuantizationType.F64]: calcBPW(1, 8), [GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32), [GGMLQuantizationType.BF16]: calcBPW(1, 2), [GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13), [GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64) }; // src/gguf.ts import { parseGGUFQuantLabel, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL, GGUF_QUANT_ORDER, findNearestQuantType, GGMLFileQuantizationType } from "@huggingface/tasks"; var RE_GGUF_FILE = /\.gguf$/; var RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/; var GGUF_DEFAULT_ALIGNMENT = 32; var GGML_PAD = (x, n) => x + n - 1 & ~(n - 1); var PARALLEL_DOWNLOADS = 20; function parseGgufShardFilename(filename) { const match = RE_GGUF_SHARD_FILE.exec(filename); if (match && match.groups) { return { prefix: match.groups["prefix"], shard: match.groups["shard"], total: match.groups["total"] }; } return null; } var isVersion = (version) => version === 1 || version === 2 || version === 3; var ggufMagicNumber = new Uint8Array([71, 71, 85, 70]); function isGGUFValueType(n) { return typeof GGUFValueType[n] === "string"; } var HTTP_CHUNK_SIZE = 2 * 10 ** 6; var HTTP_DATA_LEEWAY = 5 * 10 ** 5; var HTTP_TOTAL_MAX_SIZE = 50 * 10 ** 6; var RangeView = class { constructor(uri, params) { this.uri = uri; this.params = params; this.chunk = 0; this.buffer = new ArrayBuffer(0, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); this.dataView = new DataView(this.buffer); } chunk; buffer; dataView; get view() { return this.dataView; } /** * Fetch a new chunk from the server */ async fetchChunk() { const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buf = new Uint8Array( await (await (this.params?.fetch ?? fetch)(this.uri, { headers: { ...this.params?.additionalFetchHeaders ?? {}, Range: `bytes=${range[0]}-${range[1]}` } })).arrayBuffer() ); this.appendBuffer(buf); this.chunk += 1; } /** * Append new data into the buffer */ appendBuffer(buf) { if (ArrayBuffer.prototype.resize) { this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE); new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE); } else { const newBuffer = new ArrayBuffer((this.chunk + 1) * HTTP_CHUNK_SIZE, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); const arrView = new Uint8Array(newBuffer); arrView.set(new Uint8Array(this.buffer)); arrView.set(buf, this.chunk * HTTP_CHUNK_SIZE); this.buffer = newBuffer; this.dataView = new DataView(this.buffer); } } /** * Check whether we need to fetch a new chunk */ async fetchChunkIfNeeded(offset) { if (this.dataView.byteLength - offset < HTTP_DATA_LEEWAY) { await this.fetchChunk(); } } }; var RangeViewLocalFile = class extends RangeView { /** * Read a new chunk from local file system. */ async fetchChunk() { const { FileBlob } = await import("./FileBlob-GRW5ZULE.mjs"); const blob = await FileBlob.create(this.uri); const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buffer = await blob.slice(range[0], range[1]).arrayBuffer(); this.appendBuffer(new Uint8Array(buffer)); this.chunk += 1; } }; function readVersionedSize(view, byteOffset, version, littleEndian) { switch (version) { case 1: { const n = view.getUint32(byteOffset, littleEndian); return { value: BigInt(n), length: 4 }; } case 2: case 3: { return { value: view.getBigUint64(byteOffset, littleEndian), length: 8 }; } } } function readString(view, offset, version, littleEndian) { const length = readVersionedSize(view, offset, version, littleEndian); const off = length.length; const value = new TextDecoder().decode(view.buffer.slice(offset + off, offset + off + Number(length.value))); return { value, length: off + Number(length.value) }; } function readMetadataValue(view, type, offset, version, littleEndian) { switch (type) { case 0 /* UINT8 */: return { value: view.getUint8(offset), length: 1 }; case 1 /* INT8 */: return { value: view.getInt8(offset), length: 1 }; case 2 /* UINT16 */: return { value: view.getUint16(offset, littleEndian), length: 2 }; case 3 /* INT16 */: return { value: view.getInt16(offset, littleEndian), length: 2 }; case 4 /* UINT32 */: return { value: view.getUint32(offset, littleEndian), length: 4 }; case 5 /* INT32 */: return { value: view.getInt32(offset, littleEndian), length: 4 }; case 6 /* FLOAT32 */: return { value: view.getFloat32(offset, littleEndian), length: 4 }; case 7 /* BOOL */: return { value: view.getUint8(offset) !== 0, length: 1 }; case 8 /* STRING */: return readString(view, offset, version, littleEndian); case 9 /* ARRAY */: { const arrayType = view.getUint32(offset, littleEndian); const arrayLength = readVersionedSize(view, offset + 4, version, littleEndian); let length = 4 + arrayLength.length; const arrayValues = []; for (let i = 0; i < arrayLength.value; i++) { const metadataValue = readMetadataValue(view, arrayType, offset + length, version, littleEndian); arrayValues.push(metadataValue.value); length += metadataValue.length; } return { value: arrayValues, length }; } case 10 /* UINT64 */: return { value: view.getBigUint64(offset, littleEndian), length: 8 }; case 11 /* INT64 */: return { value: view.getBigInt64(offset, littleEndian), length: 8 }; case 12 /* FLOAT64 */: return { value: view.getFloat64(offset, littleEndian), length: 8 }; } } async function gguf(uri, params) { let r; if (isBackend) { if (uri.match(/^https?:\/\//)) { r = new RangeView(uri, params); } else if (params?.allowLocalFile) { r = new RangeViewLocalFile(uri, params); } else { throw new Error("Access to local file is not enabled, please set allowLocalFile to true"); } } else { if (params?.allowLocalFile) { throw new Error("allowLocalFile cannot be used on browser"); } r = new RangeView(uri, params); } await r.fetchChunk(); const checkBuffer = (buffer, header) => { for (let i = 0; i < header.length; i++) { if (header[i] !== buffer[i]) { return false; } } return true; }; if (!checkBuffer(new Uint8Array(r.view.buffer.slice(0, 4)), ggufMagicNumber)) { throw new Error("not a valid gguf file: not starting with GGUF magic number"); } const [littleEndian, version] = (() => { const version2 = r.view.getUint32(4, true); if (version2 & 65535) { return [true, version2]; } else { return [false, r.view.getUint32(4, false)]; } })(); if (!isVersion(version)) { throw new Error(`not a valid gguf file: unsupported version "${version}"`); } let offset = 8; const tensorCount = readVersionedSize(r.view, offset, version, littleEndian); offset += tensorCount.length; const numKv = readVersionedSize(r.view, offset, version, littleEndian); offset += numKv.length; const metadata = { version, tensor_count: tensorCount.value, kv_count: numKv.value }; for (let i = 0; i < numKv.value; i++) { await r.fetchChunkIfNeeded(offset); const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; const valueType = r.view.getUint32(offset, littleEndian); offset += 4; if (!isGGUFValueType(valueType)) { throw new Error("Unsupported metadata type: " + valueType); } let valueResult; while (!valueResult) { try { valueResult = readMetadataValue(r.view, valueType, offset, version, littleEndian); } catch (err) { if (err instanceof RangeError) { await r.fetchChunk(); } else { throw err; } } } offset += valueResult.length; metadata[keyResult.value] = valueResult.value; } const tensorInfos = []; for (let i = 0; i < tensorCount.value; i++) { await r.fetchChunkIfNeeded(offset); const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; const nDims = r.view.getUint32(offset, littleEndian); offset += 4; const shape = []; for (let dim = 0; dim < nDims; dim++) { const shapeDim = readVersionedSize(r.view, offset, version, littleEndian); shape.push(shapeDim.value); offset += shapeDim.length; } const type = r.view.getUint32(offset, littleEndian); offset += 4; const tensorOffset = r.view.getBigUint64(offset, littleEndian); offset += 8; tensorInfos.push({ name: keyResult.value, n_dims: nDims, shape, dtype: type, offset: tensorOffset }); } const alignment = Number(metadata["general.alignment"] ?? GGUF_DEFAULT_ALIGNMENT); const tensorDataOffset = BigInt(GGML_PAD(offset, alignment)); if (params?.computeParametersCount) { const parameterCount = tensorInfos.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)).reduce((acc, val) => acc + val, 0); return { metadata, tensorInfos, tensorDataOffset, parameterCount }; } else { return { metadata, tensorInfos, tensorDataOffset }; } } async function ggufAllShards(url, params) { const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS; if (parallelDownloads < 1) { throw new TypeError("parallelDownloads must be greater than 0"); } const ggufShardFileInfo = parseGgufShardFilename(url); if (ggufShardFileInfo) { const total = parseInt(ggufShardFileInfo.total); const prefix = ggufShardFileInfo.prefix; const urls = []; for (let shardIdx = 1; shardIdx <= total; shardIdx++) { urls.push(`${prefix}-${shardIdx.toString().padStart(5, "0")}-of-${total.toString().padStart(5, "0")}.gguf`); } const shards = await promisesQueue( urls.map((shardUrl) => () => gguf(shardUrl, { ...params, computeParametersCount: true })), parallelDownloads ); return { shards, parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0) }; } else { const { metadata, tensorInfos, tensorDataOffset, parameterCount } = await gguf(url, { ...params, computeParametersCount: true }); return { shards: [{ metadata, tensorInfos, tensorDataOffset }], parameterCount }; } } export { GGUFValueType, GGMLQuantizationType, GGUF_QUANT_DESCRIPTIONS, GGML_QUANT_SIZES, RE_GGUF_FILE, RE_GGUF_SHARD_FILE, parseGgufShardFilename, gguf, ggufAllShards, parseGGUFQuantLabel, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL, GGUF_QUANT_ORDER, findNearestQuantType, GGMLFileQuantizationType };