UNPKG

@huggingface/gguf

Version:

a GGUF parser that works on remotely hosted files

github.com/huggingface/huggingface.js

huggingface/huggingface.js

953 lines (944 loc) • 37.6 kB

JavaScript

"use strict"; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __esm = (fn, res) => function __init() { return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res; }; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/utils/FileBlob.ts var FileBlob_exports = {}; __export(FileBlob_exports, { FileBlob: () => FileBlob }); var import_node_fs, import_promises, import_node_stream, import_node_url, FileBlob; var init_FileBlob = __esm({ "src/utils/FileBlob.ts"() { "use strict"; import_node_fs = require("fs"); import_promises = require("fs/promises"); import_node_stream = require("stream"); import_node_url = require("url"); FileBlob = class extends Blob { /** * Creates a new FileBlob on the provided file. * * @param path Path to the file to be lazy readed */ static async create(path) { path = path instanceof URL ? (0, import_node_url.fileURLToPath)(path) : path; const { size } = await (0, import_promises.stat)(path); const fileBlob = new FileBlob(path, 0, size); return fileBlob; } path; start; end; constructor(path, start, end) { super(); this.path = path; this.start = start; this.end = end; } /** * Returns the size of the blob. */ get size() { return this.end - this.start; } /** * Returns a new instance of FileBlob that is a slice of the current one. * * The slice is inclusive of the start and exclusive of the end. * * The slice method does not supports negative start/end. * * @param start beginning of the slice * @param end end of the slice */ slice(start = 0, end = this.size) { if (start < 0 || end < 0) { new TypeError("Unsupported negative start/end on FileBlob.slice"); } const slice = new FileBlob(this.path, this.start + start, Math.min(this.start + end, this.end)); return slice; } /** * Read the part of the file delimited by the FileBlob and returns it as an ArrayBuffer. */ async arrayBuffer() { const slice = await this.execute((file) => file.read(Buffer.alloc(this.size), 0, this.size, this.start)); return slice.buffer; } /** * Read the part of the file delimited by the FileBlob and returns it as a string. */ async text() { const buffer = await this.arrayBuffer(); return buffer.toString("utf8"); } /** * Returns a stream around the part of the file delimited by the FileBlob. */ stream() { return import_node_stream.Readable.toWeb((0, import_node_fs.createReadStream)(this.path, { start: this.start, end: this.end - 1 })); } /** * We are opening and closing the file for each action to prevent file descriptor leaks. * * It is an intended choice of developer experience over performances. */ async execute(action) { const file = await (0, import_promises.open)(this.path, "r"); try { return await action(file); } finally { await file.close(); } } }; } }); // src/index.ts var src_exports = {}; __export(src_exports, { GGMLFileQuantizationType: () => import_tasks2.GGMLFileQuantizationType, GGMLQuantizationType: () => import_tasks.GGMLQuantizationType, GGUFValueType: () => GGUFValueType, GGUF_QUANT_DESCRIPTIONS: () => GGUF_QUANT_DESCRIPTIONS, GGUF_QUANT_ORDER: () => import_tasks2.GGUF_QUANT_ORDER, GGUF_QUANT_RE: () => import_tasks2.GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL: () => import_tasks2.GGUF_QUANT_RE_GLOBAL, RE_GGUF_FILE: () => RE_GGUF_FILE, RE_GGUF_SHARD_FILE: () => RE_GGUF_SHARD_FILE, findNearestQuantType: () => import_tasks2.findNearestQuantType, gguf: () => gguf, ggufAllShards: () => ggufAllShards, parseGGUFQuantLabel: () => import_tasks2.parseGGUFQuantLabel, parseGgufShardFilename: () => parseGgufShardFilename, serializeGgufMetadata: () => serializeGgufMetadata }); module.exports = __toCommonJS(src_exports); // src/transformer-llm.ts var LLM_ARCHITECTURES = [ "llama", "deci", "falcon", "grok", "gpt2", "gptj", "gptneox", "mpt", "baichuan", "starcoder", "refact", "bert", "nomic-bert", "jina-bert-v2", "bloom", "stablelm", "qwen", "qwen2", "qwen2moe", "qwen2vl", "phi2", "phi3", "phimoe", "plamo", "codeshell", "orion", "internlm2", "minicpm", "minicpm3", "gemma", "gemma2", "starcoder2", "mamba", "xverse", "command-r", "cohere2", "dbrx", "olmo", "olmo2", "olmoe", "openelm", "arctic", "deepseek", "deepseek2", "chatglm", "bitnet", "t5", "t5encoder", "jais", "nemotron", "exaone", "rwkv6", "rwkv6qwen2", "granite", "granitemoe", "chameleon", "wavtokenizer-dec" ]; // src/types.ts var import_tasks = require("@huggingface/tasks"); var GGUFValueType = /* @__PURE__ */ ((GGUFValueType2) => { GGUFValueType2[GGUFValueType2["UINT8"] = 0] = "UINT8"; GGUFValueType2[GGUFValueType2["INT8"] = 1] = "INT8"; GGUFValueType2[GGUFValueType2["UINT16"] = 2] = "UINT16"; GGUFValueType2[GGUFValueType2["INT16"] = 3] = "INT16"; GGUFValueType2[GGUFValueType2["UINT32"] = 4] = "UINT32"; GGUFValueType2[GGUFValueType2["INT32"] = 5] = "INT32"; GGUFValueType2[GGUFValueType2["FLOAT32"] = 6] = "FLOAT32"; GGUFValueType2[GGUFValueType2["BOOL"] = 7] = "BOOL"; GGUFValueType2[GGUFValueType2["STRING"] = 8] = "STRING"; GGUFValueType2[GGUFValueType2["ARRAY"] = 9] = "ARRAY"; GGUFValueType2[GGUFValueType2["UINT64"] = 10] = "UINT64"; GGUFValueType2[GGUFValueType2["INT64"] = 11] = "INT64"; GGUFValueType2[GGUFValueType2["FLOAT64"] = 12] = "FLOAT64"; return GGUFValueType2; })(GGUFValueType || {}); var ARCHITECTURES = [...LLM_ARCHITECTURES, "rwkv", "whisper"]; // src/utils/isBackend.ts var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined"; var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope"; var isBackend = !isBrowser && !isWebWorker; // src/utils/promisesQueue.ts async function promisesQueue(factories, concurrency) { const results = []; const executing = /* @__PURE__ */ new Set(); let index = 0; for (const factory of factories) { const closureIndex = index++; const e = factory().then((r) => { results[closureIndex] = r; executing.delete(e); }); executing.add(e); if (executing.size >= concurrency) { await Promise.race(executing); } } await Promise.all(executing); return results; } // src/quant-descriptions.ts var GGUF_QUANT_DESCRIPTIONS = { [import_tasks.GGMLQuantizationType.F32]: { txt: "32-bit standard IEEE 754 single-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format" }, [import_tasks.GGMLQuantizationType.F16]: { txt: "16-bit standard IEEE 754 half-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format" }, [import_tasks.GGMLQuantizationType.Q8_0]: { txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [import_tasks.GGMLQuantizationType.Q8_1]: { txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [import_tasks.GGMLQuantizationType.Q8_K]: { txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q6_K]: { txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q5_0]: { txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [import_tasks.GGMLQuantizationType.Q5_1]: { txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [import_tasks.GGMLQuantizationType.Q5_K]: { txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q4_0]: { txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [import_tasks.GGMLQuantizationType.Q4_1]: { txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [import_tasks.GGMLQuantizationType.Q4_K]: { txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q3_K]: { txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q2_K]: { txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.625 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.IQ4_XS]: { txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ3_S]: { txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ3_XXS]: { txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ2_S]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ2_XS]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ2_XXS]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ1_S]: { txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ4_NL]: { txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.", src_url: "https://github.com/ggerganov/llama.cpp/pull/5590" }, [import_tasks.GGMLQuantizationType.I8]: { txt: "8-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [import_tasks.GGMLQuantizationType.I16]: { txt: "16-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [import_tasks.GGMLQuantizationType.I32]: { txt: "32-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [import_tasks.GGMLQuantizationType.I64]: { txt: "64-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6062" }, [import_tasks.GGMLQuantizationType.F64]: { txt: "64-bit standard IEEE 754 double-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format" }, [import_tasks.GGMLQuantizationType.IQ1_M]: { txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6302" }, [import_tasks.GGMLQuantizationType.BF16]: { txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format" }, [import_tasks.GGMLQuantizationType.TQ1_0]: { txt: "Ternary quantization.", src_url: "https://github.com/ggml-org/llama.cpp/pull/8151" }, [import_tasks.GGMLQuantizationType.TQ2_0]: { txt: "Ternary quantization.", src_url: "https://github.com/ggml-org/llama.cpp/pull/8151" }, [import_tasks.GGMLQuantizationType.MXFP4]: { txt: "4-bit Microscaling Block Floating Point.", src_url: "https://github.com/ggml-org/llama.cpp/pull/15091" } }; var QK_K = 256; var calcBPW = (blockSize, typeSize) => { return typeSize * 8 / blockSize; }; var GGML_QUANT_SIZES = { [import_tasks.GGMLQuantizationType.F32]: calcBPW(1, 4), [import_tasks.GGMLQuantizationType.F16]: calcBPW(1, 2), [import_tasks.GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16), [import_tasks.GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16), [import_tasks.GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16), [import_tasks.GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16), [import_tasks.GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32), [import_tasks.GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32), [import_tasks.GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4), [import_tasks.GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12), [import_tasks.GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12), [import_tasks.GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12), [import_tasks.GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16), [import_tasks.GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8), [import_tasks.GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4), [import_tasks.GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32), [import_tasks.GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8), [import_tasks.GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16), [import_tasks.GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16), [import_tasks.GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4), [import_tasks.GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16), [import_tasks.GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64), [import_tasks.GGMLQuantizationType.I8]: calcBPW(1, 1), [import_tasks.GGMLQuantizationType.I16]: calcBPW(1, 2), [import_tasks.GGMLQuantizationType.I32]: calcBPW(1, 4), [import_tasks.GGMLQuantizationType.I64]: calcBPW(1, 8), [import_tasks.GGMLQuantizationType.F64]: calcBPW(1, 8), [import_tasks.GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32), [import_tasks.GGMLQuantizationType.BF16]: calcBPW(1, 2), [import_tasks.GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13), [import_tasks.GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64), [import_tasks.GGMLQuantizationType.MXFP4]: calcBPW(32, 1 + 16) }; // src/gguf.ts var import_tasks2 = require("@huggingface/tasks"); var RE_GGUF_FILE = /\.gguf$/; var RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/; var GGUF_DEFAULT_ALIGNMENT = 32; var GGML_PAD = (x, n) => x + n - 1 & ~(n - 1); var PARALLEL_DOWNLOADS = 20; var GGUF_MAGIC_NUMBER = new Uint8Array([71, 71, 85, 70]); function parseGgufShardFilename(filename) { const match = RE_GGUF_SHARD_FILE.exec(filename); if (match && match.groups) { return { prefix: match.groups["prefix"], shard: match.groups["shard"], total: match.groups["total"] }; } return null; } var isVersion = (version) => version === 1 || version === 2 || version === 3; function isGGUFValueType(n) { return typeof GGUFValueType[n] === "string"; } var HTTP_CHUNK_SIZE = 2 * 10 ** 6; var HTTP_DATA_LEEWAY = 5 * 10 ** 5; var HTTP_TOTAL_MAX_SIZE = 50 * 10 ** 6; var RangeView = class { constructor(uri, params) { this.uri = uri; this.params = params; this.chunk = 0; this.buffer = new ArrayBuffer(0, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); this.dataView = new DataView(this.buffer); } chunk; buffer; dataView; get view() { return this.dataView; } /** * Fetch a new chunk from the server */ async fetchChunk() { const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buf = new Uint8Array( await (await (this.params?.fetch ?? fetch)(this.uri, { headers: { ...this.params?.additionalFetchHeaders ?? {}, Range: `bytes=${range[0]}-${range[1]}` } })).arrayBuffer() ); this.appendBuffer(buf); this.chunk += 1; } /** * Append new data into the buffer */ appendBuffer(buf) { if (ArrayBuffer.prototype.resize) { this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE); new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE); } else { const newBuffer = new ArrayBuffer((this.chunk + 1) * HTTP_CHUNK_SIZE, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); const arrView = new Uint8Array(newBuffer); arrView.set(new Uint8Array(this.buffer)); arrView.set(buf, this.chunk * HTTP_CHUNK_SIZE); this.buffer = newBuffer; this.dataView = new DataView(this.buffer); } } /** * Check whether we need to fetch a new chunk */ async fetchChunkIfNeeded(offset) { if (this.dataView.byteLength - offset < HTTP_DATA_LEEWAY) { await this.fetchChunk(); } } }; var RangeViewLocalFile = class extends RangeView { /** * Read a new chunk from local file system. */ async fetchChunk() { const { FileBlob: FileBlob2 } = await Promise.resolve().then(() => (init_FileBlob(), FileBlob_exports)); const blob = await FileBlob2.create(this.uri); const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buffer = await blob.slice(range[0], range[1]).arrayBuffer(); this.appendBuffer(new Uint8Array(buffer)); this.chunk += 1; } }; function readVersionedSize(view, byteOffset, version, littleEndian) { switch (version) { case 1: { const n = view.getUint32(byteOffset, littleEndian); return { value: BigInt(n), length: 4 }; } case 2: case 3: { return { value: view.getBigUint64(byteOffset, littleEndian), length: 8 }; } } } function readString(view, offset, version, littleEndian) { const length = readVersionedSize(view, offset, version, littleEndian); const off = length.length; const value = new TextDecoder().decode(view.buffer.slice(offset + off, offset + off + Number(length.value))); return { value, length: off + Number(length.value) }; } function readMetadataValue(view, type, offset, version, littleEndian) { switch (type) { case 0 /* UINT8 */: return { value: view.getUint8(offset), length: 1 }; case 1 /* INT8 */: return { value: view.getInt8(offset), length: 1 }; case 2 /* UINT16 */: return { value: view.getUint16(offset, littleEndian), length: 2 }; case 3 /* INT16 */: return { value: view.getInt16(offset, littleEndian), length: 2 }; case 4 /* UINT32 */: return { value: view.getUint32(offset, littleEndian), length: 4 }; case 5 /* INT32 */: return { value: view.getInt32(offset, littleEndian), length: 4 }; case 6 /* FLOAT32 */: return { value: view.getFloat32(offset, littleEndian), length: 4 }; case 7 /* BOOL */: return { value: view.getUint8(offset) !== 0, length: 1 }; case 8 /* STRING */: return readString(view, offset, version, littleEndian); case 9 /* ARRAY */: { const arrayType = view.getUint32(offset, littleEndian); const arrayLength = readVersionedSize(view, offset + 4, version, littleEndian); let length = 4 + arrayLength.length; const arrayValues = []; for (let i = 0; i < arrayLength.value; i++) { const metadataValue = readMetadataValue(view, arrayType, offset + length, version, littleEndian); arrayValues.push(metadataValue.value); length += metadataValue.length; } return { value: arrayValues, length }; } case 10 /* UINT64 */: return { value: view.getBigUint64(offset, littleEndian), length: 8 }; case 11 /* INT64 */: return { value: view.getBigInt64(offset, littleEndian), length: 8 }; case 12 /* FLOAT64 */: return { value: view.getFloat64(offset, littleEndian), length: 8 }; } } async function gguf(uri, params) { let r; if (isBackend) { if (uri.match(/^https?:\/\//)) { r = new RangeView(uri, params); } else if (params?.allowLocalFile) { r = new RangeViewLocalFile(uri, params); } else { throw new Error("Access to local file is not enabled, please set allowLocalFile to true"); } } else { if (params?.allowLocalFile) { throw new Error("allowLocalFile cannot be used on browser"); } r = new RangeView(uri, params); } await r.fetchChunk(); const checkBuffer = (buffer, header) => { for (let i = 0; i < header.length; i++) { if (header[i] !== buffer[i]) { return false; } } return true; }; if (!checkBuffer(new Uint8Array(r.view.buffer.slice(0, 4)), GGUF_MAGIC_NUMBER)) { throw new Error("not a valid gguf file: not starting with GGUF magic number"); } const [littleEndian, version] = (() => { const version2 = r.view.getUint32(4, true); if (version2 & 65535) { return [true, version2]; } else { return [false, r.view.getUint32(4, false)]; } })(); if (!isVersion(version)) { throw new Error(`not a valid gguf file: unsupported version "${version}"`); } let offset = 8; const tensorCount = readVersionedSize(r.view, offset, version, littleEndian); offset += tensorCount.length; const numKv = readVersionedSize(r.view, offset, version, littleEndian); offset += numKv.length; const metadata = { version, tensor_count: tensorCount.value, kv_count: numKv.value }; let typedMetadata; if (params?.typedMetadata) { typedMetadata = { version: { value: version, type: 4 /* UINT32 */ }, tensor_count: { value: tensorCount.value, type: version === 1 ? 4 /* UINT32 */ : 10 /* UINT64 */ }, kv_count: { value: numKv.value, type: version === 1 ? 4 /* UINT32 */ : 10 /* UINT64 */ } }; } for (let i = 0; i < numKv.value; i++) { await r.fetchChunkIfNeeded(offset); const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; const valueType = r.view.getUint32(offset, littleEndian); offset += 4; if (!isGGUFValueType(valueType)) { throw new Error("Unsupported metadata type: " + valueType); } let valueResult; while (!valueResult) { try { valueResult = readMetadataValue(r.view, valueType, offset, version, littleEndian); } catch (err) { if (err instanceof RangeError) { await r.fetchChunk(); } else { throw err; } } } offset += valueResult.length; metadata[keyResult.value] = valueResult.value; if (typedMetadata) { const typedEntry = { value: valueResult.value, type: valueType }; if (valueType === 9 /* ARRAY */) { const arrayTypeOffset = offset - valueResult.length; const arraySubType = r.view.getUint32(arrayTypeOffset, littleEndian); if (isGGUFValueType(arraySubType)) { typedEntry.subType = arraySubType; } } typedMetadata[keyResult.value] = typedEntry; } } const tensorInfos = []; for (let i = 0; i < tensorCount.value; i++) { await r.fetchChunkIfNeeded(offset); const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; const nDims = r.view.getUint32(offset, littleEndian); offset += 4; const shape = []; for (let dim = 0; dim < nDims; dim++) { const shapeDim = readVersionedSize(r.view, offset, version, littleEndian); shape.push(shapeDim.value); offset += shapeDim.length; } const type = r.view.getUint32(offset, littleEndian); offset += 4; const tensorOffset = r.view.getBigUint64(offset, littleEndian); offset += 8; tensorInfos.push({ name: keyResult.value, n_dims: nDims, shape, dtype: type, offset: tensorOffset }); } const alignment = Number(metadata["general.alignment"] ?? GGUF_DEFAULT_ALIGNMENT); const tensorDataOffset = BigInt(GGML_PAD(offset, alignment)); if (params?.computeParametersCount && params?.typedMetadata) { const parameterCount = tensorInfos.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)).reduce((acc, val) => acc + val, 0); return { metadata, tensorInfos, tensorDataOffset, littleEndian, parameterCount, typedMetadata }; } else if (params?.computeParametersCount) { const parameterCount = tensorInfos.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)).reduce((acc, val) => acc + val, 0); return { metadata, tensorInfos, tensorDataOffset, littleEndian, parameterCount }; } else if (params?.typedMetadata) { return { metadata, tensorInfos, tensorDataOffset, littleEndian, typedMetadata }; } else { return { metadata, tensorInfos, tensorDataOffset, littleEndian }; } } function writeVersionedSize(version, value, littleEndian) { switch (version) { case 1: { const buffer = new ArrayBuffer(4); const view = new DataView(buffer); view.setUint32(0, Number(value), littleEndian); return new Uint8Array(buffer); } case 2: case 3: { const buffer = new ArrayBuffer(8); const view = new DataView(buffer); view.setBigUint64(0, value, littleEndian); return new Uint8Array(buffer); } } } function writeString(value, version, littleEndian) { const stringBytes = new TextEncoder().encode(value); const lengthBytes = writeVersionedSize(version, BigInt(stringBytes.length), littleEndian); const result = new Uint8Array(lengthBytes.length + stringBytes.length); result.set(lengthBytes, 0); result.set(stringBytes, lengthBytes.length); return result; } function writeMetadataValue(value, type, version, littleEndian, subType) { switch (type) { case 0 /* UINT8 */: { const buffer = new ArrayBuffer(1); const view = new DataView(buffer); view.setUint8(0, value); return new Uint8Array(buffer); } case 1 /* INT8 */: { const buffer = new ArrayBuffer(1); const view = new DataView(buffer); view.setInt8(0, value); return new Uint8Array(buffer); } case 2 /* UINT16 */: { const buffer = new ArrayBuffer(2); const view = new DataView(buffer); view.setUint16(0, value, littleEndian); return new Uint8Array(buffer); } case 3 /* INT16 */: { const buffer = new ArrayBuffer(2); const view = new DataView(buffer); view.setInt16(0, value, littleEndian); return new Uint8Array(buffer); } case 4 /* UINT32 */: { const buffer = new ArrayBuffer(4); const view = new DataView(buffer); view.setUint32(0, value, littleEndian); return new Uint8Array(buffer); } case 5 /* INT32 */: { const buffer = new ArrayBuffer(4); const view = new DataView(buffer); view.setInt32(0, value, littleEndian); return new Uint8Array(buffer); } case 6 /* FLOAT32 */: { const buffer = new ArrayBuffer(4); const view = new DataView(buffer); view.setFloat32(0, value, littleEndian); return new Uint8Array(buffer); } case 7 /* BOOL */: { const buffer = new ArrayBuffer(1); const view = new DataView(buffer); view.setUint8(0, value ? 1 : 0); return new Uint8Array(buffer); } case 8 /* STRING */: { return writeString(value, version, littleEndian); } case 9 /* ARRAY */: { if (!subType) { throw new Error("Array type requires subType to be specified"); } const arrayValue = value; const arrayTypeBuffer = new ArrayBuffer(4); const arrayTypeView = new DataView(arrayTypeBuffer); arrayTypeView.setUint32(0, subType, littleEndian); const arrayTypeBytes = new Uint8Array(arrayTypeBuffer); const lengthBytes = writeVersionedSize(version, BigInt(arrayValue.length), littleEndian); const elementBytes = []; for (const element of arrayValue) { elementBytes.push(writeMetadataValue(element, subType, version, littleEndian)); } const totalLength = arrayTypeBytes.length + lengthBytes.length + elementBytes.reduce((sum, bytes) => sum + bytes.length, 0); const result = new Uint8Array(totalLength); let offset = 0; result.set(arrayTypeBytes, offset); offset += arrayTypeBytes.length; result.set(lengthBytes, offset); offset += lengthBytes.length; for (const bytes of elementBytes) { result.set(bytes, offset); offset += bytes.length; } return result; } case 10 /* UINT64 */: { const buffer = new ArrayBuffer(8); const view = new DataView(buffer); view.setBigUint64(0, value, littleEndian); return new Uint8Array(buffer); } case 11 /* INT64 */: { const buffer = new ArrayBuffer(8); const view = new DataView(buffer); view.setBigInt64(0, value, littleEndian); return new Uint8Array(buffer); } case 12 /* FLOAT64 */: { const buffer = new ArrayBuffer(8); const view = new DataView(buffer); view.setFloat64(0, value, littleEndian); return new Uint8Array(buffer); } default: throw new Error(`Unsupported value type: ${type}`); } } function serializeGgufMetadata(typedMetadata, options = {}) { const littleEndian = options.littleEndian ?? true; const alignment = options.alignment ?? 32; const version = typedMetadata.version.value; const versionBuffer = new ArrayBuffer(4); const versionView = new DataView(versionBuffer); versionView.setUint32(0, version, littleEndian); const versionBytes = new Uint8Array(versionBuffer); const tensorCountBytes = writeVersionedSize(version, typedMetadata.tensor_count.value, littleEndian); const kvEntries = Object.entries(typedMetadata).filter( ([key]) => !["version", "tensor_count", "kv_count"].includes(key) ); const kvCount = BigInt(kvEntries.length); const kvCountBytes = writeVersionedSize(version, kvCount, littleEndian); const kvBytes = []; for (const [key, entry] of kvEntries) { const keyBytes = writeString(key, version, littleEndian); kvBytes.push(keyBytes); const valueTypeBuffer = new ArrayBuffer(4); const valueTypeView = new DataView(valueTypeBuffer); valueTypeView.setUint32(0, entry.type, littleEndian); const valueTypeBytes = new Uint8Array(valueTypeBuffer); kvBytes.push(valueTypeBytes); if (entry.value === void 0) { throw new Error(`Value for key "${key}" is undefined`); } const valueBytes = writeMetadataValue( entry.value, entry.type, version, littleEndian, "subType" in entry ? entry.subType : void 0 ); kvBytes.push(valueBytes); } const preAlignmentSize = GGUF_MAGIC_NUMBER.length + versionBytes.length + tensorCountBytes.length + kvCountBytes.length + kvBytes.reduce((sum, bytes) => sum + bytes.length, 0); const GGML_PAD2 = (x, n) => x + n - 1 & ~(n - 1); const alignedSize = GGML_PAD2(preAlignmentSize, alignment); const result = new Uint8Array(alignedSize); let offset = 0; result.set(GGUF_MAGIC_NUMBER, offset); offset += GGUF_MAGIC_NUMBER.length; result.set(versionBytes, offset); offset += versionBytes.length; result.set(tensorCountBytes, offset); offset += tensorCountBytes.length; result.set(kvCountBytes, offset); offset += kvCountBytes.length; for (const bytes of kvBytes) { result.set(bytes, offset); offset += bytes.length; } return result; } async function ggufAllShards(url, params) { const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS; if (parallelDownloads < 1) { throw new TypeError("parallelDownloads must be greater than 0"); } const ggufShardFileInfo = parseGgufShardFilename(url); if (ggufShardFileInfo) { const total = parseInt(ggufShardFileInfo.total); const prefix = ggufShardFileInfo.prefix; const urls = []; for (let shardIdx = 1; shardIdx <= total; shardIdx++) { urls.push(`${prefix}-${shardIdx.toString().padStart(5, "0")}-of-${total.toString().padStart(5, "0")}.gguf`); } const shards = await promisesQueue( urls.map((shardUrl) => () => gguf(shardUrl, { ...params, computeParametersCount: true })), parallelDownloads ); return { shards, parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0) }; } else { const { metadata, tensorInfos, tensorDataOffset, littleEndian, parameterCount } = await gguf(url, { ...params, computeParametersCount: true }); return { shards: [{ metadata, tensorInfos, tensorDataOffset, littleEndian }], parameterCount }; } } // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { GGMLFileQuantizationType, GGMLQuantizationType, GGUFValueType, GGUF_QUANT_DESCRIPTIONS, GGUF_QUANT_ORDER, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL, RE_GGUF_FILE, RE_GGUF_SHARD_FILE, findNearestQuantType, gguf, ggufAllShards, parseGGUFQuantLabel, parseGgufShardFilename, serializeGgufMetadata });