UNPKG

@huggingface/gguf

Version:

a GGUF parser that works on remotely hosted files

837 lines (828 loc) 33.9 kB
#!/usr/bin/env node "use strict"; var __defProp = Object.defineProperty; var __getOwnPropNames = Object.getOwnPropertyNames; var __esm = (fn, res) => function __init() { return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res; }; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; // src/utils/FileBlob.ts var FileBlob_exports = {}; __export(FileBlob_exports, { FileBlob: () => FileBlob }); var import_node_fs, import_promises, import_node_stream, import_node_url, FileBlob; var init_FileBlob = __esm({ "src/utils/FileBlob.ts"() { "use strict"; import_node_fs = require("fs"); import_promises = require("fs/promises"); import_node_stream = require("stream"); import_node_url = require("url"); FileBlob = class extends Blob { /** * Creates a new FileBlob on the provided file. * * @param path Path to the file to be lazy readed */ static async create(path) { path = path instanceof URL ? (0, import_node_url.fileURLToPath)(path) : path; const { size } = await (0, import_promises.stat)(path); const fileBlob = new FileBlob(path, 0, size); return fileBlob; } path; start; end; constructor(path, start, end) { super(); this.path = path; this.start = start; this.end = end; } /** * Returns the size of the blob. */ get size() { return this.end - this.start; } /** * Returns a new instance of FileBlob that is a slice of the current one. * * The slice is inclusive of the start and exclusive of the end. * * The slice method does not supports negative start/end. * * @param start beginning of the slice * @param end end of the slice */ slice(start = 0, end = this.size) { if (start < 0 || end < 0) { new TypeError("Unsupported negative start/end on FileBlob.slice"); } const slice = new FileBlob(this.path, this.start + start, Math.min(this.start + end, this.end)); return slice; } /** * Read the part of the file delimited by the FileBlob and returns it as an ArrayBuffer. */ async arrayBuffer() { const slice = await this.execute((file) => file.read(Buffer.alloc(this.size), 0, this.size, this.start)); return slice.buffer; } /** * Read the part of the file delimited by the FileBlob and returns it as a string. */ async text() { const buffer = await this.arrayBuffer(); return buffer.toString("utf8"); } /** * Returns a stream around the part of the file delimited by the FileBlob. */ stream() { return import_node_stream.Readable.toWeb((0, import_node_fs.createReadStream)(this.path, { start: this.start, end: this.end - 1 })); } /** * We are opening and closing the file for each action to prevent file descriptor leaks. * * It is an intended choice of developer experience over performances. */ async execute(action) { const file = await (0, import_promises.open)(this.path, "r"); try { return await action(file); } finally { await file.close(); } } }; } }); // src/transformer-llm.ts var LLM_ARCHITECTURES = [ "llama", "deci", "falcon", "grok", "gpt2", "gptj", "gptneox", "mpt", "baichuan", "starcoder", "refact", "bert", "nomic-bert", "jina-bert-v2", "bloom", "stablelm", "qwen", "qwen2", "qwen2moe", "qwen2vl", "phi2", "phi3", "phimoe", "plamo", "codeshell", "orion", "internlm2", "minicpm", "minicpm3", "gemma", "gemma2", "starcoder2", "mamba", "xverse", "command-r", "cohere2", "dbrx", "olmo", "olmo2", "olmoe", "openelm", "arctic", "deepseek", "deepseek2", "chatglm", "bitnet", "t5", "t5encoder", "jais", "nemotron", "exaone", "rwkv6", "rwkv6qwen2", "granite", "granitemoe", "chameleon", "wavtokenizer-dec" ]; // src/types.ts var import_tasks = require("@huggingface/tasks"); var GGUFValueType = /* @__PURE__ */ ((GGUFValueType2) => { GGUFValueType2[GGUFValueType2["UINT8"] = 0] = "UINT8"; GGUFValueType2[GGUFValueType2["INT8"] = 1] = "INT8"; GGUFValueType2[GGUFValueType2["UINT16"] = 2] = "UINT16"; GGUFValueType2[GGUFValueType2["INT16"] = 3] = "INT16"; GGUFValueType2[GGUFValueType2["UINT32"] = 4] = "UINT32"; GGUFValueType2[GGUFValueType2["INT32"] = 5] = "INT32"; GGUFValueType2[GGUFValueType2["FLOAT32"] = 6] = "FLOAT32"; GGUFValueType2[GGUFValueType2["BOOL"] = 7] = "BOOL"; GGUFValueType2[GGUFValueType2["STRING"] = 8] = "STRING"; GGUFValueType2[GGUFValueType2["ARRAY"] = 9] = "ARRAY"; GGUFValueType2[GGUFValueType2["UINT64"] = 10] = "UINT64"; GGUFValueType2[GGUFValueType2["INT64"] = 11] = "INT64"; GGUFValueType2[GGUFValueType2["FLOAT64"] = 12] = "FLOAT64"; return GGUFValueType2; })(GGUFValueType || {}); var ARCHITECTURES = [...LLM_ARCHITECTURES, "rwkv", "whisper"]; // src/utils/isBackend.ts var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined"; var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope"; var isBackend = !isBrowser && !isWebWorker; // src/utils/promisesQueue.ts async function promisesQueue(factories, concurrency) { const results = []; const executing = /* @__PURE__ */ new Set(); let index = 0; for (const factory of factories) { const closureIndex = index++; const e = factory().then((r) => { results[closureIndex] = r; executing.delete(e); }); executing.add(e); if (executing.size >= concurrency) { await Promise.race(executing); } } await Promise.all(executing); return results; } // src/quant-descriptions.ts var GGUF_QUANT_DESCRIPTIONS = { [import_tasks.GGMLQuantizationType.F32]: { txt: "32-bit standard IEEE 754 single-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format" }, [import_tasks.GGMLQuantizationType.F16]: { txt: "16-bit standard IEEE 754 half-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format" }, [import_tasks.GGMLQuantizationType.Q8_0]: { txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [import_tasks.GGMLQuantizationType.Q8_1]: { txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [import_tasks.GGMLQuantizationType.Q8_K]: { txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q6_K]: { txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q5_0]: { txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [import_tasks.GGMLQuantizationType.Q5_1]: { txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [import_tasks.GGMLQuantizationType.Q5_K]: { txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q4_0]: { txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249" }, [import_tasks.GGMLQuantizationType.Q4_1]: { txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).", src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290" }, [import_tasks.GGMLQuantizationType.Q4_K]: { txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q3_K]: { txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.Q2_K]: { txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.625 bits-per-weight.`, src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305" }, [import_tasks.GGMLQuantizationType.IQ4_XS]: { txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ3_S]: { txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ3_XXS]: { txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ2_S]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ2_XS]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ2_XXS]: { txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ1_S]: { txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.", src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70" }, [import_tasks.GGMLQuantizationType.IQ4_NL]: { txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.", src_url: "https://github.com/ggerganov/llama.cpp/pull/5590" }, [import_tasks.GGMLQuantizationType.I8]: { txt: "8-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [import_tasks.GGMLQuantizationType.I16]: { txt: "16-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [import_tasks.GGMLQuantizationType.I32]: { txt: "32-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6045" }, [import_tasks.GGMLQuantizationType.I64]: { txt: "64-bit fixed-width integer number.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6062" }, [import_tasks.GGMLQuantizationType.F64]: { txt: "64-bit standard IEEE 754 double-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format" }, [import_tasks.GGMLQuantizationType.IQ1_M]: { txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.", src_url: "https://github.com/ggerganov/llama.cpp/pull/6302" }, [import_tasks.GGMLQuantizationType.BF16]: { txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.", src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format" }, [import_tasks.GGMLQuantizationType.TQ1_0]: { txt: "Ternary quantization.", src_url: "https://github.com/ggml-org/llama.cpp/pull/8151" }, [import_tasks.GGMLQuantizationType.TQ2_0]: { txt: "Ternary quantization.", src_url: "https://github.com/ggml-org/llama.cpp/pull/8151" } }; var QK_K = 256; var calcBPW = (blockSize, typeSize) => { return typeSize * 8 / blockSize; }; var GGML_QUANT_SIZES = { [import_tasks.GGMLQuantizationType.F32]: calcBPW(1, 4), [import_tasks.GGMLQuantizationType.F16]: calcBPW(1, 2), [import_tasks.GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16), [import_tasks.GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16), [import_tasks.GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16), [import_tasks.GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16), [import_tasks.GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32), [import_tasks.GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32), [import_tasks.GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4), [import_tasks.GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12), [import_tasks.GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12), [import_tasks.GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12), [import_tasks.GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16), [import_tasks.GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8), [import_tasks.GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4), [import_tasks.GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32), [import_tasks.GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8), [import_tasks.GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16), [import_tasks.GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16), [import_tasks.GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4), [import_tasks.GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16), [import_tasks.GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64), [import_tasks.GGMLQuantizationType.I8]: calcBPW(1, 1), [import_tasks.GGMLQuantizationType.I16]: calcBPW(1, 2), [import_tasks.GGMLQuantizationType.I32]: calcBPW(1, 4), [import_tasks.GGMLQuantizationType.I64]: calcBPW(1, 8), [import_tasks.GGMLQuantizationType.F64]: calcBPW(1, 8), [import_tasks.GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32), [import_tasks.GGMLQuantizationType.BF16]: calcBPW(1, 2), [import_tasks.GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13), [import_tasks.GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64) }; // src/gguf.ts var import_tasks2 = require("@huggingface/tasks"); var RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/; var GGUF_DEFAULT_ALIGNMENT = 32; var GGML_PAD = (x, n) => x + n - 1 & ~(n - 1); var PARALLEL_DOWNLOADS = 20; function parseGgufShardFilename(filename) { const match = RE_GGUF_SHARD_FILE.exec(filename); if (match && match.groups) { return { prefix: match.groups["prefix"], shard: match.groups["shard"], total: match.groups["total"] }; } return null; } var isVersion = (version) => version === 1 || version === 2 || version === 3; var ggufMagicNumber = new Uint8Array([71, 71, 85, 70]); function isGGUFValueType(n) { return typeof GGUFValueType[n] === "string"; } var HTTP_CHUNK_SIZE = 2 * 10 ** 6; var HTTP_DATA_LEEWAY = 5 * 10 ** 5; var HTTP_TOTAL_MAX_SIZE = 50 * 10 ** 6; var RangeView = class { constructor(uri, params) { this.uri = uri; this.params = params; this.chunk = 0; this.buffer = new ArrayBuffer(0, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); this.dataView = new DataView(this.buffer); } chunk; buffer; dataView; get view() { return this.dataView; } /** * Fetch a new chunk from the server */ async fetchChunk() { const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buf = new Uint8Array( await (await (this.params?.fetch ?? fetch)(this.uri, { headers: { ...this.params?.additionalFetchHeaders ?? {}, Range: `bytes=${range[0]}-${range[1]}` } })).arrayBuffer() ); this.appendBuffer(buf); this.chunk += 1; } /** * Append new data into the buffer */ appendBuffer(buf) { if (ArrayBuffer.prototype.resize) { this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE); new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE); } else { const newBuffer = new ArrayBuffer((this.chunk + 1) * HTTP_CHUNK_SIZE, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); const arrView = new Uint8Array(newBuffer); arrView.set(new Uint8Array(this.buffer)); arrView.set(buf, this.chunk * HTTP_CHUNK_SIZE); this.buffer = newBuffer; this.dataView = new DataView(this.buffer); } } /** * Check whether we need to fetch a new chunk */ async fetchChunkIfNeeded(offset) { if (this.dataView.byteLength - offset < HTTP_DATA_LEEWAY) { await this.fetchChunk(); } } }; var RangeViewLocalFile = class extends RangeView { /** * Read a new chunk from local file system. */ async fetchChunk() { const { FileBlob: FileBlob2 } = await Promise.resolve().then(() => (init_FileBlob(), FileBlob_exports)); const blob = await FileBlob2.create(this.uri); const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buffer = await blob.slice(range[0], range[1]).arrayBuffer(); this.appendBuffer(new Uint8Array(buffer)); this.chunk += 1; } }; function readVersionedSize(view, byteOffset, version, littleEndian) { switch (version) { case 1: { const n = view.getUint32(byteOffset, littleEndian); return { value: BigInt(n), length: 4 }; } case 2: case 3: { return { value: view.getBigUint64(byteOffset, littleEndian), length: 8 }; } } } function readString(view, offset, version, littleEndian) { const length = readVersionedSize(view, offset, version, littleEndian); const off = length.length; const value = new TextDecoder().decode(view.buffer.slice(offset + off, offset + off + Number(length.value))); return { value, length: off + Number(length.value) }; } function readMetadataValue(view, type, offset, version, littleEndian) { switch (type) { case 0 /* UINT8 */: return { value: view.getUint8(offset), length: 1 }; case 1 /* INT8 */: return { value: view.getInt8(offset), length: 1 }; case 2 /* UINT16 */: return { value: view.getUint16(offset, littleEndian), length: 2 }; case 3 /* INT16 */: return { value: view.getInt16(offset, littleEndian), length: 2 }; case 4 /* UINT32 */: return { value: view.getUint32(offset, littleEndian), length: 4 }; case 5 /* INT32 */: return { value: view.getInt32(offset, littleEndian), length: 4 }; case 6 /* FLOAT32 */: return { value: view.getFloat32(offset, littleEndian), length: 4 }; case 7 /* BOOL */: return { value: view.getUint8(offset) !== 0, length: 1 }; case 8 /* STRING */: return readString(view, offset, version, littleEndian); case 9 /* ARRAY */: { const arrayType = view.getUint32(offset, littleEndian); const arrayLength = readVersionedSize(view, offset + 4, version, littleEndian); let length = 4 + arrayLength.length; const arrayValues = []; for (let i = 0; i < arrayLength.value; i++) { const metadataValue = readMetadataValue(view, arrayType, offset + length, version, littleEndian); arrayValues.push(metadataValue.value); length += metadataValue.length; } return { value: arrayValues, length }; } case 10 /* UINT64 */: return { value: view.getBigUint64(offset, littleEndian), length: 8 }; case 11 /* INT64 */: return { value: view.getBigInt64(offset, littleEndian), length: 8 }; case 12 /* FLOAT64 */: return { value: view.getFloat64(offset, littleEndian), length: 8 }; } } async function gguf(uri, params) { let r; if (isBackend) { if (uri.match(/^https?:\/\//)) { r = new RangeView(uri, params); } else if (params?.allowLocalFile) { r = new RangeViewLocalFile(uri, params); } else { throw new Error("Access to local file is not enabled, please set allowLocalFile to true"); } } else { if (params?.allowLocalFile) { throw new Error("allowLocalFile cannot be used on browser"); } r = new RangeView(uri, params); } await r.fetchChunk(); const checkBuffer = (buffer, header) => { for (let i = 0; i < header.length; i++) { if (header[i] !== buffer[i]) { return false; } } return true; }; if (!checkBuffer(new Uint8Array(r.view.buffer.slice(0, 4)), ggufMagicNumber)) { throw new Error("not a valid gguf file: not starting with GGUF magic number"); } const [littleEndian, version] = (() => { const version2 = r.view.getUint32(4, true); if (version2 & 65535) { return [true, version2]; } else { return [false, r.view.getUint32(4, false)]; } })(); if (!isVersion(version)) { throw new Error(`not a valid gguf file: unsupported version "${version}"`); } let offset = 8; const tensorCount = readVersionedSize(r.view, offset, version, littleEndian); offset += tensorCount.length; const numKv = readVersionedSize(r.view, offset, version, littleEndian); offset += numKv.length; const metadata = { version, tensor_count: tensorCount.value, kv_count: numKv.value }; for (let i = 0; i < numKv.value; i++) { await r.fetchChunkIfNeeded(offset); const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; const valueType = r.view.getUint32(offset, littleEndian); offset += 4; if (!isGGUFValueType(valueType)) { throw new Error("Unsupported metadata type: " + valueType); } let valueResult; while (!valueResult) { try { valueResult = readMetadataValue(r.view, valueType, offset, version, littleEndian); } catch (err) { if (err instanceof RangeError) { await r.fetchChunk(); } else { throw err; } } } offset += valueResult.length; metadata[keyResult.value] = valueResult.value; } const tensorInfos = []; for (let i = 0; i < tensorCount.value; i++) { await r.fetchChunkIfNeeded(offset); const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; const nDims = r.view.getUint32(offset, littleEndian); offset += 4; const shape = []; for (let dim = 0; dim < nDims; dim++) { const shapeDim = readVersionedSize(r.view, offset, version, littleEndian); shape.push(shapeDim.value); offset += shapeDim.length; } const type = r.view.getUint32(offset, littleEndian); offset += 4; const tensorOffset = r.view.getBigUint64(offset, littleEndian); offset += 8; tensorInfos.push({ name: keyResult.value, n_dims: nDims, shape, dtype: type, offset: tensorOffset }); } const alignment = Number(metadata["general.alignment"] ?? GGUF_DEFAULT_ALIGNMENT); const tensorDataOffset = BigInt(GGML_PAD(offset, alignment)); if (params?.computeParametersCount) { const parameterCount = tensorInfos.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)).reduce((acc, val) => acc + val, 0); return { metadata, tensorInfos, tensorDataOffset, parameterCount }; } else { return { metadata, tensorInfos, tensorDataOffset }; } } async function ggufAllShards(url, params) { const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS; if (parallelDownloads < 1) { throw new TypeError("parallelDownloads must be greater than 0"); } const ggufShardFileInfo = parseGgufShardFilename(url); if (ggufShardFileInfo) { const total = parseInt(ggufShardFileInfo.total); const prefix = ggufShardFileInfo.prefix; const urls = []; for (let shardIdx = 1; shardIdx <= total; shardIdx++) { urls.push(`${prefix}-${shardIdx.toString().padStart(5, "0")}-of-${total.toString().padStart(5, "0")}.gguf`); } const shards = await promisesQueue( urls.map((shardUrl) => () => gguf(shardUrl, { ...params, computeParametersCount: true })), parallelDownloads ); return { shards, parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0) }; } else { const { metadata, tensorInfos, tensorDataOffset, parameterCount } = await gguf(url, { ...params, computeParametersCount: true }); return { shards: [{ metadata, tensorInfos, tensorDataOffset }], parameterCount }; } } // src/cli.ts var mapDtypeToName = Object.fromEntries(Object.entries(import_tasks.GGMLQuantizationType).map(([name, value]) => [value, name])); function showHelp(exitCode) { console.error("Usage: gguf-view [--help|-h] [--show-tensor] [--context|-c N] <path/to/gguf>"); console.error(" --help, -h Show this help message"); console.error(" --show-tensor Show tensor information"); console.error(" --context, -c N Number of tokens in context (default: 4096)"); process.exit(exitCode); } async function main() { let ggufPath = ""; let showTensors = false; let nCtx = 4096; for (let i = 2; i < process.argv.length; i++) { if (process.argv[i] === "--help" || process.argv[i] === "-h") { showHelp(0); } else if (process.argv[i] === "--show-tensor") { showTensors = true; } else if (process.argv[i] === "--context" || process.argv[i] === "-c") { nCtx = Number(process.argv[++i]); } else { ggufPath = process.argv[i]; } } if (!ggufPath.length) { console.error("Error: Missing path to gguf file"); showHelp(1); } const { shards } = await ggufAllShards(ggufPath, { allowLocalFile: true }); const { metadata, tensorInfos } = shards[0]; for (let i = 1; i < shards.length; i++) { tensorInfos.push(...shards[i].tensorInfos); } console.log(`* Dumping ${Object.keys(metadata).length} key/value pair(s)`); printTable( [ { name: "Idx", alignRight: true }, // { name: 'Type' }, // TODO: support this { name: "Count", alignRight: true }, { name: "Value" } ], Object.entries(metadata).map(([key, value], i) => { const MAX_LEN = 50; let strVal = ""; let count = 1; if (Array.isArray(value)) { strVal = JSON.stringify(value); count = value.length; } else if (value instanceof String || typeof value === "string") { strVal = JSON.stringify(value); } else { strVal = value.toString(); } strVal = strVal.length > MAX_LEN ? strVal.slice(0, MAX_LEN) + "..." : strVal; return [(i + 1).toString(), count.toString(), `${key} = ${strVal}`]; }) ); console.log(); console.log(`* Memory usage estimation (with context length of ${nCtx} tokens)`); try { const kvUsage = calcMemoryUsage(metadata, nCtx); let modelWeightInBytes = 0; for (const tensorInfo of tensorInfos) { const nElem = Number(tensorInfo.shape.reduce((a, b) => a * b, 1n)); const tensorSizeInBytes = nElem * (GGML_QUANT_SIZES[tensorInfo.dtype] / 8); modelWeightInBytes += tensorSizeInBytes; } const overhead = calcMemoryUsage(metadata, 256).totalBytes + modelWeightInBytes * 0.05; const totalMemoryUsage = kvUsage.totalBytes + overhead + modelWeightInBytes; printTable( [{ name: "Item" }, { name: "Memory usage", alignRight: true }], [ ["K cache", (kvUsage.totalBytesK / 1e9).toFixed(2) + " GB"], ["V cache", (kvUsage.totalBytesV / 1e9).toFixed(2) + " GB"], ["Weight", (modelWeightInBytes / 1e9).toFixed(2) + " GB"], ["Overhead", (overhead / 1e9).toFixed(2) + " GB"], ["", "---"], ["TOTAL", (totalMemoryUsage / 1e9).toFixed(2) + " GB"] ] ); } catch (e) { console.error(`Error: ${e.message}`); } if (showTensors) { console.log(); console.log(`* Dumping ${tensorInfos.length} tensor(s)`); printTable( [ { name: "Idx", alignRight: true }, { name: "Num Elements", alignRight: true }, { name: "Shape" }, { name: "Data Type" }, { name: "Name" } ], tensorInfos.map((tensorInfo, i) => { const shape = [1n, 1n, 1n, 1n]; tensorInfo.shape.forEach((dim, i2) => { shape[i2] = dim; }); return [ (i + 1).toString(), shape.reduce((acc, n) => acc * n, 1n).toString(), shape.map((n) => n.toString().padStart(6)).join(", "), mapDtypeToName[tensorInfo.dtype], tensorInfo.name ]; }) ); } else { console.log(); console.log(`* Use --show-tensor to display tensor information`); } } function calcMemoryUsage(metadata, kvSize, kvTypeK = import_tasks.GGMLQuantizationType.F16, kvTypeV = import_tasks.GGMLQuantizationType.F16) { const arch = metadata["general.architecture"] ?? "unknown"; const n_embd = metadata[`${arch}.embedding_length`] ?? 0; const n_head = metadata[`${arch}.attention.head_count`] ?? 0; const n_embd_head_k = metadata[`${arch}.attention.key_length`] ?? n_embd / n_head; const n_embd_head_v = metadata[`${arch}.attention.value_length`] ?? n_embd / n_head; const n_head_kv = metadata[`${arch}.attention.head_count_kv`] ?? []; const n_layer = metadata[`${arch}.block_count`] ?? 0; if (arch.startsWith("mamba") || arch.startsWith("rwkv")) { throw new Error(`Memory usage estimation for arch "${arch}" is not supported`); } const n_head_kv_arr = Array(n_layer).fill(n_head); if (Array.isArray(n_head_kv)) { for (let i = 0; i < n_layer; i++) { if (n_head_kv[i]) { n_head_kv_arr[i] = n_head_kv[i]; } } } else { for (let i = 0; i < n_layer; i++) { n_head_kv_arr[i] = n_head_kv; } } let totalElemsK = 0; let totalElemsV = 0; for (let i = 0; i < n_layer; i++) { const n_embd_k_gqa = n_embd_head_k * n_head_kv_arr[i]; const n_embd_v_gqa = n_embd_head_v * n_head_kv_arr[i]; totalElemsK += n_embd_k_gqa * kvSize; totalElemsV += n_embd_v_gqa * kvSize; } return { totalBytesK: totalElemsK * (GGML_QUANT_SIZES[kvTypeK] / 8), totalBytesV: totalElemsV * (GGML_QUANT_SIZES[kvTypeV] / 8), totalBytes: (totalElemsK + totalElemsV) * (GGML_QUANT_SIZES[kvTypeV] / 8) }; } function printTable(header, rows, leftPad = 2) { const leftPadStr = " ".repeat(leftPad); const columnWidths = header.map((h, i) => { const maxContentWidth = Math.max(h.name.length, ...rows.map((row) => (row[i] || "").length)); return h.maxWidth ? Math.min(maxContentWidth, h.maxWidth) : maxContentWidth; }); const headerLine = header.map((h, i) => { return h.name.padEnd(columnWidths[i]); }).join(" | "); console.log(leftPadStr + headerLine); console.log(leftPadStr + columnWidths.map((w) => "-".repeat(w)).join("-|-")); for (const row of rows) { const line = header.map((h, i) => { return h.alignRight ? (row[i] || "").padStart(columnWidths[i]) : (row[i] || "").padEnd(columnWidths[i]); }).join(" | "); console.log(leftPadStr + line); } } main();