@huggingface/gguf
Version:
a GGUF parser that works on remotely hosted files
953 lines (944 loc) • 37.6 kB
JavaScript
;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __esm = (fn, res) => function __init() {
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
};
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/utils/FileBlob.ts
var FileBlob_exports = {};
__export(FileBlob_exports, {
FileBlob: () => FileBlob
});
var import_node_fs, import_promises, import_node_stream, import_node_url, FileBlob;
var init_FileBlob = __esm({
"src/utils/FileBlob.ts"() {
"use strict";
import_node_fs = require("fs");
import_promises = require("fs/promises");
import_node_stream = require("stream");
import_node_url = require("url");
FileBlob = class extends Blob {
/**
* Creates a new FileBlob on the provided file.
*
* @param path Path to the file to be lazy readed
*/
static async create(path) {
path = path instanceof URL ? (0, import_node_url.fileURLToPath)(path) : path;
const { size } = await (0, import_promises.stat)(path);
const fileBlob = new FileBlob(path, 0, size);
return fileBlob;
}
path;
start;
end;
constructor(path, start, end) {
super();
this.path = path;
this.start = start;
this.end = end;
}
/**
* Returns the size of the blob.
*/
get size() {
return this.end - this.start;
}
/**
* Returns a new instance of FileBlob that is a slice of the current one.
*
* The slice is inclusive of the start and exclusive of the end.
*
* The slice method does not supports negative start/end.
*
* @param start beginning of the slice
* @param end end of the slice
*/
slice(start = 0, end = this.size) {
if (start < 0 || end < 0) {
new TypeError("Unsupported negative start/end on FileBlob.slice");
}
const slice = new FileBlob(this.path, this.start + start, Math.min(this.start + end, this.end));
return slice;
}
/**
* Read the part of the file delimited by the FileBlob and returns it as an ArrayBuffer.
*/
async arrayBuffer() {
const slice = await this.execute((file) => file.read(Buffer.alloc(this.size), 0, this.size, this.start));
return slice.buffer;
}
/**
* Read the part of the file delimited by the FileBlob and returns it as a string.
*/
async text() {
const buffer = await this.arrayBuffer();
return buffer.toString("utf8");
}
/**
* Returns a stream around the part of the file delimited by the FileBlob.
*/
stream() {
return import_node_stream.Readable.toWeb((0, import_node_fs.createReadStream)(this.path, { start: this.start, end: this.end - 1 }));
}
/**
* We are opening and closing the file for each action to prevent file descriptor leaks.
*
* It is an intended choice of developer experience over performances.
*/
async execute(action) {
const file = await (0, import_promises.open)(this.path, "r");
try {
return await action(file);
} finally {
await file.close();
}
}
};
}
});
// src/index.ts
var src_exports = {};
__export(src_exports, {
GGMLFileQuantizationType: () => import_tasks2.GGMLFileQuantizationType,
GGMLQuantizationType: () => import_tasks.GGMLQuantizationType,
GGUFValueType: () => GGUFValueType,
GGUF_QUANT_DESCRIPTIONS: () => GGUF_QUANT_DESCRIPTIONS,
GGUF_QUANT_ORDER: () => import_tasks2.GGUF_QUANT_ORDER,
GGUF_QUANT_RE: () => import_tasks2.GGUF_QUANT_RE,
GGUF_QUANT_RE_GLOBAL: () => import_tasks2.GGUF_QUANT_RE_GLOBAL,
RE_GGUF_FILE: () => RE_GGUF_FILE,
RE_GGUF_SHARD_FILE: () => RE_GGUF_SHARD_FILE,
findNearestQuantType: () => import_tasks2.findNearestQuantType,
gguf: () => gguf,
ggufAllShards: () => ggufAllShards,
parseGGUFQuantLabel: () => import_tasks2.parseGGUFQuantLabel,
parseGgufShardFilename: () => parseGgufShardFilename,
serializeGgufMetadata: () => serializeGgufMetadata
});
module.exports = __toCommonJS(src_exports);
// src/transformer-llm.ts
var LLM_ARCHITECTURES = [
"llama",
"deci",
"falcon",
"grok",
"gpt2",
"gptj",
"gptneox",
"mpt",
"baichuan",
"starcoder",
"refact",
"bert",
"nomic-bert",
"jina-bert-v2",
"bloom",
"stablelm",
"qwen",
"qwen2",
"qwen2moe",
"qwen2vl",
"phi2",
"phi3",
"phimoe",
"plamo",
"codeshell",
"orion",
"internlm2",
"minicpm",
"minicpm3",
"gemma",
"gemma2",
"starcoder2",
"mamba",
"xverse",
"command-r",
"cohere2",
"dbrx",
"olmo",
"olmo2",
"olmoe",
"openelm",
"arctic",
"deepseek",
"deepseek2",
"chatglm",
"bitnet",
"t5",
"t5encoder",
"jais",
"nemotron",
"exaone",
"rwkv6",
"rwkv6qwen2",
"granite",
"granitemoe",
"chameleon",
"wavtokenizer-dec"
];
// src/types.ts
var import_tasks = require("@huggingface/tasks");
var GGUFValueType = /* @__PURE__ */ ((GGUFValueType2) => {
GGUFValueType2[GGUFValueType2["UINT8"] = 0] = "UINT8";
GGUFValueType2[GGUFValueType2["INT8"] = 1] = "INT8";
GGUFValueType2[GGUFValueType2["UINT16"] = 2] = "UINT16";
GGUFValueType2[GGUFValueType2["INT16"] = 3] = "INT16";
GGUFValueType2[GGUFValueType2["UINT32"] = 4] = "UINT32";
GGUFValueType2[GGUFValueType2["INT32"] = 5] = "INT32";
GGUFValueType2[GGUFValueType2["FLOAT32"] = 6] = "FLOAT32";
GGUFValueType2[GGUFValueType2["BOOL"] = 7] = "BOOL";
GGUFValueType2[GGUFValueType2["STRING"] = 8] = "STRING";
GGUFValueType2[GGUFValueType2["ARRAY"] = 9] = "ARRAY";
GGUFValueType2[GGUFValueType2["UINT64"] = 10] = "UINT64";
GGUFValueType2[GGUFValueType2["INT64"] = 11] = "INT64";
GGUFValueType2[GGUFValueType2["FLOAT64"] = 12] = "FLOAT64";
return GGUFValueType2;
})(GGUFValueType || {});
var ARCHITECTURES = [...LLM_ARCHITECTURES, "rwkv", "whisper"];
// src/utils/isBackend.ts
var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
var isBackend = !isBrowser && !isWebWorker;
// src/utils/promisesQueue.ts
async function promisesQueue(factories, concurrency) {
const results = [];
const executing = /* @__PURE__ */ new Set();
let index = 0;
for (const factory of factories) {
const closureIndex = index++;
const e = factory().then((r) => {
results[closureIndex] = r;
executing.delete(e);
});
executing.add(e);
if (executing.size >= concurrency) {
await Promise.race(executing);
}
}
await Promise.all(executing);
return results;
}
// src/quant-descriptions.ts
var GGUF_QUANT_DESCRIPTIONS = {
[import_tasks.GGMLQuantizationType.F32]: {
txt: "32-bit standard IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format"
},
[import_tasks.GGMLQuantizationType.F16]: {
txt: "16-bit standard IEEE 754 half-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format"
},
[import_tasks.GGMLQuantizationType.Q8_0]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249"
},
[import_tasks.GGMLQuantizationType.Q8_1]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290"
},
[import_tasks.GGMLQuantizationType.Q8_K]: {
txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[import_tasks.GGMLQuantizationType.Q6_K]: {
txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[import_tasks.GGMLQuantizationType.Q5_0]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249"
},
[import_tasks.GGMLQuantizationType.Q5_1]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290"
},
[import_tasks.GGMLQuantizationType.Q5_K]: {
txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[import_tasks.GGMLQuantizationType.Q4_0]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249"
},
[import_tasks.GGMLQuantizationType.Q4_1]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290"
},
[import_tasks.GGMLQuantizationType.Q4_K]: {
txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[import_tasks.GGMLQuantizationType.Q3_K]: {
txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[import_tasks.GGMLQuantizationType.Q2_K]: {
txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[import_tasks.GGMLQuantizationType.IQ4_XS]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[import_tasks.GGMLQuantizationType.IQ3_S]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[import_tasks.GGMLQuantizationType.IQ3_XXS]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[import_tasks.GGMLQuantizationType.IQ2_S]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[import_tasks.GGMLQuantizationType.IQ2_XS]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[import_tasks.GGMLQuantizationType.IQ2_XXS]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[import_tasks.GGMLQuantizationType.IQ1_S]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[import_tasks.GGMLQuantizationType.IQ4_NL]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/5590"
},
[import_tasks.GGMLQuantizationType.I8]: {
txt: "8-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045"
},
[import_tasks.GGMLQuantizationType.I16]: {
txt: "16-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045"
},
[import_tasks.GGMLQuantizationType.I32]: {
txt: "32-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045"
},
[import_tasks.GGMLQuantizationType.I64]: {
txt: "64-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6062"
},
[import_tasks.GGMLQuantizationType.F64]: {
txt: "64-bit standard IEEE 754 double-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format"
},
[import_tasks.GGMLQuantizationType.IQ1_M]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6302"
},
[import_tasks.GGMLQuantizationType.BF16]: {
txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format"
},
[import_tasks.GGMLQuantizationType.TQ1_0]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151"
},
[import_tasks.GGMLQuantizationType.TQ2_0]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151"
},
[import_tasks.GGMLQuantizationType.MXFP4]: {
txt: "4-bit Microscaling Block Floating Point.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/15091"
}
};
var QK_K = 256;
var calcBPW = (blockSize, typeSize) => {
return typeSize * 8 / blockSize;
};
var GGML_QUANT_SIZES = {
[import_tasks.GGMLQuantizationType.F32]: calcBPW(1, 4),
[import_tasks.GGMLQuantizationType.F16]: calcBPW(1, 2),
[import_tasks.GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16),
[import_tasks.GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16),
[import_tasks.GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16),
[import_tasks.GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16),
[import_tasks.GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32),
[import_tasks.GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32),
[import_tasks.GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4),
[import_tasks.GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12),
[import_tasks.GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12),
[import_tasks.GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12),
[import_tasks.GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16),
[import_tasks.GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8),
[import_tasks.GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4),
[import_tasks.GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32),
[import_tasks.GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8),
[import_tasks.GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16),
[import_tasks.GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16),
[import_tasks.GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4),
[import_tasks.GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16),
[import_tasks.GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64),
[import_tasks.GGMLQuantizationType.I8]: calcBPW(1, 1),
[import_tasks.GGMLQuantizationType.I16]: calcBPW(1, 2),
[import_tasks.GGMLQuantizationType.I32]: calcBPW(1, 4),
[import_tasks.GGMLQuantizationType.I64]: calcBPW(1, 8),
[import_tasks.GGMLQuantizationType.F64]: calcBPW(1, 8),
[import_tasks.GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
[import_tasks.GGMLQuantizationType.BF16]: calcBPW(1, 2),
[import_tasks.GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
[import_tasks.GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64),
[import_tasks.GGMLQuantizationType.MXFP4]: calcBPW(32, 1 + 16)
};
// src/gguf.ts
var import_tasks2 = require("@huggingface/tasks");
var RE_GGUF_FILE = /\.gguf$/;
var RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/;
var GGUF_DEFAULT_ALIGNMENT = 32;
var GGML_PAD = (x, n) => x + n - 1 & ~(n - 1);
var PARALLEL_DOWNLOADS = 20;
var GGUF_MAGIC_NUMBER = new Uint8Array([71, 71, 85, 70]);
function parseGgufShardFilename(filename) {
const match = RE_GGUF_SHARD_FILE.exec(filename);
if (match && match.groups) {
return {
prefix: match.groups["prefix"],
shard: match.groups["shard"],
total: match.groups["total"]
};
}
return null;
}
var isVersion = (version) => version === 1 || version === 2 || version === 3;
function isGGUFValueType(n) {
return typeof GGUFValueType[n] === "string";
}
var HTTP_CHUNK_SIZE = 2 * 10 ** 6;
var HTTP_DATA_LEEWAY = 5 * 10 ** 5;
var HTTP_TOTAL_MAX_SIZE = 50 * 10 ** 6;
var RangeView = class {
constructor(uri, params) {
this.uri = uri;
this.params = params;
this.chunk = 0;
this.buffer = new ArrayBuffer(0, { maxByteLength: HTTP_TOTAL_MAX_SIZE });
this.dataView = new DataView(this.buffer);
}
chunk;
buffer;
dataView;
get view() {
return this.dataView;
}
/**
* Fetch a new chunk from the server
*/
async fetchChunk() {
const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1];
const buf = new Uint8Array(
await (await (this.params?.fetch ?? fetch)(this.uri, {
headers: {
...this.params?.additionalFetchHeaders ?? {},
Range: `bytes=${range[0]}-${range[1]}`
}
})).arrayBuffer()
);
this.appendBuffer(buf);
this.chunk += 1;
}
/**
* Append new data into the buffer
*/
appendBuffer(buf) {
if (ArrayBuffer.prototype.resize) {
this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE);
new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE);
} else {
const newBuffer = new ArrayBuffer((this.chunk + 1) * HTTP_CHUNK_SIZE, { maxByteLength: HTTP_TOTAL_MAX_SIZE });
const arrView = new Uint8Array(newBuffer);
arrView.set(new Uint8Array(this.buffer));
arrView.set(buf, this.chunk * HTTP_CHUNK_SIZE);
this.buffer = newBuffer;
this.dataView = new DataView(this.buffer);
}
}
/**
* Check whether we need to fetch a new chunk
*/
async fetchChunkIfNeeded(offset) {
if (this.dataView.byteLength - offset < HTTP_DATA_LEEWAY) {
await this.fetchChunk();
}
}
};
var RangeViewLocalFile = class extends RangeView {
/**
* Read a new chunk from local file system.
*/
async fetchChunk() {
const { FileBlob: FileBlob2 } = await Promise.resolve().then(() => (init_FileBlob(), FileBlob_exports));
const blob = await FileBlob2.create(this.uri);
const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1];
const buffer = await blob.slice(range[0], range[1]).arrayBuffer();
this.appendBuffer(new Uint8Array(buffer));
this.chunk += 1;
}
};
function readVersionedSize(view, byteOffset, version, littleEndian) {
switch (version) {
case 1: {
const n = view.getUint32(byteOffset, littleEndian);
return { value: BigInt(n), length: 4 };
}
case 2:
case 3: {
return { value: view.getBigUint64(byteOffset, littleEndian), length: 8 };
}
}
}
function readString(view, offset, version, littleEndian) {
const length = readVersionedSize(view, offset, version, littleEndian);
const off = length.length;
const value = new TextDecoder().decode(view.buffer.slice(offset + off, offset + off + Number(length.value)));
return { value, length: off + Number(length.value) };
}
function readMetadataValue(view, type, offset, version, littleEndian) {
switch (type) {
case 0 /* UINT8 */:
return { value: view.getUint8(offset), length: 1 };
case 1 /* INT8 */:
return { value: view.getInt8(offset), length: 1 };
case 2 /* UINT16 */:
return { value: view.getUint16(offset, littleEndian), length: 2 };
case 3 /* INT16 */:
return { value: view.getInt16(offset, littleEndian), length: 2 };
case 4 /* UINT32 */:
return { value: view.getUint32(offset, littleEndian), length: 4 };
case 5 /* INT32 */:
return { value: view.getInt32(offset, littleEndian), length: 4 };
case 6 /* FLOAT32 */:
return { value: view.getFloat32(offset, littleEndian), length: 4 };
case 7 /* BOOL */:
return { value: view.getUint8(offset) !== 0, length: 1 };
case 8 /* STRING */:
return readString(view, offset, version, littleEndian);
case 9 /* ARRAY */: {
const arrayType = view.getUint32(offset, littleEndian);
const arrayLength = readVersionedSize(view, offset + 4, version, littleEndian);
let length = 4 + arrayLength.length;
const arrayValues = [];
for (let i = 0; i < arrayLength.value; i++) {
const metadataValue = readMetadataValue(view, arrayType, offset + length, version, littleEndian);
arrayValues.push(metadataValue.value);
length += metadataValue.length;
}
return { value: arrayValues, length };
}
case 10 /* UINT64 */:
return { value: view.getBigUint64(offset, littleEndian), length: 8 };
case 11 /* INT64 */:
return { value: view.getBigInt64(offset, littleEndian), length: 8 };
case 12 /* FLOAT64 */:
return { value: view.getFloat64(offset, littleEndian), length: 8 };
}
}
async function gguf(uri, params) {
let r;
if (isBackend) {
if (uri.match(/^https?:\/\//)) {
r = new RangeView(uri, params);
} else if (params?.allowLocalFile) {
r = new RangeViewLocalFile(uri, params);
} else {
throw new Error("Access to local file is not enabled, please set allowLocalFile to true");
}
} else {
if (params?.allowLocalFile) {
throw new Error("allowLocalFile cannot be used on browser");
}
r = new RangeView(uri, params);
}
await r.fetchChunk();
const checkBuffer = (buffer, header) => {
for (let i = 0; i < header.length; i++) {
if (header[i] !== buffer[i]) {
return false;
}
}
return true;
};
if (!checkBuffer(new Uint8Array(r.view.buffer.slice(0, 4)), GGUF_MAGIC_NUMBER)) {
throw new Error("not a valid gguf file: not starting with GGUF magic number");
}
const [littleEndian, version] = (() => {
const version2 = r.view.getUint32(4, true);
if (version2 & 65535) {
return [true, version2];
} else {
return [false, r.view.getUint32(4, false)];
}
})();
if (!isVersion(version)) {
throw new Error(`not a valid gguf file: unsupported version "${version}"`);
}
let offset = 8;
const tensorCount = readVersionedSize(r.view, offset, version, littleEndian);
offset += tensorCount.length;
const numKv = readVersionedSize(r.view, offset, version, littleEndian);
offset += numKv.length;
const metadata = {
version,
tensor_count: tensorCount.value,
kv_count: numKv.value
};
let typedMetadata;
if (params?.typedMetadata) {
typedMetadata = {
version: { value: version, type: 4 /* UINT32 */ },
tensor_count: {
value: tensorCount.value,
type: version === 1 ? 4 /* UINT32 */ : 10 /* UINT64 */
},
kv_count: {
value: numKv.value,
type: version === 1 ? 4 /* UINT32 */ : 10 /* UINT64 */
}
};
}
for (let i = 0; i < numKv.value; i++) {
await r.fetchChunkIfNeeded(offset);
const keyResult = readString(r.view, offset, version, littleEndian);
offset += keyResult.length;
const valueType = r.view.getUint32(offset, littleEndian);
offset += 4;
if (!isGGUFValueType(valueType)) {
throw new Error("Unsupported metadata type: " + valueType);
}
let valueResult;
while (!valueResult) {
try {
valueResult = readMetadataValue(r.view, valueType, offset, version, littleEndian);
} catch (err) {
if (err instanceof RangeError) {
await r.fetchChunk();
} else {
throw err;
}
}
}
offset += valueResult.length;
metadata[keyResult.value] = valueResult.value;
if (typedMetadata) {
const typedEntry = {
value: valueResult.value,
type: valueType
};
if (valueType === 9 /* ARRAY */) {
const arrayTypeOffset = offset - valueResult.length;
const arraySubType = r.view.getUint32(arrayTypeOffset, littleEndian);
if (isGGUFValueType(arraySubType)) {
typedEntry.subType = arraySubType;
}
}
typedMetadata[keyResult.value] = typedEntry;
}
}
const tensorInfos = [];
for (let i = 0; i < tensorCount.value; i++) {
await r.fetchChunkIfNeeded(offset);
const keyResult = readString(r.view, offset, version, littleEndian);
offset += keyResult.length;
const nDims = r.view.getUint32(offset, littleEndian);
offset += 4;
const shape = [];
for (let dim = 0; dim < nDims; dim++) {
const shapeDim = readVersionedSize(r.view, offset, version, littleEndian);
shape.push(shapeDim.value);
offset += shapeDim.length;
}
const type = r.view.getUint32(offset, littleEndian);
offset += 4;
const tensorOffset = r.view.getBigUint64(offset, littleEndian);
offset += 8;
tensorInfos.push({
name: keyResult.value,
n_dims: nDims,
shape,
dtype: type,
offset: tensorOffset
});
}
const alignment = Number(metadata["general.alignment"] ?? GGUF_DEFAULT_ALIGNMENT);
const tensorDataOffset = BigInt(GGML_PAD(offset, alignment));
if (params?.computeParametersCount && params?.typedMetadata) {
const parameterCount = tensorInfos.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)).reduce((acc, val) => acc + val, 0);
return {
metadata,
tensorInfos,
tensorDataOffset,
littleEndian,
parameterCount,
typedMetadata
};
} else if (params?.computeParametersCount) {
const parameterCount = tensorInfos.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)).reduce((acc, val) => acc + val, 0);
return {
metadata,
tensorInfos,
tensorDataOffset,
littleEndian,
parameterCount
};
} else if (params?.typedMetadata) {
return {
metadata,
tensorInfos,
tensorDataOffset,
littleEndian,
typedMetadata
};
} else {
return { metadata, tensorInfos, tensorDataOffset, littleEndian };
}
}
function writeVersionedSize(version, value, littleEndian) {
switch (version) {
case 1: {
const buffer = new ArrayBuffer(4);
const view = new DataView(buffer);
view.setUint32(0, Number(value), littleEndian);
return new Uint8Array(buffer);
}
case 2:
case 3: {
const buffer = new ArrayBuffer(8);
const view = new DataView(buffer);
view.setBigUint64(0, value, littleEndian);
return new Uint8Array(buffer);
}
}
}
function writeString(value, version, littleEndian) {
const stringBytes = new TextEncoder().encode(value);
const lengthBytes = writeVersionedSize(version, BigInt(stringBytes.length), littleEndian);
const result = new Uint8Array(lengthBytes.length + stringBytes.length);
result.set(lengthBytes, 0);
result.set(stringBytes, lengthBytes.length);
return result;
}
function writeMetadataValue(value, type, version, littleEndian, subType) {
switch (type) {
case 0 /* UINT8 */: {
const buffer = new ArrayBuffer(1);
const view = new DataView(buffer);
view.setUint8(0, value);
return new Uint8Array(buffer);
}
case 1 /* INT8 */: {
const buffer = new ArrayBuffer(1);
const view = new DataView(buffer);
view.setInt8(0, value);
return new Uint8Array(buffer);
}
case 2 /* UINT16 */: {
const buffer = new ArrayBuffer(2);
const view = new DataView(buffer);
view.setUint16(0, value, littleEndian);
return new Uint8Array(buffer);
}
case 3 /* INT16 */: {
const buffer = new ArrayBuffer(2);
const view = new DataView(buffer);
view.setInt16(0, value, littleEndian);
return new Uint8Array(buffer);
}
case 4 /* UINT32 */: {
const buffer = new ArrayBuffer(4);
const view = new DataView(buffer);
view.setUint32(0, value, littleEndian);
return new Uint8Array(buffer);
}
case 5 /* INT32 */: {
const buffer = new ArrayBuffer(4);
const view = new DataView(buffer);
view.setInt32(0, value, littleEndian);
return new Uint8Array(buffer);
}
case 6 /* FLOAT32 */: {
const buffer = new ArrayBuffer(4);
const view = new DataView(buffer);
view.setFloat32(0, value, littleEndian);
return new Uint8Array(buffer);
}
case 7 /* BOOL */: {
const buffer = new ArrayBuffer(1);
const view = new DataView(buffer);
view.setUint8(0, value ? 1 : 0);
return new Uint8Array(buffer);
}
case 8 /* STRING */: {
return writeString(value, version, littleEndian);
}
case 9 /* ARRAY */: {
if (!subType) {
throw new Error("Array type requires subType to be specified");
}
const arrayValue = value;
const arrayTypeBuffer = new ArrayBuffer(4);
const arrayTypeView = new DataView(arrayTypeBuffer);
arrayTypeView.setUint32(0, subType, littleEndian);
const arrayTypeBytes = new Uint8Array(arrayTypeBuffer);
const lengthBytes = writeVersionedSize(version, BigInt(arrayValue.length), littleEndian);
const elementBytes = [];
for (const element of arrayValue) {
elementBytes.push(writeMetadataValue(element, subType, version, littleEndian));
}
const totalLength = arrayTypeBytes.length + lengthBytes.length + elementBytes.reduce((sum, bytes) => sum + bytes.length, 0);
const result = new Uint8Array(totalLength);
let offset = 0;
result.set(arrayTypeBytes, offset);
offset += arrayTypeBytes.length;
result.set(lengthBytes, offset);
offset += lengthBytes.length;
for (const bytes of elementBytes) {
result.set(bytes, offset);
offset += bytes.length;
}
return result;
}
case 10 /* UINT64 */: {
const buffer = new ArrayBuffer(8);
const view = new DataView(buffer);
view.setBigUint64(0, value, littleEndian);
return new Uint8Array(buffer);
}
case 11 /* INT64 */: {
const buffer = new ArrayBuffer(8);
const view = new DataView(buffer);
view.setBigInt64(0, value, littleEndian);
return new Uint8Array(buffer);
}
case 12 /* FLOAT64 */: {
const buffer = new ArrayBuffer(8);
const view = new DataView(buffer);
view.setFloat64(0, value, littleEndian);
return new Uint8Array(buffer);
}
default:
throw new Error(`Unsupported value type: ${type}`);
}
}
function serializeGgufMetadata(typedMetadata, options = {}) {
const littleEndian = options.littleEndian ?? true;
const alignment = options.alignment ?? 32;
const version = typedMetadata.version.value;
const versionBuffer = new ArrayBuffer(4);
const versionView = new DataView(versionBuffer);
versionView.setUint32(0, version, littleEndian);
const versionBytes = new Uint8Array(versionBuffer);
const tensorCountBytes = writeVersionedSize(version, typedMetadata.tensor_count.value, littleEndian);
const kvEntries = Object.entries(typedMetadata).filter(
([key]) => !["version", "tensor_count", "kv_count"].includes(key)
);
const kvCount = BigInt(kvEntries.length);
const kvCountBytes = writeVersionedSize(version, kvCount, littleEndian);
const kvBytes = [];
for (const [key, entry] of kvEntries) {
const keyBytes = writeString(key, version, littleEndian);
kvBytes.push(keyBytes);
const valueTypeBuffer = new ArrayBuffer(4);
const valueTypeView = new DataView(valueTypeBuffer);
valueTypeView.setUint32(0, entry.type, littleEndian);
const valueTypeBytes = new Uint8Array(valueTypeBuffer);
kvBytes.push(valueTypeBytes);
if (entry.value === void 0) {
throw new Error(`Value for key "${key}" is undefined`);
}
const valueBytes = writeMetadataValue(
entry.value,
entry.type,
version,
littleEndian,
"subType" in entry ? entry.subType : void 0
);
kvBytes.push(valueBytes);
}
const preAlignmentSize = GGUF_MAGIC_NUMBER.length + versionBytes.length + tensorCountBytes.length + kvCountBytes.length + kvBytes.reduce((sum, bytes) => sum + bytes.length, 0);
const GGML_PAD2 = (x, n) => x + n - 1 & ~(n - 1);
const alignedSize = GGML_PAD2(preAlignmentSize, alignment);
const result = new Uint8Array(alignedSize);
let offset = 0;
result.set(GGUF_MAGIC_NUMBER, offset);
offset += GGUF_MAGIC_NUMBER.length;
result.set(versionBytes, offset);
offset += versionBytes.length;
result.set(tensorCountBytes, offset);
offset += tensorCountBytes.length;
result.set(kvCountBytes, offset);
offset += kvCountBytes.length;
for (const bytes of kvBytes) {
result.set(bytes, offset);
offset += bytes.length;
}
return result;
}
async function ggufAllShards(url, params) {
const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS;
if (parallelDownloads < 1) {
throw new TypeError("parallelDownloads must be greater than 0");
}
const ggufShardFileInfo = parseGgufShardFilename(url);
if (ggufShardFileInfo) {
const total = parseInt(ggufShardFileInfo.total);
const prefix = ggufShardFileInfo.prefix;
const urls = [];
for (let shardIdx = 1; shardIdx <= total; shardIdx++) {
urls.push(`${prefix}-${shardIdx.toString().padStart(5, "0")}-of-${total.toString().padStart(5, "0")}.gguf`);
}
const shards = await promisesQueue(
urls.map((shardUrl) => () => gguf(shardUrl, { ...params, computeParametersCount: true })),
parallelDownloads
);
return {
shards,
parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0)
};
} else {
const { metadata, tensorInfos, tensorDataOffset, littleEndian, parameterCount } = await gguf(url, {
...params,
computeParametersCount: true
});
return { shards: [{ metadata, tensorInfos, tensorDataOffset, littleEndian }], parameterCount };
}
}
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
GGMLFileQuantizationType,
GGMLQuantizationType,
GGUFValueType,
GGUF_QUANT_DESCRIPTIONS,
GGUF_QUANT_ORDER,
GGUF_QUANT_RE,
GGUF_QUANT_RE_GLOBAL,
RE_GGUF_FILE,
RE_GGUF_SHARD_FILE,
findNearestQuantType,
gguf,
ggufAllShards,
parseGGUFQuantLabel,
parseGgufShardFilename,
serializeGgufMetadata
});