@huggingface/gguf
Version:
a GGUF parser that works on remotely hosted files
585 lines (578 loc) • 23.5 kB
JavaScript
// src/transformer-llm.ts
var LLM_ARCHITECTURES = [
"llama",
"deci",
"falcon",
"grok",
"gpt2",
"gptj",
"gptneox",
"mpt",
"baichuan",
"starcoder",
"refact",
"bert",
"nomic-bert",
"jina-bert-v2",
"bloom",
"stablelm",
"qwen",
"qwen2",
"qwen2moe",
"qwen2vl",
"phi2",
"phi3",
"phimoe",
"plamo",
"codeshell",
"orion",
"internlm2",
"minicpm",
"minicpm3",
"gemma",
"gemma2",
"starcoder2",
"mamba",
"xverse",
"command-r",
"cohere2",
"dbrx",
"olmo",
"olmo2",
"olmoe",
"openelm",
"arctic",
"deepseek",
"deepseek2",
"chatglm",
"bitnet",
"t5",
"t5encoder",
"jais",
"nemotron",
"exaone",
"rwkv6",
"rwkv6qwen2",
"granite",
"granitemoe",
"chameleon",
"wavtokenizer-dec"
];
// src/types.ts
import { GGMLQuantizationType } from "@huggingface/tasks";
var GGUFValueType = /* @__PURE__ */ ((GGUFValueType2) => {
GGUFValueType2[GGUFValueType2["UINT8"] = 0] = "UINT8";
GGUFValueType2[GGUFValueType2["INT8"] = 1] = "INT8";
GGUFValueType2[GGUFValueType2["UINT16"] = 2] = "UINT16";
GGUFValueType2[GGUFValueType2["INT16"] = 3] = "INT16";
GGUFValueType2[GGUFValueType2["UINT32"] = 4] = "UINT32";
GGUFValueType2[GGUFValueType2["INT32"] = 5] = "INT32";
GGUFValueType2[GGUFValueType2["FLOAT32"] = 6] = "FLOAT32";
GGUFValueType2[GGUFValueType2["BOOL"] = 7] = "BOOL";
GGUFValueType2[GGUFValueType2["STRING"] = 8] = "STRING";
GGUFValueType2[GGUFValueType2["ARRAY"] = 9] = "ARRAY";
GGUFValueType2[GGUFValueType2["UINT64"] = 10] = "UINT64";
GGUFValueType2[GGUFValueType2["INT64"] = 11] = "INT64";
GGUFValueType2[GGUFValueType2["FLOAT64"] = 12] = "FLOAT64";
return GGUFValueType2;
})(GGUFValueType || {});
var ARCHITECTURES = [...LLM_ARCHITECTURES, "rwkv", "whisper"];
// src/utils/isBackend.ts
var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
var isBackend = !isBrowser && !isWebWorker;
// src/utils/promisesQueue.ts
async function promisesQueue(factories, concurrency) {
const results = [];
const executing = /* @__PURE__ */ new Set();
let index = 0;
for (const factory of factories) {
const closureIndex = index++;
const e = factory().then((r) => {
results[closureIndex] = r;
executing.delete(e);
});
executing.add(e);
if (executing.size >= concurrency) {
await Promise.race(executing);
}
}
await Promise.all(executing);
return results;
}
// src/quant-descriptions.ts
var GGUF_QUANT_DESCRIPTIONS = {
[GGMLQuantizationType.F32]: {
txt: "32-bit standard IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format"
},
[GGMLQuantizationType.F16]: {
txt: "16-bit standard IEEE 754 half-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format"
},
[GGMLQuantizationType.Q8_0]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249"
},
[GGMLQuantizationType.Q8_1]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290"
},
[GGMLQuantizationType.Q8_K]: {
txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[GGMLQuantizationType.Q6_K]: {
txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[GGMLQuantizationType.Q5_0]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249"
},
[GGMLQuantizationType.Q5_1]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290"
},
[GGMLQuantizationType.Q5_K]: {
txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[GGMLQuantizationType.Q4_0]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249"
},
[GGMLQuantizationType.Q4_1]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290"
},
[GGMLQuantizationType.Q4_K]: {
txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[GGMLQuantizationType.Q3_K]: {
txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[GGMLQuantizationType.Q2_K]: {
txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305"
},
[GGMLQuantizationType.IQ4_XS]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[GGMLQuantizationType.IQ3_S]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[GGMLQuantizationType.IQ3_XXS]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[GGMLQuantizationType.IQ2_S]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[GGMLQuantizationType.IQ2_XS]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[GGMLQuantizationType.IQ2_XXS]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[GGMLQuantizationType.IQ1_S]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.",
src_url: "https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70"
},
[GGMLQuantizationType.IQ4_NL]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/5590"
},
[GGMLQuantizationType.I8]: {
txt: "8-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045"
},
[GGMLQuantizationType.I16]: {
txt: "16-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045"
},
[GGMLQuantizationType.I32]: {
txt: "32-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045"
},
[GGMLQuantizationType.I64]: {
txt: "64-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6062"
},
[GGMLQuantizationType.F64]: {
txt: "64-bit standard IEEE 754 double-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format"
},
[GGMLQuantizationType.IQ1_M]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6302"
},
[GGMLQuantizationType.BF16]: {
txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format"
},
[GGMLQuantizationType.TQ1_0]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151"
},
[GGMLQuantizationType.TQ2_0]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151"
}
};
var QK_K = 256;
var calcBPW = (blockSize, typeSize) => {
return typeSize * 8 / blockSize;
};
var GGML_QUANT_SIZES = {
[GGMLQuantizationType.F32]: calcBPW(1, 4),
[GGMLQuantizationType.F16]: calcBPW(1, 2),
[GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16),
[GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16),
[GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16),
[GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16),
[GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32),
[GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32),
[GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4),
[GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12),
[GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12),
[GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12),
[GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16),
[GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8),
[GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4),
[GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32),
[GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8),
[GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16),
[GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16),
[GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4),
[GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16),
[GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64),
[GGMLQuantizationType.I8]: calcBPW(1, 1),
[GGMLQuantizationType.I16]: calcBPW(1, 2),
[GGMLQuantizationType.I32]: calcBPW(1, 4),
[GGMLQuantizationType.I64]: calcBPW(1, 8),
[GGMLQuantizationType.F64]: calcBPW(1, 8),
[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
[GGMLQuantizationType.BF16]: calcBPW(1, 2),
[GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
[GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64)
};
// src/gguf.ts
import {
parseGGUFQuantLabel,
GGUF_QUANT_RE,
GGUF_QUANT_RE_GLOBAL,
GGUF_QUANT_ORDER,
findNearestQuantType,
GGMLFileQuantizationType
} from "@huggingface/tasks";
var RE_GGUF_FILE = /\.gguf$/;
var RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/;
var GGUF_DEFAULT_ALIGNMENT = 32;
var GGML_PAD = (x, n) => x + n - 1 & ~(n - 1);
var PARALLEL_DOWNLOADS = 20;
function parseGgufShardFilename(filename) {
const match = RE_GGUF_SHARD_FILE.exec(filename);
if (match && match.groups) {
return {
prefix: match.groups["prefix"],
shard: match.groups["shard"],
total: match.groups["total"]
};
}
return null;
}
var isVersion = (version) => version === 1 || version === 2 || version === 3;
var ggufMagicNumber = new Uint8Array([71, 71, 85, 70]);
function isGGUFValueType(n) {
return typeof GGUFValueType[n] === "string";
}
var HTTP_CHUNK_SIZE = 2 * 10 ** 6;
var HTTP_DATA_LEEWAY = 5 * 10 ** 5;
var HTTP_TOTAL_MAX_SIZE = 50 * 10 ** 6;
var RangeView = class {
constructor(uri, params) {
this.uri = uri;
this.params = params;
this.chunk = 0;
this.buffer = new ArrayBuffer(0, { maxByteLength: HTTP_TOTAL_MAX_SIZE });
this.dataView = new DataView(this.buffer);
}
chunk;
buffer;
dataView;
get view() {
return this.dataView;
}
/**
* Fetch a new chunk from the server
*/
async fetchChunk() {
const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1];
const buf = new Uint8Array(
await (await (this.params?.fetch ?? fetch)(this.uri, {
headers: {
...this.params?.additionalFetchHeaders ?? {},
Range: `bytes=${range[0]}-${range[1]}`
}
})).arrayBuffer()
);
this.appendBuffer(buf);
this.chunk += 1;
}
/**
* Append new data into the buffer
*/
appendBuffer(buf) {
if (ArrayBuffer.prototype.resize) {
this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE);
new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE);
} else {
const newBuffer = new ArrayBuffer((this.chunk + 1) * HTTP_CHUNK_SIZE, { maxByteLength: HTTP_TOTAL_MAX_SIZE });
const arrView = new Uint8Array(newBuffer);
arrView.set(new Uint8Array(this.buffer));
arrView.set(buf, this.chunk * HTTP_CHUNK_SIZE);
this.buffer = newBuffer;
this.dataView = new DataView(this.buffer);
}
}
/**
* Check whether we need to fetch a new chunk
*/
async fetchChunkIfNeeded(offset) {
if (this.dataView.byteLength - offset < HTTP_DATA_LEEWAY) {
await this.fetchChunk();
}
}
};
var RangeViewLocalFile = class extends RangeView {
/**
* Read a new chunk from local file system.
*/
async fetchChunk() {
const { FileBlob } = await import("./FileBlob-GRW5ZULE.mjs");
const blob = await FileBlob.create(this.uri);
const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1];
const buffer = await blob.slice(range[0], range[1]).arrayBuffer();
this.appendBuffer(new Uint8Array(buffer));
this.chunk += 1;
}
};
function readVersionedSize(view, byteOffset, version, littleEndian) {
switch (version) {
case 1: {
const n = view.getUint32(byteOffset, littleEndian);
return { value: BigInt(n), length: 4 };
}
case 2:
case 3: {
return { value: view.getBigUint64(byteOffset, littleEndian), length: 8 };
}
}
}
function readString(view, offset, version, littleEndian) {
const length = readVersionedSize(view, offset, version, littleEndian);
const off = length.length;
const value = new TextDecoder().decode(view.buffer.slice(offset + off, offset + off + Number(length.value)));
return { value, length: off + Number(length.value) };
}
function readMetadataValue(view, type, offset, version, littleEndian) {
switch (type) {
case 0 /* UINT8 */:
return { value: view.getUint8(offset), length: 1 };
case 1 /* INT8 */:
return { value: view.getInt8(offset), length: 1 };
case 2 /* UINT16 */:
return { value: view.getUint16(offset, littleEndian), length: 2 };
case 3 /* INT16 */:
return { value: view.getInt16(offset, littleEndian), length: 2 };
case 4 /* UINT32 */:
return { value: view.getUint32(offset, littleEndian), length: 4 };
case 5 /* INT32 */:
return { value: view.getInt32(offset, littleEndian), length: 4 };
case 6 /* FLOAT32 */:
return { value: view.getFloat32(offset, littleEndian), length: 4 };
case 7 /* BOOL */:
return { value: view.getUint8(offset) !== 0, length: 1 };
case 8 /* STRING */:
return readString(view, offset, version, littleEndian);
case 9 /* ARRAY */: {
const arrayType = view.getUint32(offset, littleEndian);
const arrayLength = readVersionedSize(view, offset + 4, version, littleEndian);
let length = 4 + arrayLength.length;
const arrayValues = [];
for (let i = 0; i < arrayLength.value; i++) {
const metadataValue = readMetadataValue(view, arrayType, offset + length, version, littleEndian);
arrayValues.push(metadataValue.value);
length += metadataValue.length;
}
return { value: arrayValues, length };
}
case 10 /* UINT64 */:
return { value: view.getBigUint64(offset, littleEndian), length: 8 };
case 11 /* INT64 */:
return { value: view.getBigInt64(offset, littleEndian), length: 8 };
case 12 /* FLOAT64 */:
return { value: view.getFloat64(offset, littleEndian), length: 8 };
}
}
async function gguf(uri, params) {
let r;
if (isBackend) {
if (uri.match(/^https?:\/\//)) {
r = new RangeView(uri, params);
} else if (params?.allowLocalFile) {
r = new RangeViewLocalFile(uri, params);
} else {
throw new Error("Access to local file is not enabled, please set allowLocalFile to true");
}
} else {
if (params?.allowLocalFile) {
throw new Error("allowLocalFile cannot be used on browser");
}
r = new RangeView(uri, params);
}
await r.fetchChunk();
const checkBuffer = (buffer, header) => {
for (let i = 0; i < header.length; i++) {
if (header[i] !== buffer[i]) {
return false;
}
}
return true;
};
if (!checkBuffer(new Uint8Array(r.view.buffer.slice(0, 4)), ggufMagicNumber)) {
throw new Error("not a valid gguf file: not starting with GGUF magic number");
}
const [littleEndian, version] = (() => {
const version2 = r.view.getUint32(4, true);
if (version2 & 65535) {
return [true, version2];
} else {
return [false, r.view.getUint32(4, false)];
}
})();
if (!isVersion(version)) {
throw new Error(`not a valid gguf file: unsupported version "${version}"`);
}
let offset = 8;
const tensorCount = readVersionedSize(r.view, offset, version, littleEndian);
offset += tensorCount.length;
const numKv = readVersionedSize(r.view, offset, version, littleEndian);
offset += numKv.length;
const metadata = {
version,
tensor_count: tensorCount.value,
kv_count: numKv.value
};
for (let i = 0; i < numKv.value; i++) {
await r.fetchChunkIfNeeded(offset);
const keyResult = readString(r.view, offset, version, littleEndian);
offset += keyResult.length;
const valueType = r.view.getUint32(offset, littleEndian);
offset += 4;
if (!isGGUFValueType(valueType)) {
throw new Error("Unsupported metadata type: " + valueType);
}
let valueResult;
while (!valueResult) {
try {
valueResult = readMetadataValue(r.view, valueType, offset, version, littleEndian);
} catch (err) {
if (err instanceof RangeError) {
await r.fetchChunk();
} else {
throw err;
}
}
}
offset += valueResult.length;
metadata[keyResult.value] = valueResult.value;
}
const tensorInfos = [];
for (let i = 0; i < tensorCount.value; i++) {
await r.fetchChunkIfNeeded(offset);
const keyResult = readString(r.view, offset, version, littleEndian);
offset += keyResult.length;
const nDims = r.view.getUint32(offset, littleEndian);
offset += 4;
const shape = [];
for (let dim = 0; dim < nDims; dim++) {
const shapeDim = readVersionedSize(r.view, offset, version, littleEndian);
shape.push(shapeDim.value);
offset += shapeDim.length;
}
const type = r.view.getUint32(offset, littleEndian);
offset += 4;
const tensorOffset = r.view.getBigUint64(offset, littleEndian);
offset += 8;
tensorInfos.push({
name: keyResult.value,
n_dims: nDims,
shape,
dtype: type,
offset: tensorOffset
});
}
const alignment = Number(metadata["general.alignment"] ?? GGUF_DEFAULT_ALIGNMENT);
const tensorDataOffset = BigInt(GGML_PAD(offset, alignment));
if (params?.computeParametersCount) {
const parameterCount = tensorInfos.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)).reduce((acc, val) => acc + val, 0);
return { metadata, tensorInfos, tensorDataOffset, parameterCount };
} else {
return { metadata, tensorInfos, tensorDataOffset };
}
}
async function ggufAllShards(url, params) {
const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS;
if (parallelDownloads < 1) {
throw new TypeError("parallelDownloads must be greater than 0");
}
const ggufShardFileInfo = parseGgufShardFilename(url);
if (ggufShardFileInfo) {
const total = parseInt(ggufShardFileInfo.total);
const prefix = ggufShardFileInfo.prefix;
const urls = [];
for (let shardIdx = 1; shardIdx <= total; shardIdx++) {
urls.push(`${prefix}-${shardIdx.toString().padStart(5, "0")}-of-${total.toString().padStart(5, "0")}.gguf`);
}
const shards = await promisesQueue(
urls.map((shardUrl) => () => gguf(shardUrl, { ...params, computeParametersCount: true })),
parallelDownloads
);
return {
shards,
parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0)
};
} else {
const { metadata, tensorInfos, tensorDataOffset, parameterCount } = await gguf(url, {
...params,
computeParametersCount: true
});
return { shards: [{ metadata, tensorInfos, tensorDataOffset }], parameterCount };
}
}
export {
GGUFValueType,
GGMLQuantizationType,
GGUF_QUANT_DESCRIPTIONS,
GGML_QUANT_SIZES,
RE_GGUF_FILE,
RE_GGUF_SHARD_FILE,
parseGgufShardFilename,
gguf,
ggufAllShards,
parseGGUFQuantLabel,
GGUF_QUANT_RE,
GGUF_QUANT_RE_GLOBAL,
GGUF_QUANT_ORDER,
findNearestQuantType,
GGMLFileQuantizationType
};