@huggingface/gguf
Version:
a GGUF parser that works on remotely hosted files
329 lines (289 loc) • 11.1 kB
text/typescript
import { beforeAll, describe, expect, it } from "vitest";
import type { GGUFParseOutput } from "./gguf";
import {
GGMLFileQuantizationType,
GGMLQuantizationType,
gguf,
ggufAllShards,
parseGgufShardFilename,
parseGGUFQuantLabel,
GGUF_QUANT_ORDER,
findNearestQuantType,
} from "./gguf";
import fs from "node:fs";
const URL_LLAMA = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/191239b/llama-2-7b-chat.Q2_K.gguf";
const URL_MISTRAL_7B =
"https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/3a6fbf4/mistral-7b-instruct-v0.2.Q5_K_M.gguf";
const URL_GEMMA_2B = "https://huggingface.co/lmstudio-ai/gemma-2b-it-GGUF/resolve/a0b140b/gemma-2b-it-q4_k_m.gguf";
const URL_BIG_ENDIAN =
"https://huggingface.co/ggml-org/models/resolve/1213976/bert-bge-small/ggml-model-f16-big-endian.gguf";
const URL_V1 =
"https://huggingface.co/tmadge/testing/resolve/66c078028d1ff92d7a9264a1590bc61ba6437933/tinyllamas-stories-260k-f32.gguf";
const URL_SHARDED_GROK =
"https://huggingface.co/Arki05/Grok-1-GGUF/resolve/ecafa8d8eca9b8cd75d11a0d08d3a6199dc5a068/grok-1-IQ3_XS-split-00001-of-00009.gguf";
const URL_BIG_METADATA = "https://huggingface.co/ngxson/test_gguf_models/resolve/main/gguf_test_big_metadata.gguf";
describe("gguf", () => {
beforeAll(async () => {
// download the gguf for "load file" test, save to .cache directory
if (!fs.existsSync(".cache")) {
fs.mkdirSync(".cache");
}
if (!fs.existsSync(".cache/model.gguf")) {
const res = await fetch(URL_BIG_METADATA);
const arrayBuf = await res.arrayBuffer();
fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf));
}
}, 30_000);
it("should parse a llama2 7b", async () => {
const { metadata, tensorInfos } = await gguf(URL_LLAMA);
/// metadata
expect(metadata).toMatchObject({
// partial list, do not exhaustively list (tokenizer is quite big for instance)
version: 2,
tensor_count: 291n,
kv_count: 19n,
"general.architecture": "llama",
"general.file_type": GGMLFileQuantizationType.Q2_K,
"general.name": "LLaMA v2",
"general.quantization_version": 2,
"llama.attention.head_count": 32,
"llama.attention.head_count_kv": 32,
"llama.attention.layer_norm_rms_epsilon": 9.999999974752427e-7,
"llama.block_count": 32,
"llama.context_length": 4096,
"llama.embedding_length": 4096,
"llama.feed_forward_length": 11008,
"llama.rope.dimension_count": 128,
});
expect(metadata["tokenizer.ggml.model"]);
if (metadata["tokenizer.ggml.model"]) {
const tokens = metadata["tokenizer.ggml.tokens"];
if (!Array.isArray(tokens)) {
throw new Error();
}
expect(tokens.slice(0, 10)).toEqual([
"<unk>",
"<s>",
"</s>",
"<0x00>",
"<0x01>",
"<0x02>",
"<0x03>",
"<0x04>",
"<0x05>",
"<0x06>",
]);
}
/// Tensor infos
/// By convention we test the first and last tensor.
expect(tensorInfos.length).toEqual(291);
expect(tensorInfos[0]).toMatchObject({
name: "token_embd.weight",
shape: [4096n, 32000n],
dtype: GGMLQuantizationType.Q2_K,
});
expect(tensorInfos[tensorInfos.length - 1]).toMatchObject({
name: "output_norm.weight",
shape: [4096n],
dtype: GGMLQuantizationType.F32,
});
});
it("should parse a mistral 7b", async () => {
const { metadata, tensorInfos } = await gguf(URL_MISTRAL_7B);
/// metadata
expect(metadata).toMatchObject({
version: 3,
tensor_count: 291n,
kv_count: 24n,
"general.architecture": "llama",
"general.file_type": GGMLFileQuantizationType.Q5_K_M,
"general.name": "mistralai_mistral-7b-instruct-v0.2",
"general.quantization_version": 2,
"llama.attention.head_count": 32,
"llama.attention.head_count_kv": 8,
"llama.attention.layer_norm_rms_epsilon": 0.000009999999747378752,
"llama.block_count": 32,
"llama.context_length": 32768,
"llama.embedding_length": 4096,
"llama.feed_forward_length": 14336,
"llama.rope.dimension_count": 128,
});
/// Tensor infos
expect(tensorInfos.length).toEqual(291);
expect(tensorInfos[0]).toMatchObject({
name: "token_embd.weight",
shape: [4096n, 32000n],
dtype: GGMLQuantizationType.Q5_K,
});
expect(tensorInfos[tensorInfos.length - 1]).toMatchObject({
name: "output.weight",
shape: [4096n, 32000n],
dtype: GGMLQuantizationType.Q6_K,
});
});
it("should parse a gemma 2b", async () => {
const { metadata, tensorInfos } = await gguf(URL_GEMMA_2B);
/// metadata
expect(metadata).toMatchObject({
version: 3,
tensor_count: 164n,
kv_count: 21n,
"general.architecture": "gemma",
"general.file_type": GGMLFileQuantizationType.Q4_K_M,
"general.name": "gemma-2b-it",
"general.quantization_version": 2,
"gemma.attention.head_count": 8,
"gemma.attention.head_count_kv": 1,
"gemma.attention.layer_norm_rms_epsilon": 9.999999974752427e-7,
"gemma.block_count": 18,
"gemma.context_length": 8192,
"gemma.embedding_length": 2048,
"gemma.feed_forward_length": 16384,
});
/// Tensor infos
expect(tensorInfos.length).toEqual(164);
expect(tensorInfos[0]).toMatchObject({
name: "token_embd.weight",
shape: [2048n, 256128n],
dtype: GGMLQuantizationType.Q4_K,
});
expect(tensorInfos[tensorInfos.length - 1]).toMatchObject({
name: "blk.9.ffn_norm.weight",
shape: [2048n],
dtype: GGMLQuantizationType.F32,
});
});
it("should parse a big-endian file", async () => {
const { metadata, tensorInfos } = await gguf(URL_BIG_ENDIAN);
/// metadata
expect(metadata).toMatchObject({
version: 3,
tensor_count: 197n,
kv_count: 23n,
"general.architecture": "bert",
"general.file_type": GGMLFileQuantizationType.F16,
"general.name": "bge-small-en-v1.5",
"bert.attention.causal": false,
"bert.attention.head_count": 12,
"bert.attention.layer_norm_epsilon": 9.999999960041972e-13,
"bert.block_count": 12,
"bert.context_length": 512,
"bert.embedding_length": 384,
"bert.feed_forward_length": 1536,
"bert.pooling_type": 2,
});
/// Tensor infos
expect(tensorInfos.length).toEqual(197);
expect(tensorInfos[0]).toMatchObject({
name: "token_embd_norm.bias",
shape: [384n],
dtype: GGMLQuantizationType.F32,
});
expect(tensorInfos[tensorInfos.length - 1]).toMatchObject({
name: "blk.9.ffn_down.weight",
shape: [1536n, 384n],
dtype: GGMLQuantizationType.F16,
});
});
it("should parse a v1 file", async () => {
const { metadata, tensorInfos } = await gguf(URL_V1);
/// metadata
expect(metadata).toMatchObject({
version: 1,
tensor_count: 48n,
kv_count: 18n,
"general.architecture": "llama",
"general.name": "tinyllamas-stories-260k",
"llama.attention.head_count": 8,
"llama.attention.head_count_kv": 4,
"llama.attention.layer_norm_rms_epsilon": 0.000009999999747378752,
"llama.block_count": 5,
"llama.context_length": 512,
"llama.embedding_length": 64,
"llama.feed_forward_length": 172,
"llama.rope.dimension_count": 8,
"llama.tensor_data_layout": "Meta AI original pth",
"tokenizer.ggml.bos_token_id": 1,
"tokenizer.ggml.eos_token_id": 2,
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.padding_token_id": 0,
});
/// Tensor infos
expect(tensorInfos.length).toEqual(48);
expect(tensorInfos[0]).toMatchObject({
name: "token_embd.weight",
shape: [64n, 512n],
dtype: GGMLQuantizationType.F32,
});
expect(tensorInfos[tensorInfos.length - 1]).toMatchObject({
name: "output.weight",
shape: [64n, 512n],
dtype: GGMLQuantizationType.F32,
});
});
it("should parse a local file", async () => {
const parsedGguf = await gguf(".cache/model.gguf", { allowLocalFile: true });
const { metadata } = parsedGguf as GGUFParseOutput<{ strict: false }>; // custom metadata arch, no need for typing
expect(metadata["dummy.1"]).toBeDefined(); // first metadata in the list
expect(metadata["dummy.32767"]).toBeDefined(); // last metadata in the list
});
it("should detect sharded gguf filename", async () => {
const ggufPath = "grok-1/grok-1-q4_0-00003-of-00009.gguf"; // https://huggingface.co/ggml-org/models/blob/fcf344adb9686474c70e74dd5e55465e9e6176ef/grok-1/grok-1-q4_0-00003-of-00009.gguf
const ggufShardFileInfo = parseGgufShardFilename(ggufPath);
expect(ggufShardFileInfo?.prefix).toEqual("grok-1/grok-1-q4_0");
expect(ggufShardFileInfo?.shard).toEqual("00003");
expect(ggufShardFileInfo?.total).toEqual("00009");
});
it("should get param count for llama2 7b", async () => {
const { parameterCount } = await gguf(URL_LLAMA, { computeParametersCount: true });
expect(parameterCount).toEqual(6_738_415_616); // 7B
});
it("should get param count for sharded gguf", async () => {
const { parameterCount } = await ggufAllShards(URL_SHARDED_GROK);
expect(parameterCount).toEqual(316_490_127_360); // 316B
});
it("parse quant label", async () => {
expect(parseGGUFQuantLabel("Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf")).toEqual("Q4_K_M");
expect(parseGGUFQuantLabel("subdir/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf")).toEqual("Q4_K_M");
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q2_K.gguf")).toEqual("Q2_K");
expect(parseGGUFQuantLabel("Codestral-22B-v0.1.gguf")).toEqual(undefined);
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-F32-Q2_K.gguf")).toEqual("Q2_K"); // gguf name with two quant labels [F32, Q2_K]
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual("IQ3_XS");
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q4_0_4_4.gguf")).toEqual("Q4_0"); // TODO: investigate Q4_0_4_4
});
it("calculate tensor data offset", async () => {
const { tensorDataOffset } = await gguf(URL_LLAMA);
expect(tensorDataOffset).toEqual(741056n);
});
// Quantization handler
it("should have GGUF_QUANT_ORDER in sync with GGMLQuantizationType enum", () => {
const enumValues = Object.values(GGMLQuantizationType).filter((value) => typeof value === "number") as number[];
const checkValues = new Set(GGUF_QUANT_ORDER);
for (const value of enumValues) {
expect(checkValues).toContain(value);
}
});
it("should find the nearest quant", () => {
const quant = GGMLFileQuantizationType.IQ2_M;
const availableQuants = [
GGMLFileQuantizationType.Q2_K,
GGMLFileQuantizationType.Q4_K_M,
GGMLFileQuantizationType.Q8_0,
];
const nearestQuant = findNearestQuantType(quant, availableQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q2_K);
});
it("should find the nearest quant (vision model)", () => {
const visionQuants = [GGMLFileQuantizationType.Q8_0, GGMLFileQuantizationType.F16, GGMLFileQuantizationType.BF16];
let nearestQuant;
// text = Q4_K_M
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q4_K_M, visionQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
// text = Q8_0
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q8_0, visionQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
// text = F16
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.F16, visionQuants);
expect(nearestQuant).toEqual(GGMLFileQuantizationType.F16);
});
});