@huggingface/gguf
Version:
a GGUF parser that works on remotely hosted files
177 lines (173 loc) • 10.4 kB
text/typescript
import { GGMLQuantizationType } from "./types";
export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string; src_url?: string }> = {
[ ]: {
txt: "32-bit standard IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Single-precision_floating-point_format",
},
[ ]: {
txt: "16-bit standard IEEE 754 half-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Half-precision_floating-point_format",
},
[ ]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249",
},
[ ]: {
txt: "8-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290",
},
[ ]: {
txt: `8-bit quantization (q). Each block has 256 weights. Only used for quantizing intermediate results. All 2-6 bit dot products are implemented for this quantization type. Weight formula: w = q * block_scale.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[ ]: {
txt: `6-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(8-bit), resulting in 6.5625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[ ]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249",
},
[ ]: {
txt: "5-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290",
},
[ ]: {
txt: `5-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 5.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[ ]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557654249",
},
[ ]: {
txt: "4-bit round-to-nearest quantization (q). Each block has 32 weights. Weight formula: w = q * block_scale + block_minimum. Legacy quantization method (not used widely as of today).",
src_url: "https://github.com/huggingface/huggingface.js/pull/615#discussion_r1557682290",
},
[ ]: {
txt: `4-bit quantization (q). Super-blocks with 8 blocks, each block has 32 weights. Weight formula: w = q * block_scale(6-bit) + block_min(6-bit), resulting in 4.5 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[ ]: {
txt: `3-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weights. Weight formula: w = q * block_scale(6-bit), resulting. 3.4375 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[ ]: {
txt: `2-bit quantization (q). Super-blocks with 16 blocks, each block has 16 weight. Weight formula: w = q * block_scale(4-bit) + block_min(4-bit), resulting in 2.625 bits-per-weight.`,
src_url: "https://github.com/ggerganov/llama.cpp/pull/1684#issue-1739619305",
},
[ ]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 4.25 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[ ]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.44 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[ ]: {
txt: "3-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 3.06 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[ ]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.5 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[ ]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.31 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[ ]: {
txt: "2-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 2.06 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[ ]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.56 bits-per-weight.",
src_url:
"https://huggingface.co/CISCai/OpenCodeInterpreter-DS-6.7B-SOTA-GGUF/blob/main/README.md?code=true#L59-L70",
},
[ ]: {
txt: "4-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/5590",
},
[ ]: {
txt: "8-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
},
[ ]: {
txt: "16-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
},
[ ]: {
txt: "32-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6045",
},
[ ]: {
txt: "64-bit fixed-width integer number.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6062",
},
[ ]: {
txt: "64-bit standard IEEE 754 double-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Double-precision_floating-point_format",
},
[ ]: {
txt: "1-bit quantization (q). Super-blocks with 256 weights. Weight w is obtained using super_block_scale & importance matrix, resulting in 1.75 bits-per-weight.",
src_url: "https://github.com/ggerganov/llama.cpp/pull/6302",
},
[ ]: {
txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
},
[ ]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
},
[ ]: {
txt: "Ternary quantization.",
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
},
};
const QK_K = 256;
const calcBPW = (blockSize: number, typeSize: number) => {
return (typeSize * 8) / blockSize;
};
// copied from https://github.com/ggml-org/llama.cpp/tree/master/gguf-py/gguf/constants.py
// map quantization type to element size in bits per weight (example: Q4_K -> 4.5 bpw)
export const GGML_QUANT_SIZES = {
[ ]: calcBPW(1, 4),
[ ]: calcBPW(1, 2),
[ ]: calcBPW(32, 2 + 16),
[ ]: calcBPW(32, 2 + 2 + 16),
[ ]: calcBPW(32, 2 + 4 + 16),
[ ]: calcBPW(32, 2 + 2 + 4 + 16),
[ ]: calcBPW(32, 2 + 32),
[ ]: calcBPW(32, 4 + 4 + 32),
[ ]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4),
[ ]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12),
[ ]: calcBPW(256, 2 + 2 + QK_K / 2 + 12),
[ ]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12),
[ ]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16),
[ ]: calcBPW(256, 4 + QK_K + QK_K / 8),
[ ]: calcBPW(256, 2 + QK_K / 4),
[ ]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32),
[ ]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8),
[ ]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16),
[ ]: calcBPW(32, 2 + 16),
[ ]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4),
[ ]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16),
[ ]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64),
[ ]: calcBPW(1, 1),
[ ]: calcBPW(1, 2),
[ ]: calcBPW(1, 4),
[ ]: calcBPW(1, 8),
[ ]: calcBPW(1, 8),
[ ]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
[ ]: calcBPW(1, 2),
[ ]: calcBPW(256, 2 + 4 * 13),
[ ]: calcBPW(256, 2 + 64),
};