@huggingface/gguf
Version:
a GGUF parser that works on remotely hosted files
179 lines (177 loc) • 6.63 kB
JavaScript
import {
GGMLQuantizationType,
GGML_QUANT_SIZES,
ggufAllShards
} from "./chunk-V6DSHS26.mjs";
// src/cli.ts
var mapDtypeToName = Object.fromEntries(Object.entries(GGMLQuantizationType).map(([name, value]) => [value, name]));
function showHelp(exitCode) {
console.error("Usage: gguf-view [--help|-h] [--show-tensor] [--context|-c N] <path/to/gguf>");
console.error(" --help, -h Show this help message");
console.error(" --show-tensor Show tensor information");
console.error(" --context, -c N Number of tokens in context (default: 4096)");
process.exit(exitCode);
}
async function main() {
let ggufPath = "";
let showTensors = false;
let nCtx = 4096;
for (let i = 2; i < process.argv.length; i++) {
if (process.argv[i] === "--help" || process.argv[i] === "-h") {
showHelp(0);
} else if (process.argv[i] === "--show-tensor") {
showTensors = true;
} else if (process.argv[i] === "--context" || process.argv[i] === "-c") {
nCtx = Number(process.argv[++i]);
} else {
ggufPath = process.argv[i];
}
}
if (!ggufPath.length) {
console.error("Error: Missing path to gguf file");
showHelp(1);
}
const { shards } = await ggufAllShards(ggufPath, {
allowLocalFile: true
});
const { metadata, tensorInfos } = shards[0];
for (let i = 1; i < shards.length; i++) {
tensorInfos.push(...shards[i].tensorInfos);
}
console.log(`* Dumping ${Object.keys(metadata).length} key/value pair(s)`);
printTable(
[
{ name: "Idx", alignRight: true },
// { name: 'Type' }, // TODO: support this
{ name: "Count", alignRight: true },
{ name: "Value" }
],
Object.entries(metadata).map(([key, value], i) => {
const MAX_LEN = 50;
let strVal = "";
let count = 1;
if (Array.isArray(value)) {
strVal = JSON.stringify(value);
count = value.length;
} else if (value instanceof String || typeof value === "string") {
strVal = JSON.stringify(value);
} else {
strVal = value.toString();
}
strVal = strVal.length > MAX_LEN ? strVal.slice(0, MAX_LEN) + "..." : strVal;
return [(i + 1).toString(), count.toString(), `${key} = ${strVal}`];
})
);
console.log();
console.log(`* Memory usage estimation (with context length of ${nCtx} tokens)`);
try {
const kvUsage = calcMemoryUsage(metadata, nCtx);
let modelWeightInBytes = 0;
for (const tensorInfo of tensorInfos) {
const nElem = Number(tensorInfo.shape.reduce((a, b) => a * b, 1n));
const tensorSizeInBytes = nElem * (GGML_QUANT_SIZES[tensorInfo.dtype] / 8);
modelWeightInBytes += tensorSizeInBytes;
}
const overhead = calcMemoryUsage(metadata, 256).totalBytes + modelWeightInBytes * 0.05;
const totalMemoryUsage = kvUsage.totalBytes + overhead + modelWeightInBytes;
printTable(
[{ name: "Item" }, { name: "Memory usage", alignRight: true }],
[
["K cache", (kvUsage.totalBytesK / 1e9).toFixed(2) + " GB"],
["V cache", (kvUsage.totalBytesV / 1e9).toFixed(2) + " GB"],
["Weight", (modelWeightInBytes / 1e9).toFixed(2) + " GB"],
["Overhead", (overhead / 1e9).toFixed(2) + " GB"],
["", "---"],
["TOTAL", (totalMemoryUsage / 1e9).toFixed(2) + " GB"]
]
);
} catch (e) {
console.error(`Error: ${e.message}`);
}
if (showTensors) {
console.log();
console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
printTable(
[
{ name: "Idx", alignRight: true },
{ name: "Num Elements", alignRight: true },
{ name: "Shape" },
{ name: "Data Type" },
{ name: "Name" }
],
tensorInfos.map((tensorInfo, i) => {
const shape = [1n, 1n, 1n, 1n];
tensorInfo.shape.forEach((dim, i2) => {
shape[i2] = dim;
});
return [
(i + 1).toString(),
shape.reduce((acc, n) => acc * n, 1n).toString(),
shape.map((n) => n.toString().padStart(6)).join(", "),
mapDtypeToName[tensorInfo.dtype],
tensorInfo.name
];
})
);
} else {
console.log();
console.log(`* Use --show-tensor to display tensor information`);
}
}
function calcMemoryUsage(metadata, kvSize, kvTypeK = GGMLQuantizationType.F16, kvTypeV = GGMLQuantizationType.F16) {
const arch = metadata["general.architecture"] ?? "unknown";
const n_embd = metadata[`${arch}.embedding_length`] ?? 0;
const n_head = metadata[`${arch}.attention.head_count`] ?? 0;
const n_embd_head_k = metadata[`${arch}.attention.key_length`] ?? n_embd / n_head;
const n_embd_head_v = metadata[`${arch}.attention.value_length`] ?? n_embd / n_head;
const n_head_kv = metadata[`${arch}.attention.head_count_kv`] ?? [];
const n_layer = metadata[`${arch}.block_count`] ?? 0;
if (arch.startsWith("mamba") || arch.startsWith("rwkv")) {
throw new Error(`Memory usage estimation for arch "${arch}" is not supported`);
}
const n_head_kv_arr = Array(n_layer).fill(n_head);
if (Array.isArray(n_head_kv)) {
for (let i = 0; i < n_layer; i++) {
if (n_head_kv[i]) {
n_head_kv_arr[i] = n_head_kv[i];
}
}
} else {
for (let i = 0; i < n_layer; i++) {
n_head_kv_arr[i] = n_head_kv;
}
}
let totalElemsK = 0;
let totalElemsV = 0;
for (let i = 0; i < n_layer; i++) {
const n_embd_k_gqa = n_embd_head_k * n_head_kv_arr[i];
const n_embd_v_gqa = n_embd_head_v * n_head_kv_arr[i];
totalElemsK += n_embd_k_gqa * kvSize;
totalElemsV += n_embd_v_gqa * kvSize;
}
return {
totalBytesK: totalElemsK * (GGML_QUANT_SIZES[kvTypeK] / 8),
totalBytesV: totalElemsV * (GGML_QUANT_SIZES[kvTypeV] / 8),
totalBytes: (totalElemsK + totalElemsV) * (GGML_QUANT_SIZES[kvTypeV] / 8)
};
}
function printTable(header, rows, leftPad = 2) {
const leftPadStr = " ".repeat(leftPad);
const columnWidths = header.map((h, i) => {
const maxContentWidth = Math.max(h.name.length, ...rows.map((row) => (row[i] || "").length));
return h.maxWidth ? Math.min(maxContentWidth, h.maxWidth) : maxContentWidth;
});
const headerLine = header.map((h, i) => {
return h.name.padEnd(columnWidths[i]);
}).join(" | ");
console.log(leftPadStr + headerLine);
console.log(leftPadStr + columnWidths.map((w) => "-".repeat(w)).join("-|-"));
for (const row of rows) {
const line = header.map((h, i) => {
return h.alignRight ? (row[i] || "").padStart(columnWidths[i]) : (row[i] || "").padEnd(columnWidths[i]);
}).join(" | ");
console.log(leftPadStr + line);
}
}
main();