@huggingface/gguf

import type { MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types"; import { GGUFValueType } from "./types"; import { isBackend } from "./utils/isBackend"; import { promisesQueue } from "./utils/promisesQueue"; export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types"; export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types"; export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions"; export { parseGGUFQuantLabel, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL, GGUF_QUANT_ORDER, findNearestQuantType, GGMLFileQuantizationType, } from "@huggingface/tasks"; export const RE_GGUF_FILE = /\.gguf$/; export const RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/; const GGUF_DEFAULT_ALIGNMENT = 32; // defined in ggml.h const GGML_PAD = (x: number, n: number) => (x + n - 1) & ~(n - 1); // defined in ggml.h const PARALLEL_DOWNLOADS = 20; export interface GgufShardFileInfo { prefix: string; shard: string; total: string; } export function parseGgufShardFilename(filename: string): GgufShardFileInfo | null { const match = RE_GGUF_SHARD_FILE.exec(filename); if (match && match.groups) { return { prefix: match.groups["prefix"], shard: match.groups["shard"], total: match.groups["total"], }; } return null; } const isVersion = (version: number): version is Version => version === 1 || version === 2 || version === 3; /** * Must be `GGUF` at the byte level: `0x47` `0x47` `0x55` `0x46`. * Your executor might do little-endian byte order, so it might be * check for 0x46554747 and letting the endianness cancel out. * Consider being *very* explicit about the byte order here. */ const ggufMagicNumber = new Uint8Array([0x47, 0x47, 0x55, 0x46]); /// "GGUF" function isGGUFValueType(n: number): n is GGUFValueType { return typeof GGUFValueType[n] === "string"; } const HTTP_CHUNK_SIZE = 2 * 10 ** 6; /// 2MB const HTTP_DATA_LEEWAY = 5 * 10 ** 5; /// 500kb const HTTP_TOTAL_MAX_SIZE = 50 * 10 ** 6; /// 50MB /** * Internal stateful instance to fetch ranges of HTTP data when needed */ class RangeView { protected chunk: number; private buffer: ArrayBuffer; private dataView: DataView; get view(): DataView { return this.dataView; } constructor( public uri: string, private params?: { /** * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; additionalFetchHeaders?: Record<string, string>; } ) { this.chunk = 0; /// TODO(fix typing) // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore this.buffer = new ArrayBuffer(0, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); this.dataView = new DataView(this.buffer); } /** * Fetch a new chunk from the server */ async fetchChunk() { const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buf = new Uint8Array( await ( await (this.params?.fetch ?? fetch)(this.uri, { headers: { ...(this.params?.additionalFetchHeaders ?? {}), Range: `bytes=${range[0]}-${range[1]}`, }, }) ).arrayBuffer() ); this.appendBuffer(buf); this.chunk += 1; } /** * Append new data into the buffer */ appendBuffer(buf: Uint8Array) { /// TODO(fix typing) // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore if (ArrayBuffer.prototype.resize) { /// TODO(fix typing) // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE); new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE); } else { // If the browser does not support ArrayBuffer.resize, we fallback to this polyfill version /// TODO(fix typing) // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore const newBuffer = new ArrayBuffer((this.chunk + 1) * HTTP_CHUNK_SIZE, { maxByteLength: HTTP_TOTAL_MAX_SIZE }); const arrView = new Uint8Array(newBuffer); arrView.set(new Uint8Array(this.buffer)); arrView.set(buf, this.chunk * HTTP_CHUNK_SIZE); this.buffer = newBuffer; this.dataView = new DataView(this.buffer); } } /** * Check whether we need to fetch a new chunk */ async fetchChunkIfNeeded(offset: number) { if (this.dataView.byteLength - offset < HTTP_DATA_LEEWAY) { await this.fetchChunk(); } } } /** * Internal stateful instance to read ranges of local file when needed. * Only usable in with nodejs FS API. */ class RangeViewLocalFile extends RangeView { /** * Read a new chunk from local file system. */ override async fetchChunk(): Promise<void> { const { FileBlob } = await import("./utils/FileBlob"); const blob = await FileBlob.create(this.uri); const range = [this.chunk * HTTP_CHUNK_SIZE, (this.chunk + 1) * HTTP_CHUNK_SIZE - 1]; const buffer = await blob.slice(range[0], range[1]).arrayBuffer(); this.appendBuffer(new Uint8Array(buffer)); this.chunk += 1; } } interface Slice<T> { value: T; length: number; } /** * Note: A good article about binary data in JS: https://javascript.info/arraybuffer-binary-arrays */ function readVersionedSize(view: DataView, byteOffset: number, version: Version, littleEndian: boolean): Slice<bigint> { switch (version) { case 1: { const n = view.getUint32(byteOffset, littleEndian); return { value: BigInt(n), length: 4 }; } case 2: case 3: { return { value: view.getBigUint64(byteOffset, littleEndian), length: 8 }; } } } function readString(view: DataView, offset: number, version: Version, littleEndian: boolean): Slice<string> { const length = readVersionedSize(view, offset, version, littleEndian); const off = length.length; const value = new TextDecoder().decode(view.buffer.slice(offset + off, offset + off + Number(length.value))); return { value, length: off + Number(length.value) }; } function readMetadataValue( view: DataView, type: GGUFValueType, offset: number, version: Version, littleEndian: boolean ): Slice<MetadataValue> { switch (type) { case GGUFValueType.UINT8: return { value: view.getUint8(offset), length: 1 }; case GGUFValueType.INT8: return { value: view.getInt8(offset), length: 1 }; case GGUFValueType.UINT16: return { value: view.getUint16(offset, littleEndian), length: 2 }; case GGUFValueType.INT16: return { value: view.getInt16(offset, littleEndian), length: 2 }; case GGUFValueType.UINT32: return { value: view.getUint32(offset, littleEndian), length: 4 }; case GGUFValueType.INT32: return { value: view.getInt32(offset, littleEndian), length: 4 }; case GGUFValueType.FLOAT32: return { value: view.getFloat32(offset, littleEndian), length: 4 }; case GGUFValueType.BOOL: return { value: view.getUint8(offset) !== 0, length: 1 }; case GGUFValueType.STRING: return readString(view, offset, version, littleEndian); case GGUFValueType.ARRAY: { const arrayType = view.getUint32(offset, littleEndian); const arrayLength = readVersionedSize(view, offset + 4, version, littleEndian); let length = 4 + arrayLength.length; const arrayValues: MetadataValue[] = []; for (let i = 0; i < arrayLength.value; i++) { const metadataValue = readMetadataValue(view, arrayType, offset + length, version, littleEndian); arrayValues.push(metadataValue.value); length += metadataValue.length; } return { value: arrayValues, length }; } case GGUFValueType.UINT64: return { value: view.getBigUint64(offset, littleEndian), length: 8 }; case GGUFValueType.INT64: return { value: view.getBigInt64(offset, littleEndian), length: 8 }; case GGUFValueType.FLOAT64: return { value: view.getFloat64(offset, littleEndian), length: 8 }; } } export async function gguf( uri: string, params: { /** * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; additionalFetchHeaders?: Record<string, string>; computeParametersCount: true; allowLocalFile?: boolean; } ): Promise<GGUFParseOutput & { parameterCount: number }>; export async function gguf( uri: string, params?: { /** * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; additionalFetchHeaders?: Record<string, string>; allowLocalFile?: boolean; } ): Promise<GGUFParseOutput>; export async function gguf( uri: string, params?: { /** * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; additionalFetchHeaders?: Record<string, string>; computeParametersCount?: boolean; allowLocalFile?: boolean; } ): Promise<GGUFParseOutput & { parameterCount?: number }> { let r: RangeView; if (isBackend) { /// On backend, we switch between remote/local file based on protocol if (uri.match(/^https?:\/\//)) { r = new RangeView(uri, params); } else if (params?.allowLocalFile) { r = new RangeViewLocalFile(uri, params); } else { throw new Error("Access to local file is not enabled, please set allowLocalFile to true"); } } else { /// On frontend, we only allow using remote file if (params?.allowLocalFile) { throw new Error("allowLocalFile cannot be used on browser"); } r = new RangeView(uri, params); } await r.fetchChunk(); const checkBuffer = (buffer: Uint8Array, header: Uint8Array) => { for (let i = 0; i < header.length; i++) { if (header[i] !== buffer[i]) { return false; } } return true; }; if (!checkBuffer(new Uint8Array(r.view.buffer.slice(0, 4)), ggufMagicNumber)) { throw new Error("not a valid gguf file: not starting with GGUF magic number"); } const [littleEndian, version] = (() => { /// https://github.com/ggerganov/llama.cpp/issues/3957 /// Assume this code is always running on little-endian /// but wants to be able to parse both endianness const version = r.view.getUint32(4, true); if (version & 65535) { return [true, version]; } else { return [false, r.view.getUint32(4, false)]; } })(); if (!isVersion(version)) { throw new Error(`not a valid gguf file: unsupported version "${version}"`); } // initial offset after header let offset = 8; const tensorCount = readVersionedSize(r.view, offset, version, littleEndian); offset += tensorCount.length; const numKv = readVersionedSize(r.view, offset, version, littleEndian); offset += numKv.length; const metadata: GGUFMetadata<{ strict: false }> = { version, tensor_count: tensorCount.value, kv_count: numKv.value, }; for (let i = 0; i < numKv.value; i++) { await r.fetchChunkIfNeeded(offset); // read key const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; // read value type const valueType = r.view.getUint32(offset, littleEndian); offset += 4; if (!isGGUFValueType(valueType)) { throw new Error("Unsupported metadata type: " + valueType); } let valueResult: ReturnType<typeof readMetadataValue> | undefined; while (!valueResult) { try { // read value valueResult = readMetadataValue(r.view, valueType, offset, version, littleEndian); } catch (err) { if (err instanceof RangeError) { await r.fetchChunk(); } else { throw err; } } } offset += valueResult.length; metadata[keyResult.value] = valueResult.value; } const tensorInfos: GGUFTensorInfo[] = []; for (let i = 0; i < tensorCount.value; i++) { await r.fetchChunkIfNeeded(offset); // read tensor name const keyResult = readString(r.view, offset, version, littleEndian); offset += keyResult.length; const nDims = r.view.getUint32(offset, littleEndian); offset += 4; const shape: bigint[] = []; for (let dim = 0; dim < nDims; dim++) { const shapeDim = readVersionedSize(r.view, offset, version, littleEndian); shape.push(shapeDim.value); offset += shapeDim.length; } const type = r.view.getUint32(offset, littleEndian); offset += 4; const tensorOffset = r.view.getBigUint64(offset, littleEndian); offset += 8; tensorInfos.push({ name: keyResult.value, n_dims: nDims, shape, dtype: type, offset: tensorOffset, }); } // calculate absolute offset of tensor data const alignment: number = Number(metadata["general.alignment"] ?? GGUF_DEFAULT_ALIGNMENT); const tensorDataOffset = BigInt(GGML_PAD(offset, alignment)); if (params?.computeParametersCount) { const parameterCount = tensorInfos .map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1)) .reduce((acc, val) => acc + val, 0); return { metadata, tensorInfos, tensorDataOffset, parameterCount }; } else { return { metadata, tensorInfos, tensorDataOffset }; } } export async function ggufAllShards( url: string, params?: { /** * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers. */ fetch?: typeof fetch; additionalFetchHeaders?: Record<string, string>; parallelDownloads?: number; allowLocalFile?: boolean; } ): Promise<{ shards: GGUFParseOutput[]; parameterCount: number }> { const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS; if (parallelDownloads < 1) { throw new TypeError("parallelDownloads must be greater than 0"); } const ggufShardFileInfo = parseGgufShardFilename(url); if (ggufShardFileInfo) { const total = parseInt(ggufShardFileInfo.total); const prefix = ggufShardFileInfo.prefix; const urls: string[] = []; for (let shardIdx = 1; shardIdx <= total; shardIdx++) { urls.push(`${prefix}-${shardIdx.toString().padStart(5, "0")}-of-${total.toString().padStart(5, "0")}.gguf`); } const shards = await promisesQueue( urls.map((shardUrl) => () => gguf(shardUrl, { ...params, computeParametersCount: true })), parallelDownloads ); return { shards, parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0), }; } else { const { metadata, tensorInfos, tensorDataOffset, parameterCount } = await gguf(url, { ...params, computeParametersCount: true, }); return { shards: [{ metadata, tensorInfos, tensorDataOffset }], parameterCount }; } }