UNPKG

@huggingface/tasks

Version:
204 lines (186 loc) 5.94 kB
// This list is copied from gguf/types.ts, but will all types available (for backward compatibility) // NOT to be confused with GGMLQuantizationType, a FileQuantization can contain multiple GGMLQuantizationType // For example, Q4_K_M model can contains Q4_K and Q6_K tensors export enum GGMLFileQuantizationType { F32 = 0, F16 = 1, Q4_0 = 2, Q4_1 = 3, Q4_1_SOME_F16 = 4, Q4_2 = 5, Q4_3 = 6, Q8_0 = 7, Q5_0 = 8, Q5_1 = 9, Q2_K = 10, Q3_K_S = 11, Q3_K_M = 12, Q3_K_L = 13, Q4_K_S = 14, Q4_K_M = 15, Q5_K_S = 16, Q5_K_M = 17, Q6_K = 18, IQ2_XXS = 19, IQ2_XS = 20, Q2_K_S = 21, IQ3_XS = 22, IQ3_XXS = 23, IQ1_S = 24, IQ4_NL = 25, IQ3_S = 26, IQ3_M = 27, IQ2_S = 28, IQ2_M = 29, IQ4_XS = 30, IQ1_M = 31, BF16 = 32, Q4_0_4_4 = 33, Q4_0_4_8 = 34, Q4_0_8_8 = 35, TQ1_0 = 36, TQ2_0 = 37, MXFP4_MOE = 38, // custom quants used by unsloth // they are not officially a scheme enum value in GGUF, but only here for naming Q2_K_XL = 1000, Q3_K_XL = 1001, Q4_K_XL = 1002, Q5_K_XL = 1003, Q6_K_XL = 1004, Q8_K_XL = 1005, } const ggufQuants = Object.values(GGMLFileQuantizationType).filter((v): v is string => typeof v === "string"); export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?"); export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g"); export function parseGGUFQuantLabel(fname: string): string | undefined { const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one return quantLabel; } // order of quantization, from biggest to smallest // this list must be in sync with the order in GGMLFileQuantizationType // the gguf.spec.ts tests are using verify if the order is correct export const GGUF_QUANT_ORDER: GGMLFileQuantizationType[] = [ GGMLFileQuantizationType.F32, GGMLFileQuantizationType.BF16, GGMLFileQuantizationType.F16, GGMLFileQuantizationType.Q8_K_XL, GGMLFileQuantizationType.Q8_0, // 6-bit quantizations GGMLFileQuantizationType.Q6_K_XL, GGMLFileQuantizationType.Q6_K, // 5-bit quantizations GGMLFileQuantizationType.Q5_K_XL, GGMLFileQuantizationType.Q5_K_M, GGMLFileQuantizationType.Q5_K_S, GGMLFileQuantizationType.Q5_0, GGMLFileQuantizationType.Q5_1, // 4-bit quantizations GGMLFileQuantizationType.Q4_K_XL, GGMLFileQuantizationType.Q4_K_M, GGMLFileQuantizationType.Q4_K_S, GGMLFileQuantizationType.IQ4_NL, GGMLFileQuantizationType.IQ4_XS, GGMLFileQuantizationType.Q4_0_4_4, GGMLFileQuantizationType.Q4_0_4_8, GGMLFileQuantizationType.Q4_0_8_8, GGMLFileQuantizationType.Q4_1_SOME_F16, GGMLFileQuantizationType.Q4_0, GGMLFileQuantizationType.Q4_1, GGMLFileQuantizationType.Q4_2, GGMLFileQuantizationType.Q4_3, GGMLFileQuantizationType.MXFP4_MOE, // 3-bit quantizations GGMLFileQuantizationType.Q3_K_XL, GGMLFileQuantizationType.Q3_K_L, GGMLFileQuantizationType.Q3_K_M, GGMLFileQuantizationType.Q3_K_S, GGMLFileQuantizationType.IQ3_M, GGMLFileQuantizationType.IQ3_S, GGMLFileQuantizationType.IQ3_XS, GGMLFileQuantizationType.IQ3_XXS, // 2-bit quantizations GGMLFileQuantizationType.Q2_K_XL, GGMLFileQuantizationType.Q2_K, GGMLFileQuantizationType.Q2_K_S, GGMLFileQuantizationType.IQ2_M, GGMLFileQuantizationType.IQ2_S, GGMLFileQuantizationType.IQ2_XS, GGMLFileQuantizationType.IQ2_XXS, // 1-bit quantizations GGMLFileQuantizationType.IQ1_S, GGMLFileQuantizationType.IQ1_M, GGMLFileQuantizationType.TQ1_0, GGMLFileQuantizationType.TQ2_0, ]; // This function finds the nearest quantization type that is less than or equal to the given quantization type. // It returns undefined if no such quantization type is found. export function findNearestQuantType( quant: GGMLFileQuantizationType, availableQuants: GGMLFileQuantizationType[] ): GGMLFileQuantizationType | undefined { // Create a map for quick index lookup from the defined order const orderMap = new Map<GGMLFileQuantizationType, number>(); GGUF_QUANT_ORDER.forEach((q, index) => { orderMap.set(q, index); }); const targetIndex = orderMap.get(quant) ?? 0; // the 0 case should never happen // Filter the available quantizations to include only those defined in the order map, // then sort them according to the GGUF_QUANT_ORDER (from largest/index 0 to smallest/highest index). const sortedAvailable = availableQuants .filter((q) => orderMap.has(q)) .sort((a, b) => (orderMap.get(a) ?? Infinity) - (orderMap.get(b) ?? Infinity)); // If no valid quantizations are available after filtering if (sortedAvailable.length === 0) { return undefined; } // Iterate through the sorted available quantizations (largest to smallest). // Find the first one whose order index is >= the target index. // This means finding the largest quantization that is smaller than or equal to the target. for (const availableQuant of sortedAvailable) { // We know the key exists due to the filter above. const availableIndex = orderMap.get(availableQuant) ?? 0; if (availableIndex >= targetIndex) { return availableQuant; } } // If the loop completes, it means all available quantizations are larger (have a smaller index) // than the target quantization. In this case, return the "smallest" available quantization, // which is the last element in the sorted list (highest index among available). return sortedAvailable[sortedAvailable.length - 1]; } // This list is only used to calculate the size of the model, NOT to be confused with the quantization FILE type export enum GGMLQuantizationType { F32 = 0, F16 = 1, Q4_0 = 2, Q4_1 = 3, Q5_0 = 6, Q5_1 = 7, Q8_0 = 8, Q8_1 = 9, Q2_K = 10, Q3_K = 11, Q4_K = 12, Q5_K = 13, Q6_K = 14, Q8_K = 15, IQ2_XXS = 16, IQ2_XS = 17, IQ3_XXS = 18, IQ1_S = 19, IQ4_NL = 20, IQ3_S = 21, IQ2_S = 22, IQ4_XS = 23, I8 = 24, I16 = 25, I32 = 26, I64 = 27, F64 = 28, IQ1_M = 29, BF16 = 30, TQ1_0 = 34, TQ2_0 = 35, MXFP4 = 39, }