UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

github.com/ruvnet/claude-flow

ruvnet/claude-flow

425 lines • 19.2 kB

JavaScript

/** * ruvLLM GGUF Inference Engine -- Pure Node.js GGUF Model Interface * * Provides: * 1. GGUF binary header parsing (metadata without loading weights) * 2. Model loading abstraction (node-llama-cpp when available, metadata-only fallback) * 3. Token generation interface with async iterator streaming * 4. KV-cache persistence to RVF-compatible binary format * * Zero external dependencies. node-llama-cpp is an optional peer. * * @module @claude-flow/cli/appliance/gguf-engine */ import { open, readFile, writeFile, stat as fsStat } from 'node:fs/promises'; import { createHash } from 'node:crypto'; import { basename } from 'node:path'; // ── GGUF Metadata Value Types ─────────────────────────────── var GgufValueType; (function (GgufValueType) { GgufValueType[GgufValueType["UINT8"] = 0] = "UINT8"; GgufValueType[GgufValueType["INT8"] = 1] = "INT8"; GgufValueType[GgufValueType["UINT16"] = 2] = "UINT16"; GgufValueType[GgufValueType["INT16"] = 3] = "INT16"; GgufValueType[GgufValueType["UINT32"] = 4] = "UINT32"; GgufValueType[GgufValueType["INT32"] = 5] = "INT32"; GgufValueType[GgufValueType["FLOAT32"] = 6] = "FLOAT32"; GgufValueType[GgufValueType["BOOL"] = 7] = "BOOL"; GgufValueType[GgufValueType["STRING"] = 8] = "STRING"; GgufValueType[GgufValueType["ARRAY"] = 9] = "ARRAY"; GgufValueType[GgufValueType["UINT64"] = 10] = "UINT64"; GgufValueType[GgufValueType["INT64"] = 11] = "INT64"; GgufValueType[GgufValueType["FLOAT64"] = 12] = "FLOAT64"; })(GgufValueType || (GgufValueType = {})); const GGUF_MAGIC = 0x46554747; // "GGUF" in little-endian const RVKV_MAGIC = 0x564B5652; // "RVKV" in little-endian const RVKV_VERSION = 1; // ── Internal Buffer Reader ────────────────────────────────── /** Stateful cursor over a Buffer for sequential binary reads. */ class BufferReader { buf; offset = 0; constructor(buf) { this.buf = buf; } get remaining() { return this.buf.length - this.offset; } readU8() { const v = this.buf.readUInt8(this.offset); this.offset += 1; return v; } readI8() { const v = this.buf.readInt8(this.offset); this.offset += 1; return v; } readU16() { const v = this.buf.readUInt16LE(this.offset); this.offset += 2; return v; } readI16() { const v = this.buf.readInt16LE(this.offset); this.offset += 2; return v; } readU32() { const v = this.buf.readUInt32LE(this.offset); this.offset += 4; return v; } readI32() { const v = this.buf.readInt32LE(this.offset); this.offset += 4; return v; } readF32() { const v = this.buf.readFloatLE(this.offset); this.offset += 4; return v; } readF64() { const v = this.buf.readDoubleLE(this.offset); this.offset += 8; return v; } readU64() { const v = this.buf.readBigUInt64LE(this.offset); this.offset += 8; return v; } readI64() { const v = this.buf.readBigInt64LE(this.offset); this.offset += 8; return v; } /** Safe for values up to 2^53. Real GGUF files never exceed this for tensor/kv counts. */ readU64AsNumber() { return Number(this.readU64()); } readBool() { return this.readU8() !== 0; } /** GGUF string: [length u64 LE][utf-8 bytes]. */ readString() { const len = this.readU64AsNumber(); if (len === 0) return ''; if (len > this.remaining) throw new Error(`String length ${len} exceeds remaining buffer`); const s = this.buf.toString('utf-8', this.offset, this.offset + len); this.offset += len; return s; } } // ── GGUF Value Reading ────────────────────────────────────── /** Read a typed scalar from the buffer (shared by value and array-element readers). */ function readScalar(reader, t) { switch (t) { case GgufValueType.UINT8: return reader.readU8(); case GgufValueType.INT8: return reader.readI8(); case GgufValueType.UINT16: return reader.readU16(); case GgufValueType.INT16: return reader.readI16(); case GgufValueType.UINT32: return reader.readU32(); case GgufValueType.INT32: return reader.readI32(); case GgufValueType.FLOAT32: return reader.readF32(); case GgufValueType.BOOL: return reader.readBool(); case GgufValueType.STRING: return reader.readString(); case GgufValueType.UINT64: return Number(reader.readU64()); case GgufValueType.INT64: return Number(reader.readI64()); case GgufValueType.FLOAT64: return reader.readF64(); default: return undefined; } } /** Read a single GGUF typed value (scalar or array) from the buffer. */ function readGgufValue(reader) { const valueType = reader.readU32(); if (valueType === GgufValueType.ARRAY) { const elemType = reader.readU32(); const len = reader.readU64AsNumber(); const arr = []; for (let i = 0; i < len; i++) { const v = readScalar(reader, elemType); if (v === undefined) throw new Error(`Unknown GGUF array element type: ${elemType}`); arr.push(v); } return arr; } const v = readScalar(reader, valueType); if (v === undefined) throw new Error(`Unknown GGUF value type: ${valueType}`); return v; } // ── GGUF Header Parsing ───────────────────────────────────── /** * Parse the header and metadata from a GGUF file without loading tensors. * Reads only the first 256 KB of the file. */ export async function parseGgufHeader(path) { const fileInfo = await fsStat(path); const readSize = Math.min(fileInfo.size, 256 * 1024); const fh = await open(path, 'r'); try { const buf = Buffer.alloc(readSize); await fh.read(buf, 0, readSize, 0); return parseGgufBuffer(buf, fileInfo.size, path); } finally { await fh.close(); } } function parseGgufBuffer(buf, fileSize, filePath) { const reader = new BufferReader(buf); const magic = reader.readU32(); if (magic !== GGUF_MAGIC) { throw new Error(`Invalid GGUF magic: 0x${magic.toString(16)} (expected 0x${GGUF_MAGIC.toString(16)})`); } const version = reader.readU32(); if (version < 2 || version > 3) { throw new Error(`Unsupported GGUF version: ${version} (expected 2 or 3)`); } const tensorCount = reader.readU64AsNumber(); const kvCount = reader.readU64AsNumber(); const metadata = {}; for (let i = 0; i < kvCount; i++) { if (reader.remaining < 12) break; try { const key = reader.readString(); metadata[key] = readGgufValue(reader); } catch { break; // reached end of read window } } const arch = asString(metadata['general.architecture']); const pfx = arch || 'llama'; // fallback prefix for well-known keys return { magic: 'GGUF', version, tensorCount, kvCount, architecture: arch, name: asString(metadata['general.name']), contextLength: asNumber(metadata[`${pfx}.context_length`]), embeddingLength: asNumber(metadata[`${pfx}.embedding_length`]), blockCount: asNumber(metadata[`${pfx}.block_count`]), vocabSize: inferVocabSize(metadata), quantization: inferQuantFromMetadata(metadata, filePath), fileSize, metadata, }; } // ── Metadata Helpers ──────────────────────────────────────── function asString(v) { return typeof v === 'string' ? v : undefined; } function asNumber(v) { return typeof v === 'number' ? v : undefined; } const QUANT_RE = [ [/q2_k/i, 'Q2_K'], [/q3_k_s/i, 'Q3_K_S'], [/q3_k_m/i, 'Q3_K_M'], [/q3_k_l/i, 'Q3_K_L'], [/q4_k_s/i, 'Q4_K_S'], [/q4_k_m/i, 'Q4_K_M'], [/q4_0/i, 'Q4_0'], [/q4_1/i, 'Q4_1'], [/q5_k_s/i, 'Q5_K_S'], [/q5_k_m/i, 'Q5_K_M'], [/q5_0/i, 'Q5_0'], [/q5_1/i, 'Q5_1'], [/q6_k/i, 'Q6_K'], [/q8_0/i, 'Q8_0'], [/f16/i, 'F16'], [/f32/i, 'F32'], ]; function inferQuantFromMetadata(meta, filePath) { const ft = meta['general.file_type']; if (typeof ft === 'number') return `file_type_${ft}`; const name = basename(filePath); for (const [re, label] of QUANT_RE) if (re.test(name)) return label; return 'unknown'; } function inferVocabSize(meta) { const tokens = meta['tokenizer.ggml.tokens']; if (Array.isArray(tokens)) return tokens.length; return asNumber(meta['tokenizer.ggml.vocab_size']); } // ── GGUF Engine ───────────────────────────────────────────── export class GgufEngine { config; llamaCpp = null; llamaModel = null; llamaContext = null; loadedModels = new Map(); activeModelPath = null; kvCache = new Map(); constructor(config) { this.config = { contextSize: config.contextSize ?? 4096, maxTokens: config.maxTokens ?? 512, temperature: config.temperature ?? 0.7, kvCachePath: config.kvCachePath ?? '', verbose: config.verbose ?? false, }; } /** Probe for node-llama-cpp availability. */ async initialize() { this.llamaCpp = await this.tryLoadLlamaCpp(); if (this.config.verbose) { console.log(`[gguf-engine] node-llama-cpp: ${this.llamaCpp ? 'available' : 'not found (metadata-only mode)'}`); } } /** Parse GGUF header and optionally load the model for inference. */ async loadModel(path) { const meta = await parseGgufHeader(path); this.loadedModels.set(path, meta); this.activeModelPath = path; if (this.llamaCpp) { try { const { getLlama } = this.llamaCpp; const llama = await getLlama(); this.llamaModel = await llama.loadModel({ modelPath: path }); this.llamaContext = await this.llamaModel.createContext({ contextSize: this.config.contextSize }); if (this.config.verbose) console.log(`[gguf-engine] Model loaded: ${basename(path)}`); } catch (err) { if (this.config.verbose) console.warn('[gguf-engine] node-llama-cpp load failed:', err); this.llamaModel = null; this.llamaContext = null; } } return meta; } /** Generate text. Delegates to node-llama-cpp or returns a metadata-only stub. */ async generate(request) { const start = performance.now(); const modelPath = request.model ?? this.activeModelPath; const modelName = modelPath ? basename(modelPath) : 'none'; if (this.llamaContext && this.llamaModel) { try { const session = new this.llamaCpp.LlamaChatSession({ contextSequence: this.llamaContext.getSequence(), }); const text = await session.prompt(request.prompt, { maxTokens: request.maxTokens ?? this.config.maxTokens, temperature: request.temperature ?? this.config.temperature, stopGenerationTrigger: request.stopSequences ? request.stopSequences.map((s) => new this.llamaCpp.LlamaText([s])) : undefined, }); // Use llama.cpp tokenizer for accurate count when available, else estimate let tokensUsed; try { const seq = this.llamaContext.getSequence(); tokensUsed = seq.tokenCount ?? Math.ceil(text.length / 4); } catch { tokensUsed = Math.ceil(text.length / 4); // ~4 chars per token heuristic } return { text, model: modelName, tokensUsed, latencyMs: performance.now() - start, metadataOnly: false, }; } catch (err) { if (this.config.verbose) console.warn('[gguf-engine] Generation failed:', err); } } // Metadata-only fallback const meta = modelPath ? this.loadedModels.get(modelPath) : undefined; return { text: meta ? `[metadata-only] Model: ${meta.name ?? modelName}, arch: ${meta.architecture ?? 'unknown'}, ctx: ${meta.contextLength ?? 'unknown'}` : '[metadata-only] No model loaded', model: modelName, tokensUsed: 0, latencyMs: performance.now() - start, metadataOnly: true, }; } /** Stream tokens via async iterator. Falls back to yielding full response. */ async *stream(request) { if (this.llamaContext && this.llamaModel) { try { const session = new this.llamaCpp.LlamaChatSession({ contextSequence: this.llamaContext.getSequence(), }); const it = session.promptWithMeta(request.prompt, { maxTokens: request.maxTokens ?? this.config.maxTokens, temperature: request.temperature ?? this.config.temperature, }); if (it && typeof it[Symbol.asyncIterator] === 'function') { for await (const chunk of it) { if (typeof chunk === 'string') yield chunk; else if (chunk?.text) yield chunk.text; } return; } } catch { /* fall through to single-chunk fallback */ } } const response = await this.generate(request); yield response.text; } /** * Persist the KV cache to an RVF-compatible binary file. * Format: RVKV magic | version u32 | model SHA-256 (32B) | entry count u32 * entries: [key_len u32, key, val_len u32, val] | footer SHA-256 (32B) */ async persistKvCache(outputPath) { const path = outputPath || this.config.kvCachePath; if (!path) throw new Error('No KV cache output path specified'); const modelHash = createHash('sha256').update(this.activeModelPath ?? 'no-model').digest(); const entryBufs = []; for (const [key, value] of this.kvCache) { const keyBuf = Buffer.from(key, 'utf-8'); const hdr = Buffer.alloc(8); hdr.writeUInt32LE(keyBuf.length, 0); hdr.writeUInt32LE(value.length, 4); entryBufs.push(hdr, keyBuf, value); } const entryData = Buffer.concat(entryBufs); const footer = createHash('sha256').update(entryData).digest(); const header = Buffer.alloc(44); header.writeUInt32LE(RVKV_MAGIC, 0); header.writeUInt32LE(RVKV_VERSION, 4); modelHash.copy(header, 8); header.writeUInt32LE(this.kvCache.size, 40); await writeFile(path, Buffer.concat([header, entryData, footer])); if (this.config.verbose) console.log(`[gguf-engine] KV cache persisted: ${this.kvCache.size} entries`); } /** Restore KV cache from an RVF-compatible binary file. */ async loadKvCache(inputPath) { const data = await readFile(inputPath); if (data.length < 44) throw new Error('KV cache file too small'); const magic = data.readUInt32LE(0); if (magic !== RVKV_MAGIC) throw new Error(`Invalid KV cache magic: 0x${magic.toString(16)}`); const version = data.readUInt32LE(4); if (version !== RVKV_VERSION) throw new Error(`Unsupported KV cache version: ${version}`); const entryCount = data.readUInt32LE(40); let offset = 44; const entries = new Map(); for (let i = 0; i < entryCount; i++) { if (offset + 8 > data.length) throw new Error('KV cache file truncated'); const keyLen = data.readUInt32LE(offset); const valLen = data.readUInt32LE(offset + 4); offset += 8; if (offset + keyLen + valLen > data.length) throw new Error('KV cache file truncated'); entries.set(data.toString('utf-8', offset, offset + keyLen), Buffer.from(data.subarray(offset + keyLen, offset + keyLen + valLen))); offset += keyLen + valLen; } // Verify footer hash (mandatory) if (offset + 32 > data.length) { throw new Error('KV cache file missing SHA256 footer'); } const stored = data.subarray(offset, offset + 32); const computed = createHash('sha256').update(data.subarray(44, offset)).digest(); if (!stored.equals(computed)) throw new Error('KV cache integrity check failed: hash mismatch'); this.kvCache = entries; if (this.config.verbose) console.log(`[gguf-engine] KV cache loaded: ${entries.size} entries`); } /** Return metadata for all loaded models. */ getLoadedModels() { return Array.from(this.loadedModels.values()); } /** Store a key-value pair in the in-memory KV cache. */ setKvEntry(key, value) { this.kvCache.set(key, value); } /** Retrieve a key-value pair from the in-memory KV cache. */ getKvEntry(key) { return this.kvCache.get(key); } /** Release resources, unload models, and optionally persist the KV cache. */ async shutdown() { if (this.config.kvCachePath && this.kvCache.size > 0) { try { await this.persistKvCache(this.config.kvCachePath); } catch (err) { if (this.config.verbose) console.warn('[gguf-engine] KV persist failed:', err); } } if (this.llamaContext?.dispose) { try { await this.llamaContext.dispose(); } catch { /* ignore */ } } if (this.llamaModel?.dispose) { try { await this.llamaModel.dispose(); } catch { /* ignore */ } } this.llamaContext = null; this.llamaModel = null; this.activeModelPath = null; this.loadedModels.clear(); this.kvCache.clear(); if (this.config.verbose) console.log('[gguf-engine] Shutdown complete'); } // ── Private ─────────────────────────────────────────────── async tryLoadLlamaCpp() { // @ts-ignore -- optional peer dependency, may not be installed try { return await import('node-llama-cpp'); } catch { return null; } } } //# sourceMappingURL=gguf-engine.js.map