UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

505 lines 19.9 kB
import os from "os"; import path from "path"; import chalk from "chalk"; import { DisposedError, EventRelay, withLock } from "lifecycle-utils"; import { getConsoleLogPrefix } from "../utils/getConsoleLogPrefix.js"; import { LlamaModel } from "../evaluator/LlamaModel/LlamaModel.js"; import { DisposeGuard } from "../utils/DisposeGuard.js"; import { LlamaJsonSchemaGrammar } from "../evaluator/LlamaJsonSchemaGrammar.js"; import { LlamaGrammar } from "../evaluator/LlamaGrammar.js"; import { ThreadsSplitter } from "../utils/ThreadsSplitter.js"; import { getLlamaClasses } from "../utils/getLlamaClasses.js"; import { LlamaLocks, LlamaLogLevel, LlamaLogLevelGreaterThanOrEqual } from "./types.js"; import { MemoryOrchestrator } from "./utils/MemoryOrchestrator.js"; export const LlamaLogLevelToAddonLogLevel = new Map([ [LlamaLogLevel.disabled, 0], [LlamaLogLevel.fatal, 1], [LlamaLogLevel.error, 2], [LlamaLogLevel.warn, 3], [LlamaLogLevel.info, 4], [LlamaLogLevel.log, 5], [LlamaLogLevel.debug, 6] ]); const addonLogLevelToLlamaLogLevel = new Map([...LlamaLogLevelToAddonLogLevel.entries()].map(([key, value]) => [value, key])); const defaultLogLevel = 5; const defaultCPUMinThreadSplitterThreads = 4; export class Llama { /** @internal */ _bindings; /** @internal */ _backendDisposeGuard = new DisposeGuard(); /** @internal */ _memoryLock = {}; /** @internal */ _consts; /** @internal */ _vramOrchestrator; /** @internal */ _vramPadding; /** @internal */ _ramOrchestrator; /** @internal */ _ramPadding; /** @internal */ _swapOrchestrator; /** @internal */ _debug; /** @internal */ _threadsSplitter; /** @internal */ _gpu; /** @internal */ _buildType; /** @internal */ _cmakeOptions; /** @internal */ _supportsGpuOffloading; /** @internal */ _supportsMmap; /** @internal */ _gpuSupportsMmap; /** @internal */ _supportsMlock; /** @internal */ _mathCores; /** @internal */ _llamaCppRelease; /** @internal */ _logger; /** @internal */ _logLevel; /** @internal */ _pendingLog = null; /** @internal */ _pendingLogLevel = null; /** @internal */ _logDispatchQueuedMicrotasks = 0; /** @internal */ _previousLog = null; /** @internal */ _previousLogLevel = null; /** @internal */ _nextLogNeedNewLine = false; /** @internal */ _disposed = false; _classes; onDispose = new EventRelay(); constructor({ bindings, bindingPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, buildGpu, maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator }) { this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this); this._onAddonLog = this._onAddonLog.bind(this); this._bindings = bindings; this._debug = debug; this._logLevel = this._debug ? LlamaLogLevel.debug : (logLevel ?? LlamaLogLevel.debug); if (!this._debug) { this._bindings.setLogger(this._onAddonLog); this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel); } bindings.loadBackends(); const loadedGpu = bindings.getGpuType(); if (loadedGpu == null || (loadedGpu === false && buildGpu !== false)) bindings.loadBackends(path.dirname(bindingPath)); bindings.ensureGpuDeviceIsSupported(); this._gpu = bindings.getGpuType() ?? false; this._supportsGpuOffloading = bindings.getSupportsGpuOffloading(); this._supportsMmap = bindings.getSupportsMmap(); this._gpuSupportsMmap = bindings.getGpuSupportsMmap(); this._supportsMlock = bindings.getSupportsMlock(); this._mathCores = bindings.getMathCores(); this._consts = bindings.getConsts(); this._vramOrchestrator = vramOrchestrator; this._vramPadding = vramPadding; this._ramOrchestrator = ramOrchestrator; this._ramPadding = ramPadding; this._swapOrchestrator = swapOrchestrator; this._threadsSplitter = new ThreadsSplitter(maxThreads ?? (this._gpu === false ? Math.max(defaultCPUMinThreadSplitterThreads, this._mathCores) : 0)); this._logger = logger; this._buildType = buildType; this._cmakeOptions = Object.freeze({ ...cmakeOptions }); this._llamaCppRelease = Object.freeze({ repo: llamaCppRelease.repo, release: llamaCppRelease.release }); this._onExit = this._onExit.bind(this); process.on("exit", this._onExit); } async dispose() { if (this._disposed) return; this._disposed = true; this.onDispose.dispatchEvent(); await this._backendDisposeGuard.acquireDisposeLock(); await this._bindings.dispose(); } /** @hidden */ async [Symbol.asyncDispose]() { await this.dispose(); } get disposed() { return this._disposed; } get classes() { if (this._classes == null) this._classes = getLlamaClasses(); return this._classes; } get gpu() { return this._gpu; } get supportsGpuOffloading() { return this._supportsGpuOffloading; } get supportsMmap() { return this._supportsMmap; } get gpuSupportsMmap() { return this._gpuSupportsMmap; } get supportsMlock() { return this._supportsMlock; } /** The number of CPU cores that are useful for math */ get cpuMathCores() { return this._mathCores; } /** * The maximum number of threads that can be used by the Llama instance. * * If set to `0`, the Llama instance will have no limit on the number of threads. * * See the `maxThreads` option of `getLlama` for more information. */ get maxThreads() { return this._threadsSplitter.maxThreads; } set maxThreads(value) { this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value)); } get logLevel() { return this._logLevel; } set logLevel(value) { this._ensureNotDisposed(); if (value === this._logLevel || this._debug) return; this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(value) ?? defaultLogLevel); this._logLevel = value; } get logger() { return this._logger; } set logger(value) { this._logger = value; if (value !== Llama.defaultConsoleLogger) this._nextLogNeedNewLine = false; } get buildType() { return this._buildType; } get cmakeOptions() { return this._cmakeOptions; } get llamaCppRelease() { return this._llamaCppRelease; } get systemInfo() { this._ensureNotDisposed(); return this._bindings.systemInfo(); } /** * VRAM padding used for memory size calculations, as these calculations are not always accurate. * This is set by default to ensure stability, but can be configured when you call `getLlama`. * * See `vramPadding` on `getLlama` for more information. */ get vramPaddingSize() { return this._vramPadding.size; } /** * The total amount of VRAM that is currently being used. * * `unifiedSize` represents the amount of VRAM that is shared between the CPU and GPU. * On SoC devices, this is usually the same as `total`. */ async getVramState() { this._ensureNotDisposed(); const { total, used, unifiedSize } = this._bindings.getGpuVramInfo(); return { total, used, free: Math.max(0, total - used), unifiedSize }; } /** * Get the state of the swap memory. * * **`maxSize`** - The maximum size of the swap memory that the system can allocate. * If the swap size is dynamic (like on macOS), this will be `Infinity`. * * **`allocated`** - The total size allocated by the system for swap memory. * * **`used`** - The amount of swap memory that is currently being used from the `allocated` size. * * On Windows, this will return the info for the page file. */ async getSwapState() { this._ensureNotDisposed(); const { total, maxSize, free } = this._bindings.getSwapInfo(); return { maxSize: maxSize === -1 ? Infinity : maxSize, allocated: total, used: total - free }; } async getGpuDeviceNames() { this._ensureNotDisposed(); const { deviceNames } = this._bindings.getGpuDeviceInfo(); return deviceNames; } async loadModel(options) { this._ensureNotDisposed(); return await withLock(this._memoryLock, LlamaLocks.loadToMemory, options.loadSignal, async () => { this._ensureNotDisposed(); const preventDisposalHandle = this._backendDisposeGuard.createPreventDisposalHandle(); try { return await LlamaModel._create(options, { _llama: this }); } finally { preventDisposalHandle.dispose(); } }); } /* eslint-disable @stylistic/max-len */ /** * @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial * @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial */ async createGrammarForJsonSchema(schema) { return new LlamaJsonSchemaGrammar(this, schema); } /* eslint-enable @stylistic/max-len */ async getGrammarFor(type) { return await LlamaGrammar.getFor(this, type); } /** * @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial */ async createGrammar(options) { return new LlamaGrammar(this, options); } /** @internal */ async _init() { await this._bindings.init(); } /** * Log messages related to the Llama instance * @internal */ _log(level, message) { this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n"); } /** @internal */ _onAddonLog(level, message) { const llamaLogLevel = addonLogLevelToLlamaLogLevel.get(level) ?? LlamaLogLevel.fatal; if (this._pendingLog != null && this._pendingLogLevel != null && this._pendingLogLevel != llamaLogLevel) { this._callLogger(this._pendingLogLevel, this._pendingLog); this._pendingLog = null; } const sourceMessage = (this._pendingLog ?? "") + message; const lastNewLineIndex = sourceMessage.lastIndexOf("\n"); const currentLog = lastNewLineIndex < 0 ? sourceMessage : sourceMessage.slice(0, lastNewLineIndex); const nextLog = lastNewLineIndex < 0 ? "" : sourceMessage.slice(lastNewLineIndex + 1); if (currentLog !== "") this._callLogger(llamaLogLevel, currentLog); if (nextLog !== "") { this._pendingLog = nextLog; this._pendingLogLevel = llamaLogLevel; queueMicrotask(this._dispatchPendingLogMicrotask); this._logDispatchQueuedMicrotasks++; } else this._pendingLog = null; } /** @internal */ _dispatchPendingLogMicrotask() { this._logDispatchQueuedMicrotasks--; if (this._logDispatchQueuedMicrotasks !== 0) return; if (this._pendingLog != null && this._pendingLogLevel != null) { this._callLogger(this._pendingLogLevel, this._pendingLog); this._pendingLog = null; } } /** @internal */ _callLogger(level, message) { // llama.cpp uses dots to indicate progress, so we don't want to print them as different lines, // and instead, append to the same log line if (logMessageIsOnlyDots(message) && this._logger === Llama.defaultConsoleLogger) { if (logMessageIsOnlyDots(this._previousLog) && level === this._previousLogLevel) { process.stdout.write(message); } else { this._nextLogNeedNewLine = true; process.stdout.write(prefixAndColorMessage(message, getColorForLogLevel(level))); } } else { if (this._nextLogNeedNewLine) { process.stdout.write("\n"); this._nextLogNeedNewLine = false; } try { const transformedLogLevel = getTransformedLogLevel(level, message); if (LlamaLogLevelGreaterThanOrEqual(transformedLogLevel, this._logLevel)) this._logger(transformedLogLevel, message); } catch (err) { // the native addon code calls this function, so there's no use to throw an error here } } this._previousLog = message; this._previousLogLevel = level; } /** @internal */ _onExit() { if (this._pendingLog != null && this._pendingLogLevel != null) { this._callLogger(this._pendingLogLevel, this._pendingLog); this._pendingLog = null; } } /** @internal */ _ensureNotDisposed() { if (this._disposed) throw new DisposedError(); } /** @internal */ static async _create({ bindings, bindingPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug }) { const vramOrchestrator = new MemoryOrchestrator(() => { const { total, used, unifiedSize } = bindings.getGpuVramInfo(); return { total, free: Math.max(0, total - used), unifiedSize }; }); const ramOrchestrator = new MemoryOrchestrator(() => { const used = process.memoryUsage().rss; const total = os.totalmem(); return { total, free: Math.max(0, total - used), unifiedSize: total }; }); const swapOrchestrator = new MemoryOrchestrator(() => { const { total, maxSize, free } = bindings.getSwapInfo(); const used = total - free; if (maxSize === -1) return { total: Infinity, free: Infinity, unifiedSize: Infinity }; return { total: maxSize, free: maxSize - used, unifiedSize: maxSize }; }); let resolvedRamPadding; if (ramPadding instanceof Function) resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total)); else resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding); const llama = new Llama({ bindings, bindingPath, buildType, cmakeOptions: buildMetadata.buildOptions.customCmakeOptions, llamaCppRelease: { repo: buildMetadata.buildOptions.llamaCpp.repo, release: buildMetadata.buildOptions.llamaCpp.release }, logLevel, logger, debug, buildGpu: buildMetadata.buildOptions.gpu, vramOrchestrator, maxThreads, vramPadding: vramOrchestrator.reserveMemory(0), ramOrchestrator, ramPadding: resolvedRamPadding, swapOrchestrator }); if (llama.gpu === false || vramPadding === 0) { // do nothing since `llama._vramPadding` is already set to 0 } else if (vramPadding instanceof Function) { const currentVramPadding = llama._vramPadding; llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total)); currentVramPadding.dispose(); } else { const currentVramPadding = llama._vramPadding; llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding); currentVramPadding.dispose(); } if (!skipLlamaInit) await llama._init(); return llama; } static defaultConsoleLogger(level, message) { switch (level) { case LlamaLogLevel.disabled: break; case LlamaLogLevel.fatal: // we don't use console.error here because it prints the stack trace console.warn(prefixAndColorMessage(message, getColorForLogLevel(level))); break; case LlamaLogLevel.error: // we don't use console.error here because it prints the stack trace console.warn(prefixAndColorMessage(message, getColorForLogLevel(level))); break; case LlamaLogLevel.warn: console.warn(prefixAndColorMessage(message, getColorForLogLevel(level))); break; case LlamaLogLevel.info: console.info(prefixAndColorMessage(message, getColorForLogLevel(level))); break; case LlamaLogLevel.log: console.info(prefixAndColorMessage(message, getColorForLogLevel(level))); break; case LlamaLogLevel.debug: console.debug(prefixAndColorMessage(message, getColorForLogLevel(level))); break; default: void level; console.warn(getConsoleLogPrefix() + getColorForLogLevel(LlamaLogLevel.warn)(`Unknown log level: ${level}`)); console.log(prefixAndColorMessage(message, getColorForLogLevel(level))); } } } function getColorForLogLevel(level) { switch (level) { case LlamaLogLevel.disabled: return chalk.whiteBright; case LlamaLogLevel.fatal: return chalk.redBright; case LlamaLogLevel.error: return chalk.red; case LlamaLogLevel.warn: return chalk.yellow; case LlamaLogLevel.info: return chalk.whiteBright; case LlamaLogLevel.log: return chalk.white; case LlamaLogLevel.debug: return chalk.gray; default: void level; return chalk.whiteBright; } } function prefixAndColorMessage(message, color) { return getConsoleLogPrefix() + (message .split("\n") .map((line) => color(line)) .join("\n" + getConsoleLogPrefix())); } function logMessageIsOnlyDots(message) { if (message == null) return false; for (let i = 0; i < message.length; i++) { if (message[i] !== ".") return false; } return true; } function getTransformedLogLevel(level, message) { if (level === LlamaLogLevel.warn && message.endsWith("the full capacity of the model will not be utilized")) return LlamaLogLevel.info; else if (level === LlamaLogLevel.warn && message.startsWith("ggml_metal_init: skipping kernel_") && message.endsWith("(not supported)")) return LlamaLogLevel.log; else if (level === LlamaLogLevel.warn && message.startsWith("ggml_cuda_init: GGML_CUDA_FORCE_") && message.endsWith(" no")) return LlamaLogLevel.log; else if (level === LlamaLogLevel.info && message.startsWith("load_backend: loaded ")) return LlamaLogLevel.log; else if (level === LlamaLogLevel.warn && message.startsWith("make_cpu_buft_list: disabling extra buffer types")) return LlamaLogLevel.info; return level; } //# sourceMappingURL=Llama.js.map