node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
505 lines • 19.9 kB
JavaScript
import os from "os";
import path from "path";
import chalk from "chalk";
import { DisposedError, EventRelay, withLock } from "lifecycle-utils";
import { getConsoleLogPrefix } from "../utils/getConsoleLogPrefix.js";
import { LlamaModel } from "../evaluator/LlamaModel/LlamaModel.js";
import { DisposeGuard } from "../utils/DisposeGuard.js";
import { LlamaJsonSchemaGrammar } from "../evaluator/LlamaJsonSchemaGrammar.js";
import { LlamaGrammar } from "../evaluator/LlamaGrammar.js";
import { ThreadsSplitter } from "../utils/ThreadsSplitter.js";
import { getLlamaClasses } from "../utils/getLlamaClasses.js";
import { LlamaLocks, LlamaLogLevel, LlamaLogLevelGreaterThanOrEqual } from "./types.js";
import { MemoryOrchestrator } from "./utils/MemoryOrchestrator.js";
export const LlamaLogLevelToAddonLogLevel = new Map([
[LlamaLogLevel.disabled, 0],
[LlamaLogLevel.fatal, 1],
[LlamaLogLevel.error, 2],
[LlamaLogLevel.warn, 3],
[LlamaLogLevel.info, 4],
[LlamaLogLevel.log, 5],
[LlamaLogLevel.debug, 6]
]);
const addonLogLevelToLlamaLogLevel = new Map([...LlamaLogLevelToAddonLogLevel.entries()].map(([key, value]) => [value, key]));
const defaultLogLevel = 5;
const defaultCPUMinThreadSplitterThreads = 4;
export class Llama {
/** @internal */ _bindings;
/** @internal */ _backendDisposeGuard = new DisposeGuard();
/** @internal */ _memoryLock = {};
/** @internal */ _consts;
/** @internal */ _vramOrchestrator;
/** @internal */ _vramPadding;
/** @internal */ _ramOrchestrator;
/** @internal */ _ramPadding;
/** @internal */ _swapOrchestrator;
/** @internal */ _debug;
/** @internal */ _threadsSplitter;
/** @internal */ _gpu;
/** @internal */ _buildType;
/** @internal */ _cmakeOptions;
/** @internal */ _supportsGpuOffloading;
/** @internal */ _supportsMmap;
/** @internal */ _gpuSupportsMmap;
/** @internal */ _supportsMlock;
/** @internal */ _mathCores;
/** @internal */ _llamaCppRelease;
/** @internal */ _logger;
/** @internal */ _logLevel;
/** @internal */ _pendingLog = null;
/** @internal */ _pendingLogLevel = null;
/** @internal */ _logDispatchQueuedMicrotasks = 0;
/** @internal */ _previousLog = null;
/** @internal */ _previousLogLevel = null;
/** @internal */ _nextLogNeedNewLine = false;
/** @internal */ _disposed = false;
_classes;
onDispose = new EventRelay();
constructor({ bindings, bindingPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, buildGpu, maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator }) {
this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
this._onAddonLog = this._onAddonLog.bind(this);
this._bindings = bindings;
this._debug = debug;
this._logLevel = this._debug
? LlamaLogLevel.debug
: (logLevel ?? LlamaLogLevel.debug);
if (!this._debug) {
this._bindings.setLogger(this._onAddonLog);
this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
}
bindings.loadBackends();
const loadedGpu = bindings.getGpuType();
if (loadedGpu == null || (loadedGpu === false && buildGpu !== false))
bindings.loadBackends(path.dirname(bindingPath));
bindings.ensureGpuDeviceIsSupported();
this._gpu = bindings.getGpuType() ?? false;
this._supportsGpuOffloading = bindings.getSupportsGpuOffloading();
this._supportsMmap = bindings.getSupportsMmap();
this._gpuSupportsMmap = bindings.getGpuSupportsMmap();
this._supportsMlock = bindings.getSupportsMlock();
this._mathCores = bindings.getMathCores();
this._consts = bindings.getConsts();
this._vramOrchestrator = vramOrchestrator;
this._vramPadding = vramPadding;
this._ramOrchestrator = ramOrchestrator;
this._ramPadding = ramPadding;
this._swapOrchestrator = swapOrchestrator;
this._threadsSplitter = new ThreadsSplitter(maxThreads ?? (this._gpu === false
? Math.max(defaultCPUMinThreadSplitterThreads, this._mathCores)
: 0));
this._logger = logger;
this._buildType = buildType;
this._cmakeOptions = Object.freeze({ ...cmakeOptions });
this._llamaCppRelease = Object.freeze({
repo: llamaCppRelease.repo,
release: llamaCppRelease.release
});
this._onExit = this._onExit.bind(this);
process.on("exit", this._onExit);
}
async dispose() {
if (this._disposed)
return;
this._disposed = true;
this.onDispose.dispatchEvent();
await this._backendDisposeGuard.acquireDisposeLock();
await this._bindings.dispose();
}
/** @hidden */
async [Symbol.asyncDispose]() {
await this.dispose();
}
get disposed() {
return this._disposed;
}
get classes() {
if (this._classes == null)
this._classes = getLlamaClasses();
return this._classes;
}
get gpu() {
return this._gpu;
}
get supportsGpuOffloading() {
return this._supportsGpuOffloading;
}
get supportsMmap() {
return this._supportsMmap;
}
get gpuSupportsMmap() {
return this._gpuSupportsMmap;
}
get supportsMlock() {
return this._supportsMlock;
}
/** The number of CPU cores that are useful for math */
get cpuMathCores() {
return this._mathCores;
}
/**
* The maximum number of threads that can be used by the Llama instance.
*
* If set to `0`, the Llama instance will have no limit on the number of threads.
*
* See the `maxThreads` option of `getLlama` for more information.
*/
get maxThreads() {
return this._threadsSplitter.maxThreads;
}
set maxThreads(value) {
this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value));
}
get logLevel() {
return this._logLevel;
}
set logLevel(value) {
this._ensureNotDisposed();
if (value === this._logLevel || this._debug)
return;
this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(value) ?? defaultLogLevel);
this._logLevel = value;
}
get logger() {
return this._logger;
}
set logger(value) {
this._logger = value;
if (value !== Llama.defaultConsoleLogger)
this._nextLogNeedNewLine = false;
}
get buildType() {
return this._buildType;
}
get cmakeOptions() {
return this._cmakeOptions;
}
get llamaCppRelease() {
return this._llamaCppRelease;
}
get systemInfo() {
this._ensureNotDisposed();
return this._bindings.systemInfo();
}
/**
* VRAM padding used for memory size calculations, as these calculations are not always accurate.
* This is set by default to ensure stability, but can be configured when you call `getLlama`.
*
* See `vramPadding` on `getLlama` for more information.
*/
get vramPaddingSize() {
return this._vramPadding.size;
}
/**
* The total amount of VRAM that is currently being used.
*
* `unifiedSize` represents the amount of VRAM that is shared between the CPU and GPU.
* On SoC devices, this is usually the same as `total`.
*/
async getVramState() {
this._ensureNotDisposed();
const { total, used, unifiedSize } = this._bindings.getGpuVramInfo();
return {
total,
used,
free: Math.max(0, total - used),
unifiedSize
};
}
/**
* Get the state of the swap memory.
*
* **`maxSize`** - The maximum size of the swap memory that the system can allocate.
* If the swap size is dynamic (like on macOS), this will be `Infinity`.
*
* **`allocated`** - The total size allocated by the system for swap memory.
*
* **`used`** - The amount of swap memory that is currently being used from the `allocated` size.
*
* On Windows, this will return the info for the page file.
*/
async getSwapState() {
this._ensureNotDisposed();
const { total, maxSize, free } = this._bindings.getSwapInfo();
return {
maxSize: maxSize === -1
? Infinity
: maxSize,
allocated: total,
used: total - free
};
}
async getGpuDeviceNames() {
this._ensureNotDisposed();
const { deviceNames } = this._bindings.getGpuDeviceInfo();
return deviceNames;
}
async loadModel(options) {
this._ensureNotDisposed();
return await withLock(this._memoryLock, LlamaLocks.loadToMemory, options.loadSignal, async () => {
this._ensureNotDisposed();
const preventDisposalHandle = this._backendDisposeGuard.createPreventDisposalHandle();
try {
return await LlamaModel._create(options, { _llama: this });
}
finally {
preventDisposalHandle.dispose();
}
});
}
/* eslint-disable @stylistic/max-len */
/**
* @see [Using a JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#json-schema) tutorial
* @see [Reducing Hallucinations When Using JSON Schema Grammar](https://node-llama-cpp.withcat.ai/guide/grammar#reducing-json-schema-hallucinations) tutorial
*/
async createGrammarForJsonSchema(schema) {
return new LlamaJsonSchemaGrammar(this, schema);
}
/* eslint-enable @stylistic/max-len */
async getGrammarFor(type) {
return await LlamaGrammar.getFor(this, type);
}
/**
* @see [Using Grammar](https://node-llama-cpp.withcat.ai/guide/grammar) tutorial
*/
async createGrammar(options) {
return new LlamaGrammar(this, options);
}
/** @internal */
async _init() {
await this._bindings.init();
}
/**
* Log messages related to the Llama instance
* @internal
*/
_log(level, message) {
this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n");
}
/** @internal */
_onAddonLog(level, message) {
const llamaLogLevel = addonLogLevelToLlamaLogLevel.get(level) ?? LlamaLogLevel.fatal;
if (this._pendingLog != null && this._pendingLogLevel != null && this._pendingLogLevel != llamaLogLevel) {
this._callLogger(this._pendingLogLevel, this._pendingLog);
this._pendingLog = null;
}
const sourceMessage = (this._pendingLog ?? "") + message;
const lastNewLineIndex = sourceMessage.lastIndexOf("\n");
const currentLog = lastNewLineIndex < 0
? sourceMessage
: sourceMessage.slice(0, lastNewLineIndex);
const nextLog = lastNewLineIndex < 0
? ""
: sourceMessage.slice(lastNewLineIndex + 1);
if (currentLog !== "")
this._callLogger(llamaLogLevel, currentLog);
if (nextLog !== "") {
this._pendingLog = nextLog;
this._pendingLogLevel = llamaLogLevel;
queueMicrotask(this._dispatchPendingLogMicrotask);
this._logDispatchQueuedMicrotasks++;
}
else
this._pendingLog = null;
}
/** @internal */
_dispatchPendingLogMicrotask() {
this._logDispatchQueuedMicrotasks--;
if (this._logDispatchQueuedMicrotasks !== 0)
return;
if (this._pendingLog != null && this._pendingLogLevel != null) {
this._callLogger(this._pendingLogLevel, this._pendingLog);
this._pendingLog = null;
}
}
/** @internal */
_callLogger(level, message) {
// llama.cpp uses dots to indicate progress, so we don't want to print them as different lines,
// and instead, append to the same log line
if (logMessageIsOnlyDots(message) && this._logger === Llama.defaultConsoleLogger) {
if (logMessageIsOnlyDots(this._previousLog) && level === this._previousLogLevel) {
process.stdout.write(message);
}
else {
this._nextLogNeedNewLine = true;
process.stdout.write(prefixAndColorMessage(message, getColorForLogLevel(level)));
}
}
else {
if (this._nextLogNeedNewLine) {
process.stdout.write("\n");
this._nextLogNeedNewLine = false;
}
try {
const transformedLogLevel = getTransformedLogLevel(level, message);
if (LlamaLogLevelGreaterThanOrEqual(transformedLogLevel, this._logLevel))
this._logger(transformedLogLevel, message);
}
catch (err) {
// the native addon code calls this function, so there's no use to throw an error here
}
}
this._previousLog = message;
this._previousLogLevel = level;
}
/** @internal */
_onExit() {
if (this._pendingLog != null && this._pendingLogLevel != null) {
this._callLogger(this._pendingLogLevel, this._pendingLog);
this._pendingLog = null;
}
}
/** @internal */
_ensureNotDisposed() {
if (this._disposed)
throw new DisposedError();
}
/** @internal */
static async _create({ bindings, bindingPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false, debug }) {
const vramOrchestrator = new MemoryOrchestrator(() => {
const { total, used, unifiedSize } = bindings.getGpuVramInfo();
return {
total,
free: Math.max(0, total - used),
unifiedSize
};
});
const ramOrchestrator = new MemoryOrchestrator(() => {
const used = process.memoryUsage().rss;
const total = os.totalmem();
return {
total,
free: Math.max(0, total - used),
unifiedSize: total
};
});
const swapOrchestrator = new MemoryOrchestrator(() => {
const { total, maxSize, free } = bindings.getSwapInfo();
const used = total - free;
if (maxSize === -1)
return {
total: Infinity,
free: Infinity,
unifiedSize: Infinity
};
return {
total: maxSize,
free: maxSize - used,
unifiedSize: maxSize
};
});
let resolvedRamPadding;
if (ramPadding instanceof Function)
resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total));
else
resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding);
const llama = new Llama({
bindings,
bindingPath,
buildType,
cmakeOptions: buildMetadata.buildOptions.customCmakeOptions,
llamaCppRelease: {
repo: buildMetadata.buildOptions.llamaCpp.repo,
release: buildMetadata.buildOptions.llamaCpp.release
},
logLevel,
logger,
debug,
buildGpu: buildMetadata.buildOptions.gpu,
vramOrchestrator,
maxThreads,
vramPadding: vramOrchestrator.reserveMemory(0),
ramOrchestrator,
ramPadding: resolvedRamPadding,
swapOrchestrator
});
if (llama.gpu === false || vramPadding === 0) {
// do nothing since `llama._vramPadding` is already set to 0
}
else if (vramPadding instanceof Function) {
const currentVramPadding = llama._vramPadding;
llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total));
currentVramPadding.dispose();
}
else {
const currentVramPadding = llama._vramPadding;
llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding);
currentVramPadding.dispose();
}
if (!skipLlamaInit)
await llama._init();
return llama;
}
static defaultConsoleLogger(level, message) {
switch (level) {
case LlamaLogLevel.disabled:
break;
case LlamaLogLevel.fatal:
// we don't use console.error here because it prints the stack trace
console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
break;
case LlamaLogLevel.error:
// we don't use console.error here because it prints the stack trace
console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
break;
case LlamaLogLevel.warn:
console.warn(prefixAndColorMessage(message, getColorForLogLevel(level)));
break;
case LlamaLogLevel.info:
console.info(prefixAndColorMessage(message, getColorForLogLevel(level)));
break;
case LlamaLogLevel.log:
console.info(prefixAndColorMessage(message, getColorForLogLevel(level)));
break;
case LlamaLogLevel.debug:
console.debug(prefixAndColorMessage(message, getColorForLogLevel(level)));
break;
default:
void level;
console.warn(getConsoleLogPrefix() + getColorForLogLevel(LlamaLogLevel.warn)(`Unknown log level: ${level}`));
console.log(prefixAndColorMessage(message, getColorForLogLevel(level)));
}
}
}
function getColorForLogLevel(level) {
switch (level) {
case LlamaLogLevel.disabled: return chalk.whiteBright;
case LlamaLogLevel.fatal: return chalk.redBright;
case LlamaLogLevel.error: return chalk.red;
case LlamaLogLevel.warn: return chalk.yellow;
case LlamaLogLevel.info: return chalk.whiteBright;
case LlamaLogLevel.log: return chalk.white;
case LlamaLogLevel.debug: return chalk.gray;
default:
void level;
return chalk.whiteBright;
}
}
function prefixAndColorMessage(message, color) {
return getConsoleLogPrefix() + (message
.split("\n")
.map((line) => color(line))
.join("\n" + getConsoleLogPrefix()));
}
function logMessageIsOnlyDots(message) {
if (message == null)
return false;
for (let i = 0; i < message.length; i++) {
if (message[i] !== ".")
return false;
}
return true;
}
function getTransformedLogLevel(level, message) {
if (level === LlamaLogLevel.warn && message.endsWith("the full capacity of the model will not be utilized"))
return LlamaLogLevel.info;
else if (level === LlamaLogLevel.warn && message.startsWith("ggml_metal_init: skipping kernel_") && message.endsWith("(not supported)"))
return LlamaLogLevel.log;
else if (level === LlamaLogLevel.warn && message.startsWith("ggml_cuda_init: GGML_CUDA_FORCE_") && message.endsWith(" no"))
return LlamaLogLevel.log;
else if (level === LlamaLogLevel.info && message.startsWith("load_backend: loaded "))
return LlamaLogLevel.log;
else if (level === LlamaLogLevel.warn && message.startsWith("make_cpu_buft_list: disabling extra buffer types"))
return LlamaLogLevel.info;
return level;
}
//# sourceMappingURL=Llama.js.map