UNPKG

node-llama-cpp

Version:

Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level

214 lines (213 loc) 9.51 kB
import { LlamaGpuType, LlamaLogLevel } from "./types.js"; import { Llama } from "./Llama.js"; export type LlamaOptions = { /** * The compute layer implementation type to use for llama.cpp. * - **`"auto"`**: Automatically detect and use the best GPU available (Metal on macOS, and CUDA or Vulkan on Windows and Linux) * - **`"metal"`**: Use Metal. * Only supported on macOS. * Enabled by default on Apple Silicon Macs. * - **`"cuda"`**: Use CUDA. * - **`"vulkan"`**: Use Vulkan. * - **`false`**: Disable any GPU support and only use the CPU. * * `"auto"` by default. */ gpu?: "auto" | LlamaGpuType | { type: "auto"; exclude?: LlamaGpuType[]; }; /** * Set the minimum log level for llama.cpp. * Defaults to `"warn"`. */ logLevel?: LlamaLogLevel; /** * Set a custom logger for llama.cpp logs. */ logger?: (level: LlamaLogLevel, message: string) => void; /** * Set what build method to use. * - **`"auto"`**: If a local build is found, use it. * Otherwise, if a prebuilt binary is found, use it. * Otherwise, build from source. * - **`"never"`**: If a local build is found, use it. * Otherwise, if a prebuilt binary is found, use it. * Otherwise, throw a `NoBinaryFoundError` error. * - **`"forceRebuild"`**: Always build from source. * Be cautious with this option, as it will cause the build to fail on Windows when the binaries are in use by another process. * - **`"try"`**: If a local build is found, use it. * Otherwise, try to build from source and use the resulting binary. * If building from source fails, use a prebuilt binary if found. * * When running from inside an Asar archive in Electron, building from source is not possible, so it'll never build from source. * To allow building from source in Electron apps, make sure you ship `node-llama-cpp` as an unpacked module. * * Defaults to `"auto"`. * On Electron, defaults to `"never"`. */ build?: "auto" | "never" | "forceRebuild" | "try"; /** * Set custom CMake options for llama.cpp */ cmakeOptions?: Record<string, string>; /** * When a prebuilt binary is found, only use it if it was built with the same build options as the ones specified in `buildOptions`. * Disabled by default. */ existingPrebuiltBinaryMustMatchBuildOptions?: boolean; /** * Use prebuilt binaries if they match the build options. * Enabled by default. */ usePrebuiltBinaries?: boolean; /** * Print binary compilation progress logs. * Enabled by default. */ progressLogs?: boolean; /** * Don't download llama.cpp source if it's not found. * When set to `true`, and llama.cpp source is not found, a `NoBinaryFoundError` error will be thrown. * Disabled by default. */ skipDownload?: boolean; /** * The maximum number of threads to use for the Llama instance. * * Set to `0` to have no thread limit. * * When not using a GPU, defaults to the number of CPU cores that are useful for math (`.cpuMathCores`), or `4`, whichever is higher. * * When using a GPU, there's no limit by default. */ maxThreads?: number; /** * Pad the available VRAM for the memory size calculations, as these calculations are not always accurate. * Recommended to ensure stability. * This only affects the calculations of `"auto"` in function options and is not reflected in the `getVramState` function. * * Defaults to `6%` of the total VRAM or 1GB, whichever is lower. * Set to `0` to disable. */ vramPadding?: number | ((totalVram: number) => number); /** * Pad the available RAM for the memory size calculations, as these calculations are not always accurate. * Recommended to ensure stability. * * Defaults to `25%` of the total RAM or 6GB (1GB on Linux), whichever is lower. * Set to `0` to disable. * * > Since the OS also needs RAM to function, the default value can get up to 6GB on Windows and macOS, and 1GB on Linux. */ ramPadding?: number | ((totalRam: number) => number); /** * Enable debug mode to find issues with llama.cpp. * Makes logs print directly to the console from `llama.cpp` and not through the provided logger. * * Defaults to `false`. * * The default can be set using the `NODE_LLAMA_CPP_DEBUG` environment variable. */ debug?: boolean; }; export type LastBuildOptions = { /** * Set the minimum log level for llama.cpp. * Defaults to "warn". */ logLevel?: LlamaLogLevel; /** * Set a custom logger for llama.cpp logs. */ logger?: (level: LlamaLogLevel, message: string) => void; /** * If a local build is not found, use prebuilt binaries. * Enabled by default. */ usePrebuiltBinaries?: boolean; /** * If a local build is not found, and prebuilt binaries are not found, when building from source, * print binary compilation progress logs. * Enabled by default. */ progressLogs?: boolean; /** * If a local build is not found, and prebuilt binaries are not found, don't download llama.cpp source if it's not found. * When set to `true`, and llama.cpp source is needed but is not found, a `NoBinaryFoundError` error will be thrown. * Disabled by default. */ skipDownload?: boolean; /** * The maximum number of threads to use for the Llama instance. * * Set to `0` to have no thread limit. * * When not using a GPU, defaults to the number of CPU cores that are useful for math (`.cpuMathCores`), or `4`, whichever is higher. * * When using a GPU, there's no limit by default. */ maxThreads?: number; /** * Pad the available VRAM for the memory size calculations, as these calculations are not always accurate. * Recommended to ensure stability. * This only affects the calculations of `"auto"` in function options and is not reflected in the `getVramState` function. * * Defaults to `6%` of the total VRAM or 1GB, whichever is lower. * Set to `0` to disable. */ vramPadding?: number | ((totalVram: number) => number); /** * Pad the available RAM for the memory size calculations, as these calculations are not always accurate. * Recommended to ensure stability. * * Defaults to `25%` of the total RAM or 6GB (1GB on Linux), whichever is lower. * Set to `0` to disable. * * > Since the OS also needs RAM to function, the default value can get up to 6GB on Windows and macOS, and 1GB on Linux. */ ramPadding?: number | ((totalRam: number) => number); /** * Enable debug mode to find issues with llama.cpp. * Makes logs print directly to the console from `llama.cpp` and not through the provided logger. * * Defaults to `false`. * * The default can be set using the `NODE_LLAMA_CPP_DEBUG` environment variable. */ debug?: boolean; }; export declare const getLlamaFunctionName = "getLlama"; export declare const defaultLlamaVramPadding: (totalVram: number) => number; export declare const defaultLlamaRamPadding: (totalRam: number) => number; /** * Get a `llama.cpp` binding. * * Defaults to use a local binary built using the `source download` or `source build` CLI commands if one exists, * otherwise, uses a prebuilt binary, and fallbacks to building from source if a prebuilt binary is not found. * * Pass `"lastBuild"` to default to use the last successful build created * using the `source download` or `source build` CLI commands if one exists. * * The difference between using `"lastBuild"` and not using it is that `"lastBuild"` will use the binary built using a CLI command * with the configuration used to build that binary (like using its GPU type), * while not using `"lastBuild"` will only attempt to only use a binary that complies with the given options. * * For example, if your machine supports both CUDA and Vulkan, and you run the `source download --gpu vulkan` command, * calling `getLlama("lastBuild")` will return the binary you built with Vulkan, * while calling `getLlama()` will return a binding from a pre-built binary with CUDA, * since CUDA is preferable on systems that support it. * * For example, if your machine supports CUDA, and you run the `source download --gpu cuda` command, * calling `getLlama("lastBuild")` will return the binary you built with CUDA, * and calling `getLlama()` will also return that same binary you built with CUDA. * * You should prefer to use `getLlama()` without `"lastBuild"` unless you have a specific reason to use the last build. */ export declare function getLlama(options?: LlamaOptions): Promise<Llama>; export declare function getLlama(type: "lastBuild", lastBuildOptions?: LastBuildOptions): Promise<Llama>; export declare function getLlamaForOptions({ gpu, logLevel, logger, build, cmakeOptions, existingPrebuiltBinaryMustMatchBuildOptions, usePrebuiltBinaries, progressLogs, skipDownload, maxThreads, vramPadding, ramPadding, debug }: LlamaOptions, { updateLastBuildInfoOnCompile, skipLlamaInit, pipeBinaryTestErrorLogs }?: { updateLastBuildInfoOnCompile?: boolean; skipLlamaInit?: boolean; pipeBinaryTestErrorLogs?: boolean; }): Promise<Llama>;