node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
286 lines (285 loc) • 12.3 kB
TypeScript
import { EventRelay } from "lifecycle-utils";
import { Token, Tokenizer } from "../../types.js";
import { ModelTypeDescription } from "../../bindings/AddonTypes.js";
import { LlamaVocabularyType } from "../../bindings/types.js";
import { GgufFileInfo } from "../../gguf/types/GgufFileInfoTypes.js";
import { GgufInsights } from "../../gguf/insights/GgufInsights.js";
import { LlamaContextOptions } from "../LlamaContext/types.js";
import { LlamaContext } from "../LlamaContext/LlamaContext.js";
import { LlamaEmbeddingContext, LlamaEmbeddingContextOptions } from "../LlamaEmbeddingContext.js";
import { GgufMetadata } from "../../gguf/types/GgufMetadataTypes.js";
import { OverridesObject } from "../../utils/OverridesObject.js";
import { LlamaRankingContext, LlamaRankingContextOptions } from "../LlamaRankingContext.js";
import { TokenAttributes } from "./utils/TokenAttributes.js";
import type { Llama } from "../../bindings/Llama.js";
import type { BuiltinSpecialTokenValue } from "../../utils/LlamaText.js";
export type LlamaModelOptions = {
/** path to the model on the filesystem */
modelPath: string;
/**
* Number of layers to store in VRAM.
* - **`"auto"`** - adapt to the current VRAM state and try to fit as many layers as possible in it.
* Takes into account the VRAM required to create a context with a `contextSize` set to `"auto"`.
* - **`"max"`** - store all layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
* - **`number`** - store the specified number of layers in VRAM. If there's not enough VRAM, an error will be thrown. Use with caution.
* - **`{min?: number, max?: number, fitContext?: {contextSize: number}}`** - adapt to the current VRAM state and try to fit as
* many layers as possible in it, but at least `min` and at most `max` layers. Set `fitContext` to the parameters of a context you
* intend to create with the model, so it'll take it into account in the calculations and leave enough memory for such a context.
*
* If GPU support is disabled, will be set to `0` automatically.
*
* Defaults to `"auto"`.
*/
gpuLayers?: "auto" | "max" | number | {
min?: number;
max?: number;
fitContext?: {
contextSize?: number;
/**
* Defaults to `false`.
*/
embeddingContext?: boolean;
};
};
/**
* Only load the vocabulary, not weight tensors.
*
* Useful when you only want to use the model to use its tokenizer but not for evaluation.
*
* Defaults to `false`.
*/
vocabOnly?: boolean;
/**
* Use mmap (memory-mapped file) to load the model.
*
* Using mmap allows the OS to load the model tensors directly from the file on the filesystem,
* and makes it easier for the system to manage memory.
*
* When using mmap, you might notice a delay the first time you actually use the model,
* which is caused by the OS itself loading the model into memory.
*
* Defaults to `true` if the current system supports it.
*/
useMmap?: boolean;
/**
* Force the system to keep the model in the RAM/VRAM.
* Use with caution as this can crash your system if the available resources are insufficient.
*/
useMlock?: boolean;
/**
* Check for tensor validity before actually loading the model.
* Using it increases the time it takes to load the model.
*
* Defaults to `false`.
*/
checkTensors?: boolean;
/**
* Enable flash attention by default for contexts created with this model.
* Only works with models that support flash attention.
*
* Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
*
* The support for flash attention is currently experimental and may not always work as expected.
* Use with caution.
*
* This option will be ignored if flash attention is not supported by the model.
*
* Enabling this affects the calculations of default values for the model and contexts created with it
* as flash attention reduces the amount of memory required,
* which allows for more layers to be offloaded to the GPU and for context sizes to be bigger.
*
* Defaults to `false`.
*
* Upon flash attention exiting the experimental status, the default value will become `true`.
*/
defaultContextFlashAttention?: boolean;
/**
* Called with the load percentage when the model is being loaded.
* @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
*/
onLoadProgress?(loadProgress: number): void;
/** An abort signal to abort the model load */
loadSignal?: AbortSignal;
/**
* Ignore insufficient memory errors and continue with the model load.
* Can cause the process to crash if there's not enough VRAM to fit the model.
*
* Defaults to `false`.
*/
ignoreMemorySafetyChecks?: boolean;
/**
* Metadata overrides to load the model with.
*
* > **Note:** Most metadata value overrides aren't supported and overriding them will have no effect on `llama.cpp`.
* > Only use this for metadata values that are explicitly documented to be supported by `llama.cpp` to be overridden,
* > and only in cases when this is crucial, as this is not guaranteed to always work as expected.
*/
metadataOverrides?: OverridesObject<GgufMetadata, number | bigint | boolean | string>;
};
export declare class LlamaModel {
readonly tokenizer: Tokenizer;
readonly onDispose: EventRelay<void>;
private constructor();
dispose(): Promise<void>;
/** @hidden */
[Symbol.asyncDispose](): Promise<void>;
get disposed(): boolean;
get llama(): Llama;
get tokens(): LlamaModelTokens;
get filename(): string | undefined;
get fileInfo(): GgufFileInfo;
get fileInsights(): GgufInsights;
/**
* Number of layers offloaded to the GPU.
* If GPU support is disabled, this will always be `0`.
*/
get gpuLayers(): number;
/**
* Total model size in memory in bytes.
*
* When using mmap, actual memory usage may be higher than this value due to `llama.cpp`'s performance optimizations.
*/
get size(): number;
get flashAttentionSupported(): boolean;
get defaultContextFlashAttention(): boolean;
/**
* Transform text into tokens that can be fed to the model
* @param text - the text to tokenize
* @param [specialTokens] - if set to true, text that correspond to special tokens will be tokenized to those tokens.
* For example, `<s>` will be tokenized to the BOS token if `specialTokens` is set to `true`,
* otherwise it will be tokenized to tokens that corresponds to the plaintext `<s>` string.
* @param [options] - additional options for tokenization.
* If set to `"trimLeadingSpace"`, a leading space will be trimmed from the tokenized output if the output has an
* additional space at the beginning.
*/
tokenize(text: string, specialTokens?: boolean, options?: "trimLeadingSpace"): Token[];
tokenize(text: BuiltinSpecialTokenValue, specialTokens: "builtin"): Token[];
/**
* Transform tokens into text
* @param tokens - the tokens to detokenize.
* @param [specialTokens] - if set to `true`, special tokens will be detokenized to their corresponding token text representation.
*
* Recommended for debugging purposes only.
*
* > **Note:** there may be additional spaces around special tokens that were not present in the original text - this is not a bug,
* this is [how the tokenizer is supposed to work](https://github.com/ggml-org/llama.cpp/pull/7697#issuecomment-2144003246).
*
* Defaults to `false`.
* @param [lastTokens] - the last few tokens that preceded the tokens to detokenize.
* If provided, the last few tokens will be used to determine whether a space has to be added before the current tokens or not,
* and apply other detokenizer-specific heuristics to provide the correct text continuation to the existing tokens.
*
* Using it may have no effect with some models, but it is still recommended.
*/
detokenize(tokens: readonly Token[], specialTokens?: boolean, lastTokens?: readonly Token[]): string;
getTokenAttributes(token: Token): TokenAttributes;
/** Check whether the given token is a special token (a control-type token or a token with no normal text representation) */
isSpecialToken(token: Token | undefined): boolean;
iterateAllTokens(): Generator<Token, void, unknown>;
/** Check whether the given token is an EOG (End Of Generation) token, like EOS or EOT. */
isEogToken(token: Token | undefined): boolean;
createContext(options?: LlamaContextOptions): Promise<LlamaContext>;
/**
* @see [Using Embedding](https://node-llama-cpp.withcat.ai/guide/embedding) tutorial
*/
createEmbeddingContext(options?: LlamaEmbeddingContextOptions): Promise<LlamaEmbeddingContext>;
/**
* @see [Reranking Documents](https://node-llama-cpp.withcat.ai/guide/embedding#reranking) tutorial
*/
createRankingContext(options?: LlamaRankingContextOptions): Promise<LlamaRankingContext>;
/**
* Get warnings about the model file that would affect its usage.
*
* These warnings include all the warnings generated by `GgufInsights`, but are more comprehensive.
*/
getWarnings(): string[];
/** @hidden `ModelTypeDescription` type alias is too long in the documentation */
get typeDescription(): ModelTypeDescription;
/** The context size the model was trained on */
get trainContextSize(): number;
/** The size of an embedding vector the model can produce */
get embeddingVectorSize(): number;
get vocabularyType(): LlamaVocabularyType;
}
export declare class LlamaModelTokens {
private constructor();
/**
* @returns infill tokens
*/
get infill(): LlamaModelInfillTokens;
/**
* @returns The BOS (Beginning Of Sequence) token.
*/
get bos(): Token | null;
/**
* @returns The EOS (End Of Sequence) token.
*/
get eos(): Token | null;
/**
* @returns The EOT (End Of Turn) token.
*/
get eot(): Token | null;
/**
* @returns The SEP (Sentence Separator) token.
*/
get sep(): Token | null;
/**
* @returns The NL (New Line) token.
*/
get nl(): Token | null;
/**
* @returns The BOS (Beginning Of Sequence) token text representation.
*/
get bosString(): string | null;
/**
* @returns The EOS (End Of Sequence) token text representation.
*/
get eosString(): string | null;
/**
* @returns The EOT (End Of Turn) token text representation.
*/
get eotString(): string | null;
/**
* @returns The SEP (Sentence Separator) token text representation.
*/
get sepString(): string | null;
/**
* @returns The NL (New Line) token text representation.
*/
get nlString(): string | null;
/**
* @returns Whether we should prepend a BOS (Beginning Of Sequence) token for evaluations with this model.
*/
get shouldPrependBosToken(): boolean;
/**
* @returns Whether we should append an EOS (End Of Sequence) token for evaluations with this model.
*/
get shouldAppendEosToken(): boolean;
}
export declare class LlamaModelInfillTokens {
private constructor();
/**
* @returns The beginning of infill prefix token.
*/
get prefix(): Token | null;
/**
* @returns The beginning of infill middle token.
*/
get middle(): Token | null;
/**
* @returns The beginning of infill suffix token.
*/
get suffix(): Token | null;
/**
* @returns The beginning of infill prefix token as a string.
*/
get prefixString(): string | null;
/**
* @returns The beginning of infill middle token as a string.
*/
get middleString(): string | null;
/**
* @returns The beginning of infill suffix token as a string.
*/
get suffixString(): string | null;
}