node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
204 lines (203 loc) • 10.6 kB
TypeScript
import { EventRelay } from "lifecycle-utils";
import { Token } from "../../types.js";
import { TokenMeter } from "../TokenMeter.js";
import { LlamaModel } from "../LlamaModel/LlamaModel.js";
import { ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, SequenceEvaluateMetadataOptions, SequenceEvaluateOptions, SequenceEvaluateOutput } from "./types.js";
import { TokenPredictor } from "./TokenPredictor.js";
export declare class LlamaContext {
readonly onDispose: EventRelay<void>;
private constructor();
dispose(): Promise<void>;
/** @hidden */
[Symbol.asyncDispose](): Promise<void>;
get disposed(): boolean;
get model(): LlamaModel;
get contextSize(): number;
get batchSize(): number;
get flashAttention(): boolean;
/**
* The actual size of the state in the memory in bytes.
* This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
*/
get stateSize(): number;
/** The number of threads currently used to evaluate tokens */
get currentThreads(): number;
/**
* The number of threads that are preferred to be used to evaluate tokens.
*
* The actual number of threads used may be lower when other evaluations are running in parallel.
*/
get idealThreads(): number;
getAllocatedContextSize(): number;
get totalSequences(): number;
get sequencesLeft(): number;
/**
* Before calling this method, make sure to call `sequencesLeft` to check if there are any sequences left.
* When there are no sequences left, this method will throw an error.
*/
getSequence(options?: {
contextShift?: ContextShiftOptions;
/**
* Token predictor to use for the sequence.
* Don't share the same token predictor between multiple sequences.
*
* Using a token predictor doesn't affect the generation output itself -
* it only allows for greater parallelization of the token evaluation to speed up the generation.
*
* > **Note:** that if a token predictor is too resource intensive,
* > it can slow down the generation process due to the overhead of running the predictor.
* >
* > Testing the effectiveness of a token predictor on the target machine is recommended before using it in production.
*
* Automatically disposed when disposing the sequence.
* @see [Using Token Predictors](https://node-llama-cpp.withcat.ai/guide/token-prediction)
*/
tokenPredictor?: TokenPredictor;
}): LlamaContextSequence;
dispatchPendingBatch(): void;
/**
* Print the timings of token evaluation since that last print for this context.
*
* Requires the `performanceTracking` option to be enabled.
*
* > **Note:** it prints on the `LlamaLogLevel.info` level, so if you set the level of your `Llama` instance higher than that,
* it won't print anything.
*/
printTimings(): Promise<void>;
}
export declare class LlamaContextSequence {
readonly onDispose: EventRelay<void>;
private constructor();
dispose(): void;
/** @hidden */
[Symbol.dispose](): void;
get disposed(): boolean;
get context(): LlamaContext;
get model(): LlamaModel;
/** The maximum number of tokens that the sequence state can hold */
get contextSize(): number;
/** The index where the next evaluated token will be placed in the context */
get nextTokenIndex(): number;
/** The current context state tokens */
get contextTokens(): Token[];
get tokenMeter(): TokenMeter;
/**
* The token predictor used when creating this sequence.
*/
get tokenPredictor(): TokenPredictor | undefined;
/**
* Statistics of token predictions using the sequence's `tokenPredictor`.
*
* The statistics change only when token prediction is used in this sequence.
*
* `validated` + `refuted` = total number of evaluated predictions.
*
* Prefer using `validated` and `refuted` to evaluate the effectiveness of token prediction.
*/
get tokenPredictions(): {
/** Number of token predictions that were actually used (tokens that were validated and then consumed) */
used: number;
/** Number of token predictions that were not used (tokens that were validated and were not consumed) */
unused: number;
/** Number of token predictions that were validated successfully */
validated: number;
/** Number of token predictions that were refuted */
refuted: number;
};
get isLoadedToMemory(): boolean;
compareContextTokens(tokens: Token[]): {
firstDifferentIndex: number;
};
/**
* Erase parts of the context state to align it with the given tokens.
*
* If the given tokens do not align with the current context state, the context state will be erased to align with the given tokens.
*
* To find the first different token index between the context state and the given tokens, access the `nextTokenIndex` property.
*
* If `allowShift` is `true` (the default), shifting tokens may happen to align the context state with the given tokens,
* which incurs token evaluation of the shifted tokens.
*/
adaptStateToTokens(tokens: Token[], allowShift?: boolean): Promise<void>;
/**
* Clear the history of the sequence.
* If `prependBos` was enabled, the BOS token will be prepended to the sequence again.
*/
clearHistory(): Promise<void>;
/**
* Erase context tokens in the provided ranges to free up space for new tokens to be generated.
* The start of each range is inclusive, and the end of each range is exclusive.
* For example, the range `{start: 0, end: 1}` will remove the token at the `0` index only.
*/
eraseContextTokenRanges(ranges: ContextTokensDeleteRange[]): Promise<void>;
/**
* Evaluate the provided tokens into the context sequence, and continue generating new tokens on iterator iterations.
*
* This method uses the token predictor (when provided) to generate new tokens faster.
*/
evaluate(tokens: Token[], options?: SequenceEvaluateOptions): AsyncGenerator<Token, void, void | Token | Token[]>;
/**
* Like {@link evaluate `.evaluate(...)`}, but with additional metadata for each generated token.
*
* Configure the additional metadata options to choose which metadata to include.
*/
evaluateWithMetadata<const Metadata extends SequenceEvaluateMetadataOptions>(tokens: Token[], metadata: Metadata, options?: SequenceEvaluateOptions): AsyncGenerator<SequenceEvaluateOutput<Metadata>, void, void | Token | Token[]>;
/**
* Evaluate the provided tokens into the context sequence without generating new tokens.
*/
evaluateWithoutGeneratingNewTokens(tokens: Token[], options?: {
/**
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
* evaluated based on the strategy chosen for the context.
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
* highest evaluation priority.
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
*/
evaluationPriority?: EvaluationPriority;
/** Override the sequence context shift options for this evaluation */
contextShift?: ContextShiftOptions;
}): Promise<void>;
/**
* Evaluate the provided tokens into the context sequence with custom options for each token.
*
* This method allows for more precise control of the generation process.
*
* A next token will be generated for a given token only if any of the `generateNext` options for it are used.
*
* To generate more tokens after this method finishes,
* use it again with token(s) you selected to add to the context from the previous evaluation.
*
* This method doesn't use the token predictor (when provided) since it cannot predict which tokens are actually needed.
* Use the `evaluate` method when you need to use token prediction.
* @returns An array where for each token in the input array, there can be an output item at the same index in the output array.
* For indexes that have no output, there won't be any value at the corresponding index in the output array.
*
* It's recommended to iterate from `0` up to the length of the input array to check the results in the output array.
*/
controlledEvaluate(input: ControlledEvaluateInputItem[], options?: {
/**
* When a lot of tokens are queued for the next batch, more than the configured `batchSize`, the tokens for each sequence will be
* evaluated based on the strategy chosen for the context.
* By default, the `"maximumParallelism"` strategy is used, which will try to evaluate as many sequences in parallel as possible,
* but at some point, it'll have to choose which sequences to evaluate more tokens of, so it'll prioritize the sequences with the
* highest evaluation priority.
* Also, a custom strategy can be used to prioritize the sequences differently, but generally, the higher the evaluation priority
* is, the more likely and more tokens will be evaluated for that sequence in the next queued batch.
*/
evaluationPriority?: EvaluationPriority;
/** Override the sequence context shift options for this evaluation */
contextShift?: ContextShiftOptions;
/** Called on each token result after it's generated */
onTokenResult?(inputTokenIndex: number, result: ControlledEvaluateIndexOutput): void;
}): Promise<Array<undefined | ControlledEvaluateIndexOutput>>;
}
export declare function getDefaultContextBatchSize({ contextSize, sequences }: {
contextSize: number;
sequences: number;
}): number;
export declare function getDefaultContextSequences(): number;
export declare function getDefaultModelContextSize({ trainContextSize }: {
trainContextSize?: number;
}): number;