node-llama-cpp
Version:
Run AI models locally on your machine with node.js bindings for llama.cpp. Enforce a JSON schema on the model output on the generation level
57 lines (56 loc) • 2.49 kB
TypeScript
import { Token } from "../../../types.js";
import { SequenceEvaluateOptions } from "../types.js";
import { LlamaContextSequence } from "../LlamaContext.js";
import { TokenPredictor } from "../TokenPredictor.js";
/**
* Predicts the next tokens by evaluating the current state of the target sequence
* on a draft sequence from a smaller and faster draft model.
* @see [Using Token Predictors: Draft Model Token Predictor](https://node-llama-cpp.withcat.ai/guide/token-prediction#draft-model)
*/
export declare class DraftSequenceTokenPredictor extends TokenPredictor {
constructor(draftSequence: LlamaContextSequence, options?: {
/**
* The minimum number of tokens to draft.
*
* Defaults to `0`.
*/
minTokens?: number;
/**
* Maximum number of tokens to draft.
*
* Defaults to `16`.
*/
maxTokens?: number;
/**
* Evaluate options default to the values of the target sequence.
*
* You can override any of the options for the prediction here.
*/
evaluateOptions?: Pick<SequenceEvaluateOptions, "temperature" | "minP" | "topK" | "topP" | "seed" | "repeatPenalty" | "tokenBias" | "evaluationPriority" | "contextShift">;
/**
* Minimum token confidence (probability of the token to be generated, assigned by the model) to consider the token as a prediction.
* When the generated token confidence is lower than this value, the prediction process will stop until all the predicted tokens
* are exhausted (either by a token that was not predicted being pushed, or all the generated predictions are consumed).
*
* A number between `0` and `1` representing the minimum probability of the token to be generated.
*
* Set to `0` to disable.
*
* Defaults to `0.6`.
*/
minConfidence?: number;
});
get draftSequence(): LlamaContextSequence;
get minTokens(): number;
get maxTokens(): number;
get minConfidence(): number | undefined;
reset({ targetSequence, stateTokens, evaluateOptions }: {
targetSequence: LlamaContextSequence;
stateTokens: Token[];
evaluateOptions: Readonly<SequenceEvaluateOptions>;
}): Promise<void>;
pushTokens(tokens: Token[]): void;
predictTokens(): Token[] | Promise<Token[]>;
stop(untilPredictionsExhausted?: boolean): void;
dispose(): void;
}