@mlc-ai/web-llm
Version:
Hardware accelerated language model chats on browsers
194 lines • 8.89 kB
TypeScript
import { AppConfig, ChatOptions } from "./config";
import { ChatCompletionRequest, ChatCompletionRequestBase, ChatCompletionRequestStreaming, ChatCompletionRequestNonStreaming, ChatCompletion, ChatCompletionChunk, CompletionCreateParams, Completion, CompletionCreateParamsBase, CompletionCreateParamsStreaming, CompletionCreateParamsNonStreaming, EmbeddingCreateParams, CreateEmbeddingResponse } from "./openai_api_protocols/index";
import * as API from "./openai_api_protocols/index";
/**
* Report during intialization.
*/
export interface InitProgressReport {
progress: number;
timeElapsed: number;
text: string;
}
/**
* Callbacks used to report initialization process.
*/
export type InitProgressCallback = (report: InitProgressReport) => void;
/**
* A stateful logitProcessor used to post-process logits after forwarding the input and before
* sampling the next token. If used with `GenerationConfig.logit_bias`, logit_bias is applied after
* `processLogits()` is called.
*/
export interface LogitProcessor {
/**
* Process logits after forward() and before sampling implicitly, happens on the CPU.
* @param logits The logits right after forward().
* Returns the processed logits.
*/
processLogits: (logits: Float32Array) => Float32Array;
/**
* Use the sampled token to update the LogitProcessor's internal state. Called implicitly
* right after the next token is sampled/committed.
* @param token Token sampled from the processed logits.
*/
processSampledToken: (token: number) => void;
/**
* Called when in `MLCEngine.resetChat()`. Can clear internal states.
*/
resetState: () => void;
}
/**
* Common interface of MLCEngine that UI can interact with
*/
export interface MLCEngineInterface {
/**
* An object that exposes chat-related APIs.
*/
chat: API.Chat;
/**
* An object that exposes text completion APIs.
*/
completions: API.Completions;
/**
* An object that exposes embeddings APIs.
*/
embeddings: API.Embeddings;
/**
* Set an initialization progress callback function
* which reports the progress of model loading.
*
* This function can be useful to implement an UI that
* update as we loading the model.
*
* @param initProgressCallback The callback function
*/
setInitProgressCallback: (initProgressCallback: InitProgressCallback) => void;
/**
* @returns The current initialization progress callback function.
*/
getInitProgressCallback: () => InitProgressCallback | undefined;
/**
* Setter for the engine's appConfig.
*/
setAppConfig: (appConfig: AppConfig) => void;
/**
* Reload the chat with a new model.
*
* @param modelId model_id of the model to load, either string or string[]. When multiple models
* are provided, we load all models sequentially. Each modelId needs to either be in
* `webllm.prebuiltAppConfig`, or in `engineConfig.appConfig`.
* @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.
* The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].
* @returns A promise when reload finishes.
* @throws Throws error when device lost (mostly due to OOM); users should re-call reload(),
* potentially with a smaller model or smaller context window size.
* @note This is an async function.
*/
reload: (modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]) => Promise<void>;
/**
* OpenAI-style API. Generate a chat completion response for the given conversation and
* configuration. Use `engine.chat.completions.create()` to invoke this API.
*
* @param request A OpenAI-style ChatCompletion request.
*
* @note The API is completely functional in behavior. That is, a previous request would not
* affect the current request's result. Thus, for multi-round chatting, users are responsible for
* maintaining the chat history. With that being said, as an implicit internal optimization, if we
* detect that the user is performing multi-round chatting, we will preserve the KV cache and only
* prefill the new tokens.
* @note For requests sent to the same modelId, will block until all previous requests finish.
* @note For more, see https://platform.openai.com/docs/api-reference/chat
*/
chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
chatCompletion(request: ChatCompletionRequest): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
/**
* OpenAI-style API. Completes a CompletionCreateParams, a text completion with no chat template.
* Use `engine.completions.create()` to invoke this API.
*
* @param request An OpenAI-style Completion request.
*
* @note For requests sent to the same modelId, will block until all previous requests finish.
* @note For more, see https://platform.openai.com/docs/api-reference/completions
*/
completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
completion(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
completion(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
completion(request: CompletionCreateParams): Promise<AsyncIterable<Completion> | Completion>;
/**
* OpenAI-style API. Creates an embedding vector representing the input text.
* Use `engine.embeddings.create()` to invoke this API.
*
* @param request An OpenAI-style Embeddings request.
*
* @note For requests sent to the same modelId, will block until all previous requests finish.
* @note For more, see https://platform.openai.com/docs/api-reference/embeddings/create
*/
embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
/**
* @returns A text summarizing the runtime stats.
* @param modelId Only required when multiple models are loaded.
* @note This is an async function
*/
runtimeStatsText: (modelId?: string) => Promise<string>;
/**
* Interrupt the generate process if it is already running.
*/
interruptGenerate: () => void;
/**
* Explicitly unload the currently loaded model(s) and release the related resources. Waits until
* the webgpu device finishes all submitted work and destroys itself.
* @note This is an asynchronous function.
*/
unload: () => Promise<void>;
/**
* Reset the current chat session by clear all memories.
* @param keepStats: If True, do not reset the statistics.
* @param modelId Only required when multiple models are loaded.
*/
resetChat: (keepStats?: boolean, modelId?: string) => Promise<void>;
/**
* Get the current generated response.
* @param modelId Only required when multiple models are loaded.
* @returns The current output message.
*/
getMessage: (modelId?: string) => Promise<string>;
/**
* Returns the device's maxStorageBufferBindingSize, can be used to guess whether the device
* has limited resources like an Android phone.
*/
getMaxStorageBufferBindingSize(): Promise<number>;
/**
* Returns the device's gpu vendor (e.g. arm, qualcomm, apple) if available. Otherwise return
* an empty string.
*/
getGPUVendor(): Promise<string>;
/**
* Forward the given input tokens to the model, then sample the next token.
*
* This function has side effects as the model will update its KV cache.
*
* @param inputIds The input tokens.
* @param isPrefill True if prefill, false if decode; only used for statistics.
* @param modelId Only required when multiple models are loaded.
* @returns Next token sampled.
* @note This is an async function.
*/
forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean, modelId?: string): Promise<number>;
/**
* Set MLCEngine logging output level
*
* @param logLevel The new log level
*/
setLogLevel(logLevel: LogLevel): void;
}
export declare const LOG_LEVELS: {
TRACE: number;
DEBUG: number;
INFO: number;
WARN: number;
ERROR: number;
SILENT: number;
};
export type LogLevel = keyof typeof LOG_LEVELS;
//# sourceMappingURL=types.d.ts.map