@mlc-ai/web-llm

import { AppConfig, ChatOptions } from "./config"; import { ChatCompletionRequest, ChatCompletionRequestBase, ChatCompletionRequestStreaming, ChatCompletionRequestNonStreaming, ChatCompletion, ChatCompletionChunk, CompletionCreateParams, Completion, CompletionCreateParamsBase, CompletionCreateParamsStreaming, CompletionCreateParamsNonStreaming, EmbeddingCreateParams, CreateEmbeddingResponse } from "./openai_api_protocols/index"; import * as API from "./openai_api_protocols/index"; /** * Report during intialization. */ export interface InitProgressReport { progress: number; timeElapsed: number; text: string; } /** * Callbacks used to report initialization process. */ export type InitProgressCallback = (report: InitProgressReport) => void; /** * A stateful logitProcessor used to post-process logits after forwarding the input and before * sampling the next token. If used with `GenerationConfig.logit_bias`, logit_bias is applied after * `processLogits()` is called. */ export interface LogitProcessor { /** * Process logits after forward() and before sampling implicitly, happens on the CPU. * @param logits The logits right after forward(). * Returns the processed logits. */ processLogits: (logits: Float32Array) => Float32Array; /** * Use the sampled token to update the LogitProcessor's internal state. Called implicitly * right after the next token is sampled/committed. * @param token Token sampled from the processed logits. */ processSampledToken: (token: number) => void; /** * Called when in `MLCEngine.resetChat()`. Can clear internal states. */ resetState: () => void; } /** * Common interface of MLCEngine that UI can interact with */ export interface MLCEngineInterface { /** * An object that exposes chat-related APIs. */ chat: API.Chat; /** * An object that exposes text completion APIs. */ completions: API.Completions; /** * An object that exposes embeddings APIs. */ embeddings: API.Embeddings; /** * Set an initialization progress callback function * which reports the progress of model loading. * * This function can be useful to implement an UI that * update as we loading the model. * * @param initProgressCallback The callback function */ setInitProgressCallback: (initProgressCallback: InitProgressCallback) => void; /** * @returns The current initialization progress callback function. */ getInitProgressCallback: () => InitProgressCallback | undefined; /** * Setter for the engine's appConfig. */ setAppConfig: (appConfig: AppConfig) => void; /** * Reload the chat with a new model. * * @param modelId model_id of the model to load, either string or string[]. When multiple models * are provided, we load all models sequentially. Each modelId needs to either be in * `webllm.prebuiltAppConfig`, or in `engineConfig.appConfig`. * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`. * The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i]. * @returns A promise when reload finishes. * @throws Throws error when device lost (mostly due to OOM); users should re-call reload(), * potentially with a smaller model or smaller context window size. * @note This is an async function. */ reload: (modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]) => Promise<void>; /** * OpenAI-style API. Generate a chat completion response for the given conversation and * configuration. Use `engine.chat.completions.create()` to invoke this API. * * @param request A OpenAI-style ChatCompletion request. * * @note The API is completely functional in behavior. That is, a previous request would not * affect the current request's result. Thus, for multi-round chatting, users are responsible for * maintaining the chat history. With that being said, as an implicit internal optimization, if we * detect that the user is performing multi-round chatting, we will preserve the KV cache and only * prefill the new tokens. * @note For requests sent to the same modelId, will block until all previous requests finish. * @note For more, see https://platform.openai.com/docs/api-reference/chat */ chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>; chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>; chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>; chatCompletion(request: ChatCompletionRequest): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>; /** * OpenAI-style API. Completes a CompletionCreateParams, a text completion with no chat template. * Use `engine.completions.create()` to invoke this API. * * @param request An OpenAI-style Completion request. * * @note For requests sent to the same modelId, will block until all previous requests finish. * @note For more, see https://platform.openai.com/docs/api-reference/completions */ completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>; completion(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>; completion(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>; completion(request: CompletionCreateParams): Promise<AsyncIterable<Completion> | Completion>; /** * OpenAI-style API. Creates an embedding vector representing the input text. * Use `engine.embeddings.create()` to invoke this API. * * @param request An OpenAI-style Embeddings request. * * @note For requests sent to the same modelId, will block until all previous requests finish. * @note For more, see https://platform.openai.com/docs/api-reference/embeddings/create */ embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>; /** * @returns A text summarizing the runtime stats. * @param modelId Only required when multiple models are loaded. * @note This is an async function */ runtimeStatsText: (modelId?: string) => Promise<string>; /** * Interrupt the generate process if it is already running. */ interruptGenerate: () => void; /** * Explicitly unload the currently loaded model(s) and release the related resources. Waits until * the webgpu device finishes all submitted work and destroys itself. * @note This is an asynchronous function. */ unload: () => Promise<void>; /** * Reset the current chat session by clear all memories. * @param keepStats: If True, do not reset the statistics. * @param modelId Only required when multiple models are loaded. */ resetChat: (keepStats?: boolean, modelId?: string) => Promise<void>; /** * Get the current generated response. * @param modelId Only required when multiple models are loaded. * @returns The current output message. */ getMessage: (modelId?: string) => Promise<string>; /** * Returns the device's maxStorageBufferBindingSize, can be used to guess whether the device * has limited resources like an Android phone. */ getMaxStorageBufferBindingSize(): Promise<number>; /** * Returns the device's gpu vendor (e.g. arm, qualcomm, apple) if available. Otherwise return * an empty string. */ getGPUVendor(): Promise<string>; /** * Forward the given input tokens to the model, then sample the next token. * * This function has side effects as the model will update its KV cache. * * @param inputIds The input tokens. * @param isPrefill True if prefill, false if decode; only used for statistics. * @param modelId Only required when multiple models are loaded. * @returns Next token sampled. * @note This is an async function. */ forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean, modelId?: string): Promise<number>; /** * Set MLCEngine logging output level * * @param logLevel The new log level */ setLogLevel(logLevel: LogLevel): void; } export declare const LOG_LEVELS: { TRACE: number; DEBUG: number; INFO: number; WARN: number; ERROR: number; SILENT: number; }; export type LogLevel = keyof typeof LOG_LEVELS; //# sourceMappingURL=types.d.ts.map