@mlc-ai/web-llm
Version:
Hardware accelerated language model chats on browsers
140 lines • 7.9 kB
TypeScript
import { ChatConfig, ChatOptions, AppConfig, GenerationConfig, MLCEngineConfig } from "./config";
import { LLMChatPipeline } from "./llm_chat";
import { ChatCompletionRequest, ChatCompletion, ChatCompletionChunk, ChatCompletionRequestNonStreaming, ChatCompletionRequestStreaming, ChatCompletionRequestBase, CompletionCreateParamsNonStreaming, CompletionCreateParamsStreaming, CompletionCreateParamsBase, CompletionCreateParams, Completion, EmbeddingCreateParams, CreateEmbeddingResponse } from "./openai_api_protocols/index";
import * as API from "./openai_api_protocols/index";
import { InitProgressCallback, MLCEngineInterface, LogitProcessor, LogLevel } from "./types";
/**
* Creates `MLCEngine`, and loads `modelId` onto WebGPU.
*
* Equivalent to `new webllm.MLCEngine().reload(...)`.
*
* @param modelId model_id of the model to load, either string or string[]. When multiple models
* are provided, we load all models sequentially. Each modelId needs to either be in
* `webllm.prebuiltAppConfig`, or in `engineCOnfig.appConfig`.
* @param engineConfig Optionally configures the engine, see `webllm.MLCEngineConfig`.
* @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.
* The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].
* @returns An initialized `WebLLM.MLCEngine` with `modelId` loaded.
* @throws Throws error when device lost (mostly due to OOM); users should re-call `CreateMLCEngine()`,
* potentially with a smaller model or smaller context window size.
*/
export declare function CreateMLCEngine(modelId: string | string[], engineConfig?: MLCEngineConfig, chatOpts?: ChatOptions | ChatOptions[]): Promise<MLCEngine>;
/**
* The main interface of MLCEngine, which loads a model and performs tasks.
*
* You can either initialize one with `webllm.CreateMLCEngine(modelId)`, or
* `webllm.MLCEngine().reload(modelId)`.
*/
export declare class MLCEngine implements MLCEngineInterface {
/** For chat.completions.create() */
chat: API.Chat;
/** For completions.create() */
completions: API.Completions;
/** For embeddings.create() */
embeddings: API.Embeddings;
/** Maps each loaded model's modelId to its pipeline */
private loadedModelIdToPipeline;
/** Maps each loaded model's modelId to its chatConfig */
private loadedModelIdToChatConfig;
/** Maps each loaded model's modelId to its modelType */
private loadedModelIdToModelType;
/** Maps each loaded model's modelId to a lock. Ensures
* each model only processes one request at at time.
*/
private loadedModelIdToLock;
private logger;
private logitProcessorRegistry?;
private initProgressCallback?;
private appConfig;
private interruptSignal;
private deviceLostIsError;
private reloadController;
constructor(engineConfig?: MLCEngineConfig);
setAppConfig(appConfig: AppConfig): void;
setInitProgressCallback(initProgressCallback?: InitProgressCallback): void;
getInitProgressCallback(): InitProgressCallback | undefined;
setLogitProcessorRegistry(logitProcessorRegistry?: Map<string, LogitProcessor>): void;
/**
* Set MLCEngine logging output level
*
* @param logLevel The new log level
*/
setLogLevel(logLevel: LogLevel): void;
reload(modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]): Promise<void>;
private reloadInternal;
unload(): Promise<void>;
private _generate;
/**
* Similar to `_generate()`; but instead of using callback, we use an async iterable.
*/
asyncGenerate(request: ChatCompletionRequestStreaming, model: string, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig, timeReceived: number): AsyncGenerator<ChatCompletionChunk, void, void>;
asyncGenerate(request: CompletionCreateParamsStreaming, model: string, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig, timeReceived: number): AsyncGenerator<Completion, void, void>;
interruptGenerate(): Promise<void>;
/**
* Completes a single ChatCompletionRequest.
*
* @param request A OpenAI-style ChatCompletion request.
*
* @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple
* `decode()`. This is important as it determines the behavior of various fields including `seed`.
*/
chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
/**
* Completes a single CompletionCreateParams, a text completion with no chat template.
*
* @param request A OpenAI-style Completion request.
*
* @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple
* `decode()`. This is important as it determines the behavior of various fields including `seed`.
*/
completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>;
completion(request: CompletionCreateParamsStreaming): Promise<AsyncIterable<Completion>>;
completion(request: CompletionCreateParamsBase): Promise<AsyncIterable<Completion> | Completion>;
embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;
getMaxStorageBufferBindingSize(): Promise<number>;
getGPUVendor(): Promise<string>;
private getLLMStates;
private getEmbeddingStates;
/**
* Return the model, its LLMChatPipeline, and ChatConfig to use. Throws error when unclear which
* model to load. Ensure all loadedModelIdToXXX maps contain entry for the selected modelId.
* @param requestName The type of request or API to load the model for. Needed for error throwing.
* @param modelType The typ of model, determining what type of pipeline to expect.
* @param modelId Model the user specified to load via the request. Required when multiple
* models are loaded
*/
private getModelStates;
forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean, modelId?: string): Promise<number>;
/**
* Get the current generated response.
*
* @returns The current output message.
*/
getMessage(modelId?: string): Promise<string>;
runtimeStatsText(modelId?: string): Promise<string>;
resetChat(keepStats?: boolean, modelId?: string): Promise<void>;
/**
* Run a prefill step with a given input.
*
* If `input` is a chatCompletionRequest, we treat `input.messages[-1]` as the usual user input.
* We then convert `input.messages[:-1]` to a `Conversation` object, representing a conversation
* history.
*
* If the new `Conversation` object matches the current one loaded, it means we are
* performing multi-round chatting, so we do not reset, hence reusing KV cache. Otherwise, we
* reset every thing, treating the request as something completely new.
*
* @param input The OpenAI-style prompt to prefill.
* @param pipeline The loaded pipeline, hence model, to carry out this prefill.
* @param chatConfig The chat config to use for this model.
* @param genConfig Generation config.
*/
prefill(input: ChatCompletionRequest | CompletionCreateParams, pipeline: LLMChatPipeline, chatConfig: ChatConfig, genConfig: GenerationConfig): Promise<void>;
/**
* Run a decode step to decode the next token.
*/
decode(pipeline: LLMChatPipeline, genConfig?: GenerationConfig): Promise<void>;
}
//# sourceMappingURL=engine.d.ts.map