@mlc-ai/web-llm
Version:
Hardware accelerated language model chats on browsers
796 lines • 31.3 kB
TypeScript
/**
* The input to OpenAI API, directly adopted from openai-node with small tweaks:
* https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts
*
* Copyright 2024 OpenAI
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { MLCEngineInterface } from "../types";
import { ModelType } from "../config";
export declare class Chat {
private engine;
completions: Completions;
constructor(engine: MLCEngineInterface);
}
export declare class Completions {
private engine;
constructor(engine: MLCEngineInterface);
create(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
create(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
create(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
}
/**
* OpenAI chat completion request protocol.
*
* API reference: https://platform.openai.com/docs/api-reference/chat/create
* Followed: https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts
*
* @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API.
*/
export interface ChatCompletionRequestBase {
/**
* A list of messages comprising the conversation so far.
*/
messages: Array<ChatCompletionMessageParam>;
/**
* If set, partial message deltas will be sent. It will be terminated by an empty chunk.
*/
stream?: boolean | null;
/**
* Options for streaming response. Only set this when you set `stream: true`.
*/
stream_options?: ChatCompletionStreamOptions | null;
/**
* How many chat completion choices to generate for each input message.
*/
n?: number | null;
/**
* Number between -2.0 and 2.0. Positive values penalize new tokens based on their
* existing frequency in the text so far, decreasing the model's likelihood to
* repeat the same line verbatim.
*
* [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
*/
frequency_penalty?: number | null;
/**
* Number between -2.0 and 2.0. Positive values penalize new tokens based on
* whether they appear in the text so far, increasing the model's likelihood to
* talk about new topics.
*
* [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)
*/
presence_penalty?: number | null;
/**
* The maximum number of [tokens](/tokenizer) that can be generated in the chat
* completion.
*
* The total length of input tokens and generated tokens is limited by the model's
* context length.
*/
max_tokens?: number | null;
/**
* Sequences where the API will stop generating further tokens.
*/
stop?: string | null | Array<string>;
/**
* What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
* make the output more random, while lower values like 0.2 will make it more
* focused and deterministic.
*/
temperature?: number | null;
/**
* An alternative to sampling with temperature, called nucleus sampling, where the
* model considers the results of the tokens with top_p probability mass. So 0.1
* means only the tokens comprising the top 10% probability mass are considered.
*/
top_p?: number | null;
/**
* Modify the likelihood of specified tokens appearing in the completion.
*
* Accepts a JSON object that maps tokens (specified by their token ID, which varies per model)
* to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the
* model to see which token ID maps to what string. Mathematically, the bias is added to the
* logits generated by the model prior to sampling. The exact effect will vary per model, but
* values between -1 and 1 should decrease or increase likelihood of selection; values like -100
* or 100 should result in a ban or exclusive selection of the relevant token.
*
* As an example, you can pass `{"16230": -100}` to prevent the `Hello` token from being
* generated in Mistral-7B-Instruct-v0.2, according to the mapping in
* https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json.
*
* @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`.
* @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after
* `LogitProcessor.processLogits()` is called.
*/
logit_bias?: Record<string, number> | null;
/**
* Whether to return log probabilities of the output tokens or not.
*
* If true, returns the log probabilities of each output token returned in the `content` of
* `message`.
*/
logprobs?: boolean | null;
/**
* An integer between 0 and 5 specifying the number of most likely tokens to return
* at each token position, each with an associated log probability. `logprobs` must
* be set to `true` if this parameter is used.
*/
top_logprobs?: number | null;
/**
* If specified, our system will make a best effort to sample deterministically, such that
* repeated requests with the same `seed` and parameters should return the same result.
*
* @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you
* would still get different content for each `Choice`. But if two requests with `n = 2` are
* processed with the same seed, the two results should be the same (two choices are different).
*/
seed?: number | null;
/**
* Controls which (if any) function is called by the model. `none` means the model
* will not call a function and instead generates a message. `auto` means the model
* can pick between generating a message or calling a function. Specifying a
* particular function via
* `{"type": "function", "function": {"name": "my_function"}}` forces the model to
* call that function.
*
* `none` is the default when no functions are present. `auto` is the default if
* functions are present.
*/
tool_choice?: ChatCompletionToolChoiceOption;
/**
* A list of tools the model may call. Currently, only functions are supported as a
* tool. Use this to provide a list of functions the model may generate JSON inputs
* for.
*
* The corresponding reply would populate the `tool_calls` field. If used with streaming,
* the last chunk would contain the `tool_calls` field, while the intermediate chunks would
* contain the raw string.
*
* If the generation terminates due to FinishReason other than "stop" (i.e. "length" or "abort"),
* then no `tool_calls` will be returned. User can still get the raw string output.
*/
tools?: Array<ChatCompletionTool>;
/**
* An object specifying the format that the model must output.
*
* Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
* message the model generates is valid JSON.
*
* **Important:** when using JSON mode, you **must** also instruct the model to
* produce JSON yourself via a system or user message. Without this, the model may
* generate an unending stream of whitespace until the generation reaches the token
* limit, resulting in a long-running and seemingly "stuck" request. Also note that
* the message content may be partially cut off if `finish_reason="length"`, which
* indicates the generation exceeded `max_tokens` or the conversation exceeded the
* max context length.
*/
response_format?: ResponseFormat;
/**
* If true, will ignore stop string and stop token and generate until max_tokens hit.
* If unset, will treat as false.
*/
ignore_eos?: boolean;
/**
* ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in
* `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.
*
* @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.
* @note If only one model is loaded in the engine, this field is optional. If multiple models
* are loaded, this is required.
*/
model?: string | null;
}
export interface ChatCompletionRequestNonStreaming extends ChatCompletionRequestBase {
/**
* If set, partial message deltas will be sent. It will be terminated by an empty chunk.
*/
stream?: false | null;
}
export interface ChatCompletionRequestStreaming extends ChatCompletionRequestBase {
/**
* If set, partial message deltas will be sent. It will be terminated by an empty chunk.
*/
stream: true;
}
export type ChatCompletionRequest = ChatCompletionRequestNonStreaming | ChatCompletionRequestStreaming;
/**
* Represents a chat completion response returned by model, based on the provided input.
*/
export interface ChatCompletion {
/**
* A unique identifier for the chat completion.
*/
id: string;
/**
* A list of chat completion choices. Can be more than one if `n` is greater than 1.
*/
choices: Array<ChatCompletion.Choice>;
/**
* The model used for the chat completion.
*/
model: string;
/**
* The object type, which is always `chat.completion`.
*/
object: "chat.completion";
/**
* The Unix timestamp (in seconds) of when the chat completion was created.
*
*/
created: number;
/**
* Usage statistics for the completion request.
*
* @note If we detect user is performing multi-round chatting, only the new portion of the
* prompt is counted for prompt_tokens. If `n > 1`, all choices' generation usages combined.
*/
usage?: CompletionUsage;
/**
* This fingerprint represents the backend configuration that the model runs with.
*
* Can be used in conjunction with the `seed` request parameter to understand when
* backend changes have been made that might impact determinism.
*
* @note Not supported yet.
*/
system_fingerprint?: string;
}
/**
* Represents a streamed chunk of a chat completion response returned by model,
* based on the provided input.
*/
export interface ChatCompletionChunk {
/**
* A unique identifier for the chat completion. Each chunk has the same ID.
*/
id: string;
/**
* A list of chat completion choices. Can contain more than one elements if `n` is
* greater than 1. Can also be empty for the last chunk if you set
* `stream_options: {"include_usage": true}`.
*/
choices: Array<ChatCompletionChunk.Choice>;
/**
* The Unix timestamp (in seconds) of when the chat completion was created. Each
* chunk has the same timestamp.
*/
created: number;
/**
* The model to generate the completion.
*/
model: string;
/**
* The object type, which is always `chat.completion.chunk`.
*/
object: "chat.completion.chunk";
/**
* This fingerprint represents the backend configuration that the model runs with.
* Can be used in conjunction with the `seed` request parameter to understand when
* backend changes have been made that might impact determinism.
*
* @note Not supported yet.
*/
system_fingerprint?: string;
/**
* An optional field that will only be present when you set
* `stream_options: {"include_usage": true}` in your request. When present, it
* contains a null value except for the last chunk which contains the token usage
* statistics for the entire request.
*/
usage?: CompletionUsage;
}
export declare const ChatCompletionRequestUnsupportedFields: Array<string>;
/**
* Post init and verify whether the input of the request is valid. Thus, this function can throw
* error or in-place update request.
* @param request User's input request.
* @param currentModelId The current model loaded that will perform this request.
* @param currentModelType The type of the model loaded, decide what requests can be handled.
*/
export declare function postInitAndCheckFields(request: ChatCompletionRequest, currentModelId: string, currentModelType: ModelType): void;
export type ChatCompletionContentPart = ChatCompletionContentPartText | ChatCompletionContentPartImage;
export interface ChatCompletionContentPartText {
/**
* The text content.
*/
text: string;
/**
* The type of the content part.
*/
type: "text";
}
export declare namespace ChatCompletionContentPartImage {
interface ImageURL {
/**
* Either a URL of the image or the base64 encoded image data.
*/
url: string;
/**
* Specifies the detail level of the image.
*/
detail?: "auto" | "low" | "high";
}
}
export interface ChatCompletionContentPartImage {
image_url: ChatCompletionContentPartImage.ImageURL;
/**
* The type of the content part.
*/
type: "image_url";
}
export interface ChatCompletionMessageToolCall {
/**
* The ID of the tool call. In WebLLM, it is used as the index of the tool call among all
* the tools calls in this request generation.
*/
id: string;
/**
* The function that the model called.
*/
function: ChatCompletionMessageToolCall.Function;
/**
* The type of the tool. Currently, only `function` is supported.
*/
type: "function";
}
export declare namespace ChatCompletionMessageToolCall {
/**
* The function that the model called.
*/
interface Function {
/**
* The arguments to call the function with, as generated by the model in JSON
* format.
*/
arguments: string;
/**
* The name of the function to call.
*/
name: string;
}
}
/**
* The role of the author of a message
*/
export type ChatCompletionRole = "system" | "user" | "assistant" | "tool" | "function";
/**
* Options for streaming response. Only set this when you set `stream: true`.
*/
export interface ChatCompletionStreamOptions {
/**
* If set, an additional chunk will be streamed after the last empty chunk.
* The `usage` field on this chunk shows the token usage statistics for the entire
* request, and the `choices` field will always be an empty array. All other chunks
* will also include a `usage` field, but with a null value.
*/
include_usage?: boolean;
}
export interface ChatCompletionSystemMessageParam {
/**
* The contents of the system message.
*/
content: string;
/**
* The role of the messages author, in this case `system`.
*/
role: "system";
}
export interface ChatCompletionUserMessageParam {
/**
* The contents of the user message.
*/
content: string | Array<ChatCompletionContentPart>;
/**
* The role of the messages author, in this case `user`.
*/
role: "user";
/**
* An optional name for the participant. Provides the model information to
* differentiate between participants of the same role.
*
* @note This is experimental, as models typically have predefined names for the user.
*/
name?: string;
}
export interface ChatCompletionAssistantMessageParam {
/**
* The role of the messages author, in this case `assistant`.
*/
role: "assistant";
/**
* The contents of the assistant message. Required unless `tool_calls` is specified.
*/
content?: string | null;
/**
* An optional name for the participant. Provides the model information to
* differentiate between participants of the same role.
*
* @note This is experimental, as models typically have predefined names for the user.
*/
name?: string;
/**
* The tool calls generated by the model, such as function calls.
*/
tool_calls?: Array<ChatCompletionMessageToolCall>;
}
export interface ChatCompletionToolMessageParam {
/**
* The contents of the tool message.
*/
content: string;
/**
* The role of the messages author, in this case `tool`.
*/
role: "tool";
/**
* Tool call that this message is responding to.
*/
tool_call_id: string;
}
export type ChatCompletionMessageParam = ChatCompletionSystemMessageParam | ChatCompletionUserMessageParam | ChatCompletionAssistantMessageParam | ChatCompletionToolMessageParam;
/**
* The parameters the functions accepts, described as a JSON Schema object. See the
* [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)
* for examples, and the
* [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for
* documentation about the format.
*
* Omitting `parameters` defines a function with an empty parameter list.
*/
export type FunctionParameters = Record<string, unknown>;
export interface FunctionDefinition {
/**
* The name of the function to be called. Must be a-z, A-Z, 0-9, or contain
* underscores and dashes, with a maximum length of 64.
*/
name: string;
/**
* A description of what the function does, used by the model to choose when and
* how to call the function.
*/
description?: string;
/**
* The parameters the functions accepts, described as a JSON Schema object. See the
* [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)
* for examples, and the
* [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for
* documentation about the format.
*
* Omitting `parameters` defines a function with an empty parameter list.
*/
parameters?: FunctionParameters;
}
export interface ChatCompletionTool {
function: FunctionDefinition;
/**
* The type of the tool. Currently, only `function` is supported.
*/
type: "function";
}
/**
* Specifies a tool the model should use. Use to force the model to call a specific
* function.
*/
export interface ChatCompletionNamedToolChoice {
function: ChatCompletionNamedToolChoice.Function;
/**
* The type of the tool. Currently, only `function` is supported.
*/
type: "function";
}
export declare namespace ChatCompletionNamedToolChoice {
interface Function {
/**
* The name of the function to call.
*/
name: string;
}
}
/**
* Controls which (if any) function is called by the model. `none` means the model
* will not call a function and instead generates a message. `auto` means the model
* can pick between generating a message or calling a function. Specifying a
* particular function via
* `{"type": "function", "function": {"name": "my_function"}}` forces the model to
* call that function.
*
* `none` is the default when no functions are present. `auto` is the default if
* functions are present.
*/
export type ChatCompletionToolChoiceOption = "none" | "auto" | ChatCompletionNamedToolChoice;
export interface TopLogprob {
/**
* The token.
*/
token: string;
/**
* A list of integers representing the UTF-8 bytes representation of the token.
* Useful in instances where characters are represented by multiple tokens and
* their byte representations must be combined to generate the correct text
* representation. Can be `null` if there is no bytes representation for the token.
*
* @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.
* For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.
*/
bytes: Array<number> | null;
/**
* The log probability of this token.
*/
logprob: number;
}
export interface ChatCompletionTokenLogprob {
/**
* The token.
*/
token: string;
/**
* A list of integers representing the UTF-8 bytes representation of the token.
* Useful in instances where characters are represented by multiple tokens and
* their byte representations must be combined to generate the correct text
* representation. Can be `null` if there is no bytes representation for the token.
*
* @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.
* For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.
*/
bytes: Array<number> | null;
/**
* The log probability of this token.
*/
logprob: number;
/**
* List of the most likely tokens and their log probability, at this token
* position. In rare cases, there may be fewer than the number of requested
* `top_logprobs` returned.
*/
top_logprobs: Array<TopLogprob>;
}
/**
* A chat completion message generated by the model.
*/
export interface ChatCompletionMessage {
/**
* The contents of the message.
*/
content: string | null;
/**
* The role of the author of this message.
*/
role: "assistant";
/**
* The tool calls generated by the model, such as function calls.
*/
tool_calls?: Array<ChatCompletionMessageToolCall>;
}
/**
* Usage statistics for the completion request.
*/
export interface CompletionUsage {
/**
* Number of tokens in the generated completion.
*/
completion_tokens: number;
/**
* Number of tokens in the prompt.
*
* @note If we detect user is performing multi-round chatting, only the new portion of the
* prompt is counted for prompt_tokens.
*/
prompt_tokens: number;
/**
* Total number of tokens used in the request (prompt + completion).
*/
total_tokens: number;
/**
* Fields specific to WebLLM, not present in OpenAI.
*/
extra: {
/**
* Total seconds spent on this request, from receiving the request, to generating the response.
*/
e2e_latency_s: number;
/**
* Number of tokens per second for prefilling.
*/
prefill_tokens_per_s: number;
/**
* Number of tokens per second for autoregressive decoding.
*/
decode_tokens_per_s: number;
/**
* Seconds spent to generate the first token since receiving the request. Mainly contains
* prefilling overhead. If n > 1, it is the sum over all choices.
*/
time_to_first_token_s: number;
/**
* Seconds in between generated tokens. Mainly contains decoding overhead. If n > 1, it
* is the average over all choices.
*/
time_per_output_token_s: number;
/**
* Seconds spent on initializing grammar matcher for structured output. If n > 1, it
* is the sum over all choices.
*/
grammar_init_s?: number;
/**
* Seconds per-token that grammar matcher spent on creating bitmask and accepting token for
* structured output. If n > 1, it is the average over all choices.
*/
grammar_per_token_s?: number;
};
}
/**
* The reason the model stopped generating tokens. This will be `stop` if the model
* hit a natural stop point or a provided stop sequence, `length` if the maximum
* number of tokens specified in the request was reached or the context_window_size will
* be exceeded, `tool_calls` if the model called a tool, or `abort` if user manually stops the
* generation.
*/
export type ChatCompletionFinishReason = "stop" | "length" | "tool_calls" | "abort";
export declare namespace ChatCompletion {
interface Choice {
/**
* The reason the model stopped generating tokens. This will be `stop` if the model
* hit a natural stop point or a provided stop sequence, `length` if the maximum
* number of tokens specified in the request was reached, `tool_calls` if the
* model called a tool, or `abort` if user manually stops the generation.
*/
finish_reason: ChatCompletionFinishReason;
/**
* The index of the choice in the list of choices.
*/
index: number;
/**
* Log probability information for the choice.
*/
logprobs: Choice.Logprobs | null;
/**
* A chat completion message generated by the model.
*/
message: ChatCompletionMessage;
}
namespace Choice {
/**
* Log probability information for the choice.
*/
interface Logprobs {
/**
* A list of message content tokens with log probability information.
*/
content: Array<ChatCompletionTokenLogprob> | null;
}
}
}
export declare namespace ChatCompletionChunk {
interface Choice {
/**
* A chat completion delta generated by streamed model responses.
*/
delta: Choice.Delta;
/**
* The reason the model stopped generating tokens. This will be `stop` if the model
* hit a natural stop point or a provided stop sequence, `length` if the maximum
* number of tokens specified in the request was reached, `tool_calls` if the
* model called a tool, or `abort` if user manually stops the generation.
*/
finish_reason: ChatCompletionFinishReason | null;
/**
* The index of the choice in the list of choices.
*/
index: number;
/**
* Log probability information for the choice.
*/
logprobs?: Choice.Logprobs | null;
}
namespace Choice {
/**
* A chat completion delta generated by streamed model responses.
*/
interface Delta {
/**
* The contents of the chunk message.
*/
content?: string | null;
/**
* The role of the author of this message.
*/
role?: "system" | "user" | "assistant" | "tool";
tool_calls?: Array<Delta.ToolCall>;
}
namespace Delta {
interface ToolCall {
/**
* The index of the tool call among all the tools calls in this request generation.
*/
index: number;
/**
* The ID of the tool call. Not used in WebLLM.
*/
id?: string;
function?: ToolCall.Function;
/**
* The type of the tool. Currently, only `function` is supported.
*/
type?: "function";
}
namespace ToolCall {
interface Function {
/**
* The arguments to call the function with, as generated by the model in JSON
* format. Note that the model does not always generate valid JSON, and may
* hallucinate parameters not defined by your function schema. Validate the
* arguments in your code before calling your function.
*/
arguments?: string;
/**
* The name of the function to call.
*/
name?: string;
}
}
}
/**
* Log probability information for the choice.
*/
interface Logprobs {
/**
* A list of message content tokens with log probability information.
*/
content: Array<ChatCompletionTokenLogprob> | null;
}
}
}
/**
* An object specifying the format that the model must output.
*
* Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the
* message the model generates is valid JSON.
*
* Setting to `{ "type": "grammar" }` requires you to also specify the `grammar` field, which
* is a BNFGrammar string.
*
* Setting `schema` specifies the output format of the json object such as properties to include.
*
* **Important:** when using JSON mode, you **must** also instruct the model to produce JSON
* following the schema (if specified) yourself via a system or user message. Without this,
* the model may generate an unending stream of whitespace until the generation reaches the token
* limit, resulting in a long-running and seemingly "stuck" request. Also note that
* the message content may be partially cut off if `finish_reason="length"`, which
* indicates the generation exceeded `max_tokens` or the conversation exceeded the
* max context length.
*/
export interface ResponseFormat {
/**
* Must be one of `text`, `json_object`, or `grammar`.
*/
type?: "text" | "json_object" | "grammar";
/**
* A schema string in the format of the schema of a JSON file. `type` needs to be `json_object`.
*/
schema?: string;
/**
* An EBNF-formatted string. Needs to be specified when, and only specified when,
* `type` is `grammar`. The grammar will be normalized (simplified) by default.
* EBNF grammar: see https://www.w3.org/TR/xml/#sec-notation. Note:
1. Use # as the comment mark
2. Use C-style unicode escape sequence \u01AB, \U000001AB, \xAB
3. A-B (match A and not match B) is not supported yet
4. Lookahead assertion can be added at the end of a rule to speed up matching. E.g.
```
main ::= "ab" a [a-z]
a ::= "cd" (=[a-z])
```
The assertion (=[a-z]) means a must be followed by [a-z].
*/
grammar?: string;
}
//# sourceMappingURL=chat_completion.d.ts.map