UNPKG

@mlc-ai/web-llm

Version:

Hardware accelerated language model chats on browsers

796 lines 31.3 kB
/** * The input to OpenAI API, directly adopted from openai-node with small tweaks: * https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts * * Copyright 2024 OpenAI * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { MLCEngineInterface } from "../types"; import { ModelType } from "../config"; export declare class Chat { private engine; completions: Completions; constructor(engine: MLCEngineInterface); } export declare class Completions { private engine; constructor(engine: MLCEngineInterface); create(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>; create(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>; create(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>; } /** * OpenAI chat completion request protocol. * * API reference: https://platform.openai.com/docs/api-reference/chat/create * Followed: https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts * * @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API. */ export interface ChatCompletionRequestBase { /** * A list of messages comprising the conversation so far. */ messages: Array<ChatCompletionMessageParam>; /** * If set, partial message deltas will be sent. It will be terminated by an empty chunk. */ stream?: boolean | null; /** * Options for streaming response. Only set this when you set `stream: true`. */ stream_options?: ChatCompletionStreamOptions | null; /** * How many chat completion choices to generate for each input message. */ n?: number | null; /** * Number between -2.0 and 2.0. Positive values penalize new tokens based on their * existing frequency in the text so far, decreasing the model's likelihood to * repeat the same line verbatim. * * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details) */ frequency_penalty?: number | null; /** * Number between -2.0 and 2.0. Positive values penalize new tokens based on * whether they appear in the text so far, increasing the model's likelihood to * talk about new topics. * * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details) */ presence_penalty?: number | null; /** * The maximum number of [tokens](/tokenizer) that can be generated in the chat * completion. * * The total length of input tokens and generated tokens is limited by the model's * context length. */ max_tokens?: number | null; /** * Sequences where the API will stop generating further tokens. */ stop?: string | null | Array<string>; /** * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will * make the output more random, while lower values like 0.2 will make it more * focused and deterministic. */ temperature?: number | null; /** * An alternative to sampling with temperature, called nucleus sampling, where the * model considers the results of the tokens with top_p probability mass. So 0.1 * means only the tokens comprising the top 10% probability mass are considered. */ top_p?: number | null; /** * Modify the likelihood of specified tokens appearing in the completion. * * Accepts a JSON object that maps tokens (specified by their token ID, which varies per model) * to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the * model to see which token ID maps to what string. Mathematically, the bias is added to the * logits generated by the model prior to sampling. The exact effect will vary per model, but * values between -1 and 1 should decrease or increase likelihood of selection; values like -100 * or 100 should result in a ban or exclusive selection of the relevant token. * * As an example, you can pass `{"16230": -100}` to prevent the `Hello` token from being * generated in Mistral-7B-Instruct-v0.2, according to the mapping in * https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json. * * @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`. * @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after * `LogitProcessor.processLogits()` is called. */ logit_bias?: Record<string, number> | null; /** * Whether to return log probabilities of the output tokens or not. * * If true, returns the log probabilities of each output token returned in the `content` of * `message`. */ logprobs?: boolean | null; /** * An integer between 0 and 5 specifying the number of most likely tokens to return * at each token position, each with an associated log probability. `logprobs` must * be set to `true` if this parameter is used. */ top_logprobs?: number | null; /** * If specified, our system will make a best effort to sample deterministically, such that * repeated requests with the same `seed` and parameters should return the same result. * * @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you * would still get different content for each `Choice`. But if two requests with `n = 2` are * processed with the same seed, the two results should be the same (two choices are different). */ seed?: number | null; /** * Controls which (if any) function is called by the model. `none` means the model * will not call a function and instead generates a message. `auto` means the model * can pick between generating a message or calling a function. Specifying a * particular function via * `{"type": "function", "function": {"name": "my_function"}}` forces the model to * call that function. * * `none` is the default when no functions are present. `auto` is the default if * functions are present. */ tool_choice?: ChatCompletionToolChoiceOption; /** * A list of tools the model may call. Currently, only functions are supported as a * tool. Use this to provide a list of functions the model may generate JSON inputs * for. * * The corresponding reply would populate the `tool_calls` field. If used with streaming, * the last chunk would contain the `tool_calls` field, while the intermediate chunks would * contain the raw string. * * If the generation terminates due to FinishReason other than "stop" (i.e. "length" or "abort"), * then no `tool_calls` will be returned. User can still get the raw string output. */ tools?: Array<ChatCompletionTool>; /** * An object specifying the format that the model must output. * * Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the * message the model generates is valid JSON. * * **Important:** when using JSON mode, you **must** also instruct the model to * produce JSON yourself via a system or user message. Without this, the model may * generate an unending stream of whitespace until the generation reaches the token * limit, resulting in a long-running and seemingly "stuck" request. Also note that * the message content may be partially cut off if `finish_reason="length"`, which * indicates the generation exceeded `max_tokens` or the conversation exceeded the * max context length. */ response_format?: ResponseFormat; /** * If true, will ignore stop string and stop token and generate until max_tokens hit. * If unset, will treat as false. */ ignore_eos?: boolean; /** * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`. * * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time. * @note If only one model is loaded in the engine, this field is optional. If multiple models * are loaded, this is required. */ model?: string | null; } export interface ChatCompletionRequestNonStreaming extends ChatCompletionRequestBase { /** * If set, partial message deltas will be sent. It will be terminated by an empty chunk. */ stream?: false | null; } export interface ChatCompletionRequestStreaming extends ChatCompletionRequestBase { /** * If set, partial message deltas will be sent. It will be terminated by an empty chunk. */ stream: true; } export type ChatCompletionRequest = ChatCompletionRequestNonStreaming | ChatCompletionRequestStreaming; /** * Represents a chat completion response returned by model, based on the provided input. */ export interface ChatCompletion { /** * A unique identifier for the chat completion. */ id: string; /** * A list of chat completion choices. Can be more than one if `n` is greater than 1. */ choices: Array<ChatCompletion.Choice>; /** * The model used for the chat completion. */ model: string; /** * The object type, which is always `chat.completion`. */ object: "chat.completion"; /** * The Unix timestamp (in seconds) of when the chat completion was created. * */ created: number; /** * Usage statistics for the completion request. * * @note If we detect user is performing multi-round chatting, only the new portion of the * prompt is counted for prompt_tokens. If `n > 1`, all choices' generation usages combined. */ usage?: CompletionUsage; /** * This fingerprint represents the backend configuration that the model runs with. * * Can be used in conjunction with the `seed` request parameter to understand when * backend changes have been made that might impact determinism. * * @note Not supported yet. */ system_fingerprint?: string; } /** * Represents a streamed chunk of a chat completion response returned by model, * based on the provided input. */ export interface ChatCompletionChunk { /** * A unique identifier for the chat completion. Each chunk has the same ID. */ id: string; /** * A list of chat completion choices. Can contain more than one elements if `n` is * greater than 1. Can also be empty for the last chunk if you set * `stream_options: {"include_usage": true}`. */ choices: Array<ChatCompletionChunk.Choice>; /** * The Unix timestamp (in seconds) of when the chat completion was created. Each * chunk has the same timestamp. */ created: number; /** * The model to generate the completion. */ model: string; /** * The object type, which is always `chat.completion.chunk`. */ object: "chat.completion.chunk"; /** * This fingerprint represents the backend configuration that the model runs with. * Can be used in conjunction with the `seed` request parameter to understand when * backend changes have been made that might impact determinism. * * @note Not supported yet. */ system_fingerprint?: string; /** * An optional field that will only be present when you set * `stream_options: {"include_usage": true}` in your request. When present, it * contains a null value except for the last chunk which contains the token usage * statistics for the entire request. */ usage?: CompletionUsage; } export declare const ChatCompletionRequestUnsupportedFields: Array<string>; /** * Post init and verify whether the input of the request is valid. Thus, this function can throw * error or in-place update request. * @param request User's input request. * @param currentModelId The current model loaded that will perform this request. * @param currentModelType The type of the model loaded, decide what requests can be handled. */ export declare function postInitAndCheckFields(request: ChatCompletionRequest, currentModelId: string, currentModelType: ModelType): void; export type ChatCompletionContentPart = ChatCompletionContentPartText | ChatCompletionContentPartImage; export interface ChatCompletionContentPartText { /** * The text content. */ text: string; /** * The type of the content part. */ type: "text"; } export declare namespace ChatCompletionContentPartImage { interface ImageURL { /** * Either a URL of the image or the base64 encoded image data. */ url: string; /** * Specifies the detail level of the image. */ detail?: "auto" | "low" | "high"; } } export interface ChatCompletionContentPartImage { image_url: ChatCompletionContentPartImage.ImageURL; /** * The type of the content part. */ type: "image_url"; } export interface ChatCompletionMessageToolCall { /** * The ID of the tool call. In WebLLM, it is used as the index of the tool call among all * the tools calls in this request generation. */ id: string; /** * The function that the model called. */ function: ChatCompletionMessageToolCall.Function; /** * The type of the tool. Currently, only `function` is supported. */ type: "function"; } export declare namespace ChatCompletionMessageToolCall { /** * The function that the model called. */ interface Function { /** * The arguments to call the function with, as generated by the model in JSON * format. */ arguments: string; /** * The name of the function to call. */ name: string; } } /** * The role of the author of a message */ export type ChatCompletionRole = "system" | "user" | "assistant" | "tool" | "function"; /** * Options for streaming response. Only set this when you set `stream: true`. */ export interface ChatCompletionStreamOptions { /** * If set, an additional chunk will be streamed after the last empty chunk. * The `usage` field on this chunk shows the token usage statistics for the entire * request, and the `choices` field will always be an empty array. All other chunks * will also include a `usage` field, but with a null value. */ include_usage?: boolean; } export interface ChatCompletionSystemMessageParam { /** * The contents of the system message. */ content: string; /** * The role of the messages author, in this case `system`. */ role: "system"; } export interface ChatCompletionUserMessageParam { /** * The contents of the user message. */ content: string | Array<ChatCompletionContentPart>; /** * The role of the messages author, in this case `user`. */ role: "user"; /** * An optional name for the participant. Provides the model information to * differentiate between participants of the same role. * * @note This is experimental, as models typically have predefined names for the user. */ name?: string; } export interface ChatCompletionAssistantMessageParam { /** * The role of the messages author, in this case `assistant`. */ role: "assistant"; /** * The contents of the assistant message. Required unless `tool_calls` is specified. */ content?: string | null; /** * An optional name for the participant. Provides the model information to * differentiate between participants of the same role. * * @note This is experimental, as models typically have predefined names for the user. */ name?: string; /** * The tool calls generated by the model, such as function calls. */ tool_calls?: Array<ChatCompletionMessageToolCall>; } export interface ChatCompletionToolMessageParam { /** * The contents of the tool message. */ content: string; /** * The role of the messages author, in this case `tool`. */ role: "tool"; /** * Tool call that this message is responding to. */ tool_call_id: string; } export type ChatCompletionMessageParam = ChatCompletionSystemMessageParam | ChatCompletionUserMessageParam | ChatCompletionAssistantMessageParam | ChatCompletionToolMessageParam; /** * The parameters the functions accepts, described as a JSON Schema object. See the * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling) * for examples, and the * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for * documentation about the format. * * Omitting `parameters` defines a function with an empty parameter list. */ export type FunctionParameters = Record<string, unknown>; export interface FunctionDefinition { /** * The name of the function to be called. Must be a-z, A-Z, 0-9, or contain * underscores and dashes, with a maximum length of 64. */ name: string; /** * A description of what the function does, used by the model to choose when and * how to call the function. */ description?: string; /** * The parameters the functions accepts, described as a JSON Schema object. See the * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling) * for examples, and the * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for * documentation about the format. * * Omitting `parameters` defines a function with an empty parameter list. */ parameters?: FunctionParameters; } export interface ChatCompletionTool { function: FunctionDefinition; /** * The type of the tool. Currently, only `function` is supported. */ type: "function"; } /** * Specifies a tool the model should use. Use to force the model to call a specific * function. */ export interface ChatCompletionNamedToolChoice { function: ChatCompletionNamedToolChoice.Function; /** * The type of the tool. Currently, only `function` is supported. */ type: "function"; } export declare namespace ChatCompletionNamedToolChoice { interface Function { /** * The name of the function to call. */ name: string; } } /** * Controls which (if any) function is called by the model. `none` means the model * will not call a function and instead generates a message. `auto` means the model * can pick between generating a message or calling a function. Specifying a * particular function via * `{"type": "function", "function": {"name": "my_function"}}` forces the model to * call that function. * * `none` is the default when no functions are present. `auto` is the default if * functions are present. */ export type ChatCompletionToolChoiceOption = "none" | "auto" | ChatCompletionNamedToolChoice; export interface TopLogprob { /** * The token. */ token: string; /** * A list of integers representing the UTF-8 bytes representation of the token. * Useful in instances where characters are represented by multiple tokens and * their byte representations must be combined to generate the correct text * representation. Can be `null` if there is no bytes representation for the token. * * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`. * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode. */ bytes: Array<number> | null; /** * The log probability of this token. */ logprob: number; } export interface ChatCompletionTokenLogprob { /** * The token. */ token: string; /** * A list of integers representing the UTF-8 bytes representation of the token. * Useful in instances where characters are represented by multiple tokens and * their byte representations must be combined to generate the correct text * representation. Can be `null` if there is no bytes representation for the token. * * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`. * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode. */ bytes: Array<number> | null; /** * The log probability of this token. */ logprob: number; /** * List of the most likely tokens and their log probability, at this token * position. In rare cases, there may be fewer than the number of requested * `top_logprobs` returned. */ top_logprobs: Array<TopLogprob>; } /** * A chat completion message generated by the model. */ export interface ChatCompletionMessage { /** * The contents of the message. */ content: string | null; /** * The role of the author of this message. */ role: "assistant"; /** * The tool calls generated by the model, such as function calls. */ tool_calls?: Array<ChatCompletionMessageToolCall>; } /** * Usage statistics for the completion request. */ export interface CompletionUsage { /** * Number of tokens in the generated completion. */ completion_tokens: number; /** * Number of tokens in the prompt. * * @note If we detect user is performing multi-round chatting, only the new portion of the * prompt is counted for prompt_tokens. */ prompt_tokens: number; /** * Total number of tokens used in the request (prompt + completion). */ total_tokens: number; /** * Fields specific to WebLLM, not present in OpenAI. */ extra: { /** * Total seconds spent on this request, from receiving the request, to generating the response. */ e2e_latency_s: number; /** * Number of tokens per second for prefilling. */ prefill_tokens_per_s: number; /** * Number of tokens per second for autoregressive decoding. */ decode_tokens_per_s: number; /** * Seconds spent to generate the first token since receiving the request. Mainly contains * prefilling overhead. If n > 1, it is the sum over all choices. */ time_to_first_token_s: number; /** * Seconds in between generated tokens. Mainly contains decoding overhead. If n > 1, it * is the average over all choices. */ time_per_output_token_s: number; /** * Seconds spent on initializing grammar matcher for structured output. If n > 1, it * is the sum over all choices. */ grammar_init_s?: number; /** * Seconds per-token that grammar matcher spent on creating bitmask and accepting token for * structured output. If n > 1, it is the average over all choices. */ grammar_per_token_s?: number; }; } /** * The reason the model stopped generating tokens. This will be `stop` if the model * hit a natural stop point or a provided stop sequence, `length` if the maximum * number of tokens specified in the request was reached or the context_window_size will * be exceeded, `tool_calls` if the model called a tool, or `abort` if user manually stops the * generation. */ export type ChatCompletionFinishReason = "stop" | "length" | "tool_calls" | "abort"; export declare namespace ChatCompletion { interface Choice { /** * The reason the model stopped generating tokens. This will be `stop` if the model * hit a natural stop point or a provided stop sequence, `length` if the maximum * number of tokens specified in the request was reached, `tool_calls` if the * model called a tool, or `abort` if user manually stops the generation. */ finish_reason: ChatCompletionFinishReason; /** * The index of the choice in the list of choices. */ index: number; /** * Log probability information for the choice. */ logprobs: Choice.Logprobs | null; /** * A chat completion message generated by the model. */ message: ChatCompletionMessage; } namespace Choice { /** * Log probability information for the choice. */ interface Logprobs { /** * A list of message content tokens with log probability information. */ content: Array<ChatCompletionTokenLogprob> | null; } } } export declare namespace ChatCompletionChunk { interface Choice { /** * A chat completion delta generated by streamed model responses. */ delta: Choice.Delta; /** * The reason the model stopped generating tokens. This will be `stop` if the model * hit a natural stop point or a provided stop sequence, `length` if the maximum * number of tokens specified in the request was reached, `tool_calls` if the * model called a tool, or `abort` if user manually stops the generation. */ finish_reason: ChatCompletionFinishReason | null; /** * The index of the choice in the list of choices. */ index: number; /** * Log probability information for the choice. */ logprobs?: Choice.Logprobs | null; } namespace Choice { /** * A chat completion delta generated by streamed model responses. */ interface Delta { /** * The contents of the chunk message. */ content?: string | null; /** * The role of the author of this message. */ role?: "system" | "user" | "assistant" | "tool"; tool_calls?: Array<Delta.ToolCall>; } namespace Delta { interface ToolCall { /** * The index of the tool call among all the tools calls in this request generation. */ index: number; /** * The ID of the tool call. Not used in WebLLM. */ id?: string; function?: ToolCall.Function; /** * The type of the tool. Currently, only `function` is supported. */ type?: "function"; } namespace ToolCall { interface Function { /** * The arguments to call the function with, as generated by the model in JSON * format. Note that the model does not always generate valid JSON, and may * hallucinate parameters not defined by your function schema. Validate the * arguments in your code before calling your function. */ arguments?: string; /** * The name of the function to call. */ name?: string; } } } /** * Log probability information for the choice. */ interface Logprobs { /** * A list of message content tokens with log probability information. */ content: Array<ChatCompletionTokenLogprob> | null; } } } /** * An object specifying the format that the model must output. * * Setting to `{ "type": "json_object" }` enables JSON mode, which guarantees the * message the model generates is valid JSON. * * Setting to `{ "type": "grammar" }` requires you to also specify the `grammar` field, which * is a BNFGrammar string. * * Setting `schema` specifies the output format of the json object such as properties to include. * * **Important:** when using JSON mode, you **must** also instruct the model to produce JSON * following the schema (if specified) yourself via a system or user message. Without this, * the model may generate an unending stream of whitespace until the generation reaches the token * limit, resulting in a long-running and seemingly "stuck" request. Also note that * the message content may be partially cut off if `finish_reason="length"`, which * indicates the generation exceeded `max_tokens` or the conversation exceeded the * max context length. */ export interface ResponseFormat { /** * Must be one of `text`, `json_object`, or `grammar`. */ type?: "text" | "json_object" | "grammar"; /** * A schema string in the format of the schema of a JSON file. `type` needs to be `json_object`. */ schema?: string; /** * An EBNF-formatted string. Needs to be specified when, and only specified when, * `type` is `grammar`. The grammar will be normalized (simplified) by default. * EBNF grammar: see https://www.w3.org/TR/xml/#sec-notation. Note: 1. Use # as the comment mark 2. Use C-style unicode escape sequence \u01AB, \U000001AB, \xAB 3. A-B (match A and not match B) is not supported yet 4. Lookahead assertion can be added at the end of a rule to speed up matching. E.g. ``` main ::= "ab" a [a-z] a ::= "cd" (=[a-z]) ``` The assertion (=[a-z]) means a must be followed by [a-z]. */ grammar?: string; } //# sourceMappingURL=chat_completion.d.ts.map