@mlc-ai/web-llm
Version:
Hardware accelerated language model chats on browsers
117 lines • 5.97 kB
TypeScript
/** Util methods. */
import { Tokenizer } from "@mlc-ai/web-tokenizers";
import { AppConfig, ModelRecord } from "./config";
import { ChatCompletionChunk, ChatCompletionContentPartImage, ChatCompletionMessageToolCall } from "./openai_api_protocols/index";
/**
* Based on `p_prob` of size (vocabSize,) which becomes a distribution after calling
* `applySoftmaxWithTemperature()`, sample `top_logprobs` top-probable tokens.
*
* @param num_top_probs: `top_logprobs` from ChatCompletionRequest
* @param p_prob: `logitsOnCPUArray`, being a distribution after `applySoftmaxWithTemperature()`.
*
* Followed implementation of `ComputeTopProbsImpl()` from [https://github.com/mlc-ai/mlc-llm/blob/
* 5b8c529e9704abd09b0432da6dcb4b013fdf43b1/cpp/serve/sampler/cpu_sampler.cc].
*
* @returns Arrays of (tokenID, prob) pairs, ranked from highest prob to least.
*/
export declare function getTopProbs(num_top_probs: number, p_prob: Float32Array): Array<[number, number]>;
/**
* Get the token table in the form of a string list of tokens, ordered by their token id.
* @param tokenizer A loaded tokenizer.
* @note The size of the table (i.e. tokenizer.getVocabSize()) may be smaller than the `vocab_size`
* in config.json (length of logits), see https://github.com/QwenLM/Qwen2/issues/147 and
* https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/discussions/47.
*/
export declare function getTokenTableFromTokenizer(tokenizer: Tokenizer): string[];
/**
* Postprocess the suffix of ModelRecord.model to be "/resolve/main/" if it is not specified otherwise.
* e.g. https://huggingface.co/mlc-ai/OpenHermes-2.5-Mistral-7B-q4f16_1-MLC/resolve/main/
* @return the href of the final URL.
*/
export declare function cleanModelUrl(modelUrl: string): string;
/**
* Json schema used to prompt the model for function calling; directly copied from the official guide.
* This represents to a single function call.
*/
export declare const officialHermes2FunctionCallSchema = "{\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}";
/**
* A list of such function calls. Used to specify response format, since the output is expected to
* be a list of such function calls.
*/
export declare const officialHermes2FunctionCallSchemaArray: string;
/**
* Full system prompt for Hermes-2-Pro function calling.
*/
export declare const hermes2FunctionCallingSystemPrompt: string;
/**
* Given a string outputMessage, parse it as a JSON object and return an array of tool calls.
*
* Expect outputMessage to be a valid JSON string, and expect it to be an array of Function with
* fields `arguments` and `name`.
*/
export declare function getToolCallFromOutputMessage(outputMessage: string, isStreaming: false): Array<ChatCompletionMessageToolCall>;
export declare function getToolCallFromOutputMessage(outputMessage: string, isStreaming: true): Array<ChatCompletionChunk.Choice.Delta.ToolCall>;
export declare function findModelRecord(modelId: string, appConfig: AppConfig): ModelRecord;
/**
* Return the model to use given the loaded modelIds and requestModel. Throws error when unclear
* which model to load.
* @param loadedModelIds Models currently loaded in the engine.
* @param requestModel Model the user specified to load via the request. Required when multiple
* models are loaded
* @param requestName The type of request or API to load the model for. Needed for error throwing.
*/
export declare function getModelIdToUse(loadedModelIds: string[], requestModel: string | undefined | null, requestName: string): string;
/**
* TODO: Consider if this is the best strategy (though aligned with mlc-llm). We currently greedily
* try to fill up prefillChunkSize. Consider the example with 2048 prefill chunk size:
* const inputData = [
image1, // 1921
rangeArr(0, 2048),
image2,
];
* Current approach results in chunks:
[image1, rangeArr(0, 127)],
[rangeArr(127, 2048)],
[image2],
* This means 4 embedding kernels and 3 prefill kernels.
* While the optimal chunking may be:
[image1],
[rangeArr(0, 2048)],
[image2],
* This results in 3 embedding kernels and 3 prefill kernels.
* However, greedy strategy is more intuitive and probably more generalizable.
*/
/**
* Chunk the inputData such that each chunk's total input length is smaller than prefill
* chunk size.
* @returns [the data chunks, the input length of each chunk]
* @note precondition: if inputData has image in it, then prefillChunkSize >= IMAGE_EMBED_SIZE.
*/
export declare function getChunkedPrefillInputData(inputData: Array<Array<number> | ImageURL>, prefillChunkSize: number): [Array<Array<number> | ImageURL>[], Array<number>];
/**
* A lock implemented using Promise.
*
* Referred to:
* - https://jackpordi.com/posts/locks-in-js-because-why-not
* - https://www.linkedin.com/pulse/asynchronous-locking-using-promises-javascript-abdul-ahad-o7smf/
*/
export declare class CustomLock {
private acquired;
private readonly queue;
acquire(): Promise<void>;
release(): Promise<void>;
}
type ImageURL = ChatCompletionContentPartImage.ImageURL;
export declare const IMAGE_EMBED_SIZE = 1921;
/**
* Given a url, get the image data. The url can either start with `http` or `data:image`.
*/
export declare function getImageDataFromURL(url: string): Promise<ImageData>;
/**
* Given an ImageData, return the RGB array in Uint8ClampedArray. Note the ImageData.data
* is RGBA, so we skip every fourth element of the data. The order goes by rows from the
* top-left pixel to the bottom-right, in RGB order.
*/
export declare function getRGBArrayFromImageData(imageData: ImageData): Uint8ClampedArray;
export {};
//# sourceMappingURL=support.d.ts.map