UNPKG

@mlc-ai/web-llm

Version:

Hardware accelerated language model chats on browsers

234 lines 8.2 kB
import * as tvmjs from "@mlc-ai/web-runtime"; import { Tokenizer } from "@mlc-ai/web-tokenizers"; import { ChatConfig, GenerationConfig, Role } from "./config"; import { Conversation } from "./conversation"; import { LogitProcessor } from "./types"; import { ChatCompletionFinishReason, ChatCompletionTokenLogprob } from "./openai_api_protocols/index"; export declare class LLMChatPipeline { private config; private tokenizer; private tvm; private device; private vm; private prefill; private decoding; private image_embed; private embed; private fapplyBitmask; private fclearKVCaches; private fKVCacheAddSequence; private fKVCacheRemoveSequence; private fKVCacheBeginForward; private fKVCacheEndForward; private fKVCacheEnableSlidingWindowForSeq; private params; private kvCache; private logitsOnCPU?; private filledKVCacheLength; private bosTokenId; private contextWindowSize; private slidingWindowSize; private attentionSinkSize; private prefillChunkSize; private resetStatsPerPrefill; private stopStr; private stopTokens; private outputMessage; private outputIds; private stopTriggered; private finishReason; private appearedTokensFreq; private conversation; private tokenLogprobArray; private decodingTotalTime; private decodingTotalTokens; private prefillTotalTime; private prefillTotalTokens; private curRoundDecodingTotalTokens; private curRoundPrefillTotalTokens; private curRoundDecodingTotalTime; private curRoundPrefillTotalTime; private logitProcessor?; private grammarMatcher?; private schemaOrGrammarStr?; private xgTokenizerInfo?; private grammarCompiler?; private bitmaskSize; private fullVocabSize; private token_postproc_method; private prepend_space_in_encode; private curRoundGrammarInitTotalTime; private curRoundGrammarPerTokenTotalTime; constructor(tvm: tvmjs.Instance, tokenizer: Tokenizer, config: ChatConfig, logitProcessor?: LogitProcessor); dispose(): void; /** * Get the current message. */ getMessage(): string; /** * Reset the runtime statistics */ resetRuntimeStats(): void; /** * Reset the chat history */ resetChat(keepStats?: boolean): void; /** * Reset KV Cache */ resetKVCache(): void; /** * @returns Whether stop is triggered. */ stopped(): boolean; /** * @returns Finish reason; undefined if generation not started/stopped yet. */ getFinishReason(): ChatCompletionFinishReason | undefined; /** * @returns tokenLogprobArray for this current round of autoregressive generation. * Updated upon each sampled token, cleared upon each prefillStep(). */ getTokenLogprobArray(): Array<ChatCompletionTokenLogprob>; /** * @returns the number of tokens decoded for a single request or a single choice in the request. */ getCurRoundDecodingTotalTokens(): number; /** * @returns the number of tokens decoded for a single request or a single choice in the request. */ getCurRoundPrefillTotalTokens(): number; /** * @returns the time spent on decode for a single request or a single choice in the request. */ getCurRoundDecodingTotalTime(): number; /** * @returns the time spent on for a single request or a single choice in the request. */ getCurRoundPrefillTotalTime(): number; /** * @returns the time (seconds) spent on for initializing grammar matcher for a single request. */ getCurRoundGrammarInitTotalTime(): number; /** * @returns the total time (seconds) spent on creating bitmask and accepting token grammar matcher * for all the generated tokens in a single request. */ getCurRoundGrammarPerTokenTotalTime(): number; /** * @returns Runtime stats information. */ runtimeStatsText(): string; /** * @returns Runtime stats information, starting from the last prefill performed. */ curRoundRuntimeStatsText(): string; /** * @returns Prefill tokens per second, starting from the last prefill performed. */ getCurRoundPrefillTokensPerSec(): number; /** * @returns Prefill tokens per second, starting from the last prefill performed. */ getCurRoundDecodingTokensPerSec(): number; /** * Set the seed for the RNG `this.tvm.rng`. */ setSeed(seed: number): void; /** * @returns The conversation object (not a deep copy). */ getConversationObject(): Conversation; /** * Set this.conversation to a new conversation object. */ setConversation(newConv: Conversation): void; asyncLoadWebGPUPipelines(): Promise<void>; /** * Generate the first token given input prompt */ prefillStep(inp: string, msgRole: Role, // either user or tool inp_role_str?: string, genConfig?: GenerationConfig): Promise<void>; decodeStep(genConfig?: GenerationConfig): Promise<void>; /** * Manually trigger stop if it is not stopped. */ triggerStop(): void; /** * Add a generated token and check for stop. * * @param nextToken The next token. * @param genConfig Configs that override `this.config` for this round of generation. */ private processNextToken; /** * Given input tokens, return embeddings of them by calling embed kernel. * * @note precondition: inputTokens.length <= prefillChunkSize, since we take care of * chunking in `getChunkedPrefillInputData()`. */ private getTokensEmbeddings; /** * Embed an image input. */ private getImageEmbeddings; /** * Embed and forward input data, that can be either array of tokens, or an image. * This will increment `this.filledKVCacheLength`. * * @param inputData data to embed and forward * @param inputDataLen length of this inputData, should smaller than prefill chunk size. * @returns The logits returned by this forward as tvmjs.NDArray on GPU. * * @note Precondition: inputData's data length is smaller than prefill chunk size */ private embedAndForward; private updateLogitsOnCPU; private sampleTokenFromLogits; /** * Return the an array of a mixture of token arrays and imageURLs (which cannot be represented * as tokens). Also return the number of tokens this represents. * * We first convert the Conversation into a prompt array to be prefilled. Then we encode the * text parts, leaving the imageURLs as it is. * Example prompts: * [ * "<|system|>\nSome system prompt\n", * [ * "<|user|>\n", * imageURL1, * "\n", * imageURL2, * "\n", * "Some user input<|end|>\n" * ], * ] * * Expected output: * [ * token array for "<|system|>\nSome system prompt\n<|user|>\n", * imageUrl1, * token array for "\n", * imageUrl2, * token array for "\nSome user input<|end|>\n" */ private getInputData; forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean): Promise<number>; /** * Based on `sampledToken` and `this.logitsOnCPU`, which becomes a distribution after * calling `this.tvm.applySoftmaxWithTemperature()`, generate `ChatCompletionTokenLogprob` and * update `this.tokenLogprobArray`. * * @param sampledToken The token ID sampled. * @param top_logprobs Number of top tokens to include; `top_logprobs` in `ChatCompletionRequest`. * * @return The `ChatCompletionTokenLogprob` for this single autoregressive step. */ private getTokenLogprob; /** * Synchronize the device. */ sync(): Promise<void>; evaluate(): Promise<void>; } //# sourceMappingURL=llm_chat.d.ts.map