UNPKG

cui-llama.rn

Version:
539 lines (490 loc) 14.3 kB
import type { TurboModule } from 'react-native' import { TurboModuleRegistry } from 'react-native' export type NativeEmbeddingParams = { embd_normalize?: number } export type NativeContextParams = { model: string /** * Chat template to override the default one from the model. */ chat_template?: string is_model_asset?: boolean use_progress_callback?: boolean n_ctx?: number n_batch?: number n_ubatch?: number n_threads?: number /** * Number of layers to store in VRAM (Currently only for iOS) */ n_gpu_layers?: number /** * Skip GPU devices (iOS only) */ no_gpu_devices?: boolean /** * Enable flash attention, only recommended in GPU device (Experimental in llama.cpp) */ flash_attn?: boolean /** * KV cache data type for the K (Experimental in llama.cpp) */ cache_type_k?: string /** * KV cache data type for the V (Experimental in llama.cpp) */ cache_type_v?: string use_mlock?: boolean use_mmap?: boolean vocab_only?: boolean /** * Single LoRA adapter path */ lora?: string /** * Single LoRA adapter scale */ lora_scaled?: number /** * LoRA adapter list */ lora_list?: Array<{ path: string; scaled?: number }> rope_freq_base?: number rope_freq_scale?: number pooling_type?: number /** * Enable context shifting to handle prompts larger than context size */ ctx_shift?: boolean // Embedding params embedding?: boolean embd_normalize?: number } export type NativeCompletionParams = { prompt: string n_threads?: number /** * Enable Jinja. Default: true if supported by the model */ jinja?: boolean /** * JSON schema for convert to grammar for structured JSON output. * It will be override by grammar if both are set. */ json_schema?: string /** * Set grammar for grammar-based sampling. Default: no grammar */ grammar?: string /** * Lazy grammar sampling, trigger by grammar_triggers. Default: false */ grammar_lazy?: boolean /** * Enable thinking if jinja is enabled. Default: true */ enable_thinking?: boolean /** * Force thinking to be open. Default: false */ thinking_forced_open?: boolean /** * Lazy grammar triggers. Default: [] */ grammar_triggers?: Array<{ type: number value: string token: number }> preserved_tokens?: Array<string> chat_format?: number reasoning_format?: string /** * Path to an image file to process before generating text. * When provided, the image will be processed and added to the context. * Requires multimodal support to be enabled via initMultimodal. */ media_paths?: Array<string> /** * Specify a JSON array of stopping strings. * These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` */ stop?: Array<string> /** * Set the maximum number of tokens to predict when generating text. * **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. * When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. */ n_predict?: number /** * If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. * Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. * Default: `0` */ n_probs?: number /** * Limit the next token selection to the K most probable tokens. Default: `40` */ top_k?: number /** * Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95` */ top_p?: number /** * The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05` */ min_p?: number /** * Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled. */ xtc_probability?: number /** * Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC) */ xtc_threshold?: number /** * Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled. */ typical_p?: number /** * Adjust the randomness of the generated text. Default: `0.8` */ temperature?: number /** * Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. */ penalty_last_n?: number /** * Control the repetition of token sequences in the generated text. Default: `1.0` */ penalty_repeat?: number /** * Repeat alpha frequency penalty. Default: `0.0`, which is disabled. */ penalty_freq?: number /** * Repeat alpha presence penalty. Default: `0.0`, which is disabled. */ penalty_present?: number /** * Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. */ mirostat?: number /** * Set the Mirostat target entropy, parameter tau. Default: `5.0` */ mirostat_tau?: number /** * Set the Mirostat learning rate, parameter eta. Default: `0.1` */ mirostat_eta?: number /** * Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. */ dry_multiplier?: number /** * Set the DRY repetition penalty base value. Default: `1.75` */ dry_base?: number /** * Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` */ dry_allowed_length?: number /** * How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size. */ dry_penalty_last_n?: number /** * Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` */ dry_sequence_breakers?: Array<string> /** * Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641. Default: `-1.0` (Disabled) */ top_n_sigma?: number /** * Ignore end of stream token and continue generating. Default: `false` */ ignore_eos?: boolean /** * Modify the likelihood of a token appearing in the generated text completion. * For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. * Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, * e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. * Default: `[]` */ logit_bias?: Array<Array<number>> /** * Set the random number generator (RNG) seed. Default: `-1`, which is a random seed. */ seed?: number /** * Guide tokens for the completion. * Help prevent hallucinations by forcing the TTS to use the correct words. * Default: `[]` */ guide_tokens?: Array<number> emit_partial_completion: boolean } export type NativeCompletionTokenProbItem = { tok_str: string prob: number } export type NativeCompletionTokenProb = { content: string probs: Array<NativeCompletionTokenProbItem> } export type NativeCompletionResultTimings = { prompt_n: number prompt_ms: number prompt_per_token_ms: number prompt_per_second: number predicted_n: number predicted_ms: number predicted_per_token_ms: number predicted_per_second: number } export type NativeCompletionResult = { /** * Original text (Ignored reasoning_content / tool_calls) */ text: string /** * Reasoning content (parsed for reasoning model) */ reasoning_content: string /** * Tool calls */ tool_calls: Array<{ type: 'function' function: { name: string arguments: string } id?: string }> /** * Content text (Filtered text by reasoning_content / tool_calls) */ content: string tokens_predicted: number tokens_evaluated: number truncated: boolean stopped_eos: boolean stopped_word: string stopped_limit: number stopping_word: string tokens_cached: number timings: NativeCompletionResultTimings completion_probabilities?: Array<NativeCompletionTokenProb> audio_tokens?: Array<number> } export type NativeTokenizeResult = { tokens: Array<number> /** * Whether the tokenization contains images */ has_images: boolean /** * Bitmap hashes of the images */ bitmap_hashes: Array<number> /** * Chunk positions of the text and images */ chunk_pos: Array<number> /** * Chunk positions of the images */ chunk_pos_images: Array<number> } export type NativeEmbeddingResult = { embedding: Array<number> } export type NativeLlamaContext = { contextId: number model: { desc: string size: number nEmbd: number nParams: number chatTemplates: { llamaChat: boolean // Chat template in llama-chat.cpp minja: { // Chat template supported by minja.hpp default: boolean defaultCaps: { tools: boolean toolCalls: boolean toolResponses: boolean systemRole: boolean parallelToolCalls: boolean toolCallId: boolean } toolUse: boolean toolUseCaps: { tools: boolean toolCalls: boolean toolResponses: boolean systemRole: boolean parallelToolCalls: boolean toolCallId: boolean } } } metadata: Object isChatTemplateSupported: boolean // Deprecated } /** * Loaded library name for Android */ androidLib?: string gpu: boolean reasonNoGPU: string } export type NativeSessionLoadResult = { tokens_loaded: number prompt: string } export type NativeLlamaMessagePart = { type: 'text' text: string } export type NativeLlamaChatMessage = { role: string content: string | Array<NativeLlamaMessagePart> } export type NativeCPUFeatures = { armv8: boolean i8mm: boolean dotprod: boolean } export type FormattedChatResult = { type: 'jinja' | 'llama-chat' prompt: string has_media: boolean media_paths?: Array<string> } export type JinjaFormattedChatResult = FormattedChatResult & { chat_format?: number grammar?: string grammar_lazy?: boolean grammar_triggers?: Array<{ type: number value: string token: number }> thinking_forced_open?: boolean preserved_tokens?: Array<string> additional_stops?: Array<string> } export type NativeImageProcessingResult = { success: boolean prompt: string error?: string } export type NativeRerankParams = { normalize?: number } export type NativeRerankResult = { score: number index: number } export interface Spec extends TurboModule { toggleNativeLog(enabled: boolean): Promise<void> setContextLimit(limit: number): Promise<void> modelInfo(path: string, skip?: string[]): Promise<Object> initContext( contextId: number, params: NativeContextParams, ): Promise<NativeLlamaContext> getFormattedChat( contextId: number, messages: string, chatTemplate?: string, params?: { jinja?: boolean json_schema?: string tools?: string parallel_tool_calls?: string tool_choice?: string enable_thinking?: boolean }, ): Promise<JinjaFormattedChatResult | string> loadSession( contextId: number, filepath: string, ): Promise<NativeSessionLoadResult> saveSession( contextId: number, filepath: string, size: number, ): Promise<number> completion( contextId: number, params: NativeCompletionParams, ): Promise<NativeCompletionResult> stopCompletion(contextId: number): Promise<void> tokenizeAsync(contextId: number, text: string, imagePaths?: Array<string>): Promise<NativeTokenizeResult> tokenizeSync(contextId: number, text: string, imagePaths?: Array<string>): NativeTokenizeResult getCpuFeatures() : Promise<NativeCPUFeatures> detokenize(contextId: number, tokens: number[]): Promise<string> embedding( contextId: number, text: string, params: NativeEmbeddingParams, ): Promise<NativeEmbeddingResult> rerank( contextId: number, query: string, documents: Array<string>, params?: NativeRerankParams, ): Promise<Array<NativeRerankResult>> bench( contextId: number, pp: number, tg: number, pl: number, nr: number, ): Promise<string> applyLoraAdapters( contextId: number, loraAdapters: Array<{ path: string; scaled?: number }>, ): Promise<void> removeLoraAdapters(contextId: number): Promise<void> getLoadedLoraAdapters( contextId: number, ): Promise<Array<{ path: string; scaled?: number }>> // Multimodal methods initMultimodal( contextId: number, params: { path: string use_gpu: boolean }, ): Promise<boolean> isMultimodalEnabled( contextId: number, ): Promise<boolean> getMultimodalSupport( contextId: number, ): Promise<{ vision: boolean audio: boolean }> releaseMultimodal( contextId: number, ): Promise<void> // TTS methods initVocoder(contextId: number, vocoderModelPath: string): Promise<boolean> isVocoderEnabled(contextId: number): Promise<boolean> getFormattedAudioCompletion(contextId: number, speakerJsonStr: string, textToSpeak: string): Promise<string> getAudioCompletionGuideTokens(contextId: number, textToSpeak: string): Promise<Array<number>> decodeAudioTokens(contextId: number, tokens: number[]): Promise<Array<number>> releaseVocoder(contextId: number): Promise<void> releaseContext(contextId: number): Promise<void> releaseAllContexts(): Promise<void> } export default TurboModuleRegistry.get<Spec>('RNLlama') as Spec