cui-llama.rn

import type { TurboModule } from 'react-native' import { TurboModuleRegistry } from 'react-native' export type NativeEmbeddingParams = { embd_normalize?: number } export type NativeContextParams = { model: string is_model_asset?: boolean use_progress_callback?: boolean n_ctx?: number n_batch?: number n_threads?: number n_gpu_layers?: number /** * Enable flash attention, only recommended in GPU device (Experimental in llama.cpp) */ flash_attn?: boolean /** * KV cache data type for the K (Experimental in llama.cpp) */ cache_type_k?: number /** * KV cache data type for the V (Experimental in llama.cpp) */ cache_type_v?: number use_mlock?: boolean use_mmap?: boolean vocab_only?: boolean lora?: string // lora_adaptor lora_scaled?: number rope_freq_base?: number rope_freq_scale?: number pooling_type?: number // Embedding params embedding?: boolean embd_normalize?: number } export type NativeCompletionParams = { prompt: string n_threads?: number /** * Set grammar for grammar-based sampling. Default: no grammar */ grammar?: string /** * Specify a JSON array of stopping strings. * These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` */ stop?: Array<string> /** * Set the maximum number of tokens to predict when generating text. * **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. * When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. */ n_predict?: number /** * If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. * Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. * Default: `0` */ n_probs?: number /** * Limit the next token selection to the K most probable tokens. Default: `40` */ top_k?: number /** * Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95` */ top_p?: number /** * The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05` */ min_p?: number /** * Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled. */ xtc_probability?: number /** * Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC) */ xtc_threshold?: number /** * Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled. */ typical_p?: number /** * Adjust the randomness of the generated text. Default: `0.8` */ temperature?: number /** * Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. */ penalty_last_n?: number /** * Control the repetition of token sequences in the generated text. Default: `1.0` */ penalty_repeat?: number /** * Repeat alpha frequency penalty. Default: `0.0`, which is disabled. */ penalty_freq?: number /** * Repeat alpha presence penalty. Default: `0.0`, which is disabled. */ penalty_present?: number /** * Penalize newline tokens when applying the repeat penalty. Default: `false` */ // penalize_nl?: boolean /** * Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. */ mirostat?: number /** * Set the Mirostat target entropy, parameter tau. Default: `5.0` */ mirostat_tau?: number /** * Set the Mirostat learning rate, parameter eta. Default: `0.1` */ mirostat_eta?: number /** * Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. */ dry_multiplier?: number /** * Set the DRY repetition penalty base value. Default: `1.75` */ dry_base?: number /** * Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` */ dry_allowed_length?: number /** * How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size. */ dry_penalty_last_n?: number /** * Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` */ dry_sequence_breakers?: Array<string> /** * Ignore end of stream token and continue generating. Default: `false` */ ignore_eos?: boolean /** * Modify the likelihood of a token appearing in the generated text completion. * For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. * Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, * e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. * Default: `[]` */ logit_bias?: Array<Array<number>> /** * Set the random number generator (RNG) seed. Default: `-1`, which is a random seed. */ seed?: number emit_partial_completion: boolean } export type NativeCompletionTokenProbItem = { tok_str: string prob: number } export type NativeCompletionTokenProb = { content: string probs: Array<NativeCompletionTokenProbItem> } export type NativeCompletionResultTimings = { prompt_n: number prompt_ms: number prompt_per_token_ms: number prompt_per_second: number predicted_n: number predicted_ms: number predicted_per_token_ms: number predicted_per_second: number } export type NativeCompletionResult = { text: string tokens_predicted: number tokens_evaluated: number truncated: boolean stopped_eos: boolean stopped_word: string stopped_limit: number stopping_word: string tokens_cached: number timings: NativeCompletionResultTimings completion_probabilities?: Array<NativeCompletionTokenProb> } export type NativeTokenizeResult = { tokens: Array<number> } export type NativeEmbeddingResult = { embedding: Array<number> } export type NativeLlamaContext = { contextId: number gpu: boolean reasonNoGPU: string model: Object } export type NativeSessionLoadResult = { tokens_loaded: number prompt: string } export type NativeLlamaChatMessage = { role: string content: string } export type NativeCPUFeatures = { armv8: boolean i8mm: boolean dotprod: boolean } export interface Spec extends TurboModule { setContextLimit(limit: number): Promise<void> modelInfo(path: string, skip?: string[]): Promise<Object> initContext(contextId: number, params: NativeContextParams): Promise<NativeLlamaContext> loadSession( contextId: number, filepath: string, ): Promise<NativeSessionLoadResult> saveSession( contextId: number, filepath: string, size: number, ): Promise<number> completion( contextId: number, params: NativeCompletionParams, ): Promise<NativeCompletionResult> stopCompletion(contextId: number): Promise<void> tokenizeAsync(contextId: number, text: string): Promise<NativeTokenizeResult> tokenizeSync(contextId: number, text: string): NativeTokenizeResult getCpuFeatures() : Promise<NativeCPUFeatures> getFormattedChat( contextId: number, messages: NativeLlamaChatMessage[], chatTemplate?: string, ): Promise<string> detokenize(contextId: number, tokens: number[]): Promise<string> embedding( contextId: number, text: string, params: NativeEmbeddingParams, ): Promise<NativeEmbeddingResult> bench( contextId: number, pp: number, tg: number, pl: number, nr: number, ): Promise<string> releaseContext(contextId: number): Promise<void> releaseAllContexts(): Promise<void> } export default TurboModuleRegistry.get<Spec>('RNLlama') as Spec