cui-llama.rn

import type { TurboModule } from 'react-native'; export type NativeEmbeddingParams = { embd_normalize?: number; }; export type NativeContextParams = { model: string; is_model_asset?: boolean; use_progress_callback?: boolean; n_ctx?: number; n_batch?: number; n_threads?: number; n_gpu_layers?: number; /** * Enable flash attention, only recommended in GPU device (Experimental in llama.cpp) */ flash_attn?: boolean; /** * KV cache data type for the K (Experimental in llama.cpp) */ cache_type_k?: number; /** * KV cache data type for the V (Experimental in llama.cpp) */ cache_type_v?: number; use_mlock?: boolean; use_mmap?: boolean; vocab_only?: boolean; lora?: string; lora_scaled?: number; rope_freq_base?: number; rope_freq_scale?: number; pooling_type?: number; embedding?: boolean; embd_normalize?: number; }; export type NativeCompletionParams = { prompt: string; n_threads?: number; /** * Set grammar for grammar-based sampling. Default: no grammar */ grammar?: string; /** * Specify a JSON array of stopping strings. * These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` */ stop?: Array<string>; /** * Set the maximum number of tokens to predict when generating text. * **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. * When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. */ n_predict?: number; /** * If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. * Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. * Default: `0` */ n_probs?: number; /** * Limit the next token selection to the K most probable tokens. Default: `40` */ top_k?: number; /** * Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95` */ top_p?: number; /** * The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05` */ min_p?: number; /** * Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled. */ xtc_probability?: number; /** * Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC) */ xtc_threshold?: number; /** * Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled. */ typical_p?: number; /** * Adjust the randomness of the generated text. Default: `0.8` */ temperature?: number; /** * Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. */ penalty_last_n?: number; /** * Control the repetition of token sequences in the generated text. Default: `1.0` */ penalty_repeat?: number; /** * Repeat alpha frequency penalty. Default: `0.0`, which is disabled. */ penalty_freq?: number; /** * Repeat alpha presence penalty. Default: `0.0`, which is disabled. */ penalty_present?: number; /** * Penalize newline tokens when applying the repeat penalty. Default: `false` */ /** * Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. */ mirostat?: number; /** * Set the Mirostat target entropy, parameter tau. Default: `5.0` */ mirostat_tau?: number; /** * Set the Mirostat learning rate, parameter eta. Default: `0.1` */ mirostat_eta?: number; /** * Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled. */ dry_multiplier?: number; /** * Set the DRY repetition penalty base value. Default: `1.75` */ dry_base?: number; /** * Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2` */ dry_allowed_length?: number; /** * How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size. */ dry_penalty_last_n?: number; /** * Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` */ dry_sequence_breakers?: Array<string>; /** * Ignore end of stream token and continue generating. Default: `false` */ ignore_eos?: boolean; /** * Modify the likelihood of a token appearing in the generated text completion. * For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. * Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, * e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. * Default: `[]` */ logit_bias?: Array<Array<number>>; /** * Set the random number generator (RNG) seed. Default: `-1`, which is a random seed. */ seed?: number; emit_partial_completion: boolean; }; export type NativeCompletionTokenProbItem = { tok_str: string; prob: number; }; export type NativeCompletionTokenProb = { content: string; probs: Array<NativeCompletionTokenProbItem>; }; export type NativeCompletionResultTimings = { prompt_n: number; prompt_ms: number; prompt_per_token_ms: number; prompt_per_second: number; predicted_n: number; predicted_ms: number; predicted_per_token_ms: number; predicted_per_second: number; }; export type NativeCompletionResult = { text: string; tokens_predicted: number; tokens_evaluated: number; truncated: boolean; stopped_eos: boolean; stopped_word: string; stopped_limit: number; stopping_word: string; tokens_cached: number; timings: NativeCompletionResultTimings; completion_probabilities?: Array<NativeCompletionTokenProb>; }; export type NativeTokenizeResult = { tokens: Array<number>; }; export type NativeEmbeddingResult = { embedding: Array<number>; }; export type NativeLlamaContext = { contextId: number; gpu: boolean; reasonNoGPU: string; model: Object; }; export type NativeSessionLoadResult = { tokens_loaded: number; prompt: string; }; export type NativeLlamaChatMessage = { role: string; content: string; }; export type NativeCPUFeatures = { armv8: boolean; i8mm: boolean; dotprod: boolean; }; export interface Spec extends TurboModule { setContextLimit(limit: number): Promise<void>; modelInfo(path: string, skip?: string[]): Promise<Object>; initContext(contextId: number, params: NativeContextParams): Promise<NativeLlamaContext>; loadSession(contextId: number, filepath: string): Promise<NativeSessionLoadResult>; saveSession(contextId: number, filepath: string, size: number): Promise<number>; completion(contextId: number, params: NativeCompletionParams): Promise<NativeCompletionResult>; stopCompletion(contextId: number): Promise<void>; tokenizeAsync(contextId: number, text: string): Promise<NativeTokenizeResult>; tokenizeSync(contextId: number, text: string): NativeTokenizeResult; getCpuFeatures(): Promise<NativeCPUFeatures>; getFormattedChat(contextId: number, messages: NativeLlamaChatMessage[], chatTemplate?: string): Promise<string>; detokenize(contextId: number, tokens: number[]): Promise<string>; embedding(contextId: number, text: string, params: NativeEmbeddingParams): Promise<NativeEmbeddingResult>; bench(contextId: number, pp: number, tg: number, pl: number, nr: number): Promise<string>; releaseContext(contextId: number): Promise<void>; releaseAllContexts(): Promise<void>; } declare const _default: Spec; export default _default; //# sourceMappingURL=NativeRNLlama.d.ts.map