cui-llama.rn
Version:
Fork of llama.rn for ChatterUI
233 lines • 8.72 kB
TypeScript
import type { TurboModule } from 'react-native';
export type NativeEmbeddingParams = {
embd_normalize?: number;
};
export type NativeContextParams = {
model: string;
is_model_asset?: boolean;
use_progress_callback?: boolean;
n_ctx?: number;
n_batch?: number;
n_threads?: number;
n_gpu_layers?: number;
/**
* Enable flash attention, only recommended in GPU device (Experimental in llama.cpp)
*/
flash_attn?: boolean;
/**
* KV cache data type for the K (Experimental in llama.cpp)
*/
cache_type_k?: number;
/**
* KV cache data type for the V (Experimental in llama.cpp)
*/
cache_type_v?: number;
use_mlock?: boolean;
use_mmap?: boolean;
vocab_only?: boolean;
lora?: string;
lora_scaled?: number;
rope_freq_base?: number;
rope_freq_scale?: number;
pooling_type?: number;
embedding?: boolean;
embd_normalize?: number;
};
export type NativeCompletionParams = {
prompt: string;
n_threads?: number;
/**
* Set grammar for grammar-based sampling. Default: no grammar
*/
grammar?: string;
/**
* Specify a JSON array of stopping strings.
* These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
*/
stop?: Array<string>;
/**
* Set the maximum number of tokens to predict when generating text.
* **Note:** May exceed the set limit slightly if the last token is a partial multibyte character.
* When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
*/
n_predict?: number;
/**
* If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings.
* Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings.
* Default: `0`
*/
n_probs?: number;
/**
* Limit the next token selection to the K most probable tokens. Default: `40`
*/
top_k?: number;
/**
* Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
*/
top_p?: number;
/**
* The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
*/
min_p?: number;
/**
* Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
*/
xtc_probability?: number;
/**
* Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
*/
xtc_threshold?: number;
/**
* Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
*/
typical_p?: number;
/**
* Adjust the randomness of the generated text. Default: `0.8`
*/
temperature?: number;
/**
* Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
*/
penalty_last_n?: number;
/**
* Control the repetition of token sequences in the generated text. Default: `1.0`
*/
penalty_repeat?: number;
/**
* Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
*/
penalty_freq?: number;
/**
* Repeat alpha presence penalty. Default: `0.0`, which is disabled.
*/
penalty_present?: number;
/**
* Penalize newline tokens when applying the repeat penalty. Default: `false`
*/
/**
* Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
*/
mirostat?: number;
/**
* Set the Mirostat target entropy, parameter tau. Default: `5.0`
*/
mirostat_tau?: number;
/**
* Set the Mirostat learning rate, parameter eta. Default: `0.1`
*/
mirostat_eta?: number;
/**
* Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
*/
dry_multiplier?: number;
/**
* Set the DRY repetition penalty base value. Default: `1.75`
*/
dry_base?: number;
/**
* Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
*/
dry_allowed_length?: number;
/**
* How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
*/
dry_penalty_last_n?: number;
/**
* Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
*/
dry_sequence_breakers?: Array<string>;
/**
* Ignore end of stream token and continue generating. Default: `false`
*/
ignore_eos?: boolean;
/**
* Modify the likelihood of a token appearing in the generated text completion.
* For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood.
* Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings,
* e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does.
* Default: `[]`
*/
logit_bias?: Array<Array<number>>;
/**
* Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
*/
seed?: number;
emit_partial_completion: boolean;
};
export type NativeCompletionTokenProbItem = {
tok_str: string;
prob: number;
};
export type NativeCompletionTokenProb = {
content: string;
probs: Array<NativeCompletionTokenProbItem>;
};
export type NativeCompletionResultTimings = {
prompt_n: number;
prompt_ms: number;
prompt_per_token_ms: number;
prompt_per_second: number;
predicted_n: number;
predicted_ms: number;
predicted_per_token_ms: number;
predicted_per_second: number;
};
export type NativeCompletionResult = {
text: string;
tokens_predicted: number;
tokens_evaluated: number;
truncated: boolean;
stopped_eos: boolean;
stopped_word: string;
stopped_limit: number;
stopping_word: string;
tokens_cached: number;
timings: NativeCompletionResultTimings;
completion_probabilities?: Array<NativeCompletionTokenProb>;
};
export type NativeTokenizeResult = {
tokens: Array<number>;
};
export type NativeEmbeddingResult = {
embedding: Array<number>;
};
export type NativeLlamaContext = {
contextId: number;
gpu: boolean;
reasonNoGPU: string;
model: Object;
};
export type NativeSessionLoadResult = {
tokens_loaded: number;
prompt: string;
};
export type NativeLlamaChatMessage = {
role: string;
content: string;
};
export type NativeCPUFeatures = {
armv8: boolean;
i8mm: boolean;
dotprod: boolean;
};
export interface Spec extends TurboModule {
setContextLimit(limit: number): Promise<void>;
modelInfo(path: string, skip?: string[]): Promise<Object>;
initContext(contextId: number, params: NativeContextParams): Promise<NativeLlamaContext>;
loadSession(contextId: number, filepath: string): Promise<NativeSessionLoadResult>;
saveSession(contextId: number, filepath: string, size: number): Promise<number>;
completion(contextId: number, params: NativeCompletionParams): Promise<NativeCompletionResult>;
stopCompletion(contextId: number): Promise<void>;
tokenizeAsync(contextId: number, text: string): Promise<NativeTokenizeResult>;
tokenizeSync(contextId: number, text: string): NativeTokenizeResult;
getCpuFeatures(): Promise<NativeCPUFeatures>;
getFormattedChat(contextId: number, messages: NativeLlamaChatMessage[], chatTemplate?: string): Promise<string>;
detokenize(contextId: number, tokens: number[]): Promise<string>;
embedding(contextId: number, text: string, params: NativeEmbeddingParams): Promise<NativeEmbeddingResult>;
bench(contextId: number, pp: number, tg: number, pl: number, nr: number): Promise<string>;
releaseContext(contextId: number): Promise<void>;
releaseAllContexts(): Promise<void>;
}
declare const _default: Spec;
export default _default;
//# sourceMappingURL=NativeRNLlama.d.ts.map