cui-llama.rn
Version:
Fork of llama.rn for ChatterUI
449 lines • 15.1 kB
TypeScript
import type { TurboModule } from 'react-native';
export type NativeEmbeddingParams = {
embd_normalize?: number;
};
export type NativeContextParams = {
model: string;
/**
* Chat template to override the default one from the model.
*/
chat_template?: string;
is_model_asset?: boolean;
use_progress_callback?: boolean;
n_ctx?: number;
n_batch?: number;
n_ubatch?: number;
n_threads?: number;
/**
* Number of layers to store in VRAM (Currently only for iOS)
*/
n_gpu_layers?: number;
/**
* Skip GPU devices (iOS only)
*/
no_gpu_devices?: boolean;
/**
* Enable flash attention, only recommended in GPU device (Experimental in llama.cpp)
*/
flash_attn?: boolean;
/**
* KV cache data type for the K (Experimental in llama.cpp)
*/
cache_type_k?: string;
/**
* KV cache data type for the V (Experimental in llama.cpp)
*/
cache_type_v?: string;
use_mlock?: boolean;
use_mmap?: boolean;
vocab_only?: boolean;
/**
* Single LoRA adapter path
*/
lora?: string;
/**
* Single LoRA adapter scale
*/
lora_scaled?: number;
/**
* LoRA adapter list
*/
lora_list?: Array<{
path: string;
scaled?: number;
}>;
rope_freq_base?: number;
rope_freq_scale?: number;
pooling_type?: number;
/**
* Enable context shifting to handle prompts larger than context size
*/
ctx_shift?: boolean;
embedding?: boolean;
embd_normalize?: number;
};
export type NativeCompletionParams = {
prompt: string;
n_threads?: number;
/**
* Enable Jinja. Default: true if supported by the model
*/
jinja?: boolean;
/**
* JSON schema for convert to grammar for structured JSON output.
* It will be override by grammar if both are set.
*/
json_schema?: string;
/**
* Set grammar for grammar-based sampling. Default: no grammar
*/
grammar?: string;
/**
* Lazy grammar sampling, trigger by grammar_triggers. Default: false
*/
grammar_lazy?: boolean;
/**
* Enable thinking if jinja is enabled. Default: true
*/
enable_thinking?: boolean;
/**
* Force thinking to be open. Default: false
*/
thinking_forced_open?: boolean;
/**
* Lazy grammar triggers. Default: []
*/
grammar_triggers?: Array<{
type: number;
value: string;
token: number;
}>;
preserved_tokens?: Array<string>;
chat_format?: number;
reasoning_format?: string;
/**
* Path to an image file to process before generating text.
* When provided, the image will be processed and added to the context.
* Requires multimodal support to be enabled via initMultimodal.
*/
media_paths?: Array<string>;
/**
* Specify a JSON array of stopping strings.
* These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
*/
stop?: Array<string>;
/**
* Set the maximum number of tokens to predict when generating text.
* **Note:** May exceed the set limit slightly if the last token is a partial multibyte character.
* When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
*/
n_predict?: number;
/**
* If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings.
* Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings.
* Default: `0`
*/
n_probs?: number;
/**
* Limit the next token selection to the K most probable tokens. Default: `40`
*/
top_k?: number;
/**
* Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
*/
top_p?: number;
/**
* The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
*/
min_p?: number;
/**
* Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
*/
xtc_probability?: number;
/**
* Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
*/
xtc_threshold?: number;
/**
* Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
*/
typical_p?: number;
/**
* Adjust the randomness of the generated text. Default: `0.8`
*/
temperature?: number;
/**
* Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
*/
penalty_last_n?: number;
/**
* Control the repetition of token sequences in the generated text. Default: `1.0`
*/
penalty_repeat?: number;
/**
* Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
*/
penalty_freq?: number;
/**
* Repeat alpha presence penalty. Default: `0.0`, which is disabled.
*/
penalty_present?: number;
/**
* Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
*/
mirostat?: number;
/**
* Set the Mirostat target entropy, parameter tau. Default: `5.0`
*/
mirostat_tau?: number;
/**
* Set the Mirostat learning rate, parameter eta. Default: `0.1`
*/
mirostat_eta?: number;
/**
* Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
*/
dry_multiplier?: number;
/**
* Set the DRY repetition penalty base value. Default: `1.75`
*/
dry_base?: number;
/**
* Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
*/
dry_allowed_length?: number;
/**
* How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
*/
dry_penalty_last_n?: number;
/**
* Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
*/
dry_sequence_breakers?: Array<string>;
/**
* Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641. Default: `-1.0` (Disabled)
*/
top_n_sigma?: number;
/**
* Ignore end of stream token and continue generating. Default: `false`
*/
ignore_eos?: boolean;
/**
* Modify the likelihood of a token appearing in the generated text completion.
* For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood.
* Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings,
* e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does.
* Default: `[]`
*/
logit_bias?: Array<Array<number>>;
/**
* Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
*/
seed?: number;
/**
* Guide tokens for the completion.
* Help prevent hallucinations by forcing the TTS to use the correct words.
* Default: `[]`
*/
guide_tokens?: Array<number>;
emit_partial_completion: boolean;
};
export type NativeCompletionTokenProbItem = {
tok_str: string;
prob: number;
};
export type NativeCompletionTokenProb = {
content: string;
probs: Array<NativeCompletionTokenProbItem>;
};
export type NativeCompletionResultTimings = {
prompt_n: number;
prompt_ms: number;
prompt_per_token_ms: number;
prompt_per_second: number;
predicted_n: number;
predicted_ms: number;
predicted_per_token_ms: number;
predicted_per_second: number;
};
export type NativeCompletionResult = {
/**
* Original text (Ignored reasoning_content / tool_calls)
*/
text: string;
/**
* Reasoning content (parsed for reasoning model)
*/
reasoning_content: string;
/**
* Tool calls
*/
tool_calls: Array<{
type: 'function';
function: {
name: string;
arguments: string;
};
id?: string;
}>;
/**
* Content text (Filtered text by reasoning_content / tool_calls)
*/
content: string;
tokens_predicted: number;
tokens_evaluated: number;
truncated: boolean;
stopped_eos: boolean;
stopped_word: string;
stopped_limit: number;
stopping_word: string;
tokens_cached: number;
timings: NativeCompletionResultTimings;
completion_probabilities?: Array<NativeCompletionTokenProb>;
audio_tokens?: Array<number>;
};
export type NativeTokenizeResult = {
tokens: Array<number>;
/**
* Whether the tokenization contains images
*/
has_images: boolean;
/**
* Bitmap hashes of the images
*/
bitmap_hashes: Array<number>;
/**
* Chunk positions of the text and images
*/
chunk_pos: Array<number>;
/**
* Chunk positions of the images
*/
chunk_pos_images: Array<number>;
};
export type NativeEmbeddingResult = {
embedding: Array<number>;
};
export type NativeLlamaContext = {
contextId: number;
model: {
desc: string;
size: number;
nEmbd: number;
nParams: number;
chatTemplates: {
llamaChat: boolean;
minja: {
default: boolean;
defaultCaps: {
tools: boolean;
toolCalls: boolean;
toolResponses: boolean;
systemRole: boolean;
parallelToolCalls: boolean;
toolCallId: boolean;
};
toolUse: boolean;
toolUseCaps: {
tools: boolean;
toolCalls: boolean;
toolResponses: boolean;
systemRole: boolean;
parallelToolCalls: boolean;
toolCallId: boolean;
};
};
};
metadata: Object;
isChatTemplateSupported: boolean;
};
/**
* Loaded library name for Android
*/
androidLib?: string;
gpu: boolean;
reasonNoGPU: string;
};
export type NativeSessionLoadResult = {
tokens_loaded: number;
prompt: string;
};
export type NativeLlamaMessagePart = {
type: 'text';
text: string;
};
export type NativeLlamaChatMessage = {
role: string;
content: string | Array<NativeLlamaMessagePart>;
};
export type NativeCPUFeatures = {
armv8: boolean;
i8mm: boolean;
dotprod: boolean;
};
export type FormattedChatResult = {
type: 'jinja' | 'llama-chat';
prompt: string;
has_media: boolean;
media_paths?: Array<string>;
};
export type JinjaFormattedChatResult = FormattedChatResult & {
chat_format?: number;
grammar?: string;
grammar_lazy?: boolean;
grammar_triggers?: Array<{
type: number;
value: string;
token: number;
}>;
thinking_forced_open?: boolean;
preserved_tokens?: Array<string>;
additional_stops?: Array<string>;
};
export type NativeImageProcessingResult = {
success: boolean;
prompt: string;
error?: string;
};
export type NativeRerankParams = {
normalize?: number;
};
export type NativeRerankResult = {
score: number;
index: number;
};
export interface Spec extends TurboModule {
toggleNativeLog(enabled: boolean): Promise<void>;
setContextLimit(limit: number): Promise<void>;
modelInfo(path: string, skip?: string[]): Promise<Object>;
initContext(contextId: number, params: NativeContextParams): Promise<NativeLlamaContext>;
getFormattedChat(contextId: number, messages: string, chatTemplate?: string, params?: {
jinja?: boolean;
json_schema?: string;
tools?: string;
parallel_tool_calls?: string;
tool_choice?: string;
enable_thinking?: boolean;
}): Promise<JinjaFormattedChatResult | string>;
loadSession(contextId: number, filepath: string): Promise<NativeSessionLoadResult>;
saveSession(contextId: number, filepath: string, size: number): Promise<number>;
completion(contextId: number, params: NativeCompletionParams): Promise<NativeCompletionResult>;
stopCompletion(contextId: number): Promise<void>;
tokenizeAsync(contextId: number, text: string, imagePaths?: Array<string>): Promise<NativeTokenizeResult>;
tokenizeSync(contextId: number, text: string, imagePaths?: Array<string>): NativeTokenizeResult;
getCpuFeatures(): Promise<NativeCPUFeatures>;
detokenize(contextId: number, tokens: number[]): Promise<string>;
embedding(contextId: number, text: string, params: NativeEmbeddingParams): Promise<NativeEmbeddingResult>;
rerank(contextId: number, query: string, documents: Array<string>, params?: NativeRerankParams): Promise<Array<NativeRerankResult>>;
bench(contextId: number, pp: number, tg: number, pl: number, nr: number): Promise<string>;
applyLoraAdapters(contextId: number, loraAdapters: Array<{
path: string;
scaled?: number;
}>): Promise<void>;
removeLoraAdapters(contextId: number): Promise<void>;
getLoadedLoraAdapters(contextId: number): Promise<Array<{
path: string;
scaled?: number;
}>>;
initMultimodal(contextId: number, params: {
path: string;
use_gpu: boolean;
}): Promise<boolean>;
isMultimodalEnabled(contextId: number): Promise<boolean>;
getMultimodalSupport(contextId: number): Promise<{
vision: boolean;
audio: boolean;
}>;
releaseMultimodal(contextId: number): Promise<void>;
initVocoder(contextId: number, vocoderModelPath: string): Promise<boolean>;
isVocoderEnabled(contextId: number): Promise<boolean>;
getFormattedAudioCompletion(contextId: number, speakerJsonStr: string, textToSpeak: string): Promise<string>;
getAudioCompletionGuideTokens(contextId: number, textToSpeak: string): Promise<Array<number>>;
decodeAudioTokens(contextId: number, tokens: number[]): Promise<Array<number>>;
releaseVocoder(contextId: number): Promise<void>;
releaseContext(contextId: number): Promise<void>;
releaseAllContexts(): Promise<void>;
}
declare const _default: Spec;
export default _default;
//# sourceMappingURL=NativeRNLlama.d.ts.map