cui-llama.rn
Version:
Fork of llama.rn for ChatterUI
539 lines (490 loc) • 14.3 kB
text/typescript
import type { TurboModule } from 'react-native'
import { TurboModuleRegistry } from 'react-native'
export type NativeEmbeddingParams = {
embd_normalize?: number
}
export type NativeContextParams = {
model: string
/**
* Chat template to override the default one from the model.
*/
chat_template?: string
is_model_asset?: boolean
use_progress_callback?: boolean
n_ctx?: number
n_batch?: number
n_ubatch?: number
n_threads?: number
/**
* Number of layers to store in VRAM (Currently only for iOS)
*/
n_gpu_layers?: number
/**
* Skip GPU devices (iOS only)
*/
no_gpu_devices?: boolean
/**
* Enable flash attention, only recommended in GPU device (Experimental in llama.cpp)
*/
flash_attn?: boolean
/**
* KV cache data type for the K (Experimental in llama.cpp)
*/
cache_type_k?: string
/**
* KV cache data type for the V (Experimental in llama.cpp)
*/
cache_type_v?: string
use_mlock?: boolean
use_mmap?: boolean
vocab_only?: boolean
/**
* Single LoRA adapter path
*/
lora?: string
/**
* Single LoRA adapter scale
*/
lora_scaled?: number
/**
* LoRA adapter list
*/
lora_list?: Array<{ path: string; scaled?: number }>
rope_freq_base?: number
rope_freq_scale?: number
pooling_type?: number
/**
* Enable context shifting to handle prompts larger than context size
*/
ctx_shift?: boolean
// Embedding params
embedding?: boolean
embd_normalize?: number
}
export type NativeCompletionParams = {
prompt: string
n_threads?: number
/**
* Enable Jinja. Default: true if supported by the model
*/
jinja?: boolean
/**
* JSON schema for convert to grammar for structured JSON output.
* It will be override by grammar if both are set.
*/
json_schema?: string
/**
* Set grammar for grammar-based sampling. Default: no grammar
*/
grammar?: string
/**
* Lazy grammar sampling, trigger by grammar_triggers. Default: false
*/
grammar_lazy?: boolean
/**
* Enable thinking if jinja is enabled. Default: true
*/
enable_thinking?: boolean
/**
* Force thinking to be open. Default: false
*/
thinking_forced_open?: boolean
/**
* Lazy grammar triggers. Default: []
*/
grammar_triggers?: Array<{
type: number
value: string
token: number
}>
preserved_tokens?: Array<string>
chat_format?: number
reasoning_format?: string
/**
* Path to an image file to process before generating text.
* When provided, the image will be processed and added to the context.
* Requires multimodal support to be enabled via initMultimodal.
*/
media_paths?: Array<string>
/**
* Specify a JSON array of stopping strings.
* These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
*/
stop?: Array<string>
/**
* Set the maximum number of tokens to predict when generating text.
* **Note:** May exceed the set limit slightly if the last token is a partial multibyte character.
* When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
*/
n_predict?: number
/**
* If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings.
* Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings.
* Default: `0`
*/
n_probs?: number
/**
* Limit the next token selection to the K most probable tokens. Default: `40`
*/
top_k?: number
/**
* Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
*/
top_p?: number
/**
* The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
*/
min_p?: number
/**
* Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
*/
xtc_probability?: number
/**
* Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
*/
xtc_threshold?: number
/**
* Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
*/
typical_p?: number
/**
* Adjust the randomness of the generated text. Default: `0.8`
*/
temperature?: number
/**
* Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
*/
penalty_last_n?: number
/**
* Control the repetition of token sequences in the generated text. Default: `1.0`
*/
penalty_repeat?: number
/**
* Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
*/
penalty_freq?: number
/**
* Repeat alpha presence penalty. Default: `0.0`, which is disabled.
*/
penalty_present?: number
/**
* Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
*/
mirostat?: number
/**
* Set the Mirostat target entropy, parameter tau. Default: `5.0`
*/
mirostat_tau?: number
/**
* Set the Mirostat learning rate, parameter eta. Default: `0.1`
*/
mirostat_eta?: number
/**
* Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
*/
dry_multiplier?: number
/**
* Set the DRY repetition penalty base value. Default: `1.75`
*/
dry_base?: number
/**
* Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
*/
dry_allowed_length?: number
/**
* How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
*/
dry_penalty_last_n?: number
/**
* Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
*/
dry_sequence_breakers?: Array<string>
/**
* Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641. Default: `-1.0` (Disabled)
*/
top_n_sigma?: number
/**
* Ignore end of stream token and continue generating. Default: `false`
*/
ignore_eos?: boolean
/**
* Modify the likelihood of a token appearing in the generated text completion.
* For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood.
* Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings,
* e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does.
* Default: `[]`
*/
logit_bias?: Array<Array<number>>
/**
* Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
*/
seed?: number
/**
* Guide tokens for the completion.
* Help prevent hallucinations by forcing the TTS to use the correct words.
* Default: `[]`
*/
guide_tokens?: Array<number>
emit_partial_completion: boolean
}
export type NativeCompletionTokenProbItem = {
tok_str: string
prob: number
}
export type NativeCompletionTokenProb = {
content: string
probs: Array<NativeCompletionTokenProbItem>
}
export type NativeCompletionResultTimings = {
prompt_n: number
prompt_ms: number
prompt_per_token_ms: number
prompt_per_second: number
predicted_n: number
predicted_ms: number
predicted_per_token_ms: number
predicted_per_second: number
}
export type NativeCompletionResult = {
/**
* Original text (Ignored reasoning_content / tool_calls)
*/
text: string
/**
* Reasoning content (parsed for reasoning model)
*/
reasoning_content: string
/**
* Tool calls
*/
tool_calls: Array<{
type: 'function'
function: {
name: string
arguments: string
}
id?: string
}>
/**
* Content text (Filtered text by reasoning_content / tool_calls)
*/
content: string
tokens_predicted: number
tokens_evaluated: number
truncated: boolean
stopped_eos: boolean
stopped_word: string
stopped_limit: number
stopping_word: string
tokens_cached: number
timings: NativeCompletionResultTimings
completion_probabilities?: Array<NativeCompletionTokenProb>
audio_tokens?: Array<number>
}
export type NativeTokenizeResult = {
tokens: Array<number>
/**
* Whether the tokenization contains images
*/
has_images: boolean
/**
* Bitmap hashes of the images
*/
bitmap_hashes: Array<number>
/**
* Chunk positions of the text and images
*/
chunk_pos: Array<number>
/**
* Chunk positions of the images
*/
chunk_pos_images: Array<number>
}
export type NativeEmbeddingResult = {
embedding: Array<number>
}
export type NativeLlamaContext = {
contextId: number
model: {
desc: string
size: number
nEmbd: number
nParams: number
chatTemplates: {
llamaChat: boolean // Chat template in llama-chat.cpp
minja: {
// Chat template supported by minja.hpp
default: boolean
defaultCaps: {
tools: boolean
toolCalls: boolean
toolResponses: boolean
systemRole: boolean
parallelToolCalls: boolean
toolCallId: boolean
}
toolUse: boolean
toolUseCaps: {
tools: boolean
toolCalls: boolean
toolResponses: boolean
systemRole: boolean
parallelToolCalls: boolean
toolCallId: boolean
}
}
}
metadata: Object
isChatTemplateSupported: boolean // Deprecated
}
/**
* Loaded library name for Android
*/
androidLib?: string
gpu: boolean
reasonNoGPU: string
}
export type NativeSessionLoadResult = {
tokens_loaded: number
prompt: string
}
export type NativeLlamaMessagePart = {
type: 'text'
text: string
}
export type NativeLlamaChatMessage = {
role: string
content: string | Array<NativeLlamaMessagePart>
}
export type NativeCPUFeatures = {
armv8: boolean
i8mm: boolean
dotprod: boolean
}
export type FormattedChatResult = {
type: 'jinja' | 'llama-chat'
prompt: string
has_media: boolean
media_paths?: Array<string>
}
export type JinjaFormattedChatResult = FormattedChatResult & {
chat_format?: number
grammar?: string
grammar_lazy?: boolean
grammar_triggers?: Array<{
type: number
value: string
token: number
}>
thinking_forced_open?: boolean
preserved_tokens?: Array<string>
additional_stops?: Array<string>
}
export type NativeImageProcessingResult = {
success: boolean
prompt: string
error?: string
}
export type NativeRerankParams = {
normalize?: number
}
export type NativeRerankResult = {
score: number
index: number
}
export interface Spec extends TurboModule {
toggleNativeLog(enabled: boolean): Promise<void>
setContextLimit(limit: number): Promise<void>
modelInfo(path: string, skip?: string[]): Promise<Object>
initContext(
contextId: number,
params: NativeContextParams,
): Promise<NativeLlamaContext>
getFormattedChat(
contextId: number,
messages: string,
chatTemplate?: string,
params?: {
jinja?: boolean
json_schema?: string
tools?: string
parallel_tool_calls?: string
tool_choice?: string
enable_thinking?: boolean
},
): Promise<JinjaFormattedChatResult | string>
loadSession(
contextId: number,
filepath: string,
): Promise<NativeSessionLoadResult>
saveSession(
contextId: number,
filepath: string,
size: number,
): Promise<number>
completion(
contextId: number,
params: NativeCompletionParams,
): Promise<NativeCompletionResult>
stopCompletion(contextId: number): Promise<void>
tokenizeAsync(contextId: number, text: string, imagePaths?: Array<string>): Promise<NativeTokenizeResult>
tokenizeSync(contextId: number, text: string, imagePaths?: Array<string>): NativeTokenizeResult
getCpuFeatures() : Promise<NativeCPUFeatures>
detokenize(contextId: number, tokens: number[]): Promise<string>
embedding(
contextId: number,
text: string,
params: NativeEmbeddingParams,
): Promise<NativeEmbeddingResult>
rerank(
contextId: number,
query: string,
documents: Array<string>,
params?: NativeRerankParams,
): Promise<Array<NativeRerankResult>>
bench(
contextId: number,
pp: number,
tg: number,
pl: number,
nr: number,
): Promise<string>
applyLoraAdapters(
contextId: number,
loraAdapters: Array<{ path: string; scaled?: number }>,
): Promise<void>
removeLoraAdapters(contextId: number): Promise<void>
getLoadedLoraAdapters(
contextId: number,
): Promise<Array<{ path: string; scaled?: number }>>
// Multimodal methods
initMultimodal(
contextId: number,
params: {
path: string
use_gpu: boolean
},
): Promise<boolean>
isMultimodalEnabled(
contextId: number,
): Promise<boolean>
getMultimodalSupport(
contextId: number,
): Promise<{
vision: boolean
audio: boolean
}>
releaseMultimodal(
contextId: number,
): Promise<void>
// TTS methods
initVocoder(contextId: number, vocoderModelPath: string): Promise<boolean>
isVocoderEnabled(contextId: number): Promise<boolean>
getFormattedAudioCompletion(contextId: number, speakerJsonStr: string, textToSpeak: string): Promise<string>
getAudioCompletionGuideTokens(contextId: number, textToSpeak: string): Promise<Array<number>>
decodeAudioTokens(contextId: number, tokens: number[]): Promise<Array<number>>
releaseVocoder(contextId: number): Promise<void>
releaseContext(contextId: number): Promise<void>
releaseAllContexts(): Promise<void>
}
export default TurboModuleRegistry.get<Spec>('RNLlama') as Spec