llama-cpp-capacitor
Version:
A native Capacitor plugin that embeds llama.cpp directly into mobile apps, enabling offline AI inference with chat-first API design. Supports both simple text generation and advanced chat conversations with system prompts, multimodal processing, TTS, LoRA
715 lines (714 loc) • 22.3 kB
TypeScript
export interface NativeEmbeddingParams {
embd_normalize?: number;
}
export interface NativeContextParams {
model: string;
/**
* Chat template to override the default one from the model.
*/
chat_template?: string;
is_model_asset?: boolean;
use_progress_callback?: boolean;
n_ctx?: number;
n_batch?: number;
n_ubatch?: number;
n_threads?: number;
/**
* Path to draft model for speculative decoding (mobile optimization)
*/
draft_model?: string;
/**
* Number of tokens to predict speculatively (default: 3 for mobile)
*/
speculative_samples?: number;
/**
* Enable mobile-optimized speculative decoding
*/
mobile_speculative?: boolean;
/**
* Number of layers to store in VRAM (Currently only for iOS)
*/
n_gpu_layers?: number;
/**
* Skip GPU devices (iOS only)
*/
no_gpu_devices?: boolean;
/**
* Enable flash attention, only recommended in GPU device (Experimental in llama.cpp)
*/
flash_attn?: boolean;
/**
* KV cache data type for the K (Experimental in llama.cpp)
*/
cache_type_k?: string;
/**
* KV cache data type for the V (Experimental in llama.cpp)
*/
cache_type_v?: string;
use_mlock?: boolean;
use_mmap?: boolean;
vocab_only?: boolean;
/**
* Single LoRA adapter path
*/
lora?: string;
/**
* Single LoRA adapter scale
*/
lora_scaled?: number;
/**
* LoRA adapter list
*/
lora_list?: Array<{
path: string;
scaled?: number;
}>;
rope_freq_base?: number;
rope_freq_scale?: number;
pooling_type?: number;
/**
* Enable context shifting to handle prompts larger than context size
*/
ctx_shift?: boolean;
/**
* Use a unified buffer across the input sequences when computing the attention.
* Try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix.
*/
kv_unified?: boolean;
/**
* Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
*/
swa_full?: boolean;
/**
* Number of layers to keep MoE weights on CPU
*/
n_cpu_moe?: number;
embedding?: boolean;
embd_normalize?: number;
}
export interface NativeCompletionParams {
prompt: string;
n_threads?: number;
/**
* Enable Jinja. Default: true if supported by the model
*/
jinja?: boolean;
/**
* JSON schema for convert to grammar for structured JSON output.
* It will be override by grammar if both are set.
*/
json_schema?: string;
/**
* Set grammar for grammar-based sampling (GBNF format). Default: no grammar
* This will override json_schema if both are provided.
*/
grammar?: string;
/**
* Lazy grammar sampling, trigger by grammar_triggers. Default: false
*/
grammar_lazy?: boolean;
/**
* Enable thinking if jinja is enabled. Default: true
*/
enable_thinking?: boolean;
/**
* Force thinking to be open. Default: false
*/
thinking_forced_open?: boolean;
/**
* Lazy grammar triggers. Default: []
*/
grammar_triggers?: Array<{
type: number;
value: string;
token: number;
}>;
preserved_tokens?: Array<string>;
chat_format?: number;
reasoning_format?: string;
/**
* Path to an image file to process before generating text.
* When provided, the image will be processed and added to the context.
* Requires multimodal support to be enabled via initMultimodal.
*/
media_paths?: Array<string>;
/**
* Specify a JSON array of stopping strings.
* These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
*/
stop?: Array<string>;
/**
* Set the maximum number of tokens to predict when generating text.
* **Note:** May exceed the set limit slightly if the last token is a partial multibyte character.
* When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
*/
n_predict?: number;
/**
* If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings.
* Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings.
* Default: `0`
*/
n_probs?: number;
/**
* Limit the next token selection to the K most probable tokens. Default: `40`
*/
top_k?: number;
/**
* Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
*/
top_p?: number;
/**
* The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
*/
min_p?: number;
/**
* Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
*/
xtc_probability?: number;
/**
* Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
*/
xtc_threshold?: number;
/**
* Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
*/
typical_p?: number;
/**
* Adjust the randomness of the generated text. Default: `0.8`
*/
temperature?: number;
/**
* Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
*/
penalty_last_n?: number;
/**
* Control the repetition of token sequences in the generated text. Default: `1.0`
*/
penalty_repeat?: number;
/**
* Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
*/
penalty_freq?: number;
/**
* Repeat alpha presence penalty. Default: `0.0`, which is disabled.
*/
penalty_present?: number;
/**
* Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
*/
mirostat?: number;
/**
* Set the Mirostat target entropy, parameter tau. Default: `5.0`
*/
mirostat_tau?: number;
/**
* Set the Mirostat learning rate, parameter eta. Default: `0.1`
*/
mirostat_eta?: number;
/**
* Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
*/
dry_multiplier?: number;
/**
* Set the DRY repetition penalty base value. Default: `1.75`
*/
dry_base?: number;
/**
* Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
*/
dry_allowed_length?: number;
/**
* How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
*/
dry_penalty_last_n?: number;
/**
* Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
*/
dry_sequence_breakers?: Array<string>;
/**
* Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641. Default: `-1.0` (Disabled)
*/
top_n_sigma?: number;
/**
* Ignore end of stream token and continue generating. Default: `false`
*/
ignore_eos?: boolean;
/**
* Modify the likelihood of a token appearing in the generated text completion.
* For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood.
* Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings,
* e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does.
* Default: `[]`
*/
logit_bias?: Array<Array<number>>;
/**
* Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
*/
seed?: number;
/**
* Guide tokens for the completion.
* Help prevent hallucinations by forcing the TTS to use the correct words.
* Default: `[]`
*/
guide_tokens?: Array<number>;
emit_partial_completion: boolean;
}
export interface NativeCompletionTokenProbItem {
tok_str: string;
prob: number;
}
export interface NativeCompletionTokenProb {
content: string;
probs: Array<NativeCompletionTokenProbItem>;
}
export interface NativeCompletionResultTimings {
prompt_n: number;
prompt_ms: number;
prompt_per_token_ms: number;
prompt_per_second: number;
predicted_n: number;
predicted_ms: number;
predicted_per_token_ms: number;
predicted_per_second: number;
}
export interface NativeCompletionResult {
/**
* Original text (Ignored reasoning_content / tool_calls)
*/
text: string;
/**
* Reasoning content (parsed for reasoning model)
*/
reasoning_content: string;
/**
* Tool calls (parsed from response)
*/
tool_calls: Array<{
type: 'function';
function: {
name: string;
arguments: string;
};
id?: string;
}>;
/**
* Content text (Filtered text by reasoning_content / tool_calls)
*/
content: string;
chat_format: number;
tokens_predicted: number;
tokens_evaluated: number;
truncated: boolean;
stopped_eos: boolean;
stopped_word: string;
stopped_limit: number;
stopping_word: string;
context_full: boolean;
interrupted: boolean;
tokens_cached: number;
timings: NativeCompletionResultTimings;
completion_probabilities?: Array<NativeCompletionTokenProb>;
audio_tokens?: Array<number>;
}
export interface NativeTokenizeResult {
tokens: Array<number>;
/**
* Whether the tokenization contains images
*/
has_images: boolean;
/**
* Bitmap hashes of the images
*/
bitmap_hashes: Array<number>;
/**
* Chunk positions of the text and images
*/
chunk_pos: Array<number>;
/**
* Chunk positions of the images
*/
chunk_pos_images: Array<number>;
}
export interface NativeEmbeddingResult {
embedding: Array<number>;
}
export interface NativeLlamaContext {
contextId: number;
model: {
desc: string;
size: number;
nEmbd: number;
nParams: number;
chatTemplates: {
llamaChat: boolean;
minja: {
default: boolean;
defaultCaps: {
tools: boolean;
toolCalls: boolean;
toolResponses: boolean;
systemRole: boolean;
parallelToolCalls: boolean;
toolCallId: boolean;
};
toolUse: boolean;
toolUseCaps: {
tools: boolean;
toolCalls: boolean;
toolResponses: boolean;
systemRole: boolean;
parallelToolCalls: boolean;
toolCallId: boolean;
};
};
};
metadata: Object;
isChatTemplateSupported: boolean;
};
/**
* Loaded library name for Android
*/
androidLib?: string;
gpu: boolean;
reasonNoGPU: string;
}
export interface NativeSessionLoadResult {
tokens_loaded: number;
prompt: string;
}
export interface NativeLlamaMessagePart {
type: 'text';
text: string;
}
export interface NativeLlamaChatMessage {
role: string;
content: string | Array<NativeLlamaMessagePart>;
}
export interface FormattedChatResult {
type: 'jinja' | 'llama-chat';
prompt: string;
has_media: boolean;
media_paths?: Array<string>;
}
export interface JinjaFormattedChatResult extends FormattedChatResult {
chat_format?: number;
grammar?: string;
grammar_lazy?: boolean;
grammar_triggers?: Array<{
type: number;
value: string;
token: number;
}>;
thinking_forced_open?: boolean;
preserved_tokens?: Array<string>;
additional_stops?: Array<string>;
}
export interface NativeImageProcessingResult {
success: boolean;
prompt: string;
error?: string;
}
export interface NativeRerankParams {
normalize?: number;
}
export interface NativeRerankResult {
score: number;
index: number;
}
export interface LlamaCppMessagePart {
type: string;
text?: string;
image_url?: {
url?: string;
};
input_audio?: {
format: string;
data?: string;
url?: string;
};
}
export interface LlamaCppOAICompatibleMessage {
role: string;
content?: string | LlamaCppMessagePart[];
}
export interface ToolCall {
type: 'function';
id?: string;
function: {
name: string;
arguments: string;
};
}
export interface TokenData {
token: string;
completion_probabilities?: Array<NativeCompletionTokenProb>;
content?: string;
reasoning_content?: string;
tool_calls?: Array<ToolCall>;
accumulated_text?: string;
}
export interface ContextParams extends Omit<NativeContextParams, 'cache_type_k' | 'cache_type_v' | 'pooling_type'> {
cache_type_k?: 'f16' | 'f32' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1';
cache_type_v?: 'f16' | 'f32' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1';
pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank';
}
export interface EmbeddingParams extends NativeEmbeddingParams {
}
export interface RerankParams {
normalize?: number;
}
export interface RerankResult {
score: number;
index: number;
document?: string;
}
export interface CompletionResponseFormat {
type: 'text' | 'json_object' | 'json_schema';
json_schema?: {
strict?: boolean;
schema: object;
};
schema?: object;
}
export interface CompletionBaseParams {
prompt?: string;
messages?: LlamaCppOAICompatibleMessage[];
chatTemplate?: string;
chat_template?: string;
jinja?: boolean;
tools?: object;
parallel_tool_calls?: object;
tool_choice?: string;
response_format?: CompletionResponseFormat;
media_paths?: string[];
add_generation_prompt?: boolean;
now?: string | number;
chat_template_kwargs?: Record<string, string>;
/**
* Prefill text to be used for chat parsing (Generation Prompt + Content)
* Used for if last assistant message is for prefill purpose
*/
prefill_text?: string;
}
export interface CompletionParams extends Omit<NativeCompletionParams, 'emit_partial_completion' | 'prompt'> {
prompt?: string;
messages?: LlamaCppOAICompatibleMessage[];
chatTemplate?: string;
chat_template?: string;
jinja?: boolean;
/**
* GBNF grammar for structured output. Takes precedence over json_schema.
*/
grammar?: string;
tools?: object;
parallel_tool_calls?: object;
tool_choice?: string;
response_format?: CompletionResponseFormat;
media_paths?: string[];
add_generation_prompt?: boolean;
now?: string | number;
chat_template_kwargs?: Record<string, string>;
/**
* Prefill text to be used for chat parsing (Generation Prompt + Content)
* Used for if last assistant message is for prefill purpose
*/
prefill_text?: string;
}
export interface BenchResult {
modelDesc: string;
modelSize: number;
modelNParams: number;
ppAvg: number;
ppStd: number;
tgAvg: number;
tgStd: number;
}
export interface LlamaCppPlugin {
toggleNativeLog(options: {
enabled: boolean;
}): Promise<void>;
setContextLimit(options: {
limit: number;
}): Promise<void>;
modelInfo(options: {
path: string;
skip?: string[];
}): Promise<Object>;
initContext(options: {
contextId: number;
params: NativeContextParams;
}): Promise<NativeLlamaContext>;
releaseContext(options: {
contextId: number;
}): Promise<void>;
releaseAllContexts(): Promise<void>;
getFormattedChat(options: {
contextId: number;
messages: string;
chatTemplate?: string;
params?: {
jinja?: boolean;
json_schema?: string;
tools?: string;
parallel_tool_calls?: string;
tool_choice?: string;
enable_thinking?: boolean;
add_generation_prompt?: boolean;
now?: string;
chat_template_kwargs?: string;
};
}): Promise<JinjaFormattedChatResult | string>;
completion(options: {
contextId: number;
params: NativeCompletionParams;
}): Promise<NativeCompletionResult>;
chat(options: {
contextId: number;
messages: LlamaCppOAICompatibleMessage[];
system?: string;
chatTemplate?: string;
params?: Omit<NativeCompletionParams, 'prompt' | 'messages'>;
}): Promise<NativeCompletionResult>;
chatWithSystem(options: {
contextId: number;
system: string;
message: string;
params?: Omit<NativeCompletionParams, 'prompt' | 'messages'>;
}): Promise<NativeCompletionResult>;
generateText(options: {
contextId: number;
prompt: string;
params?: Omit<NativeCompletionParams, 'prompt' | 'messages'>;
}): Promise<NativeCompletionResult>;
stopCompletion(options: {
contextId: number;
}): Promise<void>;
loadSession(options: {
contextId: number;
filepath: string;
}): Promise<NativeSessionLoadResult>;
saveSession(options: {
contextId: number;
filepath: string;
size: number;
}): Promise<number>;
tokenize(options: {
contextId: number;
text: string;
imagePaths?: Array<string>;
}): Promise<NativeTokenizeResult>;
detokenize(options: {
contextId: number;
tokens: number[];
}): Promise<string>;
embedding(options: {
contextId: number;
text: string;
params: NativeEmbeddingParams;
}): Promise<NativeEmbeddingResult>;
rerank(options: {
contextId: number;
query: string;
documents: Array<string>;
params?: NativeRerankParams;
}): Promise<Array<NativeRerankResult>>;
bench(options: {
contextId: number;
pp: number;
tg: number;
pl: number;
nr: number;
}): Promise<string>;
applyLoraAdapters(options: {
contextId: number;
loraAdapters: Array<{
path: string;
scaled?: number;
}>;
}): Promise<void>;
removeLoraAdapters(options: {
contextId: number;
}): Promise<void>;
getLoadedLoraAdapters(options: {
contextId: number;
}): Promise<Array<{
path: string;
scaled?: number;
}>>;
initMultimodal(options: {
contextId: number;
params: {
path: string;
use_gpu: boolean;
};
}): Promise<boolean>;
isMultimodalEnabled(options: {
contextId: number;
}): Promise<boolean>;
getMultimodalSupport(options: {
contextId: number;
}): Promise<{
vision: boolean;
audio: boolean;
}>;
releaseMultimodal(options: {
contextId: number;
}): Promise<void>;
initVocoder(options: {
contextId: number;
params: {
path: string;
n_batch?: number;
};
}): Promise<boolean>;
isVocoderEnabled(options: {
contextId: number;
}): Promise<boolean>;
getFormattedAudioCompletion(options: {
contextId: number;
speakerJsonStr: string;
textToSpeak: string;
}): Promise<{
prompt: string;
grammar?: string;
}>;
getAudioCompletionGuideTokens(options: {
contextId: number;
textToSpeak: string;
}): Promise<Array<number>>;
decodeAudioTokens(options: {
contextId: number;
tokens: number[];
}): Promise<Array<number>>;
releaseVocoder(options: {
contextId: number;
}): Promise<void>;
downloadModel(options: {
url: string;
filename: string;
}): Promise<string>;
getDownloadProgress(options: {
url: string;
}): Promise<{
progress: number;
completed: boolean;
failed: boolean;
errorMessage?: string;
localPath?: string;
downloadedBytes: number;
totalBytes: number;
}>;
cancelDownload(options: {
url: string;
}): Promise<boolean>;
getAvailableModels(): Promise<Array<{
name: string;
path: string;
size: number;
}>>;
convertJsonSchemaToGrammar(options: {
schema: string;
}): Promise<string>;
addListener(eventName: string, listenerFunc: (data: any) => void): Promise<void>;
removeAllListeners(eventName: string): Promise<void>;
}