UNPKG

llama-cpp-capacitor

Version:

A native Capacitor plugin that embeds llama.cpp directly into mobile apps, enabling offline AI inference with chat-first API design. Supports both simple text generation and advanced chat conversations with system prompts, multimodal processing, TTS, LoRA

592 lines (587 loc) 27.2 kB
var capacitorLlamaCpp = (function (exports, tslib, core) { 'use strict'; var _a, _b, _c; // Constants const LLAMACPP_MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'; // Event names const EVENT_ON_INIT_CONTEXT_PROGRESS = '@LlamaCpp_onInitContextProgress'; const EVENT_ON_TOKEN = '@LlamaCpp_onToken'; const EVENT_ON_NATIVE_LOG = '@LlamaCpp_onNativeLog'; // Register the plugin const LlamaCpp = core.registerPlugin('LlamaCpp'); // Log listeners management const logListeners = []; // Set up native log listener LlamaCpp.addListener(EVENT_ON_NATIVE_LOG, (evt) => { logListeners.forEach((listener) => listener(evt.level, evt.text)); }); // Trigger unset to use default log callback (_c = (_b = (_a = LlamaCpp === null || LlamaCpp === void 0 ? void 0 : LlamaCpp.toggleNativeLog) === null || _a === void 0 ? void 0 : _a.call(LlamaCpp, { enabled: false })) === null || _b === void 0 ? void 0 : _b.catch) === null || _c === void 0 ? void 0 : _c.call(_b, () => { }); const RNLLAMA_MTMD_DEFAULT_MEDIA_MARKER = LLAMACPP_MTMD_DEFAULT_MEDIA_MARKER; const validCacheTypes = [ 'f16', 'f32', 'bf16', 'q8_0', 'q4_0', 'q4_1', 'iq4_nl', 'q5_0', 'q5_1', ]; const getJsonSchema = (responseFormat) => { var _a; if ((responseFormat === null || responseFormat === void 0 ? void 0 : responseFormat.type) === 'json_schema') { return (_a = responseFormat.json_schema) === null || _a === void 0 ? void 0 : _a.schema; } if ((responseFormat === null || responseFormat === void 0 ? void 0 : responseFormat.type) === 'json_object') { return responseFormat.schema || {}; } return null; }; // Utility function to convert JSON schema to GBNF grammar const jsonSchemaToGrammar = async (schema) => { // This will call the native method to convert JSON schema to GBNF // For now, we'll return a basic implementation try { const result = await LlamaCpp.convertJsonSchemaToGrammar({ schema: JSON.stringify(schema) }); return result; } catch (error) { console.warn('Failed to convert JSON schema to GBNF, using fallback:', error); // Fallback for basic object structure return `root ::= "{" ws object_content ws "}" object_content ::= string_field ("," ws string_field)* string_field ::= "\\"" [a-zA-Z_][a-zA-Z0-9_]* "\\"" ws ":" ws value value ::= string | number | boolean | "null" string ::= "\\"" [^"]* "\\"" number ::= "-"? [0-9]+ ("." [0-9]+)? boolean ::= "true" | "false" ws ::= [ \\t\\n]*`; } }; class LlamaContext { constructor({ contextId, gpu, reasonNoGPU, model }) { this.gpu = false; this.reasonNoGPU = ''; this.id = contextId; this.gpu = gpu; this.reasonNoGPU = reasonNoGPU; this.model = model; } /** * Load cached prompt & completion state from a file. */ async loadSession(filepath) { let path = filepath; if (path.startsWith('file://')) path = path.slice(7); return LlamaCpp.loadSession({ contextId: this.id, filepath: path }); } /** * Save current cached prompt & completion state to a file. */ async saveSession(filepath, options) { return LlamaCpp.saveSession({ contextId: this.id, filepath, size: (options === null || options === void 0 ? void 0 : options.tokenSize) || -1 }); } isLlamaChatSupported() { return !!this.model.chatTemplates.llamaChat; } isJinjaSupported() { const { minja } = this.model.chatTemplates; return !!(minja === null || minja === void 0 ? void 0 : minja.toolUse) || !!(minja === null || minja === void 0 ? void 0 : minja.default); } async getFormattedChat(messages, template, params) { var _a; const mediaPaths = []; const chat = messages.map((msg) => { if (Array.isArray(msg.content)) { const content = msg.content.map((part) => { var _a; // Handle multimodal content if (part.type === 'image_url') { let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || ''; if (path === null || path === void 0 ? void 0 : path.startsWith('file://')) path = path.slice(7); mediaPaths.push(path); return { type: 'text', text: RNLLAMA_MTMD_DEFAULT_MEDIA_MARKER, }; } else if (part.type === 'input_audio') { const { input_audio: audio } = part; if (!audio) throw new Error('input_audio is required'); const { format } = audio; if (format != 'wav' && format != 'mp3') { throw new Error(`Unsupported audio format: ${format}`); } if (audio.url) { const path = audio.url.replace(/file:\/\//, ''); mediaPaths.push(path); } else if (audio.data) { mediaPaths.push(audio.data); } return { type: 'text', text: RNLLAMA_MTMD_DEFAULT_MEDIA_MARKER, }; } return part; }); return Object.assign(Object.assign({}, msg), { content }); } return msg; }); const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja); let tmpl; if (template) tmpl = template; // Force replace if provided const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format); const result = await LlamaCpp.getFormattedChat({ contextId: this.id, messages: JSON.stringify(chat), chatTemplate: tmpl, params: { jinja: useJinja, json_schema: jsonSchema ? JSON.stringify(jsonSchema) : undefined, tools: (params === null || params === void 0 ? void 0 : params.tools) ? JSON.stringify(params.tools) : undefined, parallel_tool_calls: (params === null || params === void 0 ? void 0 : params.parallel_tool_calls) ? JSON.stringify(params.parallel_tool_calls) : undefined, tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice, enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true, add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt, now: typeof (params === null || params === void 0 ? void 0 : params.now) === 'number' ? params.now.toString() : params === null || params === void 0 ? void 0 : params.now, chat_template_kwargs: (params === null || params === void 0 ? void 0 : params.chat_template_kwargs) ? JSON.stringify(Object.entries(params.chat_template_kwargs).reduce((acc, [key, value]) => { acc[key] = JSON.stringify(value); // Each value is a stringified JSON object return acc; }, {})) : undefined, }, }); if (!useJinja) { return { type: 'llama-chat', prompt: result, has_media: mediaPaths.length > 0, media_paths: mediaPaths, }; } const jinjaResult = result; jinjaResult.type = 'jinja'; jinjaResult.has_media = mediaPaths.length > 0; jinjaResult.media_paths = mediaPaths; return jinjaResult; } /** * Generate a completion based on the provided parameters * @param params Completion parameters including prompt or messages * @param callback Optional callback for token-by-token streaming * @returns Promise resolving to the completion result * * Note: For multimodal support, you can include an media_paths parameter. * This will process the images and add them to the context before generating text. * Multimodal support must be enabled via initMultimodal() first. */ async completion(params, callback) { const nativeParams = Object.assign(Object.assign({}, params), { prompt: params.prompt || '', emit_partial_completion: !!callback }); if (params.messages) { const formattedResult = await this.getFormattedChat(params.messages, params.chat_template || params.chatTemplate, { jinja: params.jinja, tools: params.tools, parallel_tool_calls: params.parallel_tool_calls, tool_choice: params.tool_choice, enable_thinking: params.enable_thinking, add_generation_prompt: params.add_generation_prompt, now: params.now, chat_template_kwargs: params.chat_template_kwargs, }); if (formattedResult.type === 'jinja') { const jinjaResult = formattedResult; nativeParams.prompt = jinjaResult.prompt || ''; if (typeof jinjaResult.chat_format === 'number') nativeParams.chat_format = jinjaResult.chat_format; if (jinjaResult.grammar) nativeParams.grammar = jinjaResult.grammar; if (typeof jinjaResult.grammar_lazy === 'boolean') nativeParams.grammar_lazy = jinjaResult.grammar_lazy; if (jinjaResult.grammar_triggers) nativeParams.grammar_triggers = jinjaResult.grammar_triggers; if (jinjaResult.preserved_tokens) nativeParams.preserved_tokens = jinjaResult.preserved_tokens; if (jinjaResult.additional_stops) { if (!nativeParams.stop) nativeParams.stop = []; nativeParams.stop.push(...jinjaResult.additional_stops); } if (jinjaResult.has_media) { nativeParams.media_paths = jinjaResult.media_paths; } } else if (formattedResult.type === 'llama-chat') { const llamaChatResult = formattedResult; nativeParams.prompt = llamaChatResult.prompt || ''; if (llamaChatResult.has_media) { nativeParams.media_paths = llamaChatResult.media_paths; } } } else { nativeParams.prompt = params.prompt || ''; } // If media_paths were explicitly provided or extracted from messages, use them if (!nativeParams.media_paths && params.media_paths) { nativeParams.media_paths = params.media_paths; } // Handle structured output and grammar if (params.grammar) { // Direct GBNF grammar takes precedence nativeParams.grammar = params.grammar; } else if (nativeParams.response_format && !nativeParams.grammar) { const jsonSchema = getJsonSchema(params.response_format); if (jsonSchema) { // Try to convert JSON schema to GBNF grammar try { nativeParams.grammar = await jsonSchemaToGrammar(jsonSchema); } catch (error) { console.warn('Failed to convert JSON schema to grammar, falling back to json_schema parameter:', error); nativeParams.json_schema = JSON.stringify(jsonSchema); } } } let tokenListener = callback && LlamaCpp.addListener(EVENT_ON_TOKEN, (evt) => { const { contextId, tokenResult } = evt; if (contextId !== this.id) return; callback(tokenResult); }); if (!nativeParams.prompt) throw new Error('Prompt is required'); const promise = LlamaCpp.completion({ contextId: this.id, params: nativeParams }); return promise .then((completionResult) => { tokenListener === null || tokenListener === void 0 ? void 0 : tokenListener.remove(); tokenListener = null; return completionResult; }) .catch((err) => { tokenListener === null || tokenListener === void 0 ? void 0 : tokenListener.remove(); tokenListener = null; throw err; }); } stopCompletion() { return LlamaCpp.stopCompletion({ contextId: this.id }); } /** * Tokenize text or text with images * @param text Text to tokenize * @param params.media_paths Array of image paths to tokenize (if multimodal is enabled) * @returns Promise resolving to the tokenize result */ tokenize(text, { media_paths: mediaPaths, } = {}) { return LlamaCpp.tokenize({ contextId: this.id, text, imagePaths: mediaPaths }); } detokenize(tokens) { return LlamaCpp.detokenize({ contextId: this.id, tokens }); } embedding(text, params) { return LlamaCpp.embedding({ contextId: this.id, text, params: params || {} }); } /** * Rerank documents based on relevance to a query * @param query The query text to rank documents against * @param documents Array of document texts to rank * @param params Optional reranking parameters * @returns Promise resolving to an array of ranking results with scores and indices */ async rerank(query, documents, params) { const results = await LlamaCpp.rerank({ contextId: this.id, query, documents, params: params || {} }); // Sort by score descending and add document text if requested return results .map((result) => (Object.assign(Object.assign({}, result), { document: documents[result.index] }))) .sort((a, b) => b.score - a.score); } async bench(pp, tg, pl, nr) { const result = await LlamaCpp.bench({ contextId: this.id, pp, tg, pl, nr }); const [modelDesc, modelSize, modelNParams, ppAvg, ppStd, tgAvg, tgStd] = JSON.parse(result); return { modelDesc, modelSize, modelNParams, ppAvg, ppStd, tgAvg, tgStd, }; } async applyLoraAdapters(loraList) { let loraAdapters = []; if (loraList) loraAdapters = loraList.map((l) => ({ path: l.path.replace(/file:\/\//, ''), scaled: l.scaled, })); return LlamaCpp.applyLoraAdapters({ contextId: this.id, loraAdapters }); } async removeLoraAdapters() { return LlamaCpp.removeLoraAdapters({ contextId: this.id }); } async getLoadedLoraAdapters() { return LlamaCpp.getLoadedLoraAdapters({ contextId: this.id }); } /** * Initialize multimodal support with a mmproj file * @param params Parameters for multimodal support * @param params.path Path to the multimodal projector file * @param params.use_gpu Whether to use GPU * @returns Promise resolving to true if initialization was successful */ async initMultimodal({ path, use_gpu: useGpu, }) { if (path.startsWith('file://')) path = path.slice(7); return LlamaCpp.initMultimodal({ contextId: this.id, params: { path, use_gpu: useGpu !== null && useGpu !== void 0 ? useGpu : true, }, }); } /** * Check if multimodal support is enabled * @returns Promise resolving to true if multimodal is enabled */ async isMultimodalEnabled() { return await LlamaCpp.isMultimodalEnabled({ contextId: this.id }); } /** * Check multimodal support * @returns Promise resolving to an object with vision and audio support */ async getMultimodalSupport() { return await LlamaCpp.getMultimodalSupport({ contextId: this.id }); } /** * Release multimodal support * @returns Promise resolving to void */ async releaseMultimodal() { return await LlamaCpp.releaseMultimodal({ contextId: this.id }); } /** * Initialize TTS support with a vocoder model * @param params Parameters for TTS support * @param params.path Path to the vocoder model * @param params.n_batch Batch size for the vocoder model * @returns Promise resolving to true if initialization was successful */ async initVocoder({ path, n_batch: nBatch }) { if (path.startsWith('file://')) path = path.slice(7); return await LlamaCpp.initVocoder({ contextId: this.id, params: { path, n_batch: nBatch } }); } /** * Check if TTS support is enabled * @returns Promise resolving to true if TTS is enabled */ async isVocoderEnabled() { return await LlamaCpp.isVocoderEnabled({ contextId: this.id }); } /** * Get a formatted audio completion prompt * @param speakerJsonStr JSON string representing the speaker * @param textToSpeak Text to speak * @returns Promise resolving to the formatted audio completion result with prompt and grammar */ async getFormattedAudioCompletion(speaker, textToSpeak) { return await LlamaCpp.getFormattedAudioCompletion({ contextId: this.id, speakerJsonStr: speaker ? JSON.stringify(speaker) : '', textToSpeak, }); } /** * Get guide tokens for audio completion * @param textToSpeak Text to speak * @returns Promise resolving to the guide tokens */ async getAudioCompletionGuideTokens(textToSpeak) { return await LlamaCpp.getAudioCompletionGuideTokens({ contextId: this.id, textToSpeak }); } /** * Decode audio tokens * @param tokens Array of audio tokens * @returns Promise resolving to the decoded audio tokens */ async decodeAudioTokens(tokens) { return await LlamaCpp.decodeAudioTokens({ contextId: this.id, tokens }); } /** * Release TTS support * @returns Promise resolving to void */ async releaseVocoder() { return await LlamaCpp.releaseVocoder({ contextId: this.id }); } async release() { return LlamaCpp.releaseContext({ contextId: this.id }); } } async function toggleNativeLog(enabled) { return LlamaCpp.toggleNativeLog({ enabled }); } function addNativeLogListener(listener) { logListeners.push(listener); return { remove: () => { logListeners.splice(logListeners.indexOf(listener), 1); }, }; } async function setContextLimit(limit) { return LlamaCpp.setContextLimit({ limit }); } let contextIdCounter = 0; const contextIdRandom = () => process.env.NODE_ENV === 'test' ? 0 : Math.floor(Math.random() * 100000); const modelInfoSkip = [ // Large fields 'tokenizer.ggml.tokens', 'tokenizer.ggml.token_type', 'tokenizer.ggml.merges', 'tokenizer.ggml.scores', ]; async function loadLlamaModelInfo(model) { let path = model; if (path.startsWith('file://')) path = path.slice(7); return LlamaCpp.modelInfo({ path, skip: modelInfoSkip }); } const poolTypeMap = { // -1 is unspecified as undefined none: 0, mean: 1, cls: 2, last: 3, rank: 4, }; async function initLlama(_a, onProgress) { var { model, is_model_asset: isModelAsset, pooling_type: poolingType, lora, lora_list: loraList } = _a, rest = tslib.__rest(_a, ["model", "is_model_asset", "pooling_type", "lora", "lora_list"]); let path = model; if (path.startsWith('file://')) path = path.slice(7); let loraPath = lora; if (loraPath === null || loraPath === void 0 ? void 0 : loraPath.startsWith('file://')) loraPath = loraPath.slice(7); let loraAdapters = []; if (loraList) loraAdapters = loraList.map((l) => ({ path: l.path.replace(/file:\/\//, ''), scaled: l.scaled, })); const contextId = contextIdCounter + contextIdRandom(); contextIdCounter += 1; let removeProgressListener = null; if (onProgress) { removeProgressListener = LlamaCpp.addListener(EVENT_ON_INIT_CONTEXT_PROGRESS, (evt) => { if (evt.contextId !== contextId) return; onProgress(evt.progress); }); } const poolType = poolTypeMap[poolingType]; if (rest.cache_type_k && !validCacheTypes.includes(rest.cache_type_k)) { console.warn(`[LlamaCpp] initLlama: Invalid cache K type: ${rest.cache_type_k}, falling back to f16`); delete rest.cache_type_k; } if (rest.cache_type_v && !validCacheTypes.includes(rest.cache_type_v)) { console.warn(`[LlamaCpp] initLlama: Invalid cache V type: ${rest.cache_type_v}, falling back to f16`); delete rest.cache_type_v; } // Log speculative decoding configuration if enabled if (rest.draft_model) { console.log(`🚀 Initializing with speculative decoding: - Main model: ${path} - Draft model: ${rest.draft_model} - Speculative samples: ${rest.speculative_samples || 3} - Mobile optimization: ${rest.mobile_speculative !== false ? 'enabled' : 'disabled'}`); } const { gpu, reasonNoGPU, model: modelDetails, androidLib, } = await LlamaCpp.initContext({ contextId, params: Object.assign({ model: path, is_model_asset: !!isModelAsset, use_progress_callback: !!onProgress, pooling_type: poolType, lora: loraPath, lora_list: loraAdapters }, rest), }).catch((err) => { removeProgressListener === null || removeProgressListener === void 0 ? void 0 : removeProgressListener.remove(); throw err; }); removeProgressListener === null || removeProgressListener === void 0 ? void 0 : removeProgressListener.remove(); return new LlamaContext({ contextId, gpu, reasonNoGPU, model: modelDetails, androidLib, }); } async function releaseAllLlama() { return LlamaCpp.releaseAllContexts(); } // Model download and management functions async function downloadModel(url, filename) { return LlamaCpp.downloadModel({ url, filename }); } async function getDownloadProgress(url) { return LlamaCpp.getDownloadProgress({ url }); } async function cancelDownload(url) { return LlamaCpp.cancelDownload({ url }); } async function getAvailableModels() { return LlamaCpp.getAvailableModels(); } /** * Convert a JSON schema to GBNF grammar format * @param schema JSON schema object * @returns Promise resolving to GBNF grammar string */ async function convertJsonSchemaToGrammar(schema) { return jsonSchemaToGrammar(schema); } const BuildInfo = { number: '1.0.0', commit: 'capacitor-llama-cpp', }; exports.BuildInfo = BuildInfo; exports.LLAMACPP_MTMD_DEFAULT_MEDIA_MARKER = LLAMACPP_MTMD_DEFAULT_MEDIA_MARKER; exports.LlamaContext = LlamaContext; exports.LlamaCpp = LlamaCpp; exports.RNLLAMA_MTMD_DEFAULT_MEDIA_MARKER = RNLLAMA_MTMD_DEFAULT_MEDIA_MARKER; exports.addNativeLogListener = addNativeLogListener; exports.cancelDownload = cancelDownload; exports.convertJsonSchemaToGrammar = convertJsonSchemaToGrammar; exports.downloadModel = downloadModel; exports.getAvailableModels = getAvailableModels; exports.getDownloadProgress = getDownloadProgress; exports.initLlama = initLlama; exports.loadLlamaModelInfo = loadLlamaModelInfo; exports.releaseAllLlama = releaseAllLlama; exports.setContextLimit = setContextLimit; exports.toggleNativeLog = toggleNativeLog; return exports; })({}, tslib, capacitorExports); //# sourceMappingURL=plugin.js.map