UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

283 lines 13.4 kB
import { createOpenAI } from "@ai-sdk/openai"; import { stepCountIs, streamText } from "ai"; import { BaseProvider } from "../core/baseProvider.js"; import { DEFAULT_MAX_STEPS } from "../core/constants.js"; import { streamAnalyticsCollector } from "../core/streamAnalytics.js"; import { isNeuroLink } from "../neurolink.js"; import { createProxyFetch } from "../proxy/proxyFetch.js"; import { createLoggingFetch } from "../utils/loggingFetch.js"; import { tracers, ATTR, withClientStreamSpan } from "../telemetry/index.js"; import { NetworkError, ProviderError } from "../types/index.js"; import { logger } from "../utils/logger.js"; import { composeAbortSignals, createTimeoutController, TimeoutError, } from "../utils/timeout.js"; import { emitToolEndFromStepFinish } from "../utils/toolEndEmitter.js"; import { resolveToolChoice } from "../utils/toolChoice.js"; import { toAnalyticsStreamResult } from "./providerTypeUtils.js"; const LLAMACPP_DEFAULT_BASE_URL = "http://localhost:8080/v1"; const LLAMACPP_PLACEHOLDER_KEY = "llamacpp"; const FALLBACK_MODEL = "loaded-model"; const getLlamaCppBaseURL = () => { return process.env.LLAMACPP_BASE_URL || LLAMACPP_DEFAULT_BASE_URL; }; /** * llama.cpp Provider * Wraps a llama-server process (https://github.com/ggerganov/llama.cpp) that * exposes an OpenAI-compatible API at http://localhost:8080/v1 by default. * llama-server hosts ONE model loaded at startup; /v1/models returns just that. */ export class LlamaCppProvider extends BaseProvider { model; // Caller-supplied model name — never overwritten by discovery, so a // FALLBACK_MODEL miss can't poison the explicit-vs-discover branch on // subsequent calls. requestedModelName; baseURL; apiKey; discoveredModel; llamaCppClient; constructor(modelName, sdk, _region, credentials) { const validatedNeurolink = isNeuroLink(sdk) ? sdk : undefined; super(modelName, "llamacpp", validatedNeurolink); this.requestedModelName = modelName; this.baseURL = credentials?.baseURL ?? getLlamaCppBaseURL(); // llama-server doesn't authenticate, but the AI SDK's createOpenAI() requires // an apiKey. Allow override via credentials/env for users who run llama-server // behind an auth-proxying reverse-proxy. this.apiKey = credentials?.apiKey ?? process.env.LLAMACPP_API_KEY ?? LLAMACPP_PLACEHOLDER_KEY; this.llamaCppClient = createOpenAI({ baseURL: this.baseURL, apiKey: this.apiKey, fetch: createLoggingFetch("llamacpp"), }); logger.debug("llama.cpp Provider initialized", { modelName: this.modelName, providerName: this.providerName, baseURL: this.baseURL, }); } async getAvailableModels(callerSignal) { const url = `${this.baseURL.replace(/\/$/, "")}/models`; // Use the proxy-aware fetch + bearer auth so users running llama-server // behind an auth-proxying reverse-proxy can still discover the model. // Compose the caller's request signal (per-request timeout / abort) with // a fixed 5s discovery cap so cancellation propagates AND a hung server // can't stall provider initialization. const proxyFetch = createProxyFetch(); const discoveryTimeout = AbortSignal.timeout(5000); const composedSignal = callerSignal ? AbortSignal.any([callerSignal, discoveryTimeout]) : discoveryTimeout; const response = await proxyFetch(url, { headers: this.apiKey && this.apiKey !== LLAMACPP_PLACEHOLDER_KEY ? { Authorization: `Bearer ${this.apiKey}` } : undefined, signal: composedSignal, }); if (!response.ok) { throw new Error(`llama-server /v1/models returned ${response.status}: ${response.statusText}`); } const data = (await response.json()); return data.data.map((m) => m.id); } async getAISDKModel(signal) { if (this.model) { return this.model; } let modelToUse; let discoverySucceeded = false; // Use requestedModelName, not this.modelName — refreshHandlersForModel() // mutates this.modelName, so on a retry after a discovery miss the // FALLBACK_MODEL would look like an explicit user choice. See lmStudio.ts. const explicit = this.requestedModelName; if (explicit && explicit.trim() !== "") { modelToUse = explicit; discoverySucceeded = true; // explicit user choice — treat as success } else { try { const models = await this.getAvailableModels(signal); if (models.length > 0) { this.discoveredModel = models[0]; modelToUse = this.discoveredModel; discoverySucceeded = true; logger.info(`llama.cpp loaded model: ${modelToUse}`); } else { modelToUse = FALLBACK_MODEL; } } catch (error) { logger.warn(`llama.cpp model discovery failed: ${error instanceof Error ? error.message : String(error)}`); modelToUse = FALLBACK_MODEL; } } // Persist resolved model on the instance and rebuild the composed // handlers (TelemetryHandler, MessageBuilder, etc.) so pricing / // telemetry / span attributes report the discovered model name. Plain // assignment to `this.modelName` is not enough — handlers cached the // pre-discovery value at construction time. this.refreshHandlersForModel(modelToUse); // .chat() — llama-server exposes /v1/chat/completions, not /v1/responses const resolvedModel = this.llamaCppClient.chat(modelToUse); // Only memoize on success — see lmStudio.ts for the same rationale: a // discovery miss should let the next call retry instead of being stuck // on FALLBACK_MODEL until the provider instance is recreated. if (discoverySucceeded) { this.model = resolvedModel; } return resolvedModel; } async executeStream(options, _analysisSchema) { // Resolve the llama.cpp model BEFORE opening the span so OTEL // attributes, MessageBuilder, and downstream image/tool adapters all see // the discovered model id rather than the empty pre-discovery placeholder. // Pass the caller's abort signal so user cancellation / per-request // timeouts are honored during the discovery probe. await this.getAISDKModel(options.abortSignal); return withClientStreamSpan({ name: "neurolink.provider.stream", tracer: tracers.provider, attributes: { [ATTR.GEN_AI_SYSTEM]: "llamacpp", [ATTR.GEN_AI_MODEL]: this.modelName || this.discoveredModel || FALLBACK_MODEL, [ATTR.GEN_AI_OPERATION]: "stream", [ATTR.NL_STREAM_MODE]: true, }, }, async () => this.executeStreamInner(options), (r) => r.stream, (r, wrapped) => ({ ...r, stream: wrapped })); } async executeStreamInner(options) { this.validateStreamOptions(options); const startTime = Date.now(); const timeout = this.getTimeout(options); const timeoutController = createTimeoutController(timeout, this.providerName, "stream"); try { const shouldUseTools = !options.disableTools && this.supportsTools(); const tools = shouldUseTools ? options.tools || (await this.getAllTools()) : {}; // Resolve the AI SDK model BEFORE building messages so message/image // adapters see the same handlers/model that streamText will use. See // lmStudio.ts for the same rationale. const model = await this.getAISDKModelWithMiddleware(options); const messages = await this.buildMessagesForStream(options); const result = await streamText({ model, messages, temperature: options.temperature, maxOutputTokens: options.maxTokens, tools, stopWhen: stepCountIs(options.maxSteps || DEFAULT_MAX_STEPS), toolChoice: resolveToolChoice(options, tools, shouldUseTools), abortSignal: composeAbortSignals(options.abortSignal, timeoutController?.controller.signal), experimental_telemetry: this.telemetryHandler.getTelemetryConfig(options), experimental_repairToolCall: this.getToolCallRepairFn(options), onStepFinish: ({ toolCalls, toolResults }) => { emitToolEndFromStepFinish(this.neurolink?.getEventEmitter(), toolResults); this.handleToolExecutionStorage(toolCalls, toolResults, options, new Date()).catch((error) => { logger.warn("[LlamaCppProvider] Failed to store tool executions", { provider: this.providerName, error: error instanceof Error ? error.message : String(error), }); }); }, }); timeoutController?.cleanup(); const transformedStream = this.createTextStream(result); const analyticsPromise = streamAnalyticsCollector.createAnalytics(this.providerName, this.modelName || this.discoveredModel || FALLBACK_MODEL, toAnalyticsStreamResult(result), Date.now() - startTime, { requestId: `llamacpp-stream-${Date.now()}`, streamingMode: true, }); return { stream: transformedStream, provider: this.providerName, model: this.modelName || this.discoveredModel || FALLBACK_MODEL, analytics: analyticsPromise, metadata: { startTime, streamId: `llamacpp-${Date.now()}` }, }; } catch (error) { timeoutController?.cleanup(); throw this.handleProviderError(error); } } getProviderName() { return this.providerName; } getDefaultModel() { return process.env.LLAMACPP_MODEL || ""; } formatProviderError(error) { if (error instanceof TimeoutError) { return new NetworkError(`Request timed out: ${error.message}`, "llamacpp"); } const errorRecord = error; const message = typeof errorRecord?.message === "string" ? errorRecord.message : "Unknown error"; const cause = errorRecord?.cause ?? {}; const code = (errorRecord?.code ?? cause?.code); if (code === "ECONNREFUSED" || message.includes("ECONNREFUSED") || message.includes("Failed to fetch") || message.includes("fetch failed")) { return new NetworkError(`llama.cpp server not reachable at ${this.baseURL}. ` + "Start it with: ./llama-server -m model.gguf --port 8080", "llamacpp"); } if (message.includes("400")) { return new ProviderError("llama.cpp rejected the request. Common cause: model doesn't support tools (start llama-server with --jinja for tool support).", "llamacpp"); } return new ProviderError(`llama.cpp error: ${message}`, "llamacpp"); } async validateConfiguration() { // Retry up to 3x with 500ms backoff. llama-server can be briefly unresponsive // under load (CPU inference saturates the event loop). Use the proxy-aware // fetch + bearer auth header so reverse-proxied setups still validate. const healthURL = this.baseURL.replace(/\/v1\/?$/, "/health"); const modelsURL = `${this.baseURL.replace(/\/$/, "")}/models`; const proxyFetch = createProxyFetch(); const headers = this.apiKey && this.apiKey !== LLAMACPP_PLACEHOLDER_KEY ? { Authorization: `Bearer ${this.apiKey}` } : undefined; for (let attempt = 0; attempt < 3; attempt++) { try { const r = await proxyFetch(healthURL, { headers, signal: AbortSignal.timeout(2000), }); if (r.ok) { return true; } } catch { /* fall through */ } try { const r2 = await proxyFetch(modelsURL, { headers, signal: AbortSignal.timeout(2000), }); if (r2.ok) { return true; } } catch { /* fall through */ } await new Promise((resolve) => setTimeout(resolve, 500)); } return false; } getConfiguration() { return { provider: this.providerName, model: this.modelName || this.discoveredModel || FALLBACK_MODEL, defaultModel: this.getDefaultModel(), baseURL: this.baseURL, }; } } export default LlamaCppProvider; //# sourceMappingURL=llamaCpp.js.map