@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
283 lines • 13.4 kB
JavaScript
import { createOpenAI } from "@ai-sdk/openai";
import { stepCountIs, streamText } from "ai";
import { BaseProvider } from "../core/baseProvider.js";
import { DEFAULT_MAX_STEPS } from "../core/constants.js";
import { streamAnalyticsCollector } from "../core/streamAnalytics.js";
import { isNeuroLink } from "../neurolink.js";
import { createProxyFetch } from "../proxy/proxyFetch.js";
import { createLoggingFetch } from "../utils/loggingFetch.js";
import { tracers, ATTR, withClientStreamSpan } from "../telemetry/index.js";
import { NetworkError, ProviderError } from "../types/index.js";
import { logger } from "../utils/logger.js";
import { composeAbortSignals, createTimeoutController, TimeoutError, } from "../utils/timeout.js";
import { emitToolEndFromStepFinish } from "../utils/toolEndEmitter.js";
import { resolveToolChoice } from "../utils/toolChoice.js";
import { toAnalyticsStreamResult } from "./providerTypeUtils.js";
const LLAMACPP_DEFAULT_BASE_URL = "http://localhost:8080/v1";
const LLAMACPP_PLACEHOLDER_KEY = "llamacpp";
const FALLBACK_MODEL = "loaded-model";
const getLlamaCppBaseURL = () => {
return process.env.LLAMACPP_BASE_URL || LLAMACPP_DEFAULT_BASE_URL;
};
/**
* llama.cpp Provider
* Wraps a llama-server process (https://github.com/ggerganov/llama.cpp) that
* exposes an OpenAI-compatible API at http://localhost:8080/v1 by default.
* llama-server hosts ONE model loaded at startup; /v1/models returns just that.
*/
export class LlamaCppProvider extends BaseProvider {
model;
// Caller-supplied model name — never overwritten by discovery, so a
// FALLBACK_MODEL miss can't poison the explicit-vs-discover branch on
// subsequent calls.
requestedModelName;
baseURL;
apiKey;
discoveredModel;
llamaCppClient;
constructor(modelName, sdk, _region, credentials) {
const validatedNeurolink = isNeuroLink(sdk) ? sdk : undefined;
super(modelName, "llamacpp", validatedNeurolink);
this.requestedModelName = modelName;
this.baseURL = credentials?.baseURL ?? getLlamaCppBaseURL();
// llama-server doesn't authenticate, but the AI SDK's createOpenAI() requires
// an apiKey. Allow override via credentials/env for users who run llama-server
// behind an auth-proxying reverse-proxy.
this.apiKey =
credentials?.apiKey ??
process.env.LLAMACPP_API_KEY ??
LLAMACPP_PLACEHOLDER_KEY;
this.llamaCppClient = createOpenAI({
baseURL: this.baseURL,
apiKey: this.apiKey,
fetch: createLoggingFetch("llamacpp"),
});
logger.debug("llama.cpp Provider initialized", {
modelName: this.modelName,
providerName: this.providerName,
baseURL: this.baseURL,
});
}
async getAvailableModels(callerSignal) {
const url = `${this.baseURL.replace(/\/$/, "")}/models`;
// Use the proxy-aware fetch + bearer auth so users running llama-server
// behind an auth-proxying reverse-proxy can still discover the model.
// Compose the caller's request signal (per-request timeout / abort) with
// a fixed 5s discovery cap so cancellation propagates AND a hung server
// can't stall provider initialization.
const proxyFetch = createProxyFetch();
const discoveryTimeout = AbortSignal.timeout(5000);
const composedSignal = callerSignal
? AbortSignal.any([callerSignal, discoveryTimeout])
: discoveryTimeout;
const response = await proxyFetch(url, {
headers: this.apiKey && this.apiKey !== LLAMACPP_PLACEHOLDER_KEY
? { Authorization: `Bearer ${this.apiKey}` }
: undefined,
signal: composedSignal,
});
if (!response.ok) {
throw new Error(`llama-server /v1/models returned ${response.status}: ${response.statusText}`);
}
const data = (await response.json());
return data.data.map((m) => m.id);
}
async getAISDKModel(signal) {
if (this.model) {
return this.model;
}
let modelToUse;
let discoverySucceeded = false;
// Use requestedModelName, not this.modelName — refreshHandlersForModel()
// mutates this.modelName, so on a retry after a discovery miss the
// FALLBACK_MODEL would look like an explicit user choice. See lmStudio.ts.
const explicit = this.requestedModelName;
if (explicit && explicit.trim() !== "") {
modelToUse = explicit;
discoverySucceeded = true; // explicit user choice — treat as success
}
else {
try {
const models = await this.getAvailableModels(signal);
if (models.length > 0) {
this.discoveredModel = models[0];
modelToUse = this.discoveredModel;
discoverySucceeded = true;
logger.info(`llama.cpp loaded model: ${modelToUse}`);
}
else {
modelToUse = FALLBACK_MODEL;
}
}
catch (error) {
logger.warn(`llama.cpp model discovery failed: ${error instanceof Error ? error.message : String(error)}`);
modelToUse = FALLBACK_MODEL;
}
}
// Persist resolved model on the instance and rebuild the composed
// handlers (TelemetryHandler, MessageBuilder, etc.) so pricing /
// telemetry / span attributes report the discovered model name. Plain
// assignment to `this.modelName` is not enough — handlers cached the
// pre-discovery value at construction time.
this.refreshHandlersForModel(modelToUse);
// .chat() — llama-server exposes /v1/chat/completions, not /v1/responses
const resolvedModel = this.llamaCppClient.chat(modelToUse);
// Only memoize on success — see lmStudio.ts for the same rationale: a
// discovery miss should let the next call retry instead of being stuck
// on FALLBACK_MODEL until the provider instance is recreated.
if (discoverySucceeded) {
this.model = resolvedModel;
}
return resolvedModel;
}
async executeStream(options, _analysisSchema) {
// Resolve the llama.cpp model BEFORE opening the span so OTEL
// attributes, MessageBuilder, and downstream image/tool adapters all see
// the discovered model id rather than the empty pre-discovery placeholder.
// Pass the caller's abort signal so user cancellation / per-request
// timeouts are honored during the discovery probe.
await this.getAISDKModel(options.abortSignal);
return withClientStreamSpan({
name: "neurolink.provider.stream",
tracer: tracers.provider,
attributes: {
[ATTR.GEN_AI_SYSTEM]: "llamacpp",
[ATTR.GEN_AI_MODEL]: this.modelName || this.discoveredModel || FALLBACK_MODEL,
[ATTR.GEN_AI_OPERATION]: "stream",
[ATTR.NL_STREAM_MODE]: true,
},
}, async () => this.executeStreamInner(options), (r) => r.stream, (r, wrapped) => ({ ...r, stream: wrapped }));
}
async executeStreamInner(options) {
this.validateStreamOptions(options);
const startTime = Date.now();
const timeout = this.getTimeout(options);
const timeoutController = createTimeoutController(timeout, this.providerName, "stream");
try {
const shouldUseTools = !options.disableTools && this.supportsTools();
const tools = shouldUseTools
? options.tools || (await this.getAllTools())
: {};
// Resolve the AI SDK model BEFORE building messages so message/image
// adapters see the same handlers/model that streamText will use. See
// lmStudio.ts for the same rationale.
const model = await this.getAISDKModelWithMiddleware(options);
const messages = await this.buildMessagesForStream(options);
const result = await streamText({
model,
messages,
temperature: options.temperature,
maxOutputTokens: options.maxTokens,
tools,
stopWhen: stepCountIs(options.maxSteps || DEFAULT_MAX_STEPS),
toolChoice: resolveToolChoice(options, tools, shouldUseTools),
abortSignal: composeAbortSignals(options.abortSignal, timeoutController?.controller.signal),
experimental_telemetry: this.telemetryHandler.getTelemetryConfig(options),
experimental_repairToolCall: this.getToolCallRepairFn(options),
onStepFinish: ({ toolCalls, toolResults }) => {
emitToolEndFromStepFinish(this.neurolink?.getEventEmitter(), toolResults);
this.handleToolExecutionStorage(toolCalls, toolResults, options, new Date()).catch((error) => {
logger.warn("[LlamaCppProvider] Failed to store tool executions", {
provider: this.providerName,
error: error instanceof Error ? error.message : String(error),
});
});
},
});
timeoutController?.cleanup();
const transformedStream = this.createTextStream(result);
const analyticsPromise = streamAnalyticsCollector.createAnalytics(this.providerName, this.modelName || this.discoveredModel || FALLBACK_MODEL, toAnalyticsStreamResult(result), Date.now() - startTime, {
requestId: `llamacpp-stream-${Date.now()}`,
streamingMode: true,
});
return {
stream: transformedStream,
provider: this.providerName,
model: this.modelName || this.discoveredModel || FALLBACK_MODEL,
analytics: analyticsPromise,
metadata: { startTime, streamId: `llamacpp-${Date.now()}` },
};
}
catch (error) {
timeoutController?.cleanup();
throw this.handleProviderError(error);
}
}
getProviderName() {
return this.providerName;
}
getDefaultModel() {
return process.env.LLAMACPP_MODEL || "";
}
formatProviderError(error) {
if (error instanceof TimeoutError) {
return new NetworkError(`Request timed out: ${error.message}`, "llamacpp");
}
const errorRecord = error;
const message = typeof errorRecord?.message === "string"
? errorRecord.message
: "Unknown error";
const cause = errorRecord?.cause ?? {};
const code = (errorRecord?.code ?? cause?.code);
if (code === "ECONNREFUSED" ||
message.includes("ECONNREFUSED") ||
message.includes("Failed to fetch") ||
message.includes("fetch failed")) {
return new NetworkError(`llama.cpp server not reachable at ${this.baseURL}. ` +
"Start it with: ./llama-server -m model.gguf --port 8080", "llamacpp");
}
if (message.includes("400")) {
return new ProviderError("llama.cpp rejected the request. Common cause: model doesn't support tools (start llama-server with --jinja for tool support).", "llamacpp");
}
return new ProviderError(`llama.cpp error: ${message}`, "llamacpp");
}
async validateConfiguration() {
// Retry up to 3x with 500ms backoff. llama-server can be briefly unresponsive
// under load (CPU inference saturates the event loop). Use the proxy-aware
// fetch + bearer auth header so reverse-proxied setups still validate.
const healthURL = this.baseURL.replace(/\/v1\/?$/, "/health");
const modelsURL = `${this.baseURL.replace(/\/$/, "")}/models`;
const proxyFetch = createProxyFetch();
const headers = this.apiKey && this.apiKey !== LLAMACPP_PLACEHOLDER_KEY
? { Authorization: `Bearer ${this.apiKey}` }
: undefined;
for (let attempt = 0; attempt < 3; attempt++) {
try {
const r = await proxyFetch(healthURL, {
headers,
signal: AbortSignal.timeout(2000),
});
if (r.ok) {
return true;
}
}
catch {
/* fall through */
}
try {
const r2 = await proxyFetch(modelsURL, {
headers,
signal: AbortSignal.timeout(2000),
});
if (r2.ok) {
return true;
}
}
catch {
/* fall through */
}
await new Promise((resolve) => setTimeout(resolve, 500));
}
return false;
}
getConfiguration() {
return {
provider: this.providerName,
model: this.modelName || this.discoveredModel || FALLBACK_MODEL,
defaultModel: this.getDefaultModel(),
baseURL: this.baseURL,
};
}
}
export default LlamaCppProvider;
//# sourceMappingURL=llamaCpp.js.map