UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

372 lines (371 loc) 14.4 kB
/** * Google Gemini Live Voice API Handler * * Implementation of bidirectional voice communication using Gemini's Live API. * * @module voice/providers/GeminiLive */ import { logger } from "../../utils/logger.js"; import { RealtimeError } from "../errors.js"; import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js"; /** * Google Gemini Live Voice API Handler * * Implements bidirectional voice communication with Gemini's Live API. * * @see https://ai.google.dev/gemini-api/docs/live */ export class GeminiLive extends BaseRealtimeHandler { name = "gemini-live"; apiKey; ws = null; audioChunkIndex = 0; pendingFunctionCalls = new Map(); constructor(apiKey) { super(); // Accept GOOGLE_AI_API_KEY / GEMINI_API_KEY as aliases — `.env.example` // documents those as the canonical Google credentials, so insisting on // GOOGLE_API_KEY here was a setup footgun (Copilot review). const resolvedKey = (apiKey ?? process.env.GOOGLE_API_KEY ?? process.env.GOOGLE_AI_API_KEY ?? process.env.GEMINI_API_KEY ?? "").trim(); this.apiKey = resolvedKey.length > 0 ? resolvedKey : null; } isConfigured() { return this.apiKey !== null; } getSupportedFormats() { return ["opus", "wav"]; } async connect(config) { if (!this.apiKey) { throw RealtimeError.providerNotConfigured("gemini-live"); } if (this.isConnected()) { throw RealtimeError.sessionAlreadyActive("gemini-live"); } this.emitStateChange("connecting"); try { // Import WebSocket const { default: WebSocket } = await import("ws"); // Determine model const model = config.model ?? "gemini-2.5-flash-native-audio-preview-09-2025"; // Connect to Gemini Live API const wsUrl = `wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key=${this.apiKey}`; this.ws = new WebSocket(wsUrl); // Issue 9: capture a local reference so the closure below doesn't need // a non-null assertion on `this.ws`. The local `ws` survives even if a // timeout nulls `this.ws` — that's intentional, the closure should // still be able to detach its own listeners on the same socket. const ws = this.ws; // Wait for connection await new Promise((resolve, reject) => { const openHandler = () => { clearTimeout(timeout); ws.off("error", errorHandler); resolve(); }; const errorHandler = (err) => { clearTimeout(timeout); ws.off("open", openHandler); reject(err); }; const timeout = setTimeout(() => { // C1: close the half-opened socket and detach temp listeners so we // don't leak the WebSocket or its closures on connection timeout. // NEW7: removing the temp listeners also prevents accumulation // across reconnect attempts (they'd otherwise hang forever and // silently call reject() on a settled promise). ws.off("open", openHandler); ws.off("error", errorHandler); ws.terminate(); this.ws = null; reject(new Error("Connection timeout")); }, config.timeout ?? 30000); ws.on("open", openHandler); ws.on("error", errorHandler); }); this.ws.on("close", () => { this.emitStateChange("disconnected"); this.session = null; }); this.ws.on("error", (err) => { this.emitError(err); }); // Send setup message await this.sendSetup(config, model); // Wait for setup complete BEFORE attaching the permanent message handler, // otherwise early audio/text data arriving during setup race window is // dispatched to handleMessage before consumers register their handlers. await this.waitForSetupComplete(); // Set up message handler — only after setup complete. this.ws.on("message", (data) => { this.handleMessage(data); }); // Generate session ID const sessionId = `gemini-${Date.now()}`; // Create session object this.session = this.createSession(sessionId, config); this.emitStateChange("connected"); logger.info(`[GeminiLiveHandler] Connected to session: ${sessionId}`); return this.session; } catch (err) { this.emitStateChange("error"); const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error"); throw RealtimeError.connectionFailed(errorMessage, "gemini-live", err instanceof Error ? err : undefined); } } async disconnect() { if (!this.ws) { return; } this.emitStateChange("disconnecting"); try { this.ws.close(); this.ws = null; this.session = null; this.audioChunkIndex = 0; this.pendingFunctionCalls.clear(); this.emitStateChange("disconnected"); logger.info("[GeminiLiveHandler] Disconnected"); } catch (err) { const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error"); throw RealtimeError.protocolError(`Disconnect failed: ${errorMessage}`, "gemini-live", err instanceof Error ? err : undefined); } } async sendAudio(audio) { if (!this.ws || !this.isConnected()) { throw RealtimeError.sessionNotActive("gemini-live"); } const audioBuffer = Buffer.isBuffer(audio) ? audio : audio.data; // Send audio as realtime input const message = { realtimeInput: { mediaChunks: [ { mimeType: "audio/pcm;rate=16000", data: audioBuffer.toString("base64"), }, ], }, }; this.ws.send(JSON.stringify(message)); } async sendText(text) { if (!this.ws || !this.isConnected()) { throw RealtimeError.sessionNotActive("gemini-live"); } // Send text as client content const message = { clientContent: { turns: [ { role: "user", parts: [{ text }], }, ], turnComplete: true, }, }; this.ws.send(JSON.stringify(message)); } async triggerResponse() { // Gemini automatically generates responses based on VAD // This is a no-op for Gemini Live } async cancelResponse() { // Gemini doesn't have explicit cancel, but we can send empty content // to interrupt if (this.ws && this.isConnected()) { const message = { clientContent: { turns: [], turnComplete: true, }, }; this.ws.send(JSON.stringify(message)); } } /** * Send setup message with configuration */ async sendSetup(config, model) { if (!this.ws) { return; } // Issue 9: build the inner `setup` object as a named local so the // optional fields below can be assigned without non-null assertions on // `setupMessage.setup`. const setup = { model: `models/${model}`, generationConfig: { responseModalities: ["AUDIO", "TEXT"], speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: config.voice ?? "Puck", }, }, }, }, }; // Add system instruction if (config.systemPrompt) { setup.systemInstruction = { parts: [{ text: config.systemPrompt }], }; } // Add tools if (config.tools && config.tools.length > 0) { setup.tools = [ { functionDeclarations: config.tools.map((tool) => ({ name: tool.name, description: tool.description, parameters: tool.parameters, })), }, ]; } const setupMessage = { setup }; this.ws.send(JSON.stringify(setupMessage)); } /** * Wait for setup complete message */ waitForSetupComplete() { return new Promise((resolve, reject) => { const handler = (data) => { try { const response = JSON.parse(data.toString()); if (response.setupComplete) { clearTimeout(timeout); this.ws?.off("message", handler); resolve(); } } catch { // Ignore parse errors } }; const timeout = setTimeout(() => { // M2: detach the message handler before rejecting so future Gemini // messages don't invoke a dangling handler for the connection lifetime. this.ws?.off("message", handler); reject(new Error("Timeout waiting for setup complete")); }, 10000); this.ws?.on("message", handler); }); } /** * Handle incoming WebSocket messages */ handleMessage(data) { try { const response = JSON.parse(data.toString()); if (response.serverContent) { const content = response.serverContent; // Handle model turn if (content.modelTurn?.parts) { for (const part of content.modelTurn.parts) { // Handle text if (part.text) { this.emitText(part.text, content.turnComplete ?? false); } // Handle audio if (part.inlineData) { const audioData = Buffer.from(part.inlineData.data, "base64"); this.emitAudio({ data: audioData, index: this.audioChunkIndex++, isFinal: content.turnComplete ?? false, format: this.parseAudioFormat(part.inlineData.mimeType), sampleRate: 24000, }); } } } // Handle turn complete if (content.turnComplete) { this.emitTurnEnd(); this.audioChunkIndex = 0; } // Handle interruption if (content.interrupted) { this.emitTurnEnd(); this.audioChunkIndex = 0; } } // Handle tool calls if (response.toolCall?.functionCalls) { for (const call of response.toolCall.functionCalls) { this.pendingFunctionCalls.set(call.id, call.name); this.handleFunctionCall(call.id, call.name, call.args); } } // Handle tool call cancellation if (response.toolCallCancellation?.ids) { for (const id of response.toolCallCancellation.ids) { this.pendingFunctionCalls.delete(id); } } } catch (err) { logger.warn(`[GeminiLiveHandler] Failed to parse message: ${err instanceof Error ? err.message : String(err)}`); } } /** * Parse audio format from MIME type */ parseAudioFormat(mimeType) { if (mimeType.includes("opus")) { return "opus"; } if (mimeType.includes("wav") || mimeType.includes("pcm")) { return "wav"; } if (mimeType.includes("mp3") || mimeType.includes("mpeg")) { return "mp3"; } return "opus"; } /** * Handle function call from model */ async handleFunctionCall(callId, name, args) { try { const result = await this.emitFunctionCall(name, args); // Send function response if (this.ws && this.isConnected()) { const responseMessage = { toolResponse: { functionResponses: [ { id: callId, name, response: { result }, }, ], }, }; this.ws.send(JSON.stringify(responseMessage)); this.pendingFunctionCalls.delete(callId); } } catch (err) { const error = err instanceof Error ? err : new Error(String(err || "Function call failed")); logger.error(`[GeminiLiveHandler] Function call failed: ${error.message}`); this.emitError(error); // Clean up the pending entry on the error path too — the success // branch deletes at line ~439, but without this delete the Map grows // unbounded over a long session with intermittently-failing tools. this.pendingFunctionCalls.delete(callId); } } }