UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

412 lines 16.8 kB
/** * OpenAI Realtime Voice API Handler * * Implementation of bidirectional voice communication using OpenAI's Realtime API. * * @module voice/providers/OpenAIRealtime */ import { logger } from "../../utils/logger.js"; import { RealtimeError } from "../errors.js"; import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js"; /** * OpenAI Realtime API Handler * * Implements bidirectional voice communication with OpenAI's Realtime API. * * @see https://platform.openai.com/docs/api-reference/realtime */ export class OpenAIRealtime extends BaseRealtimeHandler { name = "openai-realtime"; apiKey; ws = null; audioChunkIndex = 0; constructor(apiKey) { super(); // Match the trim+null-coerce pattern used by sibling providers // (OpenAITTS/AzureSTT/AzureTTS/ElevenLabsTTS/GeminiLive/OpenAISTT/GoogleSTT) // so empty/whitespace `OPENAI_API_KEY=""` surfaces as PROVIDER_NOT_CONFIGURED // instead of a downstream 401, and `isConfigured()` agrees with `connect()`. const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim(); this.apiKey = resolvedKey.length > 0 ? resolvedKey : null; } isConfigured() { return this.apiKey !== null; } getSupportedFormats() { // Session uses pcm16 for both input_audio_format and output_audio_format, // and audio-delta/audio-done chunks are tagged as `format: "pcm16"`. The // advertised list must match what's actually emitted. return ["pcm16"]; } async connect(config) { if (!this.apiKey) { throw RealtimeError.providerNotConfigured("openai-realtime"); } if (this.isConnected()) { throw RealtimeError.sessionAlreadyActive("openai-realtime"); } this.emitStateChange("connecting"); try { // Import WebSocket const { default: WebSocket } = await import("ws"); // Determine model const model = config.model ?? "gpt-4o-realtime-preview-2024-12-17"; // Connect to OpenAI Realtime API const wsUrl = `wss://api.openai.com/v1/realtime?model=${model}`; this.ws = new WebSocket(wsUrl, { headers: { Authorization: `Bearer ${this.apiKey}`, "OpenAI-Beta": "realtime=v1", }, }); // Wait for connection. Capture a local reference so the closure // doesn't need a non-null assertion on `this.ws` (Issue 9). const ws = this.ws; await new Promise((resolve, reject) => { const timeout = setTimeout(() => { reject(new Error("Connection timeout")); }, config.timeout ?? 30000); ws.on("open", () => { clearTimeout(timeout); resolve(); }); ws.on("error", (err) => { clearTimeout(timeout); reject(err); }); }); // Set up message handler this.ws.on("message", (data) => { this.handleMessage(data); }); this.ws.on("close", () => { this.emitStateChange("disconnected"); this.session = null; }); this.ws.on("error", (err) => { this.emitError(err); }); // Send session update with configuration await this.sendSessionUpdate(config); // Wait for session.created event const sessionId = await this.waitForSessionCreated(); // Create session object this.session = this.createSession(sessionId, config); this.emitStateChange("connected"); logger.info(`[OpenAIRealtimeHandler] Connected to session: ${sessionId}`); return this.session; } catch (err) { this.emitStateChange("error"); const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error"); throw RealtimeError.connectionFailed(errorMessage, "openai-realtime", err instanceof Error ? err : undefined); } } async disconnect() { if (!this.ws) { return; } this.emitStateChange("disconnecting"); try { this.ws.close(); this.ws = null; this.session = null; this.audioChunkIndex = 0; this.emitStateChange("disconnected"); logger.info("[OpenAIRealtimeHandler] Disconnected"); } catch (err) { const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error"); throw RealtimeError.protocolError(`Disconnect failed: ${errorMessage}`, "openai-realtime", err instanceof Error ? err : undefined); } } async sendAudio(audio) { if (!this.ws || !this.isConnected()) { throw RealtimeError.sessionNotActive("openai-realtime"); } const audioBuffer = Buffer.isBuffer(audio) ? audio : audio.data; // Send audio append event const event = { type: "input_audio_buffer.append", audio: audioBuffer.toString("base64"), }; this.ws.send(JSON.stringify(event)); } async sendText(text) { if (!this.ws || !this.isConnected()) { throw RealtimeError.sessionNotActive("openai-realtime"); } // Send conversation item create event const event = { type: "conversation.item.create", item: { type: "message", role: "user", content: [ { type: "input_text", text, }, ], }, }; this.ws.send(JSON.stringify(event)); // Trigger response await this.triggerResponse(); } async triggerResponse() { if (!this.ws || !this.isConnected()) { throw RealtimeError.sessionNotActive("openai-realtime"); } // Commit audio buffer this.ws.send(JSON.stringify({ type: "input_audio_buffer.commit", })); // Create response this.ws.send(JSON.stringify({ type: "response.create", })); } async cancelResponse() { if (!this.ws || !this.isConnected()) { return; } this.ws.send(JSON.stringify({ type: "response.cancel", })); } /** * Send session update with configuration */ async sendSessionUpdate(config) { if (!this.ws) { return; } const sessionConfig = { modalities: ["text", "audio"], input_audio_format: "pcm16", output_audio_format: "pcm16", input_audio_transcription: { model: "whisper-1", }, }; // Add voice if specified if (config.voice) { sessionConfig.voice = config.voice; } // Add turn detection if (config.turnDetection) { sessionConfig.turn_detection = { type: config.turnDetection, threshold: config.vadThreshold ?? 0.5, prefix_padding_ms: 300, silence_duration_ms: 500, }; } // Add system prompt if (config.systemPrompt) { sessionConfig.instructions = config.systemPrompt; } // Add tools if (config.tools && config.tools.length > 0) { sessionConfig.tools = config.tools.map((tool) => ({ type: "function", name: tool.name, description: tool.description, parameters: tool.parameters, })); } const event = { type: "session.update", session: sessionConfig, }; this.ws.send(JSON.stringify(event)); } /** * Wait for session.created event */ waitForSessionCreated() { return new Promise((resolve, reject) => { const handler = (data) => { try { const event = JSON.parse(data.toString()); if (event.type === "session.created") { clearTimeout(timeout); this.ws?.off("message", handler); const sessionEvent = event; resolve(sessionEvent.session.id); } else if (event.type === "error") { clearTimeout(timeout); this.ws?.off("message", handler); reject(new Error(event.error?.message ?? "Unknown error")); } } catch { // Ignore parse errors } }; const timeout = setTimeout(() => { // M1: detach the message handler before rejecting so subsequent // OpenAI Realtime messages don't invoke a dangling handler for the // connection lifetime. (The success and event-error paths above // already off-detach; only the timeout path was leaking.) this.ws?.off("message", handler); reject(new Error("Timeout waiting for session.created")); }, 10000); this.ws?.on("message", handler); }); } /** * Handle incoming WebSocket messages */ handleMessage(data) { try { const event = JSON.parse(data.toString()); switch (event.type) { case "response.audio.delta": { const audioEvent = event; const audioData = Buffer.from(audioEvent.delta, "base64"); this.emitAudio({ data: audioData, index: this.audioChunkIndex++, isFinal: false, // M7: session is configured with output_audio_format "pcm16", // so OpenAI sends raw 16-bit PCM, not WAV-headered bytes. // Tagging as "pcm16" prevents downstream consumers (e.g. // calculateWavDuration) from mis-parsing the buffer as RIFF/WAV. format: "pcm16", sampleRate: 24000, }); break; } case "response.audio.done": { // Audio stream complete this.emitAudio({ data: Buffer.alloc(0), index: this.audioChunkIndex++, isFinal: true, // M7: session is configured with output_audio_format "pcm16", // so OpenAI sends raw 16-bit PCM, not WAV-headered bytes. // Tagging as "pcm16" prevents downstream consumers (e.g. // calculateWavDuration) from mis-parsing the buffer as RIFF/WAV. format: "pcm16", sampleRate: 24000, }); break; } case "response.audio_transcript.delta": { const transcriptEvent = event; if (transcriptEvent.delta) { this.emitText(transcriptEvent.delta, false); } break; } case "response.audio_transcript.done": { // Final transcript const finalEvent = event; if (finalEvent.transcript) { this.emitText(finalEvent.transcript, true); } break; } case "conversation.item.input_audio_transcription.completed": { const transcriptEvent = event; if (transcriptEvent.transcript) { this.emitTranscript(transcriptEvent.transcript, true); } break; } case "response.function_call_arguments.done": { const funcEvent = event; if (funcEvent.name && funcEvent.call_id && funcEvent.arguments) { try { const args = JSON.parse(funcEvent.arguments); // NEW6: defense-in-depth. handleFunctionCall already wraps its // body in try/catch, so the inner path is covered today. This // outer .catch is here to ensure any future un-caught path // (e.g. a refactor that drops the inner catch, or `logger.error` // itself throwing inside that catch) doesn't crash the process // or hang the session via an unhandled-rejection. Issue 5. void this.handleFunctionCall(funcEvent.name, args, funcEvent.call_id).catch((err) => { logger.error(`[OpenAIRealtimeHandler] handleFunctionCall failed: ${err instanceof Error ? err.message : String(err)}`); }); } catch { logger.warn("[OpenAIRealtimeHandler] Failed to parse function arguments"); } } break; } case "response.done": { this.emitTurnEnd(); this.audioChunkIndex = 0; break; } case "input_audio_buffer.speech_started": { this.emitTurnStart(); break; } case "error": { const errorEvent = event; const errorMessage = errorEvent.error?.message ?? "Unknown error"; this.emitError(new Error(errorMessage)); break; } default: // Log unhandled events at debug level logger.debug(`[OpenAIRealtimeHandler] Unhandled event: ${event.type}`); } } catch (err) { logger.warn(`[OpenAIRealtimeHandler] Failed to parse message: ${err instanceof Error ? err.message : String(err)}`); } } /** * Handle function call from model */ async handleFunctionCall(name, args, callId) { try { const result = await this.emitFunctionCall(name, args); // Send function result back if (this.ws && this.isConnected()) { this.ws.send(JSON.stringify({ type: "conversation.item.create", item: { type: "function_call_output", call_id: callId, output: JSON.stringify(result), }, })); // Trigger response with function result await this.triggerResponse(); } } catch (err) { const errMessage = err instanceof Error ? err.message : String(err); logger.error(`[OpenAIRealtimeHandler] Function call failed: ${errMessage}`); // M6: send a function_call_output with the error so OpenAI Realtime // can resume the turn. Without this the session stalls indefinitely // (the model waits for a function result before continuing) and the // user hears silence. if (this.ws && this.isConnected()) { try { this.ws.send(JSON.stringify({ type: "conversation.item.create", item: { type: "function_call_output", call_id: callId, output: JSON.stringify({ error: errMessage }), }, })); await this.triggerResponse(); } catch (sendErr) { logger.error(`[OpenAIRealtimeHandler] Failed to send error result for ${callId}: ${sendErr instanceof Error ? sendErr.message : String(sendErr)}`); } } } } } //# sourceMappingURL=OpenAIRealtime.js.map