@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
412 lines • 16.8 kB
JavaScript
/**
* OpenAI Realtime Voice API Handler
*
* Implementation of bidirectional voice communication using OpenAI's Realtime API.
*
* @module voice/providers/OpenAIRealtime
*/
import { logger } from "../../utils/logger.js";
import { RealtimeError } from "../errors.js";
import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js";
/**
* OpenAI Realtime API Handler
*
* Implements bidirectional voice communication with OpenAI's Realtime API.
*
* @see https://platform.openai.com/docs/api-reference/realtime
*/
export class OpenAIRealtime extends BaseRealtimeHandler {
name = "openai-realtime";
apiKey;
ws = null;
audioChunkIndex = 0;
constructor(apiKey) {
super();
// Match the trim+null-coerce pattern used by sibling providers
// (OpenAITTS/AzureSTT/AzureTTS/ElevenLabsTTS/GeminiLive/OpenAISTT/GoogleSTT)
// so empty/whitespace `OPENAI_API_KEY=""` surfaces as PROVIDER_NOT_CONFIGURED
// instead of a downstream 401, and `isConfigured()` agrees with `connect()`.
const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
}
isConfigured() {
return this.apiKey !== null;
}
getSupportedFormats() {
// Session uses pcm16 for both input_audio_format and output_audio_format,
// and audio-delta/audio-done chunks are tagged as `format: "pcm16"`. The
// advertised list must match what's actually emitted.
return ["pcm16"];
}
async connect(config) {
if (!this.apiKey) {
throw RealtimeError.providerNotConfigured("openai-realtime");
}
if (this.isConnected()) {
throw RealtimeError.sessionAlreadyActive("openai-realtime");
}
this.emitStateChange("connecting");
try {
// Import WebSocket
const { default: WebSocket } = await import("ws");
// Determine model
const model = config.model ?? "gpt-4o-realtime-preview-2024-12-17";
// Connect to OpenAI Realtime API
const wsUrl = `wss://api.openai.com/v1/realtime?model=${model}`;
this.ws = new WebSocket(wsUrl, {
headers: {
Authorization: `Bearer ${this.apiKey}`,
"OpenAI-Beta": "realtime=v1",
},
});
// Wait for connection. Capture a local reference so the closure
// doesn't need a non-null assertion on `this.ws` (Issue 9).
const ws = this.ws;
await new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error("Connection timeout"));
}, config.timeout ?? 30000);
ws.on("open", () => {
clearTimeout(timeout);
resolve();
});
ws.on("error", (err) => {
clearTimeout(timeout);
reject(err);
});
});
// Set up message handler
this.ws.on("message", (data) => {
this.handleMessage(data);
});
this.ws.on("close", () => {
this.emitStateChange("disconnected");
this.session = null;
});
this.ws.on("error", (err) => {
this.emitError(err);
});
// Send session update with configuration
await this.sendSessionUpdate(config);
// Wait for session.created event
const sessionId = await this.waitForSessionCreated();
// Create session object
this.session = this.createSession(sessionId, config);
this.emitStateChange("connected");
logger.info(`[OpenAIRealtimeHandler] Connected to session: ${sessionId}`);
return this.session;
}
catch (err) {
this.emitStateChange("error");
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
throw RealtimeError.connectionFailed(errorMessage, "openai-realtime", err instanceof Error ? err : undefined);
}
}
async disconnect() {
if (!this.ws) {
return;
}
this.emitStateChange("disconnecting");
try {
this.ws.close();
this.ws = null;
this.session = null;
this.audioChunkIndex = 0;
this.emitStateChange("disconnected");
logger.info("[OpenAIRealtimeHandler] Disconnected");
}
catch (err) {
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
throw RealtimeError.protocolError(`Disconnect failed: ${errorMessage}`, "openai-realtime", err instanceof Error ? err : undefined);
}
}
async sendAudio(audio) {
if (!this.ws || !this.isConnected()) {
throw RealtimeError.sessionNotActive("openai-realtime");
}
const audioBuffer = Buffer.isBuffer(audio) ? audio : audio.data;
// Send audio append event
const event = {
type: "input_audio_buffer.append",
audio: audioBuffer.toString("base64"),
};
this.ws.send(JSON.stringify(event));
}
async sendText(text) {
if (!this.ws || !this.isConnected()) {
throw RealtimeError.sessionNotActive("openai-realtime");
}
// Send conversation item create event
const event = {
type: "conversation.item.create",
item: {
type: "message",
role: "user",
content: [
{
type: "input_text",
text,
},
],
},
};
this.ws.send(JSON.stringify(event));
// Trigger response
await this.triggerResponse();
}
async triggerResponse() {
if (!this.ws || !this.isConnected()) {
throw RealtimeError.sessionNotActive("openai-realtime");
}
// Commit audio buffer
this.ws.send(JSON.stringify({
type: "input_audio_buffer.commit",
}));
// Create response
this.ws.send(JSON.stringify({
type: "response.create",
}));
}
async cancelResponse() {
if (!this.ws || !this.isConnected()) {
return;
}
this.ws.send(JSON.stringify({
type: "response.cancel",
}));
}
/**
* Send session update with configuration
*/
async sendSessionUpdate(config) {
if (!this.ws) {
return;
}
const sessionConfig = {
modalities: ["text", "audio"],
input_audio_format: "pcm16",
output_audio_format: "pcm16",
input_audio_transcription: {
model: "whisper-1",
},
};
// Add voice if specified
if (config.voice) {
sessionConfig.voice = config.voice;
}
// Add turn detection
if (config.turnDetection) {
sessionConfig.turn_detection = {
type: config.turnDetection,
threshold: config.vadThreshold ?? 0.5,
prefix_padding_ms: 300,
silence_duration_ms: 500,
};
}
// Add system prompt
if (config.systemPrompt) {
sessionConfig.instructions = config.systemPrompt;
}
// Add tools
if (config.tools && config.tools.length > 0) {
sessionConfig.tools = config.tools.map((tool) => ({
type: "function",
name: tool.name,
description: tool.description,
parameters: tool.parameters,
}));
}
const event = {
type: "session.update",
session: sessionConfig,
};
this.ws.send(JSON.stringify(event));
}
/**
* Wait for session.created event
*/
waitForSessionCreated() {
return new Promise((resolve, reject) => {
const handler = (data) => {
try {
const event = JSON.parse(data.toString());
if (event.type === "session.created") {
clearTimeout(timeout);
this.ws?.off("message", handler);
const sessionEvent = event;
resolve(sessionEvent.session.id);
}
else if (event.type === "error") {
clearTimeout(timeout);
this.ws?.off("message", handler);
reject(new Error(event.error?.message ??
"Unknown error"));
}
}
catch {
// Ignore parse errors
}
};
const timeout = setTimeout(() => {
// M1: detach the message handler before rejecting so subsequent
// OpenAI Realtime messages don't invoke a dangling handler for the
// connection lifetime. (The success and event-error paths above
// already off-detach; only the timeout path was leaking.)
this.ws?.off("message", handler);
reject(new Error("Timeout waiting for session.created"));
}, 10000);
this.ws?.on("message", handler);
});
}
/**
* Handle incoming WebSocket messages
*/
handleMessage(data) {
try {
const event = JSON.parse(data.toString());
switch (event.type) {
case "response.audio.delta": {
const audioEvent = event;
const audioData = Buffer.from(audioEvent.delta, "base64");
this.emitAudio({
data: audioData,
index: this.audioChunkIndex++,
isFinal: false,
// M7: session is configured with output_audio_format "pcm16",
// so OpenAI sends raw 16-bit PCM, not WAV-headered bytes.
// Tagging as "pcm16" prevents downstream consumers (e.g.
// calculateWavDuration) from mis-parsing the buffer as RIFF/WAV.
format: "pcm16",
sampleRate: 24000,
});
break;
}
case "response.audio.done": {
// Audio stream complete
this.emitAudio({
data: Buffer.alloc(0),
index: this.audioChunkIndex++,
isFinal: true,
// M7: session is configured with output_audio_format "pcm16",
// so OpenAI sends raw 16-bit PCM, not WAV-headered bytes.
// Tagging as "pcm16" prevents downstream consumers (e.g.
// calculateWavDuration) from mis-parsing the buffer as RIFF/WAV.
format: "pcm16",
sampleRate: 24000,
});
break;
}
case "response.audio_transcript.delta": {
const transcriptEvent = event;
if (transcriptEvent.delta) {
this.emitText(transcriptEvent.delta, false);
}
break;
}
case "response.audio_transcript.done": {
// Final transcript
const finalEvent = event;
if (finalEvent.transcript) {
this.emitText(finalEvent.transcript, true);
}
break;
}
case "conversation.item.input_audio_transcription.completed": {
const transcriptEvent = event;
if (transcriptEvent.transcript) {
this.emitTranscript(transcriptEvent.transcript, true);
}
break;
}
case "response.function_call_arguments.done": {
const funcEvent = event;
if (funcEvent.name && funcEvent.call_id && funcEvent.arguments) {
try {
const args = JSON.parse(funcEvent.arguments);
// NEW6: defense-in-depth. handleFunctionCall already wraps its
// body in try/catch, so the inner path is covered today. This
// outer .catch is here to ensure any future un-caught path
// (e.g. a refactor that drops the inner catch, or `logger.error`
// itself throwing inside that catch) doesn't crash the process
// or hang the session via an unhandled-rejection. Issue 5.
void this.handleFunctionCall(funcEvent.name, args, funcEvent.call_id).catch((err) => {
logger.error(`[OpenAIRealtimeHandler] handleFunctionCall failed: ${err instanceof Error ? err.message : String(err)}`);
});
}
catch {
logger.warn("[OpenAIRealtimeHandler] Failed to parse function arguments");
}
}
break;
}
case "response.done": {
this.emitTurnEnd();
this.audioChunkIndex = 0;
break;
}
case "input_audio_buffer.speech_started": {
this.emitTurnStart();
break;
}
case "error": {
const errorEvent = event;
const errorMessage = errorEvent.error?.message ?? "Unknown error";
this.emitError(new Error(errorMessage));
break;
}
default:
// Log unhandled events at debug level
logger.debug(`[OpenAIRealtimeHandler] Unhandled event: ${event.type}`);
}
}
catch (err) {
logger.warn(`[OpenAIRealtimeHandler] Failed to parse message: ${err instanceof Error ? err.message : String(err)}`);
}
}
/**
* Handle function call from model
*/
async handleFunctionCall(name, args, callId) {
try {
const result = await this.emitFunctionCall(name, args);
// Send function result back
if (this.ws && this.isConnected()) {
this.ws.send(JSON.stringify({
type: "conversation.item.create",
item: {
type: "function_call_output",
call_id: callId,
output: JSON.stringify(result),
},
}));
// Trigger response with function result
await this.triggerResponse();
}
}
catch (err) {
const errMessage = err instanceof Error ? err.message : String(err);
logger.error(`[OpenAIRealtimeHandler] Function call failed: ${errMessage}`);
// M6: send a function_call_output with the error so OpenAI Realtime
// can resume the turn. Without this the session stalls indefinitely
// (the model waits for a function result before continuing) and the
// user hears silence.
if (this.ws && this.isConnected()) {
try {
this.ws.send(JSON.stringify({
type: "conversation.item.create",
item: {
type: "function_call_output",
call_id: callId,
output: JSON.stringify({ error: errMessage }),
},
}));
await this.triggerResponse();
}
catch (sendErr) {
logger.error(`[OpenAIRealtimeHandler] Failed to send error result for ${callId}: ${sendErr instanceof Error ? sendErr.message : String(sendErr)}`);
}
}
}
}
}
//# sourceMappingURL=OpenAIRealtime.js.map