@openai/agents-realtime
Version:
The OpenAI Agents SDK is a lightweight yet powerful framework for building multi-agent workflows. This package contains the logic for building realtime voice agents on the server or in the browser.
527 lines • 18.6 kB
JavaScript
import { z } from 'zod';
// Event schemas mirroring the structures defined in the OpenAI SDK typings.
// Most fields from the interfaces in `realtime.d.ts` are captured here to
// provide better runtime validation when parsing events from the server.
export const realtimeResponse = z.object({
id: z.string().optional().nullable(),
conversation_id: z.string().optional().nullable(),
max_output_tokens: z.number().or(z.literal('inf')).optional().nullable(),
metadata: z.record(z.string(), z.any()).optional().nullable(),
// GA rename: modalities -> output_modalities
output_modalities: z.array(z.string()).optional().nullable(),
object: z.literal('realtime.response').optional().nullable(),
output: z.array(z.any()).optional().nullable(),
// GA grouping: audio.output.{format,voice}
audio: z
.object({
output: z
.object({
format: z.any().optional().nullable(),
voice: z.string().optional().nullable(),
})
.optional()
.nullable(),
})
.optional()
.nullable(),
status: z
.enum(['completed', 'incomplete', 'failed', 'cancelled', 'in_progress'])
.optional()
.nullable(),
status_details: z.record(z.string(), z.any()).optional().nullable(),
usage: z
.object({
input_tokens: z.number().optional(),
input_token_details: z.record(z.string(), z.any()).optional().nullable(),
output_tokens: z.number().optional(),
output_token_details: z.record(z.string(), z.any()).optional().nullable(),
})
.optional()
.nullable(),
});
// Basic content schema used by ConversationItem.
export const conversationItemContentSchema = z.object({
id: z.string().optional(),
audio: z.string().nullable().optional(),
text: z.string().nullable().optional(),
transcript: z.string().nullable().optional(),
type: z.union([
z.literal('input_text'),
z.literal('input_audio'),
z.literal('item_reference'),
z.literal('output_text'),
z.literal('output_audio'),
]),
});
export const conversationItemSchema = z
.object({
id: z.string().optional(),
arguments: z.string().optional(),
call_id: z.string().optional(),
content: z.array(conversationItemContentSchema).optional(),
name: z.string().optional(),
output: z.string().nullable().optional(),
role: z.enum(['user', 'assistant', 'system']).optional(),
status: z.enum(['completed', 'incomplete', 'in_progress']).optional(),
type: z
.enum([
'message',
'function_call',
'function_call_output',
'mcp_list_tools',
'mcp_tool_call',
'mcp_call',
'mcp_approval_request',
'mcp_approval_response',
])
.optional(),
approval_request_id: z.string().nullable().optional(),
approve: z.boolean().nullable().optional(),
reason: z.string().nullable().optional(),
server_label: z.string().optional(),
error: z.any().nullable().optional(),
tools: z
.array(z
.object({
name: z.string(),
description: z.string(),
input_schema: z.record(z.any()).optional(),
})
.passthrough())
.optional(),
})
.passthrough();
export const conversationCreatedEventSchema = z.object({
type: z.literal('conversation.created'),
event_id: z.string(),
conversation: z.object({
id: z.string().optional(),
object: z.literal('realtime.conversation').optional(),
}),
});
// GA rename: conversation.item.created -> conversation.item.added
export const conversationItemAddedEventSchema = z.object({
type: z.literal('conversation.item.added'),
event_id: z.string(),
item: conversationItemSchema,
previous_item_id: z.string().nullable().optional(),
});
// GA addition: conversation.item.done
export const conversationItemDoneEventSchema = z.object({
type: z.literal('conversation.item.done'),
event_id: z.string(),
item: conversationItemSchema,
previous_item_id: z.string().nullable().optional(),
});
export const conversationItemDeletedEventSchema = z.object({
type: z.literal('conversation.item.deleted'),
event_id: z.string(),
item_id: z.string(),
});
export const conversationItemInputAudioTranscriptionCompletedEventSchema = z.object({
type: z.literal('conversation.item.input_audio_transcription.completed'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
transcript: z.string(),
logprobs: z.array(z.any()).nullable().optional(),
});
export const conversationItemInputAudioTranscriptionDeltaEventSchema = z.object({
type: z.literal('conversation.item.input_audio_transcription.delta'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number().optional(),
delta: z.string().optional(),
logprobs: z.array(z.any()).nullable().optional(),
});
export const conversationItemInputAudioTranscriptionFailedEventSchema = z.object({
type: z.literal('conversation.item.input_audio_transcription.failed'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
error: z.object({
code: z.string().optional(),
message: z.string().optional(),
param: z.string().optional(),
type: z.string().optional(),
}),
});
export const conversationItemRetrievedEventSchema = z.object({
type: z.literal('conversation.item.retrieved'),
event_id: z.string(),
item: conversationItemSchema,
});
export const conversationItemTruncatedEventSchema = z.object({
type: z.literal('conversation.item.truncated'),
event_id: z.string(),
item_id: z.string(),
audio_end_ms: z.number(),
content_index: z.number(),
});
export const conversationItemCreateEventSchema = z.object({
type: z.literal('conversation.item.create'),
item: conversationItemSchema,
event_id: z.string().optional(),
previous_item_id: z.string().nullable().optional(),
});
export const conversationItemDeleteEventSchema = z.object({
type: z.literal('conversation.item.delete'),
item_id: z.string(),
event_id: z.string().optional(),
});
export const conversationItemRetrieveEventSchema = z.object({
type: z.literal('conversation.item.retrieve'),
item_id: z.string(),
event_id: z.string().optional(),
});
export const conversationItemTruncateEventSchema = z.object({
type: z.literal('conversation.item.truncate'),
item_id: z.string(),
audio_end_ms: z.number(),
content_index: z.number(),
event_id: z.string().optional(),
});
export const errorEventSchema = z.object({
type: z.literal('error'),
event_id: z.string().optional(),
error: z.any().optional(),
});
export const inputAudioBufferClearedEventSchema = z.object({
type: z.literal('input_audio_buffer.cleared'),
event_id: z.string(),
});
export const inputAudioBufferAppendEventSchema = z.object({
type: z.literal('input_audio_buffer.append'),
audio: z.string(),
event_id: z.string().optional(),
});
export const inputAudioBufferClearEventSchema = z.object({
type: z.literal('input_audio_buffer.clear'),
event_id: z.string().optional(),
});
export const inputAudioBufferCommitEventSchema = z.object({
type: z.literal('input_audio_buffer.commit'),
event_id: z.string().optional(),
});
export const inputAudioBufferCommittedEventSchema = z.object({
type: z.literal('input_audio_buffer.committed'),
event_id: z.string(),
item_id: z.string(),
previous_item_id: z.string().nullable().optional(),
});
export const inputAudioBufferSpeechStartedEventSchema = z.object({
type: z.literal('input_audio_buffer.speech_started'),
event_id: z.string(),
item_id: z.string(),
audio_start_ms: z.number(),
});
export const inputAudioBufferSpeechStoppedEventSchema = z.object({
type: z.literal('input_audio_buffer.speech_stopped'),
event_id: z.string(),
item_id: z.string(),
audio_end_ms: z.number(),
});
export const outputAudioBufferStartedEventSchema = z
.object({
type: z.literal('output_audio_buffer.started'),
event_id: z.string(),
})
.passthrough();
export const outputAudioBufferStoppedEventSchema = z
.object({
type: z.literal('output_audio_buffer.stopped'),
event_id: z.string(),
})
.passthrough();
export const outputAudioBufferClearedEventSchema = z.object({
type: z.literal('output_audio_buffer.cleared'),
event_id: z.string(),
});
export const rateLimitsUpdatedEventSchema = z.object({
type: z.literal('rate_limits.updated'),
event_id: z.string(),
rate_limits: z.array(z.object({
limit: z.number().optional(),
name: z.enum(['requests', 'tokens']).optional(),
remaining: z.number().optional(),
reset_seconds: z.number().optional(),
})),
});
export const responseAudioDeltaEventSchema = z.object({
type: z.literal('response.output_audio.delta'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
delta: z.string(),
output_index: z.number(),
response_id: z.string(),
});
export const responseAudioDoneEventSchema = z.object({
type: z.literal('response.output_audio.done'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
output_index: z.number(),
response_id: z.string(),
});
export const responseAudioTranscriptDeltaEventSchema = z.object({
type: z.literal('response.output_audio_transcript.delta'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
delta: z.string(),
output_index: z.number(),
response_id: z.string(),
});
export const responseAudioTranscriptDoneEventSchema = z.object({
// GA may introduce response.output_audio_transcript.done
type: z.literal('response.output_audio_transcript.done'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
transcript: z.string(),
output_index: z.number(),
response_id: z.string(),
});
export const responseContentPartAddedEventSchema = z.object({
type: z.literal('response.content_part.added'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
output_index: z.number(),
response_id: z.string(),
part: z.object({
audio: z.string().optional(),
text: z.string().optional(),
transcript: z.string().optional(),
type: z.enum(['text', 'audio']).optional(),
}),
});
export const responseContentPartDoneEventSchema = z.object({
type: z.literal('response.content_part.done'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
output_index: z.number(),
response_id: z.string(),
part: z.object({
audio: z.string().optional(),
text: z.string().optional(),
transcript: z.string().optional(),
type: z.enum(['text', 'audio']).optional(),
}),
});
export const responseCreatedEventSchema = z.object({
type: z.literal('response.created'),
event_id: z.string(),
response: realtimeResponse,
});
export const responseDoneEventSchema = z.object({
type: z.literal('response.done'),
event_id: z.string(),
response: realtimeResponse,
});
export const responseFunctionCallArgumentsDeltaEventSchema = z.object({
type: z.literal('response.function_call_arguments.delta'),
event_id: z.string(),
item_id: z.string(),
call_id: z.string(),
delta: z.string(),
output_index: z.number(),
response_id: z.string(),
});
export const responseFunctionCallArgumentsDoneEventSchema = z.object({
type: z.literal('response.function_call_arguments.done'),
event_id: z.string(),
item_id: z.string(),
call_id: z.string(),
arguments: z.string(),
output_index: z.number(),
response_id: z.string(),
});
export const responseOutputItemAddedEventSchema = z.object({
type: z.literal('response.output_item.added'),
event_id: z.string(),
item: conversationItemSchema,
output_index: z.number(),
response_id: z.string(),
});
export const responseOutputItemDoneEventSchema = z.object({
type: z.literal('response.output_item.done'),
event_id: z.string(),
item: conversationItemSchema,
output_index: z.number(),
response_id: z.string(),
});
export const responseTextDeltaEventSchema = z.object({
type: z.literal('response.output_text.delta'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
delta: z.string(),
output_index: z.number(),
response_id: z.string(),
});
export const responseTextDoneEventSchema = z.object({
// No rename specified for done; keep response.text.done
type: z.literal('response.output_text.done'),
event_id: z.string(),
item_id: z.string(),
content_index: z.number(),
text: z.string(),
output_index: z.number(),
response_id: z.string(),
});
export const sessionCreatedEventSchema = z.object({
type: z.literal('session.created'),
event_id: z.string(),
session: z.any(),
});
export const sessionUpdatedEventSchema = z.object({
type: z.literal('session.updated'),
event_id: z.string(),
session: z.any(),
});
export const responseCancelEventSchema = z.object({
type: z.literal('response.cancel'),
event_id: z.string().optional(),
response_id: z.string().optional(),
});
export const responseCreateEventSchema = z.object({
type: z.literal('response.create'),
event_id: z.string().optional(),
response: z.any().optional(),
});
export const sessionUpdateEventSchema = z.object({
type: z.literal('session.update'),
event_id: z.string().optional(),
session: z.any(),
});
export const mcpListToolsInProgressEventSchema = z.object({
type: z.literal('mcp_list_tools.in_progress'),
event_id: z.string().optional(),
item_id: z.string().optional(),
});
export const mcpListToolsCompletedEventSchema = z.object({
type: z.literal('mcp_list_tools.completed'),
event_id: z.string().optional(),
item_id: z.string().optional(),
});
export const responseMcpCallArgumentsDeltaEventSchema = z.object({
type: z.literal('response.mcp_call_arguments.delta'),
event_id: z.string(),
response_id: z.string(),
item_id: z.string(),
output_index: z.number(),
delta: z.string(),
obfuscation: z.string(),
});
export const responseMcpCallArgumentsDoneEventSchema = z.object({
type: z.literal('response.mcp_call_arguments.done'),
event_id: z.string(),
response_id: z.string(),
item_id: z.string(),
output_index: z.number(),
arguments: z.string(),
});
export const responseMcpCallInProgressEventSchema = z.object({
type: z.literal('response.mcp_call.in_progress'),
event_id: z.string(),
output_index: z.number(),
item_id: z.string(),
});
export const responseMcpCallCompletedEventSchema = z.object({
type: z.literal('response.mcp_call.completed'),
event_id: z.string(),
output_index: z.number(),
item_id: z.string(),
});
export const mcpListToolsFailedEventSchema = z.object({
type: z.literal('mcp_list_tools.failed'),
event_id: z.string().optional(),
item_id: z.string().optional(),
});
/**
* This schema is used if an event is unknown to the client. The Realtime API might introduce
* new events at some point and we should handle them gracefully by treating them as generic events
* only requiring a type and an optional event_id.
*/
export const genericEventSchema = z
.object({
type: z.string(),
event_id: z.string().optional().nullable(),
})
.passthrough();
export const realtimeServerEventSchema = z.discriminatedUnion('type', [
conversationCreatedEventSchema,
conversationItemAddedEventSchema,
conversationItemDoneEventSchema,
conversationItemDeletedEventSchema,
conversationItemInputAudioTranscriptionCompletedEventSchema,
conversationItemInputAudioTranscriptionDeltaEventSchema,
conversationItemInputAudioTranscriptionFailedEventSchema,
conversationItemRetrievedEventSchema,
conversationItemTruncatedEventSchema,
errorEventSchema,
inputAudioBufferClearedEventSchema,
inputAudioBufferCommittedEventSchema,
inputAudioBufferSpeechStartedEventSchema,
inputAudioBufferSpeechStoppedEventSchema,
outputAudioBufferStartedEventSchema,
outputAudioBufferStoppedEventSchema,
outputAudioBufferClearedEventSchema,
rateLimitsUpdatedEventSchema,
responseAudioDeltaEventSchema,
responseAudioDoneEventSchema,
responseAudioTranscriptDeltaEventSchema,
responseAudioTranscriptDoneEventSchema,
responseContentPartAddedEventSchema,
responseContentPartDoneEventSchema,
responseCreatedEventSchema,
responseDoneEventSchema,
responseFunctionCallArgumentsDeltaEventSchema,
responseFunctionCallArgumentsDoneEventSchema,
responseOutputItemAddedEventSchema,
responseOutputItemDoneEventSchema,
responseTextDeltaEventSchema,
responseTextDoneEventSchema,
sessionCreatedEventSchema,
sessionUpdatedEventSchema,
mcpListToolsInProgressEventSchema,
mcpListToolsCompletedEventSchema,
mcpListToolsFailedEventSchema,
responseMcpCallArgumentsDeltaEventSchema,
responseMcpCallArgumentsDoneEventSchema,
responseMcpCallInProgressEventSchema,
responseMcpCallCompletedEventSchema,
]);
export const realtimeClientEventSchema = z.discriminatedUnion('type', [
conversationItemCreateEventSchema,
conversationItemDeleteEventSchema,
conversationItemRetrieveEventSchema,
conversationItemTruncateEventSchema,
inputAudioBufferAppendEventSchema,
inputAudioBufferClearEventSchema,
inputAudioBufferCommitEventSchema,
responseCancelEventSchema,
responseCreateEventSchema,
sessionUpdateEventSchema,
]);
/**
* Parses a realtime event from the server. If the event is unknown to the client, it will be treated as a generic event.
* @param event - The event to parse.
* @returns The parsed event or null if the event is unknown to the client.
*/
export function parseRealtimeEvent(event) {
const raw = JSON.parse(event.data.toString());
const parsed = realtimeServerEventSchema.safeParse(raw);
if (!parsed.success) {
const genericParsed = genericEventSchema.safeParse(raw);
if (genericParsed.success) {
return { data: genericParsed.data, isGeneric: true };
}
return { data: null, isGeneric: true };
}
return { data: parsed.data, isGeneric: false };
}
//# sourceMappingURL=openaiRealtimeEvents.mjs.map