@openai/agents-realtime
Version:
The OpenAI Agents SDK is a lightweight yet powerful framework for building multi-agent workflows. This package contains the logic for building realtime voice agents on the server or in the browser.
639 lines • 25.9 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.OpenAIRealtimeBase = exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG = exports.DEFAULT_OPENAI_REALTIME_MODEL = void 0;
const agents_core_1 = require("@openai/agents-core");
const clientMessages_1 = require("./clientMessages.js");
const items_1 = require("./items.js");
const logger_1 = __importDefault(require("./logger.js"));
const openaiRealtimeEvents_1 = require("./openaiRealtimeEvents.js");
const utils_1 = require("./utils.js");
const utils_2 = require("@openai/agents-core/utils");
/**
* The default model that is used during the connection if no model is provided.
*/
exports.DEFAULT_OPENAI_REALTIME_MODEL = 'gpt-realtime';
/**
* The default session config that gets send over during session connection unless overridden
* by the user.
*/
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG = {
outputModalities: ['audio'],
audio: {
input: {
format: { type: 'audio/pcm', rate: 24000 },
transcription: { model: 'gpt-4o-mini-transcribe' },
turnDetection: { type: 'semantic_vad' },
noiseReduction: null,
},
output: {
format: { type: 'audio/pcm', rate: 24000 },
speed: 1,
},
},
};
class OpenAIRealtimeBase extends utils_2.EventEmitterDelegate {
#model;
#apiKey;
#tracingConfig = null;
#rawSessionConfig = null;
eventEmitter = new agents_core_1.RuntimeEventEmitter();
constructor(options = {}) {
super();
this.#model = options.model ?? exports.DEFAULT_OPENAI_REALTIME_MODEL;
this.#apiKey = options.apiKey;
}
/**
* The current model that is being used by the transport layer.
*/
get currentModel() {
return this.#model;
}
/**
* The current model that is being used by the transport layer.
* **Note**: The model cannot be changed mid conversation.
*/
set currentModel(model) {
this.#model = model;
}
get _rawSessionConfig() {
return this.#rawSessionConfig ?? null;
}
async _getApiKey(options) {
const apiKey = options.apiKey ?? this.#apiKey;
if (typeof apiKey === 'function') {
return await apiKey();
}
return apiKey;
}
_onMessage(event) {
const { data: parsed, isGeneric } = (0, openaiRealtimeEvents_1.parseRealtimeEvent)(event);
if (parsed === null) {
return;
}
this.emit('*', parsed);
if (isGeneric) {
return;
}
if (parsed.type === 'error') {
this.emit('error', { type: 'error', error: parsed });
}
else {
this.emit(parsed.type, parsed);
}
if (parsed.type === 'response.created') {
this.emit('turn_started', {
type: 'response_started',
providerData: {
...parsed,
},
});
return;
}
if (parsed.type === 'session.updated') {
this.#rawSessionConfig = parsed.session;
}
if (parsed.type === 'response.done') {
const response = openaiRealtimeEvents_1.responseDoneEventSchema.safeParse(parsed);
if (!response.success) {
logger_1.default.error('Error parsing response done event', response.error);
return;
}
const inputTokens = response.data.response.usage?.input_tokens ?? 0;
const outputTokens = response.data.response.usage?.output_tokens ?? 0;
const totalTokens = inputTokens + outputTokens;
const usage = new agents_core_1.Usage({
inputTokens,
inputTokensDetails: response.data.response.usage?.input_token_details ?? {},
outputTokens,
outputTokensDetails: response.data.response.usage?.output_token_details ?? {},
totalTokens,
});
this.emit('usage_update', usage);
this.emit('turn_done', {
type: 'response_done',
response: {
id: response.data.response.id ?? '',
output: response.data.response.output ?? [],
usage: {
inputTokens,
inputTokensDetails: response.data.response.usage?.input_token_details ?? {},
outputTokens,
outputTokensDetails: response.data.response.usage?.output_token_details ?? {},
totalTokens,
},
},
});
return;
}
if (parsed.type === 'response.output_audio.done') {
this.emit('audio_done');
return;
}
if (parsed.type === 'conversation.item.deleted') {
this.emit('item_deleted', {
itemId: parsed.item_id,
});
return;
}
if (parsed.type === 'conversation.item.input_audio_transcription.completed' ||
parsed.type === 'conversation.item.truncated') {
// right now rather than keeping track of partials and rebuilding the item we
// will retrieve it instead which triggers the `conversation.item.retrieved` event below
this.sendEvent({
type: 'conversation.item.retrieve',
item_id: parsed.item_id,
});
return;
}
if (parsed.type === 'conversation.item.input_audio_transcription.delta' ||
parsed.type === 'response.output_text.delta' ||
parsed.type === 'response.output_audio_transcript.delta' ||
parsed.type === 'response.function_call_arguments.delta') {
if (parsed.type === 'response.output_audio_transcript.delta') {
this.emit('audio_transcript_delta', {
type: 'transcript_delta',
delta: parsed.delta,
itemId: parsed.item_id,
responseId: parsed.response_id,
});
}
// no support for partial transcripts yet.
return;
}
if (parsed.type === 'conversation.item.added' ||
parsed.type === 'conversation.item.done' ||
parsed.type === 'conversation.item.retrieved') {
// Handle MCP list tools items (only act when done to ensure tools are present)
if (parsed.item.type === 'mcp_list_tools' &&
parsed.type === 'conversation.item.done') {
const serverLabel = parsed.item.server_label ?? '';
const tools = (parsed.item.tools ?? []);
try {
this.emit('mcp_tools_listed', {
serverLabel,
tools,
});
}
catch (err) {
logger_1.default.error('Error emitting mcp_tools_listed', err, parsed.item);
}
// We do not add this item to history; it's a transport-level side-channel.
return;
}
if (parsed.item.type === 'message') {
const previousItemId = parsed.type === 'conversation.item.added' ||
parsed.type === 'conversation.item.done'
? parsed.previous_item_id
: null;
const item = items_1.realtimeMessageItemSchema.parse({
itemId: parsed.item.id,
previousItemId,
type: parsed.item.type,
role: parsed.item.role,
content: parsed.item.content,
status: parsed.item.status,
});
this.emit('item_update', item);
return;
}
if (parsed.item.type === 'mcp_approval_request' &&
parsed.type === 'conversation.item.done') {
const item = parsed.item;
const mcpApprovalRequest = items_1.realtimeMcpCallApprovalRequestItem.parse({
itemId: item.id,
type: item.type,
serverLabel: item.server_label,
name: item.name,
arguments: JSON.parse(item.arguments || '{}'),
approved: item.approved,
});
this.emit('item_update', mcpApprovalRequest);
this.emit('mcp_approval_request', mcpApprovalRequest);
return;
}
if (parsed.item.type === 'mcp_tool_call' ||
parsed.item.type === 'mcp_call') {
const status = parsed.type === 'conversation.item.done'
? 'completed'
: 'in_progress';
const mcpCall = items_1.realtimeMcpCallItem.parse({
itemId: parsed.item.id,
type: parsed.item.type,
status,
arguments: parsed.item.arguments,
name: parsed.item.name,
output: parsed.item.output,
});
this.emit('item_update', mcpCall);
if (parsed.type === 'conversation.item.done') {
this.emit('mcp_tool_call_completed', mcpCall);
}
return;
}
}
if (parsed.type === 'response.mcp_call.in_progress') {
const item = parsed;
this.sendEvent({
type: 'conversation.item.retrieve',
item_id: item.item_id,
});
return;
}
if (parsed.type === 'mcp_list_tools.in_progress') {
const item = parsed;
if (item.item_id) {
this.sendEvent({
type: 'conversation.item.retrieve',
item_id: item.item_id,
});
}
return;
}
if (parsed.type === 'response.output_item.done' ||
parsed.type === 'response.output_item.added') {
const item = parsed.item;
if (item.type === 'function_call' && item.status === 'completed') {
const toolCall = items_1.realtimeToolCallItem.parse({
itemId: item.id,
type: item.type,
status: 'in_progress', // we set it to in_progress for the UI as it will only be completed with the output
arguments: item.arguments,
name: item.name,
output: null,
});
this.emit('item_update', toolCall);
this.emit('function_call', {
id: item.id,
type: 'function_call',
callId: item.call_id ?? '',
arguments: item.arguments ?? '',
name: item.name ?? '',
});
return;
}
if (item.type === 'mcp_tool_call' || item.type === 'mcp_call') {
const mcpCall = items_1.realtimeMcpCallItem.parse({
itemId: item.id,
type: item.type,
status: parsed.type === 'response.output_item.done'
? 'completed'
: 'in_progress', // we set it to in_progress for the UI as it will only be completed with the output
arguments: item.arguments,
name: item.name,
output: item.output,
});
this.emit('item_update', mcpCall);
return;
}
if (item.type === 'message') {
const realtimeItem = items_1.realtimeMessageItemSchema.parse({
itemId: parsed.item.id,
type: parsed.item.type,
role: parsed.item.role,
content: parsed.item.content,
status: parsed.type === 'response.output_item.done'
? (item.status ?? 'completed')
: (item.status ?? 'in_progress'),
});
this.emit('item_update', realtimeItem);
return;
}
}
}
_onError(error) {
this.emit('error', {
type: 'error',
error,
});
}
_onOpen() {
this.emit('connected');
}
_onClose() {
this.emit('disconnected');
}
/**
* Send a message to the Realtime API. This will create a new item in the conversation and
* trigger a response.
*
* @param message - The message to send.
* @param otherEventData - Additional event data to send.
*/
sendMessage(message, otherEventData, { triggerResponse = true } = {}) {
const content = typeof message === 'string'
? [
{
type: 'input_text',
text: message,
},
]
: message.content.map((content) => {
if (content.type === 'input_image') {
return {
type: 'input_image',
image_url: content.image,
...(content.providerData ?? {}),
};
}
return content;
});
this.sendEvent({
type: 'conversation.item.create',
item: {
type: 'message',
role: 'user',
content,
},
...otherEventData,
});
if (triggerResponse) {
this.sendEvent({
type: 'response.create',
});
}
}
addImage(image, { triggerResponse = true } = {}) {
this.sendMessage({
type: 'message',
role: 'user',
content: [{ type: 'input_image', image }],
}, {}, { triggerResponse });
}
_getMergedSessionConfig(config) {
const newConfig = (0, clientMessages_1.toNewSessionConfig)(config);
const sessionData = {
type: 'realtime',
instructions: newConfig.instructions,
model: newConfig.model ?? this.#model,
output_modalities: newConfig.outputModalities ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.outputModalities,
audio: {
input: {
format: newConfig.audio?.input?.format ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.format,
noise_reduction: newConfig.audio?.input?.noiseReduction ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.noiseReduction,
transcription: newConfig.audio?.input?.transcription ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.transcription,
turn_detection: OpenAIRealtimeBase.buildTurnDetectionConfig(newConfig.audio?.input?.turnDetection) ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.input?.turnDetection,
},
output: {
format: newConfig.audio?.output?.format ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.output?.format,
voice: newConfig.audio?.output?.voice ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.output?.voice,
speed: newConfig.audio?.output?.speed ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.audio?.output?.speed,
},
},
tool_choice: newConfig.toolChoice ??
exports.DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.toolChoice,
// We don't set tracing here to make sure that we don't try to override it on every
// session.update as it might lead to errors
...(newConfig.providerData ?? {}),
};
if (newConfig.prompt) {
sessionData.prompt = {
id: newConfig.prompt.promptId,
version: newConfig.prompt.version,
variables: newConfig.prompt.variables,
};
}
if (newConfig.tools && newConfig.tools.length > 0) {
sessionData.tools = newConfig.tools.map((tool) => ({
...tool,
strict: undefined,
}));
}
return sessionData;
}
static buildTurnDetectionConfig(c) {
if (typeof c === 'undefined') {
return undefined;
}
const { type, createResponse, create_response, eagerness, interruptResponse, interrupt_response, prefixPaddingMs, prefix_padding_ms, silenceDurationMs, silence_duration_ms, threshold, idleTimeoutMs, idle_timeout_ms, ...rest } = c;
const config = {
type,
create_response: createResponse ? createResponse : create_response,
eagerness,
interrupt_response: interruptResponse
? interruptResponse
: interrupt_response,
prefix_padding_ms: prefixPaddingMs ? prefixPaddingMs : prefix_padding_ms,
silence_duration_ms: silenceDurationMs
? silenceDurationMs
: silence_duration_ms,
idle_timeout_ms: idleTimeoutMs ? idleTimeoutMs : idle_timeout_ms,
threshold,
...rest,
};
// Remove undefined values from the config
Object.keys(config).forEach((key) => {
if (config[key] === undefined)
delete config[key];
});
return Object.keys(config).length > 0 ? config : undefined;
}
/**
* Sets the internal tracing config. This is used to track the tracing config that has been set
* during the session.create event.
*/
set _tracingConfig(tracingConfig) {
this.#tracingConfig = tracingConfig;
}
/**
* Sets the tracing config for the session. This will send the tracing config to the Realtime API.
*
* @param tracingConfig - The tracing config to set. We don't support 'auto' here as the SDK will always configure a Workflow Name unless it exists
*/
_updateTracingConfig(tracingConfig) {
if (typeof this.#tracingConfig === 'undefined') {
// treating it as default value
this.#tracingConfig = null;
}
if (tracingConfig === 'auto') {
// turn on tracing in auto mode
this.sendEvent({
type: 'session.update',
session: {
type: 'realtime',
tracing: 'auto',
},
});
return;
}
if (this.#tracingConfig !== null &&
typeof this.#tracingConfig !== 'string' &&
typeof tracingConfig !== 'string') {
// tracing is already set, we can't change it
logger_1.default.warn('Tracing config is already set, skipping setting it again. This likely happens when you already set a tracing config on session creation.');
return;
}
if (tracingConfig === null) {
logger_1.default.debug('Disabling tracing for this session. It cannot be turned on for this session from this point on.');
this.sendEvent({
type: 'session.update',
session: {
type: 'realtime',
tracing: null,
},
});
return;
}
if (this.#tracingConfig === null ||
typeof this.#tracingConfig === 'string') {
// tracing is currently not set so we can set it to the new value
this.sendEvent({
type: 'session.update',
session: {
type: 'realtime',
tracing: tracingConfig,
},
});
return;
}
if (tracingConfig?.group_id !== this.#tracingConfig?.group_id ||
tracingConfig?.metadata !== this.#tracingConfig?.metadata ||
tracingConfig?.workflow_name !== this.#tracingConfig?.workflow_name) {
logger_1.default.warn('Mismatch in tracing config. Ignoring the new tracing config. This likely happens when you already set a tracing config on session creation. Current tracing config: %s, new tracing config: %s', JSON.stringify(this.#tracingConfig), JSON.stringify(tracingConfig));
return;
}
this.sendEvent({
type: 'session.update',
session: {
type: 'realtime',
tracing: tracingConfig,
},
});
}
/**
* Updates the session config. This will merge it with the current session config with the default
* values and send it to the Realtime API.
*
* @param config - The session config to update.
*/
updateSessionConfig(config) {
const sessionData = this._getMergedSessionConfig(config);
this.sendEvent({
type: 'session.update',
session: sessionData,
});
}
/**
* Send the output of a function call to the Realtime API.
*
* @param toolCall - The tool call to send the output for.
* @param output - The output of the function call.
* @param startResponse - Whether to start a new response after sending the output.
*/
sendFunctionCallOutput(toolCall, output, startResponse = true) {
this.sendEvent({
type: 'conversation.item.create',
item: {
type: 'function_call_output',
output,
call_id: toolCall.callId,
},
});
try {
const item = items_1.realtimeToolCallItem.parse({
itemId: toolCall.id,
previousItemId: toolCall.previousItemId,
type: 'function_call',
status: 'completed',
arguments: toolCall.arguments,
name: toolCall.name,
output,
});
this.emit('item_update', item);
}
catch (error) {
logger_1.default.error('Error parsing tool call item', error, toolCall);
}
if (startResponse) {
this.sendEvent({
type: 'response.create',
});
}
}
/**
* Send an audio buffer to the Realtime API. If `{ commit: true }` is passed, the audio buffer
* will be committed and the model will start processing it. This is necessary if you have
* disabled turn detection / voice activity detection (VAD).
*
* @param audio - The audio buffer to send.
* @param options - The options for the audio buffer.
*/
sendAudio(audio, { commit = false } = {}) {
this.sendEvent({
type: 'input_audio_buffer.append',
audio: (0, utils_1.arrayBufferToBase64)(audio),
});
if (commit) {
this.sendEvent({
type: 'input_audio_buffer.commit',
});
}
}
/**
* Reset the history of the conversation. This will create a diff between the old and new history
* and send the necessary events to the Realtime API to update the history.
*
* @param oldHistory - The old history of the conversation.
* @param newHistory - The new history of the conversation.
*/
resetHistory(oldHistory, newHistory) {
const { removals, additions, updates } = (0, utils_1.diffRealtimeHistory)(oldHistory, newHistory);
const removalIds = new Set(removals.map((item) => item.itemId));
// we don't have an update event for items so we will remove and re-add what's there
for (const update of updates) {
removalIds.add(update.itemId);
}
if (removalIds.size > 0) {
for (const itemId of removalIds) {
this.sendEvent({
type: 'conversation.item.delete',
item_id: itemId,
});
}
}
const additionsAndUpdates = [...additions, ...updates];
for (const addition of additionsAndUpdates) {
if (addition.type === 'message') {
const itemEntry = {
type: 'message',
role: addition.role,
content: addition.content,
id: addition.itemId,
};
if (addition.role !== 'system' && addition.status) {
itemEntry.status = addition.status;
}
this.sendEvent({
type: 'conversation.item.create',
item: itemEntry,
});
}
else if (addition.type === 'function_call') {
logger_1.default.warn('Function calls cannot be manually added or updated at the moment. Ignoring.');
}
}
}
sendMcpResponse(approvalRequest, approved) {
this.sendEvent({
type: 'conversation.item.create',
previous_item_id: approvalRequest.itemId,
item: {
type: 'mcp_approval_response',
approval_request_id: approvalRequest.itemId,
approve: approved,
},
});
}
}
exports.OpenAIRealtimeBase = OpenAIRealtimeBase;
//# sourceMappingURL=openaiRealtimeBase.js.map