@openai/agents-realtime
Version:
The OpenAI Agents SDK is a lightweight yet powerful framework for building multi-agent workflows. This package contains the logic for building realtime voice agents on the server or in the browser.
669 lines • 29.4 kB
JavaScript
import { getTransferMessage, ModelBehaviorError, OutputGuardrailTripwireTriggered, RunContext, RunToolApprovalItem, } from '@openai/agents-core';
import { RuntimeEventEmitter } from '@openai/agents-core/_shims';
import { isZodObject, toSmartString } from '@openai/agents-core/utils';
import { defineRealtimeOutputGuardrail, getRealtimeGuardrailFeedbackMessage, getRealtimeGuardrailSettings, } from "./guardrail.mjs";
import { DEFAULT_OPENAI_REALTIME_SESSION_CONFIG, } from "./openaiRealtimeBase.mjs";
import { OpenAIRealtimeWebRTC } from "./openaiRealtimeWebRtc.mjs";
import { OpenAIRealtimeWebSocket } from "./openaiRealtimeWebsocket.mjs";
import { approvalItemToRealtimeApprovalItem, getLastTextFromAudioOutputMessage, hasWebRTCSupport, realtimeApprovalItemToApprovalItem, updateRealtimeHistory, } from "./utils.mjs";
import logger from "./logger.mjs";
import { isBackgroundResult, isValidRealtimeTool, toRealtimeToolDefinition, } from "./tool.mjs";
function cloneDefaultSessionConfig() {
return JSON.parse(JSON.stringify(DEFAULT_OPENAI_REALTIME_SESSION_CONFIG));
}
/**
* A `RealtimeSession` is the cornerstone of building Voice Agents. It's the equivalent of a
* Runner in text-based agents except that it automatically handles multiple turns by maintaining a
* connection with the underlying transport layer.
*
* The session handles managing the local history copy, executes tools, runs output guardrails, and
* facilitates handoffs.
*
* The actual audio handling and generation of model responses is handled by the underlying
* transport layer. By default if you are using a browser with WebRTC support, the session will
* automatically use the WebRTC version of the OpenAI Realtime API. On the server or if you pass
* `websocket` as the transport layer, the session will establish a connection using WebSockets.
*
* In the case of WebRTC, in the browser, the transport layer will also automatically configure the
* microphone and audio output to be used by the session.
*
* You can also create a transport layer instance yourself and pass it in to have more control over
* the configuration or even extend the existing ones. Check out the `TwilioRealtimeTransportLayer`
* for an example of how to create a custom transport layer.
*
* @example
* ```ts
* const agent = new RealtimeAgent({
* name: 'my-agent',
* instructions: 'You are a helpful assistant that can answer questions and help with tasks.',
* })
*
* const session = new RealtimeSession(agent);
* session.connect({
* apiKey: 'your-api-key',
* });
* ```
*/
export class RealtimeSession extends RuntimeEventEmitter {
initialAgent;
options;
#transport;
#currentAgent;
#currentTools;
#context;
#outputGuardrails = [];
#outputGuardrailSettings;
#transcribedTextDeltas = {};
#history = [];
#shouldIncludeAudioData;
#interruptedByGuardrail = {};
#audioStarted = false;
// Tracks all MCP tools fetched per server label (from mcp_list_tools results).
#allMcpToolsByServer = new Map();
// Tracks currently available MCP tools based on the active agent's configured server_labels.
#availableMcpTools = [];
// Keeps track of the last full session config we sent (camelCase keys) so that
// subsequent updates (e.g. during agent handoffs) preserve properties that are
// not explicitly recalculated here (such as inputAudioFormat, outputAudioFormat,
// modalities, speed, toolChoice, turnDetection, etc.). Without this, updating
// the agent would drop audio format overrides (e.g. g711_ulaw) and revert to
// transport defaults causing issues for integrations like Twilio.
#lastSessionConfig = cloneDefaultSessionConfig();
#automaticallyTriggerResponseForMcpToolCalls = true;
constructor(initialAgent, options = {}) {
super();
this.initialAgent = initialAgent;
this.options = options;
if ((typeof options.transport === 'undefined' && hasWebRTCSupport()) ||
options.transport === 'webrtc') {
this.#transport = new OpenAIRealtimeWebRTC();
}
else if (options.transport === 'websocket' ||
typeof options.transport === 'undefined') {
this.#transport = new OpenAIRealtimeWebSocket();
}
else {
this.#transport = options.transport;
}
this.#currentAgent = initialAgent;
this.#context = new RunContext({
...(options.context ?? {}),
history: this.#history,
});
this.#outputGuardrails = (options.outputGuardrails ?? []).map(defineRealtimeOutputGuardrail);
this.#outputGuardrailSettings = getRealtimeGuardrailSettings(options.outputGuardrailSettings ?? {});
this.#shouldIncludeAudioData = options.historyStoreAudio ?? false;
this.#automaticallyTriggerResponseForMcpToolCalls =
options.automaticallyTriggerResponseForMcpToolCalls ?? true;
}
/**
* The transport layer used by the session.
*/
get transport() {
return this.#transport;
}
/**
* The current agent in the session.
*/
get currentAgent() {
return this.#currentAgent;
}
/**
* The current usage of the session.
*/
get usage() {
return this.#context.usage;
}
/**
* The current context of the session.
*/
get context() {
return this.#context;
}
/**
* Whether the session is muted. Might be `null` if the underlying transport layer does not
* support muting.
*/
get muted() {
return this.#transport.muted;
}
/**
* The history of the session.
*/
get history() {
return this.#history;
}
get availableMcpTools() {
return this.#availableMcpTools;
}
async #setCurrentAgent(agent) {
this.#currentAgent = agent;
const handoffs = await this.#currentAgent.getEnabledHandoffs(this.#context);
const handoffTools = handoffs.map((handoff) => handoff.getHandoffAsFunctionTool());
const allTools = (await this.#currentAgent.getAllTools(this.#context))
.filter(isValidRealtimeTool)
.map(toRealtimeToolDefinition);
const hasToolsDefined = typeof this.#currentAgent.tools !== 'undefined' ||
typeof this.#currentAgent.mcpServers !== 'undefined';
const hasHandoffsDefined = handoffs.length > 0;
this.#currentTools =
hasToolsDefined || hasHandoffsDefined
? [...allTools, ...handoffTools]
: undefined;
// Recompute currently available MCP tools based on the new agent's active server labels.
this.#updateAvailableMcpTools();
}
async #getSessionConfig(additionalConfig = {}) {
const instructions = await this.#currentAgent.getSystemPrompt(this.#context);
const tracingConfig = this.options
.tracingDisabled
? null
: this.options.workflowName
? {
workflow_name: this.options.workflowName,
}
: 'auto';
if (tracingConfig !== null && tracingConfig !== 'auto') {
if (this.options.groupId) {
tracingConfig.group_id = this.options.groupId;
}
if (this.options.traceMetadata) {
tracingConfig.metadata = this.options.traceMetadata;
}
}
else if (this.options.groupId || this.options.traceMetadata) {
logger.warn('In order to set traceMetadata or a groupId you need to specify a workflowName.');
}
// Start from any previously-sent config (so we preserve values like audio formats)
// and the original options.config provided by the user. Preference order:
// 1. Last session config we sent (#lastSessionConfig)
// 2. Original options.config
// 3. Additional config passed into this invocation (explicit overrides)
// Finally we overwrite dynamic fields (instructions, voice, model, tools, tracing)
// to ensure they always reflect the current agent & runtime state.
const base = {
...(this.#lastSessionConfig ?? {}),
...(this.options.config ?? {}),
...(additionalConfig ?? {}),
};
// Note: Certain fields cannot be updated after the session begins, such as voice and model
const fullConfig = {
...base,
instructions,
voice: this.#currentAgent.voice,
model: this.options.model,
tools: this.#currentTools,
tracing: tracingConfig,
prompt: typeof this.#currentAgent.prompt === 'function'
? await this.#currentAgent.prompt(this.#context, this.#currentAgent)
: this.#currentAgent.prompt,
};
// Update our cache so subsequent updates inherit the full set including any
// dynamic fields we just overwrote.
this.#lastSessionConfig = fullConfig;
return fullConfig;
}
async updateAgent(newAgent) {
this.#currentAgent.emit('agent_handoff', this.#context, newAgent);
this.emit('agent_handoff', this.#context, this.#currentAgent, newAgent);
await this.#setCurrentAgent(newAgent);
await this.#transport.updateSessionConfig(await this.#getSessionConfig());
return newAgent;
}
async #handleHandoff(toolCall, handoff) {
const newAgent = (await handoff.onInvokeHandoff(this.#context, toolCall.arguments));
this.#currentAgent.emit('agent_handoff', this.#context, newAgent);
this.emit('agent_handoff', this.#context, this.#currentAgent, newAgent);
// update session with new agent
await this.#setCurrentAgent(newAgent);
await this.#transport.updateSessionConfig(await this.#getSessionConfig());
const output = getTransferMessage(newAgent);
this.#transport.sendFunctionCallOutput(toolCall, output, true);
return newAgent;
}
async #handleFunctionToolCall(toolCall, tool) {
this.#context.context.history = JSON.parse(JSON.stringify(this.#history)); // deep copy of the history
let parsedArgs = toolCall.arguments;
if (tool.parameters) {
if (isZodObject(tool.parameters)) {
parsedArgs = tool.parameters.parse(parsedArgs);
}
else {
parsedArgs = JSON.parse(parsedArgs);
}
}
const needsApproval = await tool.needsApproval(this.#context, parsedArgs, toolCall.callId);
if (needsApproval) {
const approval = this.context.isToolApproved({
toolName: tool.name,
callId: toolCall.callId,
});
if (approval === false) {
this.emit('agent_tool_start', this.#context, this.#currentAgent, tool, {
toolCall,
});
this.#currentAgent.emit('agent_tool_start', this.#context, tool, {
toolCall,
});
const result = 'Tool execution was not approved.';
this.#transport.sendFunctionCallOutput(toolCall, result, true);
this.emit('agent_tool_end', this.#context, this.#currentAgent, tool, result, { toolCall });
this.#currentAgent.emit('agent_tool_end', this.#context, tool, result, {
toolCall,
});
return;
}
else if (typeof approval === 'undefined') {
this.emit('tool_approval_requested', this.#context, this.#currentAgent, {
type: 'function_approval',
tool,
approvalItem: new RunToolApprovalItem(toolCall, this.#currentAgent),
});
return;
}
}
this.emit('agent_tool_start', this.#context, this.#currentAgent, tool, {
toolCall,
});
this.#currentAgent.emit('agent_tool_start', this.#context, tool, {
toolCall,
});
this.#context.context.history = JSON.parse(JSON.stringify(this.#history)); // deep copy of the history
const result = await tool.invoke(this.#context, toolCall.arguments, {
toolCall,
});
let stringResult;
if (isBackgroundResult(result)) {
// Don't generate a new response, just send the result
stringResult = toSmartString(result.content);
this.#transport.sendFunctionCallOutput(toolCall, stringResult, false);
}
else {
stringResult = toSmartString(result);
this.#transport.sendFunctionCallOutput(toolCall, stringResult, true);
}
this.emit('agent_tool_end', this.#context, this.#currentAgent, tool, stringResult, { toolCall });
this.#currentAgent.emit('agent_tool_end', this.#context, tool, stringResult, { toolCall });
}
async #handleFunctionCall(toolCall) {
const enabledHandoffs = await this.#currentAgent.getEnabledHandoffs(this.#context);
const handoffMap = new Map(enabledHandoffs.map((handoff) => [handoff.toolName, handoff]));
const allTools = await this.#currentAgent.getAllTools(this.#context);
const functionToolMap = new Map(allTools.map((tool) => [tool.name, tool]));
const possibleHandoff = handoffMap.get(toolCall.name);
if (possibleHandoff) {
await this.#handleHandoff(toolCall, possibleHandoff);
}
else {
const functionTool = functionToolMap.get(toolCall.name);
if (functionTool && functionTool.type === 'function') {
await this.#handleFunctionToolCall(toolCall, functionTool);
}
else {
throw new ModelBehaviorError(`Tool ${toolCall.name} not found`);
}
}
}
async #runOutputGuardrails(output, responseId, itemId) {
if (this.#outputGuardrails.length === 0) {
return;
}
const guardrailArgs = {
agent: this.#currentAgent,
agentOutput: output,
context: this.#context,
};
const results = await Promise.all(this.#outputGuardrails.map((guardrail) => guardrail.run(guardrailArgs)));
const firstTripwireTriggered = results.find((result) => result.output.tripwireTriggered);
if (firstTripwireTriggered) {
// this ensures that if one guardrail already trips and we are in the middle of another
// guardrail run, we don't trip again
if (this.#interruptedByGuardrail[responseId]) {
return;
}
this.#interruptedByGuardrail[responseId] = true;
const error = new OutputGuardrailTripwireTriggered(`Output guardrail triggered: ${JSON.stringify(firstTripwireTriggered.output.outputInfo)}`, firstTripwireTriggered);
this.emit('guardrail_tripped', this.#context, this.#currentAgent, error, {
itemId,
});
this.interrupt();
const feedbackText = getRealtimeGuardrailFeedbackMessage(firstTripwireTriggered);
this.sendMessage(feedbackText);
return;
}
}
#setEventListeners() {
this.#transport.on('*', (event) => {
this.emit('transport_event', event);
// Handle completed user transcription events
if (event.type === 'conversation.item.input_audio_transcription.completed') {
try {
const completedEvent = event;
this.#history = updateRealtimeHistory(this.#history, completedEvent, this.#shouldIncludeAudioData);
this.#context.context.history = this.#history;
this.emit('history_updated', this.#history);
}
catch (err) {
this.emit('error', {
type: 'error',
error: err,
});
}
}
});
this.#transport.on('mcp_tools_listed', ({ serverLabel, tools }) => {
try {
this.#allMcpToolsByServer.set(serverLabel, tools ?? []);
this.#updateAvailableMcpTools();
}
catch (err) {
this.emit('error', { type: 'error', error: err });
}
});
this.#transport.on('audio', (event) => {
if (!this.#audioStarted) {
this.#audioStarted = true;
this.emit('audio_start', this.#context, this.#currentAgent);
}
this.emit('audio', event);
});
this.#transport.on('turn_started', () => {
this.#audioStarted = false;
this.emit('agent_start', this.#context, this.#currentAgent);
this.#currentAgent.emit('agent_start', this.#context, this.#currentAgent);
});
this.#transport.on('turn_done', (event) => {
const item = event.response.output[event.response.output.length - 1];
const textOutput = getLastTextFromAudioOutputMessage(item) ?? '';
const itemId = item?.id ?? '';
this.emit('agent_end', this.#context, this.#currentAgent, textOutput);
this.#currentAgent.emit('agent_end', this.#context, textOutput);
this.#runOutputGuardrails(textOutput, event.response.id, itemId);
});
this.#transport.on('audio_done', () => {
if (this.#audioStarted) {
this.#audioStarted = false;
}
this.emit('audio_stopped', this.#context, this.#currentAgent);
});
let lastRunIndex = 0;
let lastItemId;
this.#transport.on('audio_transcript_delta', (event) => {
try {
const delta = event.delta;
const itemId = event.itemId;
const responseId = event.responseId;
if (lastItemId !== itemId) {
lastItemId = itemId;
lastRunIndex = 0;
}
const currentText = this.#transcribedTextDeltas[itemId] ?? '';
const newText = currentText + delta;
this.#transcribedTextDeltas[itemId] = newText;
if (this.#outputGuardrailSettings.debounceTextLength < 0) {
return;
}
const newRunIndex = Math.floor(newText.length / this.#outputGuardrailSettings.debounceTextLength);
if (newRunIndex > lastRunIndex) {
lastRunIndex = newRunIndex;
// We don't cancel existing runs because we want the first one to fail to fail
// The transport layer should upon failure handle the interruption and stop the model
// from generating further
this.#runOutputGuardrails(newText, responseId, itemId);
}
}
catch (err) {
this.emit('error', {
type: 'error',
error: err,
});
}
});
this.#transport.on('item_update', (event) => {
try {
const isNew = !this.#history.some((item) => item.itemId === event.itemId);
this.#history = updateRealtimeHistory(this.#history, event, this.#shouldIncludeAudioData);
this.#context.context.history = this.#history;
if (isNew) {
const addedItem = this.#history.find((item) => item.itemId === event.itemId);
if (addedItem) {
this.emit('history_added', addedItem);
}
}
this.emit('history_updated', this.#history);
}
catch (err) {
this.emit('error', {
type: 'error',
error: err,
});
}
});
this.#transport.on('item_deleted', (event) => {
try {
this.#history = this.#history.filter((item) => item.itemId !== event.itemId);
this.#context.context.history = this.#history;
this.emit('history_updated', this.#history);
}
catch (err) {
this.emit('error', {
type: 'error',
error: err,
});
}
});
this.#transport.on('function_call', async (event) => {
try {
await this.#handleFunctionCall(event);
}
catch (error) {
logger.error('Error handling function call', error);
this.emit('error', {
type: 'error',
error,
});
}
});
this.#transport.on('usage_update', (usage) => {
this.#context.usage.add(usage);
});
this.#transport.on('audio_interrupted', () => {
if (this.#audioStarted) {
this.#audioStarted = false;
}
this.emit('audio_interrupted', this.#context, this.#currentAgent);
});
this.#transport.on('error', (error) => {
this.emit('error', error);
});
this.#transport.on('mcp_tool_call_completed', (toolCall) => {
this.emit('mcp_tool_call_completed', this.#context, this.#currentAgent, toolCall);
if (this.#automaticallyTriggerResponseForMcpToolCalls) {
this.#transport.sendEvent({
type: 'response.create',
});
}
});
this.#transport.on('mcp_approval_request', (approvalRequest) => {
this.emit('tool_approval_requested', this.#context, this.#currentAgent, {
type: 'mcp_approval_request',
approvalItem: realtimeApprovalItemToApprovalItem(this.#currentAgent, approvalRequest),
});
});
}
/**
* Recomputes the currently available MCP tools based on the current agent's active
* MCP server configurations and the cached per-server tool listings. Emits
* `mcp_tools_changed` if the set changed.
*/
#updateAvailableMcpTools() {
// Collect active MCP server labels and optional allowed filters from the current agent
const activeMcpConfigs = this.#currentTools?.filter((t) => t.type === 'mcp');
const allowedFromConfig = (cfg) => {
const allowed = cfg.allowed_tools;
if (!allowed)
return undefined;
if (Array.isArray(allowed))
return allowed;
if (allowed && Array.isArray(allowed.tool_names))
return allowed.tool_names;
return undefined;
};
const dedupByName = new Map();
for (const cfg of activeMcpConfigs) {
const tools = this.#allMcpToolsByServer.get(cfg.server_label) ?? [];
const allowed = allowedFromConfig(cfg);
for (const tool of tools) {
if (allowed && !allowed.includes(tool.name))
continue;
if (!dedupByName.has(tool.name)) {
dedupByName.set(tool.name, tool);
}
}
}
const next = Array.from(dedupByName.values());
const prev = this.#availableMcpTools;
const changed = prev.length !== next.length ||
JSON.stringify(prev.map((t) => t.name).sort()) !==
JSON.stringify(next.map((t) => t.name).sort());
if (changed) {
this.#availableMcpTools = next;
this.emit('mcp_tools_changed', this.#availableMcpTools);
}
}
/**
* Connect to the session. This will establish the connection to the underlying transport layer
* and start the session.
*
* After connecting, the session will also emit a `history_updated` event with an empty history.
*
* @param options - The options for the connection.
*/
async connect(options) {
// makes sure the current agent is correctly set and loads the tools
await this.#setCurrentAgent(this.initialAgent);
this.#setEventListeners();
await this.#transport.connect({
apiKey: options.apiKey ?? this.options.apiKey,
model: this.options.model,
url: options.url,
initialSessionConfig: await this.#getSessionConfig(this.options.config),
});
// Ensure the cached lastSessionConfig includes everything passed as the initial session config
// (the call above already set it via #getSessionConfig but in case additional overrides were
// passed directly here in the future we could merge them). For now it's a no-op.
this.#history = [];
this.emit('history_updated', this.#history);
}
/**
* Update the history of the session.
* @param newHistory - The new history to set.
*/
updateHistory(newHistory) {
let updatedHistory;
if (typeof newHistory === 'function') {
updatedHistory = newHistory(this.#history);
}
else {
updatedHistory = newHistory;
}
this.#transport.resetHistory(this.#history, updatedHistory);
}
/**
* Send a message to the session.
* @param message - The message to send.
* @param otherEventData - Additional event data to send.
*/
sendMessage(message, otherEventData = {}) {
this.#transport.sendMessage(message, otherEventData);
}
/**
* Add image to the session
* @param image - The image to add.
*/
addImage(image, { triggerResponse = true } = {}) {
this.#transport.addImage(image, { triggerResponse });
}
/**
* Mute the session.
* @param muted - Whether to mute the session.
*/
mute(muted) {
this.#transport.mute(muted);
}
/**
* Disconnect from the session.
*/
close() {
this.#interruptedByGuardrail = {};
this.#transport.close();
}
/**
* Send audio to the session.
* @param audio - The audio to send.
* @param options - Additional options.
* @param options.commit - Whether to finish the turn with this audio.
*/
sendAudio(audio, options = {}) {
this.#transport.sendAudio(audio, options);
}
/**
* Interrupt the session artificially for example if you want to build a "stop talking"
* button.
*/
interrupt() {
this.#transport.interrupt();
}
/**
* Approve a tool call. This will also trigger the tool call to the agent.
* @param approvalItem - The approval item to approve.
* @param options - Additional options.
* @param options.alwaysApprove - Whether to always approve the tool call.
*/
async approve(approvalItem, options = { alwaysApprove: false }) {
this.#context.approveTool(approvalItem, options);
const tool = this.#currentAgent.tools.find((tool) => tool.name === approvalItem.rawItem.name);
if (tool &&
tool.type === 'function' &&
approvalItem.rawItem.type === 'function_call') {
await this.#handleFunctionToolCall(approvalItem.rawItem, tool);
}
else if (approvalItem.rawItem.type === 'hosted_tool_call') {
if (options.alwaysApprove) {
logger.warn('Always approving MCP tools is not supported. Use the allowed tools configuration instead.');
}
const mcpApprovalRequest = approvalItemToRealtimeApprovalItem(approvalItem);
this.#transport.sendMcpResponse(mcpApprovalRequest, true);
}
else {
throw new ModelBehaviorError(`Tool ${approvalItem.rawItem.name} not found`);
}
}
/**
* Reject a tool call. This will also trigger the tool call to the agent.
* @param approvalItem - The approval item to reject.
* @param options - Additional options.
* @param options.alwaysReject - Whether to always reject the tool call.
*/
async reject(approvalItem, options = { alwaysReject: false }) {
this.#context.rejectTool(approvalItem, options);
// we still need to simulate a tool call to the agent to let the agent know
const tool = this.#currentAgent.tools.find((tool) => tool.name === approvalItem.rawItem.name);
if (tool &&
tool.type === 'function' &&
approvalItem.rawItem.type === 'function_call') {
await this.#handleFunctionToolCall(approvalItem.rawItem, tool);
}
else if (approvalItem.rawItem.type === 'hosted_tool_call') {
if (options.alwaysReject) {
logger.warn('Always rejecting MCP tools is not supported. Use the allowed tools configuration instead.');
}
const mcpApprovalRequest = approvalItemToRealtimeApprovalItem(approvalItem);
this.#transport.sendMcpResponse(mcpApprovalRequest, false);
}
else {
throw new ModelBehaviorError(`Tool ${approvalItem.rawItem.name} not found`);
}
}
}
//# sourceMappingURL=realtimeSession.mjs.map