UNPKG

@openai/agents-realtime

Version:

The OpenAI Agents SDK is a lightweight yet powerful framework for building multi-agent workflows. This package contains the logic for building realtime voice agents on the server or in the browser.

488 lines 19.2 kB
import { RuntimeEventEmitter, Usage } from '@openai/agents-core'; import { realtimeMessageItemSchema, realtimeToolCallItem, } from "./items.js"; import logger from "./logger.js"; import { parseRealtimeEvent, responseDoneEventSchema, } from "./openaiRealtimeEvents.js"; import { arrayBufferToBase64, diffRealtimeHistory } from "./utils.js"; import { EventEmitterDelegate } from '@openai/agents-core/utils'; /** * The default model that is used during the connection if no model is provided. */ export const DEFAULT_OPENAI_REALTIME_MODEL = 'gpt-4o-realtime-preview'; /** * The default session config that gets send over during session connection unless overridden * by the user. */ export const DEFAULT_OPENAI_REALTIME_SESSION_CONFIG = { voice: 'ash', modalities: ['text', 'audio'], inputAudioFormat: 'pcm16', outputAudioFormat: 'pcm16', inputAudioTranscription: { model: 'gpt-4o-mini-transcribe', }, turnDetection: { type: 'semantic_vad', }, inputAudioNoiseReduction: null, speed: 1, }; export class OpenAIRealtimeBase extends EventEmitterDelegate { #model; #apiKey; #tracingConfig = null; #rawSessionConfig = null; eventEmitter = new RuntimeEventEmitter(); constructor(options = {}) { super(); this.#model = options.model ?? DEFAULT_OPENAI_REALTIME_MODEL; this.#apiKey = options.apiKey; } /** * The current model that is being used by the transport layer. */ get currentModel() { return this.#model; } /** * The current model that is being used by the transport layer. * **Note**: The model cannot be changed mid conversation. */ set currentModel(model) { this.#model = model; } get _rawSessionConfig() { return this.#rawSessionConfig ?? null; } async _getApiKey(options) { const apiKey = options.apiKey ?? this.#apiKey; if (typeof apiKey === 'function') { return await apiKey(); } return apiKey; } _onMessage(event) { const { data: parsed, isGeneric } = parseRealtimeEvent(event); if (parsed === null) { return; } this.emit('*', parsed); if (isGeneric) { return; } if (parsed.type === 'error') { this.emit('error', { type: 'error', error: parsed }); } else { this.emit(parsed.type, parsed); } if (parsed.type === 'response.created') { this.emit('turn_started', { type: 'response_started', providerData: { ...parsed, }, }); return; } if (parsed.type === 'session.updated') { this.#rawSessionConfig = parsed.session; } if (parsed.type === 'response.done') { const response = responseDoneEventSchema.safeParse(parsed); if (!response.success) { logger.error('Error parsing response done event', response.error); return; } const inputTokens = response.data.response.usage?.input_tokens ?? 0; const outputTokens = response.data.response.usage?.output_tokens ?? 0; const totalTokens = inputTokens + outputTokens; const usage = new Usage({ inputTokens, inputTokensDetails: response.data.response.usage?.input_tokens_details ?? {}, outputTokens, outputTokensDetails: response.data.response.usage?.output_tokens_details ?? {}, totalTokens, }); this.emit('usage_update', usage); this.emit('turn_done', { type: 'response_done', response: { id: response.data.response.id ?? '', output: response.data.response.output ?? [], usage: { inputTokens, inputTokensDetails: response.data.response.usage?.input_tokens_details ?? {}, outputTokens, outputTokensDetails: response.data.response.usage?.output_tokens_details ?? {}, totalTokens, }, }, }); return; } if (parsed.type === 'response.audio.done') { this.emit('audio_done'); return; } if (parsed.type === 'conversation.item.deleted') { this.emit('item_deleted', { itemId: parsed.item_id, }); return; } if (parsed.type === 'conversation.item.input_audio_transcription.completed' || parsed.type === 'conversation.item.truncated') { // right now rather than keeping track of partials and rebuilding the item we // will retrieve it instead which triggers the `conversation.item.retrieved` event below this.sendEvent({ type: 'conversation.item.retrieve', item_id: parsed.item_id, }); return; } if (parsed.type === 'conversation.item.input_audio_transcription.delta' || parsed.type === 'response.text.delta' || parsed.type === 'response.audio_transcript.delta' || parsed.type === 'response.function_call_arguments.delta') { if (parsed.type === 'response.audio_transcript.delta') { this.emit('audio_transcript_delta', { type: 'transcript_delta', delta: parsed.delta, itemId: parsed.item_id, responseId: parsed.response_id, }); } // no support for partial transcripts yet. return; } if (parsed.type === 'conversation.item.created' || parsed.type === 'conversation.item.retrieved') { if (parsed.item.type === 'message') { const previousItemId = parsed.type === 'conversation.item.created' ? parsed.previous_item_id : null; const item = realtimeMessageItemSchema.parse({ itemId: parsed.item.id, previousItemId, type: parsed.item.type, role: parsed.item.role, content: parsed.item.content, status: parsed.item.status, }); this.emit('item_update', item); return; } } if (parsed.type === 'response.output_item.done' || parsed.type === 'response.output_item.added') { const item = parsed.item; if (item.type === 'function_call' && item.status === 'completed') { const toolCall = realtimeToolCallItem.parse({ itemId: item.id, type: item.type, status: 'in_progress', // we set it to in_progress for the UI as it will only be completed with the output arguments: item.arguments, name: item.name, output: null, }); this.emit('item_update', toolCall); this.emit('function_call', { id: item.id, type: 'function_call', callId: item.call_id ?? '', arguments: item.arguments ?? '', name: item.name ?? '', }); return; } if (item.type === 'message') { const realtimeItem = realtimeMessageItemSchema.parse({ itemId: parsed.item.id, type: parsed.item.type, role: parsed.item.role, content: parsed.item.content, status: 'in_progress', }); this.emit('item_update', realtimeItem); return; } } } _onError(error) { this.emit('error', { type: 'error', error, }); } _onOpen() { this.emit('connected'); } _onClose() { this.emit('disconnected'); } /** * Send a message to the Realtime API. This will create a new item in the conversation and * trigger a response. * * @param message - The message to send. * @param otherEventData - Additional event data to send. */ sendMessage(message, otherEventData) { this.sendEvent({ type: 'conversation.item.create', item: typeof message === 'string' ? { type: 'message', role: 'user', content: [ { type: 'input_text', text: message, }, ], } : message, ...otherEventData, }); this.sendEvent({ type: 'response.create', }); } _getMergedSessionConfig(config) { const sessionData = { instructions: config.instructions, model: config.model ?? this.#model ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.model, voice: config.voice ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.voice, speed: config.speed ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.speed, modalities: config.modalities ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.modalities, input_audio_format: config.inputAudioFormat ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.inputAudioFormat, output_audio_format: config.outputAudioFormat ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.outputAudioFormat, input_audio_transcription: config.inputAudioTranscription ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.inputAudioTranscription, input_audio_noise_reduction: config.inputAudioNoiseReduction ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.inputAudioNoiseReduction, turn_detection: OpenAIRealtimeBase.buildTurnDetectionConfig(config.turnDetection) ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.turnDetection, tool_choice: config.toolChoice ?? DEFAULT_OPENAI_REALTIME_SESSION_CONFIG.toolChoice, tools: config.tools?.map((tool) => ({ ...tool, strict: undefined, })), // We don't set tracing here to make sure that we don't try to override it on every // session.update as it might lead to errors ...(config.providerData ?? {}), }; return sessionData; } static buildTurnDetectionConfig(c) { if (typeof c === 'undefined') { return undefined; } const { type, createResponse, create_response, eagerness, interruptResponse, interrupt_response, prefixPaddingMs, prefix_padding_ms, silenceDurationMs, silence_duration_ms, threshold, ...rest } = c; const config = { type, create_response: createResponse ? createResponse : create_response, eagerness, interrupt_response: interruptResponse ? interruptResponse : interrupt_response, prefix_padding_ms: prefixPaddingMs ? prefixPaddingMs : prefix_padding_ms, silence_duration_ms: silenceDurationMs ? silenceDurationMs : silence_duration_ms, threshold, ...rest, }; // Remove undefined values from the config Object.keys(config).forEach((key) => { if (config[key] === undefined) delete config[key]; }); return Object.keys(config).length > 0 ? config : undefined; } /** * Sets the internal tracing config. This is used to track the tracing config that has been set * during the session.create event. */ set _tracingConfig(tracingConfig) { this.#tracingConfig = tracingConfig; } /** * Sets the tracing config for the session. This will send the tracing config to the Realtime API. * * @param tracingConfig - The tracing config to set. We don't support 'auto' here as the SDK will always configure a Workflow Name unless it exists */ _updateTracingConfig(tracingConfig) { if (typeof this.#tracingConfig === 'undefined') { // treating it as default value this.#tracingConfig = null; } if (tracingConfig === 'auto') { // turn on tracing in auto mode this.sendEvent({ type: 'session.update', session: { tracing: 'auto', }, }); return; } if (typeof this.#tracingConfig !== 'string' && typeof tracingConfig !== 'string') { // tracing is already set, we can't change it logger.warn('Tracing config is already set, skipping setting it again. This likely happens when you already set a tracing config on session creation.'); return; } if (tracingConfig === null) { logger.debug('Disabling tracing for this session. It cannot be turned on for this session from this point on.'); this.sendEvent({ type: 'session.update', session: { tracing: null, }, }); return; } if (this.#tracingConfig === null || typeof this.#tracingConfig === 'string') { // tracing is currently not set so we can set it to the new value this.sendEvent({ type: 'session.update', session: { tracing: tracingConfig, }, }); return; } if (tracingConfig?.group_id !== this.#tracingConfig?.group_id || tracingConfig?.metadata !== this.#tracingConfig?.metadata || tracingConfig?.workflow_name !== this.#tracingConfig?.workflow_name) { logger.warn('Mismatch in tracing config. Ignoring the new tracing config. This likely happens when you already set a tracing config on session creation. Current tracing config: %s, new tracing config: %s', JSON.stringify(this.#tracingConfig), JSON.stringify(tracingConfig)); return; } this.sendEvent({ type: 'session.update', session: { tracing: tracingConfig, }, }); } /** * Updates the session config. This will merge it with the current session config with the default * values and send it to the Realtime API. * * @param config - The session config to update. */ updateSessionConfig(config) { const sessionData = this._getMergedSessionConfig(config); this.sendEvent({ type: 'session.update', session: sessionData, }); } /** * Send the output of a function call to the Realtime API. * * @param toolCall - The tool call to send the output for. * @param output - The output of the function call. * @param startResponse - Whether to start a new response after sending the output. */ sendFunctionCallOutput(toolCall, output, startResponse = true) { this.sendEvent({ type: 'conversation.item.create', item: { type: 'function_call_output', output, call_id: toolCall.callId, }, }); try { const item = realtimeToolCallItem.parse({ itemId: toolCall.id, previousItemId: toolCall.previousItemId, type: 'function_call', status: 'completed', arguments: toolCall.arguments, name: toolCall.name, output, }); this.emit('item_update', item); } catch (error) { logger.error('Error parsing tool call item', error, toolCall); } if (startResponse) { this.sendEvent({ type: 'response.create', }); } } /** * Send an audio buffer to the Realtime API. If `{ commit: true }` is passed, the audio buffer * will be committed and the model will start processing it. This is necessary if you have * disabled turn detection / voice activity detection (VAD). * * @param audio - The audio buffer to send. * @param options - The options for the audio buffer. */ sendAudio(audio, { commit = false } = {}) { this.sendEvent({ type: 'input_audio_buffer.append', audio: arrayBufferToBase64(audio), }); if (commit) { this.sendEvent({ type: 'input_audio_buffer.commit', }); } } /** * Reset the history of the conversation. This will create a diff between the old and new history * and send the necessary events to the Realtime API to update the history. * * @param oldHistory - The old history of the conversation. * @param newHistory - The new history of the conversation. */ resetHistory(oldHistory, newHistory) { const { removals, additions, updates } = diffRealtimeHistory(oldHistory, newHistory); const removalIds = new Set(removals.map((item) => item.itemId)); // we don't have an update event for items so we will remove and re-add what's there for (const update of updates) { removalIds.add(update.itemId); } if (removalIds.size > 0) { for (const itemId of removalIds) { this.sendEvent({ type: 'conversation.item.delete', item_id: itemId, }); } } const additionsAndUpdates = [...additions, ...updates]; for (const addition of additionsAndUpdates) { if (addition.type === 'message') { const itemEntry = { type: 'message', role: addition.role, content: addition.content, id: addition.itemId, }; if (addition.role !== 'system' && addition.status) { itemEntry.status = addition.status; } this.sendEvent({ type: 'conversation.item.create', item: itemEntry, }); } else if (addition.type === 'function_call') { logger.warn('Function calls cannot be manually added or updated at the moment. Ignoring.'); } } } } //# sourceMappingURL=openaiRealtimeBase.js.map