@openai/agents-realtime
Version:
The OpenAI Agents SDK is a lightweight yet powerful framework for building multi-agent workflows. This package contains the logic for building realtime voice agents on the server or in the browser.
353 lines • 14.1 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.OpenAIRealtimeWebSocket = void 0;
const _shims_1 = require("@openai/agents-realtime/_shims");
const openaiRealtimeBase_1 = require("./openaiRealtimeBase.js");
const utils_1 = require("./utils.js");
const agents_core_1 = require("@openai/agents-core");
const openaiRealtimeEvents_1 = require("./openaiRealtimeEvents.js");
/**
* Transport layer that's handling the connection between the client and OpenAI's Realtime API
* via WebSockets. While this transport layer is designed to be used within a RealtimeSession, it
* can also be used standalone if you want to have a direct connection to the Realtime API.
*/
class OpenAIRealtimeWebSocket extends openaiRealtimeBase_1.OpenAIRealtimeBase {
#apiKey;
#url;
#state = {
status: 'disconnected',
websocket: undefined,
};
#useInsecureApiKey;
#currentItemId;
#currentAudioContentIndex;
/**
* Timestamp maintained by the transport layer to aid with the calculation of the elapsed time
* since the response started to compute the right interruption time.
*
* Mostly internal but might be used by extended transport layers for their interruption
* calculation.
*/
_firstAudioTimestamp;
_audioLengthMs = 0;
#ongoingResponse = false;
#createWebSocket;
#skipOpenEventListeners;
constructor(options = {}) {
super(options);
this.#url = options.url;
this.#useInsecureApiKey = options.useInsecureApiKey ?? false;
this.#createWebSocket = options.createWebSocket;
this.#skipOpenEventListeners = options.skipOpenEventListeners ?? false;
}
getCommonRequestHeaders() {
return utils_1.HEADERS;
}
/**
* The current status of the WebSocket connection.
*/
get status() {
return this.#state.status;
}
/**
* The current connection state of the WebSocket connection.
*/
get connectionState() {
return this.#state;
}
/**
* Always returns `null` as the WebSocket transport layer does not handle muting. Instead,
* this should be handled by the client by not triggering the `sendAudio` method.
*/
get muted() {
return null;
}
/**
* The current item ID of the ongoing response.
*/
get currentItemId() {
return this.#currentItemId;
}
/**
* Triggers the `audio` event that a client might listen to to receive the audio buffer.
* Protected for you to be able to override and disable emitting this event in case your extended
* transport layer handles audio internally.
*
* @param audioEvent - The audio event to emit.
*/
_onAudio(audioEvent) {
this.emit('audio', audioEvent);
}
async #setupWebSocket(resolve, reject, sessionConfig) {
if (this.#state.websocket) {
resolve();
return;
}
if (!this.#apiKey) {
throw new agents_core_1.UserError('API key is not set. Please call `connect()` with an API key first.');
}
if ((0, _shims_1.isBrowserEnvironment)() &&
!this.#apiKey.startsWith('ek_') &&
!this.#useInsecureApiKey) {
throw new agents_core_1.UserError('Using the WebSocket connection in a browser environment requires an ephemeral client key. If you have to use a regular API key, set the `useInsecureApiKey` option to true.');
}
let ws = null;
if (this.#createWebSocket) {
ws = await this.#createWebSocket({
url: this.#url,
apiKey: this.#apiKey,
});
}
else {
// browsers and workerd should use the protocols argument, node should use the headers argument
const websocketArguments = _shims_1.useWebSocketProtocols
? [
'realtime',
// Auth
'openai-insecure-api-key.' + this.#apiKey,
// Version header
utils_1.WEBSOCKET_META,
]
: {
headers: {
Authorization: `Bearer ${this.#apiKey}`,
...this.getCommonRequestHeaders(),
},
};
ws = new _shims_1.WebSocket(this.#url, websocketArguments);
}
this.#state = {
status: 'connecting',
websocket: ws,
};
this.emit('connection_change', this.#state.status);
const onSocketOpenReady = () => {
this.#state = {
status: 'connected',
websocket: ws,
};
this.emit('connection_change', this.#state.status);
this._onOpen();
resolve();
};
if (this.#skipOpenEventListeners === true) {
onSocketOpenReady();
}
else {
ws.addEventListener('open', onSocketOpenReady);
}
ws.addEventListener('error', (error) => {
this._onError(error);
this.#state = {
status: 'disconnected',
websocket: undefined,
};
this.emit('connection_change', this.#state.status);
reject(error);
});
ws.addEventListener('message', (message) => {
this._onMessage(message);
const { data: parsed, isGeneric } = (0, openaiRealtimeEvents_1.parseRealtimeEvent)(message);
if (!parsed || isGeneric) {
return;
}
if (parsed.type === 'response.output_audio.delta') {
this.#currentAudioContentIndex = parsed.content_index;
this.#currentItemId = parsed.item_id;
if (this._firstAudioTimestamp === undefined) {
// If the response start timestamp is not set, we set it to the current time.
// This is used to calculate the elapsed time for interruption.
this._firstAudioTimestamp = Date.now();
this._audioLengthMs = 0;
}
const buff = (0, utils_1.base64ToArrayBuffer)(parsed.delta);
// calculate the audio length in milliseconds
// GA format: session.audio.output.format supports structured { type: "audio/pcm", rate } or "audio/pcmu" etc.
const fmt = this._rawSessionConfig?.audio?.output?.format;
if (fmt && typeof fmt === 'object') {
// Structured format
const t = fmt.type;
if (t === 'audio/pcmu' || t === 'audio/pcma') {
// 8kHz, 1 byte per sample
this._audioLengthMs += buff.byteLength / 8;
}
else if (t === 'audio/pcm') {
const rate = fmt.rate ?? 24000;
// bytes -> samples (2 bytes per sample) -> ms
this._audioLengthMs += (buff.byteLength / 2 / rate) * 1000;
}
else {
// Fallback assumption similar to legacy
this._audioLengthMs += buff.byteLength / 24 / 2;
}
}
else if (typeof fmt === 'string') {
if (fmt.startsWith('g711_')) {
this._audioLengthMs += buff.byteLength / 8;
}
else {
// Assume 24kHz PCM16
this._audioLengthMs += buff.byteLength / 24 / 2;
}
}
else {
// Default to 24kHz PCM16 behavior if unspecified
this._audioLengthMs += buff.byteLength / 24 / 2;
}
const audioEvent = {
type: 'audio',
data: buff,
responseId: parsed.response_id,
};
this._onAudio(audioEvent);
}
else if (parsed.type === 'input_audio_buffer.speech_started') {
const automaticResponseCancellationEnabled = this._rawSessionConfig?.audio?.input?.turn_detection
?.interrupt_response ?? false;
this.interrupt(!automaticResponseCancellationEnabled);
}
else if (parsed.type === 'response.created') {
this.#ongoingResponse = true;
}
else if (parsed.type === 'response.done') {
this.#ongoingResponse = false;
}
else if (parsed.type === 'session.created') {
this._tracingConfig = parsed.session.tracing;
// Trying to turn on tracing after the session is created
this._updateTracingConfig(sessionConfig.tracing ?? 'auto');
}
});
ws.addEventListener('close', () => {
this.#state = {
status: 'disconnected',
websocket: undefined,
};
this.emit('connection_change', this.#state.status);
this._onClose();
});
}
async connect(options) {
const model = options.model ?? this.currentModel;
this.currentModel = model;
this.#apiKey = await this._getApiKey(options);
const url = options.url ??
this.#url ??
`wss://api.openai.com/v1/realtime?model=${this.currentModel}`;
this.#url = url;
const sessionConfig = {
...(options.initialSessionConfig || {}),
model: this.currentModel,
};
await new Promise((resolve, reject) => {
this.#setupWebSocket(resolve, reject, sessionConfig).catch(reject);
});
await this.updateSessionConfig(sessionConfig);
}
/**
* Send an event to the Realtime API. This will stringify the event and send it directly to the
* API. This can be used if you want to take control over the connection and send events manually.
*
* @param event - The event to send.
*/
sendEvent(event) {
if (!this.#state.websocket) {
throw new Error('WebSocket is not connected. Make sure you call `connect()` before sending events.');
}
this.#state.websocket.send(JSON.stringify(event));
}
/**
* Close the WebSocket connection.
*
* This will also reset any internal connection tracking used for interruption handling.
*/
close() {
this.#state.websocket?.close();
this.#currentItemId = undefined;
this._firstAudioTimestamp = undefined;
this._audioLengthMs = 0;
this.#currentAudioContentIndex = undefined;
}
/**
* Will throw an error as the WebSocket transport layer does not support muting.
*/
mute(_muted) {
throw new Error('Mute is not supported for the WebSocket transport. You have to mute the audio input yourself.');
}
/**
* Send an audio buffer to the Realtime API. This is used for your client to send audio to the
* model to respond.
*
* @param audio - The audio buffer to send.
* @param options - The options for the audio buffer.
*/
sendAudio(audio, options = {}) {
if (this.#state.status === 'connected') {
super.sendAudio(audio, options);
}
}
/**
* Send a cancel response event to the Realtime API. This is used to cancel an ongoing
* response that the model is currently generating.
*/
_cancelResponse() {
// cancel the ongoing response
if (this.#ongoingResponse) {
this.sendEvent({
type: 'response.cancel',
});
this.#ongoingResponse = false;
}
}
/**
* Do NOT call this method directly. Call `interrupt()` instead for proper interruption handling.
*
* This method is used to send the right events to the API to inform the model that the user has
* interrupted the response. It might be overridden/extended by an extended transport layer. See
* the `TwilioRealtimeTransportLayer` for an example.
*
* @param elapsedTime - The elapsed time since the response started.
*/
_interrupt(elapsedTime, cancelOngoingResponse = true) {
if (elapsedTime < 0) {
return;
}
// immediately emit this event so the client can stop playing audio
if (cancelOngoingResponse) {
this._cancelResponse();
}
const length = this._audioLengthMs ?? Number.POSITIVE_INFINITY;
// audio_end_ms must be an integer
const audio_end_ms = Math.max(0, Math.floor(Math.min(elapsedTime, length)));
this.emit('audio_interrupted');
this.sendEvent({
type: 'conversation.item.truncate',
item_id: this.#currentItemId,
content_index: this.#currentAudioContentIndex,
audio_end_ms,
});
}
/**
* Interrupt the ongoing response. This method is triggered automatically by the client when
* voice activity detection (VAD) is enabled (default) as well as when an output guardrail got
* triggered.
*
* You can also call this method directly if you want to interrupt the conversation for example
* based on an event in the client.
*/
interrupt(cancelOngoingResponse = true) {
if (!this.#currentItemId || typeof this._firstAudioTimestamp !== 'number') {
return;
}
const elapsedTime = Date.now() - this._firstAudioTimestamp;
if (elapsedTime >= 0) {
this._interrupt(elapsedTime, cancelOngoingResponse);
}
this.#currentItemId = undefined;
this._firstAudioTimestamp = undefined;
this._audioLengthMs = 0;
this.#currentAudioContentIndex = undefined;
}
}
exports.OpenAIRealtimeWebSocket = OpenAIRealtimeWebSocket;
//# sourceMappingURL=openaiRealtimeWebsocket.js.map