openai
Version:
The official TypeScript library for the OpenAI API
1,897 lines (1,641 loc) • 77 kB
text/typescript
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
import { APIResource } from '../../../resource';
import * as RealtimeAPI from './realtime';
import * as Shared from '../../shared';
import * as SessionsAPI from './sessions';
import {
Session as SessionsAPISession,
SessionCreateParams,
SessionCreateResponse,
Sessions,
} from './sessions';
import * as TranscriptionSessionsAPI from './transcription-sessions';
import {
TranscriptionSession,
TranscriptionSessionCreateParams,
TranscriptionSessions,
} from './transcription-sessions';
export class Realtime extends APIResource {
sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client);
transcriptionSessions: TranscriptionSessionsAPI.TranscriptionSessions =
new TranscriptionSessionsAPI.TranscriptionSessions(this._client);
}
/**
* Returned when a conversation is created. Emitted right after session creation.
*/
export interface ConversationCreatedEvent {
/**
* The conversation resource.
*/
conversation: ConversationCreatedEvent.Conversation;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `conversation.created`.
*/
type: 'conversation.created';
}
export namespace ConversationCreatedEvent {
/**
* The conversation resource.
*/
export interface Conversation {
/**
* The unique ID of the conversation.
*/
id?: string;
/**
* The object type, must be `realtime.conversation`.
*/
object?: 'realtime.conversation';
}
}
/**
* The item to add to the conversation.
*/
export interface ConversationItem {
/**
* The unique ID of the item, this can be generated by the client to help manage
* server-side context, but is not required because the server will generate one if
* not provided.
*/
id?: string;
/**
* The arguments of the function call (for `function_call` items).
*/
arguments?: string;
/**
* The ID of the function call (for `function_call` and `function_call_output`
* items). If passed on a `function_call_output` item, the server will check that a
* `function_call` item with the same ID exists in the conversation history.
*/
call_id?: string;
/**
* The content of the message, applicable for `message` items.
*
* - Message items of role `system` support only `input_text` content
* - Message items of role `user` support `input_text` and `input_audio` content
* - Message items of role `assistant` support `text` content.
*/
content?: Array<ConversationItemContent>;
/**
* The name of the function being called (for `function_call` items).
*/
name?: string;
/**
* Identifier for the API object being returned - always `realtime.item`.
*/
object?: 'realtime.item';
/**
* The output of the function call (for `function_call_output` items).
*/
output?: string;
/**
* The role of the message sender (`user`, `assistant`, `system`), only applicable
* for `message` items.
*/
role?: 'user' | 'assistant' | 'system';
/**
* The status of the item (`completed`, `incomplete`). These have no effect on the
* conversation, but are accepted for consistency with the
* `conversation.item.created` event.
*/
status?: 'completed' | 'incomplete';
/**
* The type of the item (`message`, `function_call`, `function_call_output`).
*/
type?: 'message' | 'function_call' | 'function_call_output';
}
export interface ConversationItemContent {
/**
* ID of a previous conversation item to reference (for `item_reference` content
* types in `response.create` events). These can reference both client and server
* created items.
*/
id?: string;
/**
* Base64-encoded audio bytes, used for `input_audio` content type.
*/
audio?: string;
/**
* The text content, used for `input_text` and `text` content types.
*/
text?: string;
/**
* The transcript of the audio, used for `input_audio` content type.
*/
transcript?: string;
/**
* The content type (`input_text`, `input_audio`, `item_reference`, `text`).
*/
type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
}
/**
* Add a new Item to the Conversation's context, including messages, function
* calls, and function call responses. This event can be used both to populate a
* "history" of the conversation and to add new items mid-stream, but has the
* current limitation that it cannot populate assistant audio messages.
*
* If successful, the server will respond with a `conversation.item.created` event,
* otherwise an `error` event will be sent.
*/
export interface ConversationItemCreateEvent {
/**
* The item to add to the conversation.
*/
item: ConversationItem;
/**
* The event type, must be `conversation.item.create`.
*/
type: 'conversation.item.create';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
/**
* The ID of the preceding item after which the new item will be inserted. If not
* set, the new item will be appended to the end of the conversation. If set to
* `root`, the new item will be added to the beginning of the conversation. If set
* to an existing ID, it allows an item to be inserted mid-conversation. If the ID
* cannot be found, an error will be returned and the item will not be added.
*/
previous_item_id?: string;
}
/**
* Returned when a conversation item is created. There are several scenarios that
* produce this event:
*
* - The server is generating a Response, which if successful will produce either
* one or two Items, which will be of type `message` (role `assistant`) or type
* `function_call`.
* - The input audio buffer has been committed, either by the client or the server
* (in `server_vad` mode). The server will take the content of the input audio
* buffer and add it to a new user message Item.
* - The client has sent a `conversation.item.create` event to add a new Item to
* the Conversation.
*/
export interface ConversationItemCreatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The item to add to the conversation.
*/
item: ConversationItem;
/**
* The ID of the preceding item in the Conversation context, allows the client to
* understand the order of the conversation.
*/
previous_item_id: string;
/**
* The event type, must be `conversation.item.created`.
*/
type: 'conversation.item.created';
}
/**
* Send this event when you want to remove any item from the conversation history.
* The server will respond with a `conversation.item.deleted` event, unless the
* item does not exist in the conversation history, in which case the server will
* respond with an error.
*/
export interface ConversationItemDeleteEvent {
/**
* The ID of the item to delete.
*/
item_id: string;
/**
* The event type, must be `conversation.item.delete`.
*/
type: 'conversation.item.delete';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an item in the conversation is deleted by the client with a
* `conversation.item.delete` event. This event is used to synchronize the server's
* understanding of the conversation history with the client's view.
*/
export interface ConversationItemDeletedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item that was deleted.
*/
item_id: string;
/**
* The event type, must be `conversation.item.deleted`.
*/
type: 'conversation.item.deleted';
}
/**
* This event is the output of audio transcription for user audio written to the
* user audio buffer. Transcription begins when the input audio buffer is committed
* by the client or server (in `server_vad` mode). Transcription runs
* asynchronously with Response creation, so this event may come before or after
* the Response events.
*
* Realtime API models accept audio natively, and thus input transcription is a
* separate process run on a separate ASR (Automatic Speech Recognition) model,
* currently always `whisper-1`. Thus the transcript may diverge somewhat from the
* model's interpretation, and should be treated as a rough guide.
*/
export interface ConversationItemInputAudioTranscriptionCompletedEvent {
/**
* The index of the content part containing the audio.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item containing the audio.
*/
item_id: string;
/**
* The transcribed text.
*/
transcript: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.completed`.
*/
type: 'conversation.item.input_audio_transcription.completed';
/**
* The log probabilities of the transcription.
*/
logprobs?: Array<ConversationItemInputAudioTranscriptionCompletedEvent.Logprob> | null;
}
export namespace ConversationItemInputAudioTranscriptionCompletedEvent {
/**
* A log probability object.
*/
export interface Logprob {
/**
* The token that was used to generate the log probability.
*/
token: string;
/**
* The bytes that were used to generate the log probability.
*/
bytes: Array<number>;
/**
* The log probability of the token.
*/
logprob: number;
}
}
/**
* Returned when the text value of an input audio transcription content part is
* updated.
*/
export interface ConversationItemInputAudioTranscriptionDeltaEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.delta`.
*/
type: 'conversation.item.input_audio_transcription.delta';
/**
* The index of the content part in the item's content array.
*/
content_index?: number;
/**
* The text delta.
*/
delta?: string;
/**
* The log probabilities of the transcription.
*/
logprobs?: Array<ConversationItemInputAudioTranscriptionDeltaEvent.Logprob> | null;
}
export namespace ConversationItemInputAudioTranscriptionDeltaEvent {
/**
* A log probability object.
*/
export interface Logprob {
/**
* The token that was used to generate the log probability.
*/
token: string;
/**
* The bytes that were used to generate the log probability.
*/
bytes: Array<number>;
/**
* The log probability of the token.
*/
logprob: number;
}
}
/**
* Returned when input audio transcription is configured, and a transcription
* request for a user message failed. These events are separate from other `error`
* events so that the client can identify the related Item.
*/
export interface ConversationItemInputAudioTranscriptionFailedEvent {
/**
* The index of the content part containing the audio.
*/
content_index: number;
/**
* Details of the transcription error.
*/
error: ConversationItemInputAudioTranscriptionFailedEvent.Error;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item.
*/
item_id: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.failed`.
*/
type: 'conversation.item.input_audio_transcription.failed';
}
export namespace ConversationItemInputAudioTranscriptionFailedEvent {
/**
* Details of the transcription error.
*/
export interface Error {
/**
* Error code, if any.
*/
code?: string;
/**
* A human-readable error message.
*/
message?: string;
/**
* Parameter related to the error, if any.
*/
param?: string;
/**
* The type of error.
*/
type?: string;
}
}
/**
* Send this event when you want to retrieve the server's representation of a
* specific item in the conversation history. This is useful, for example, to
* inspect user audio after noise cancellation and VAD. The server will respond
* with a `conversation.item.retrieved` event, unless the item does not exist in
* the conversation history, in which case the server will respond with an error.
*/
export interface ConversationItemRetrieveEvent {
/**
* The ID of the item to retrieve.
*/
item_id: string;
/**
* The event type, must be `conversation.item.retrieve`.
*/
type: 'conversation.item.retrieve';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Send this event to truncate a previous assistant message’s audio. The server
* will produce audio faster than realtime, so this event is useful when the user
* interrupts to truncate audio that has already been sent to the client but not
* yet played. This will synchronize the server's understanding of the audio with
* the client's playback.
*
* Truncating audio will delete the server-side text transcript to ensure there is
* not text in the context that hasn't been heard by the user.
*
* If successful, the server will respond with a `conversation.item.truncated`
* event.
*/
export interface ConversationItemTruncateEvent {
/**
* Inclusive duration up to which audio is truncated, in milliseconds. If the
* audio_end_ms is greater than the actual audio duration, the server will respond
* with an error.
*/
audio_end_ms: number;
/**
* The index of the content part to truncate. Set this to 0.
*/
content_index: number;
/**
* The ID of the assistant message item to truncate. Only assistant message items
* can be truncated.
*/
item_id: string;
/**
* The event type, must be `conversation.item.truncate`.
*/
type: 'conversation.item.truncate';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an earlier assistant audio message item is truncated by the client
* with a `conversation.item.truncate` event. This event is used to synchronize the
* server's understanding of the audio with the client's playback.
*
* This action will truncate the audio and remove the server-side text transcript
* to ensure there is no text in the context that hasn't been heard by the user.
*/
export interface ConversationItemTruncatedEvent {
/**
* The duration up to which the audio was truncated, in milliseconds.
*/
audio_end_ms: number;
/**
* The index of the content part that was truncated.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the assistant message item that was truncated.
*/
item_id: string;
/**
* The event type, must be `conversation.item.truncated`.
*/
type: 'conversation.item.truncated';
}
/**
* The item to add to the conversation.
*/
export interface ConversationItemWithReference {
/**
* For an item of type (`message` | `function_call` | `function_call_output`) this
* field allows the client to assign the unique ID of the item. It is not required
* because the server will generate one if not provided.
*
* For an item of type `item_reference`, this field is required and is a reference
* to any item that has previously existed in the conversation.
*/
id?: string;
/**
* The arguments of the function call (for `function_call` items).
*/
arguments?: string;
/**
* The ID of the function call (for `function_call` and `function_call_output`
* items). If passed on a `function_call_output` item, the server will check that a
* `function_call` item with the same ID exists in the conversation history.
*/
call_id?: string;
/**
* The content of the message, applicable for `message` items.
*
* - Message items of role `system` support only `input_text` content
* - Message items of role `user` support `input_text` and `input_audio` content
* - Message items of role `assistant` support `text` content.
*/
content?: Array<ConversationItemContent>;
/**
* The name of the function being called (for `function_call` items).
*/
name?: string;
/**
* Identifier for the API object being returned - always `realtime.item`.
*/
object?: 'realtime.item';
/**
* The output of the function call (for `function_call_output` items).
*/
output?: string;
/**
* The role of the message sender (`user`, `assistant`, `system`), only applicable
* for `message` items.
*/
role?: 'user' | 'assistant' | 'system';
/**
* The status of the item (`completed`, `incomplete`). These have no effect on the
* conversation, but are accepted for consistency with the
* `conversation.item.created` event.
*/
status?: 'completed' | 'incomplete';
/**
* The type of the item (`message`, `function_call`, `function_call_output`,
* `item_reference`).
*/
type?: 'message' | 'function_call' | 'function_call_output' | 'item_reference';
}
/**
* Returned when an error occurs, which could be a client problem or a server
* problem. Most errors are recoverable and the session will stay open, we
* recommend to implementors to monitor and log error messages by default.
*/
export interface ErrorEvent {
/**
* Details of the error.
*/
error: ErrorEvent.Error;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `error`.
*/
type: 'error';
}
export namespace ErrorEvent {
/**
* Details of the error.
*/
export interface Error {
/**
* A human-readable error message.
*/
message: string;
/**
* The type of error (e.g., "invalid_request_error", "server_error").
*/
type: string;
/**
* Error code, if any.
*/
code?: string | null;
/**
* The event_id of the client event that caused the error, if applicable.
*/
event_id?: string | null;
/**
* Parameter related to the error, if any.
*/
param?: string | null;
}
}
/**
* Send this event to append audio bytes to the input audio buffer. The audio
* buffer is temporary storage you can write to and later commit. In Server VAD
* mode, the audio buffer is used to detect speech and the server will decide when
* to commit. When Server VAD is disabled, you must commit the audio buffer
* manually.
*
* The client may choose how much audio to place in each event up to a maximum of
* 15 MiB, for example streaming smaller chunks from the client may allow the VAD
* to be more responsive. Unlike made other client events, the server will not send
* a confirmation response to this event.
*/
export interface InputAudioBufferAppendEvent {
/**
* Base64-encoded audio bytes. This must be in the format specified by the
* `input_audio_format` field in the session configuration.
*/
audio: string;
/**
* The event type, must be `input_audio_buffer.append`.
*/
type: 'input_audio_buffer.append';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Send this event to clear the audio bytes in the buffer. The server will respond
* with an `input_audio_buffer.cleared` event.
*/
export interface InputAudioBufferClearEvent {
/**
* The event type, must be `input_audio_buffer.clear`.
*/
type: 'input_audio_buffer.clear';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when the input audio buffer is cleared by the client with a
* `input_audio_buffer.clear` event.
*/
export interface InputAudioBufferClearedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `input_audio_buffer.cleared`.
*/
type: 'input_audio_buffer.cleared';
}
/**
* Send this event to commit the user input audio buffer, which will create a new
* user message item in the conversation. This event will produce an error if the
* input audio buffer is empty. When in Server VAD mode, the client does not need
* to send this event, the server will commit the audio buffer automatically.
*
* Committing the input audio buffer will trigger input audio transcription (if
* enabled in session configuration), but it will not create a response from the
* model. The server will respond with an `input_audio_buffer.committed` event.
*/
export interface InputAudioBufferCommitEvent {
/**
* The event type, must be `input_audio_buffer.commit`.
*/
type: 'input_audio_buffer.commit';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an input audio buffer is committed, either by the client or
* automatically in server VAD mode. The `item_id` property is the ID of the user
* message item that will be created, thus a `conversation.item.created` event will
* also be sent to the client.
*/
export interface InputAudioBufferCommittedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created.
*/
item_id: string;
/**
* The ID of the preceding item after which the new item will be inserted.
*/
previous_item_id: string;
/**
* The event type, must be `input_audio_buffer.committed`.
*/
type: 'input_audio_buffer.committed';
}
/**
* Sent by the server when in `server_vad` mode to indicate that speech has been
* detected in the audio buffer. This can happen any time audio is added to the
* buffer (unless speech is already detected). The client may want to use this
* event to interrupt audio playback or provide visual feedback to the user.
*
* The client should expect to receive a `input_audio_buffer.speech_stopped` event
* when speech stops. The `item_id` property is the ID of the user message item
* that will be created when speech stops and will also be included in the
* `input_audio_buffer.speech_stopped` event (unless the client manually commits
* the audio buffer during VAD activation).
*/
export interface InputAudioBufferSpeechStartedEvent {
/**
* Milliseconds from the start of all audio written to the buffer during the
* session when speech was first detected. This will correspond to the beginning of
* audio sent to the model, and thus includes the `prefix_padding_ms` configured in
* the Session.
*/
audio_start_ms: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created when speech stops.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.speech_started`.
*/
type: 'input_audio_buffer.speech_started';
}
/**
* Returned in `server_vad` mode when the server detects the end of speech in the
* audio buffer. The server will also send an `conversation.item.created` event
* with the user message item that is created from the audio buffer.
*/
export interface InputAudioBufferSpeechStoppedEvent {
/**
* Milliseconds since the session started when speech stopped. This will correspond
* to the end of audio sent to the model, and thus includes the
* `min_silence_duration_ms` configured in the Session.
*/
audio_end_ms: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.speech_stopped`.
*/
type: 'input_audio_buffer.speech_stopped';
}
/**
* Emitted at the beginning of a Response to indicate the updated rate limits. When
* a Response is created some tokens will be "reserved" for the output tokens, the
* rate limits shown here reflect that reservation, which is then adjusted
* accordingly once the Response is completed.
*/
export interface RateLimitsUpdatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* List of rate limit information.
*/
rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>;
/**
* The event type, must be `rate_limits.updated`.
*/
type: 'rate_limits.updated';
}
export namespace RateLimitsUpdatedEvent {
export interface RateLimit {
/**
* The maximum allowed value for the rate limit.
*/
limit?: number;
/**
* The name of the rate limit (`requests`, `tokens`).
*/
name?: 'requests' | 'tokens';
/**
* The remaining value before the limit is reached.
*/
remaining?: number;
/**
* Seconds until the rate limit resets.
*/
reset_seconds?: number;
}
}
/**
* A realtime client event.
*/
export type RealtimeClientEvent =
| ConversationItemCreateEvent
| ConversationItemDeleteEvent
| ConversationItemRetrieveEvent
| ConversationItemTruncateEvent
| InputAudioBufferAppendEvent
| InputAudioBufferClearEvent
| RealtimeClientEvent.OutputAudioBufferClear
| InputAudioBufferCommitEvent
| ResponseCancelEvent
| ResponseCreateEvent
| SessionUpdateEvent
| TranscriptionSessionUpdate;
export namespace RealtimeClientEvent {
/**
* **WebRTC Only:** Emit to cut off the current audio response. This will trigger
* the server to stop generating audio and emit a `output_audio_buffer.cleared`
* event. This event should be preceded by a `response.cancel` client event to stop
* the generation of the current response.
* [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
*/
export interface OutputAudioBufferClear {
/**
* The event type, must be `output_audio_buffer.clear`.
*/
type: 'output_audio_buffer.clear';
/**
* The unique ID of the client event used for error handling.
*/
event_id?: string;
}
}
/**
* The response resource.
*/
export interface RealtimeResponse {
/**
* The unique ID of the response.
*/
id?: string;
/**
* Which conversation the response is added to, determined by the `conversation`
* field in the `response.create` event. If `auto`, the response will be added to
* the default conversation and the value of `conversation_id` will be an id like
* `conv_1234`. If `none`, the response will not be added to any conversation and
* the value of `conversation_id` will be `null`. If responses are being triggered
* by server VAD, the response will be added to the default conversation, thus the
* `conversation_id` will be an id like `conv_1234`.
*/
conversation_id?: string;
/**
* Maximum number of output tokens for a single assistant response, inclusive of
* tool calls, that was used in this response.
*/
max_output_tokens?: number | 'inf';
/**
* Set of 16 key-value pairs that can be attached to an object. This can be useful
* for storing additional information about the object in a structured format, and
* querying for objects via API or the dashboard.
*
* Keys are strings with a maximum length of 64 characters. Values are strings with
* a maximum length of 512 characters.
*/
metadata?: Shared.Metadata | null;
/**
* The set of modalities the model used to respond. If there are multiple
* modalities, the model will pick one, for example if `modalities` is
* `["text", "audio"]`, the model could be responding in either text or audio.
*/
modalities?: Array<'text' | 'audio'>;
/**
* The object type, must be `realtime.response`.
*/
object?: 'realtime.response';
/**
* The list of output items generated by the response.
*/
output?: Array<ConversationItem>;
/**
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* The final status of the response (`completed`, `cancelled`, `failed`, or
* `incomplete`).
*/
status?: 'completed' | 'cancelled' | 'failed' | 'incomplete';
/**
* Additional details about the status.
*/
status_details?: RealtimeResponseStatus;
/**
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
*/
temperature?: number;
/**
* Usage statistics for the Response, this will correspond to billing. A Realtime
* API session will maintain a conversation context and append new Items to the
* Conversation, thus output from previous turns (text and audio tokens) will
* become the input for later turns.
*/
usage?: RealtimeResponseUsage;
/**
* The voice the model used to respond. Current voice options are `alloy`, `ash`,
* `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`, and
* `verse`.
*/
voice?:
| (string & {})
| 'alloy'
| 'ash'
| 'ballad'
| 'coral'
| 'echo'
| 'fable'
| 'onyx'
| 'nova'
| 'sage'
| 'shimmer'
| 'verse';
}
/**
* Additional details about the status.
*/
export interface RealtimeResponseStatus {
/**
* A description of the error that caused the response to fail, populated when the
* `status` is `failed`.
*/
error?: RealtimeResponseStatus.Error;
/**
* The reason the Response did not complete. For a `cancelled` Response, one of
* `turn_detected` (the server VAD detected a new start of speech) or
* `client_cancelled` (the client sent a cancel event). For an `incomplete`
* Response, one of `max_output_tokens` or `content_filter` (the server-side safety
* filter activated and cut off the response).
*/
reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter';
/**
* The type of error that caused the response to fail, corresponding with the
* `status` field (`completed`, `cancelled`, `incomplete`, `failed`).
*/
type?: 'completed' | 'cancelled' | 'incomplete' | 'failed';
}
export namespace RealtimeResponseStatus {
/**
* A description of the error that caused the response to fail, populated when the
* `status` is `failed`.
*/
export interface Error {
/**
* Error code, if any.
*/
code?: string;
/**
* The type of error.
*/
type?: string;
}
}
/**
* Usage statistics for the Response, this will correspond to billing. A Realtime
* API session will maintain a conversation context and append new Items to the
* Conversation, thus output from previous turns (text and audio tokens) will
* become the input for later turns.
*/
export interface RealtimeResponseUsage {
/**
* Details about the input tokens used in the Response.
*/
input_token_details?: RealtimeResponseUsage.InputTokenDetails;
/**
* The number of input tokens used in the Response, including text and audio
* tokens.
*/
input_tokens?: number;
/**
* Details about the output tokens used in the Response.
*/
output_token_details?: RealtimeResponseUsage.OutputTokenDetails;
/**
* The number of output tokens sent in the Response, including text and audio
* tokens.
*/
output_tokens?: number;
/**
* The total number of tokens in the Response including input and output text and
* audio tokens.
*/
total_tokens?: number;
}
export namespace RealtimeResponseUsage {
/**
* Details about the input tokens used in the Response.
*/
export interface InputTokenDetails {
/**
* The number of audio tokens used in the Response.
*/
audio_tokens?: number;
/**
* The number of cached tokens used in the Response.
*/
cached_tokens?: number;
/**
* The number of text tokens used in the Response.
*/
text_tokens?: number;
}
/**
* Details about the output tokens used in the Response.
*/
export interface OutputTokenDetails {
/**
* The number of audio tokens used in the Response.
*/
audio_tokens?: number;
/**
* The number of text tokens used in the Response.
*/
text_tokens?: number;
}
}
/**
* A realtime server event.
*/
export type RealtimeServerEvent =
| ConversationCreatedEvent
| ConversationItemCreatedEvent
| ConversationItemDeletedEvent
| ConversationItemInputAudioTranscriptionCompletedEvent
| ConversationItemInputAudioTranscriptionDeltaEvent
| ConversationItemInputAudioTranscriptionFailedEvent
| RealtimeServerEvent.ConversationItemRetrieved
| ConversationItemTruncatedEvent
| ErrorEvent
| InputAudioBufferClearedEvent
| InputAudioBufferCommittedEvent
| InputAudioBufferSpeechStartedEvent
| InputAudioBufferSpeechStoppedEvent
| RateLimitsUpdatedEvent
| ResponseAudioDeltaEvent
| ResponseAudioDoneEvent
| ResponseAudioTranscriptDeltaEvent
| ResponseAudioTranscriptDoneEvent
| ResponseContentPartAddedEvent
| ResponseContentPartDoneEvent
| ResponseCreatedEvent
| ResponseDoneEvent
| ResponseFunctionCallArgumentsDeltaEvent
| ResponseFunctionCallArgumentsDoneEvent
| ResponseOutputItemAddedEvent
| ResponseOutputItemDoneEvent
| ResponseTextDeltaEvent
| ResponseTextDoneEvent
| SessionCreatedEvent
| SessionUpdatedEvent
| TranscriptionSessionUpdatedEvent
| RealtimeServerEvent.OutputAudioBufferStarted
| RealtimeServerEvent.OutputAudioBufferStopped
| RealtimeServerEvent.OutputAudioBufferCleared;
export namespace RealtimeServerEvent {
/**
* Returned when a conversation item is retrieved with
* `conversation.item.retrieve`.
*/
export interface ConversationItemRetrieved {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The item to add to the conversation.
*/
item: RealtimeAPI.ConversationItem;
/**
* The event type, must be `conversation.item.retrieved`.
*/
type: 'conversation.item.retrieved';
}
/**
* **WebRTC Only:** Emitted when the server begins streaming audio to the client.
* This event is emitted after an audio content part has been added
* (`response.content_part.added`) to the response.
* [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
*/
export interface OutputAudioBufferStarted {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The unique ID of the response that produced the audio.
*/
response_id: string;
/**
* The event type, must be `output_audio_buffer.started`.
*/
type: 'output_audio_buffer.started';
}
/**
* **WebRTC Only:** Emitted when the output audio buffer has been completely
* drained on the server, and no more audio is forthcoming. This event is emitted
* after the full response data has been sent to the client (`response.done`).
* [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
*/
export interface OutputAudioBufferStopped {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The unique ID of the response that produced the audio.
*/
response_id: string;
/**
* The event type, must be `output_audio_buffer.stopped`.
*/
type: 'output_audio_buffer.stopped';
}
/**
* **WebRTC Only:** Emitted when the output audio buffer is cleared. This happens
* either in VAD mode when the user has interrupted
* (`input_audio_buffer.speech_started`), or when the client has emitted the
* `output_audio_buffer.clear` event to manually cut off the current audio
* response.
* [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
*/
export interface OutputAudioBufferCleared {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The unique ID of the response that produced the audio.
*/
response_id: string;
/**
* The event type, must be `output_audio_buffer.cleared`.
*/
type: 'output_audio_buffer.cleared';
}
}
/**
* Returned when the model-generated audio is updated.
*/
export interface ResponseAudioDeltaEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* Base64-encoded audio data delta.
*/
delta: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.audio.delta`.
*/
type: 'response.audio.delta';
}
/**
* Returned when the model-generated audio is done. Also emitted when a Response is
* interrupted, incomplete, or cancelled.
*/
export interface ResponseAudioDoneEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.audio.done`.
*/
type: 'response.audio.done';
}
/**
* Returned when the model-generated transcription of audio output is updated.
*/
export interface ResponseAudioTranscriptDeltaEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The transcript delta.
*/
delta: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.audio_transcript.delta`.
*/
type: 'response.audio_transcript.delta';
}
/**
* Returned when the model-generated transcription of audio output is done
* streaming. Also emitted when a Response is interrupted, incomplete, or
* cancelled.
*/
export interface ResponseAudioTranscriptDoneEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The final transcript of the audio.
*/
transcript: string;
/**
* The event type, must be `response.audio_transcript.done`.
*/
type: 'response.audio_transcript.done';
}
/**
* Send this event to cancel an in-progress response. The server will respond with
* a `response.cancelled` event or an error if there is no response to cancel.
*/
export interface ResponseCancelEvent {
/**
* The event type, must be `response.cancel`.
*/
type: 'response.cancel';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
/**
* A specific response ID to cancel - if not provided, will cancel an in-progress
* response in the default conversation.
*/
response_id?: string;
}
/**
* Returned when a new content part is added to an assistant message item during
* response generation.
*/
export interface ResponseContentPartAddedEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item to which the content part was added.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The content part that was added.
*/
part: ResponseContentPartAddedEvent.Part;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.content_part.added`.
*/
type: 'response.content_part.added';
}
export namespace ResponseContentPartAddedEvent {
/**
* The content part that was added.
*/
export interface Part {
/**
* Base64-encoded audio data (if type is "audio").
*/
audio?: string;
/**
* The text content (if type is "text").
*/
text?: string;
/**
* The transcript of the audio (if type is "audio").
*/
transcript?: string;
/**
* The content type ("text", "audio").
*/
type?: 'text' | 'audio';
}
}
/**
* Returned when a content part is done streaming in an assistant message item.
* Also emitted when a Response is interrupted, incomplete, or cancelled.
*/
export interface ResponseContentPartDoneEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The content part that is done.
*/
part: ResponseContentPartDoneEvent.Part;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.content_part.done`.
*/
type: 'response.content_part.done';
}
export namespace ResponseContentPartDoneEvent {
/**
* The content part that is done.
*/
export interface Part {
/**
* Base64-encoded audio data (if type is "audio").
*/
audio?: string;
/**
* The text content (if type is "text").
*/
text?: string;
/**
* The transcript of the audio (if type is "audio").
*/
transcript?: string;
/**
* The content type ("text", "audio").
*/
type?: 'text' | 'audio';
}
}
/**
* This event instructs the server to create a Response, which means triggering
* model inference. When in Server VAD mode, the server will create Responses
* automatically.
*
* A Response will include at least one Item, and may have two, in which case the
* second will be a function call. These Items will be appended to the conversation
* history.
*
* The server will respond with a `response.created` event, events for Items and
* content created, and finally a `response.done` event to indicate the Response is
* complete.
*
* The `response.create` event includes inference configuration like
* `instructions`, and `temperature`. These fields will override the Session's
* configuration for this Response only.
*/
export interface ResponseCreateEvent {
/**
* The event type, must be `response.create`.
*/
type: 'response.create';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
/**
* Create a new Realtime response with these parameters
*/
response?: ResponseCreateEvent.Response;
}
export namespace ResponseCreateEvent {
/**
* Create a new Realtime response with these parameters
*/
export interface Response {
/**
* Controls which conversation the response is added to. Currently supports `auto`
* and `none`, with `auto` as the default value. The `auto` value means that the
* contents of the response will be added to the default conversation. Set this to
* `none` to create an out-of-band response which will not add items to default
* conversation.
*/
conversation?: (string & {}) | 'auto' | 'none';
/**
* Input items to include in the prompt for the model. Using this field creates a
* new context for this Response instead of using the default conversation. An
* empty array `[]` will clear the context for this Response. Note that this can
* include references to items from the default conversation.
*/
input?: Array<RealtimeAPI.ConversationItemWithReference>;
/**
* The default system instructions (i.e. system message) prepended to model calls.
* This field allows the client to guide the model on desired responses. The model
* can be instructed on response content and format, (e.g. "be extremely succinct",
* "act friendly", "here are examples of good responses") and on audio behavior
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
* instructions are not guaranteed to be followed by the model, but they provide
* guidance to the model on the desired behavior.
*
* Note that the server sets default instructions which will be used if this field
* is not set and are visible in the `session.created` event at the start of the
* session.
*/
instructions?: string;
/**
* Maximum number of output tokens for a single assistant response, inclusive of
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
*/
max_response_output_tokens?: number | 'inf';
/**
* Set of 16 key-value pairs that can be attached to an object. This can be useful
* for storing additional information about the object in a structured format, and
* querying for objects via API or the dashboard.
*
* Keys are strings with a maximum length of 64 characters. Values are strings with
* a maximum length of 512 characters.
*/
metadata?: Shared.Metadata | null;
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
*/
temperature?: number;
/**
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
* a function, like `{"type": "function", "function": {"name": "my_function"}}`.
*/
tool_choice?: string;
/**
* Tools (functions) available to the model.
*/
tools?: Array<Response.Tool>;
/**
* The voice the model uses to respond. Voice cannot be changed during the session
* once the model has responded with audio at least once. Current voice options are
* `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`,
* `shimmer`, and `verse`.
*/
voice?:
| (string & {})
| 'alloy'
| 'ash'
| 'ballad'
| 'coral'
| 'echo'
| 'fable'
| 'onyx'
| 'nova'
| 'sage'
| 'shimmer'
| 'verse';
}
export namespace Response {
export interface Tool {
/**
* The description of the function, including guidance on when and how to call it,
* and guidance about what to tell the user when calling (if anything).
*/
description?: string;
/**
* The name of the function.
*/
name?: string;
/**
* Parameters of the function in JSON Schema.
*/
parameters?: unknown;
/**
* The type of the tool, i.e. `function`.
*/
type?: 'function';
}
}
}
/**
* Returned when a new Response is created. The first event of response creation,
* where the response is in an initial state of `in_progress`.
*/
export interface ResponseCreatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The response resource.
*/
response: RealtimeResponse;
/**
* The event type, must be `response.created`.
*/
type: 'response.created';
}
/**
* Returned when a Response is done streaming. Always emitted, no matter the final
* state. The Response object included in the `response.done` event will include
* all output Items in the Response but will omit the raw audio data.
*/
export interface ResponseDoneEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The response resource.
*/
response: RealtimeResponse;
/**
* The event type, must be `response.done`.
*/
type: 'response.done';
}
/**
* Returned when the model-generated function call arguments are updated.
*/
export interface ResponseFunctionCallArgumentsDeltaEvent {
/**
* The ID of the function call.
*/
call_id: string;
/**
* The arguments delta as a JSON string.
*/
delta: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the function call item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.function_call_arguments.delta`.
*/
type: 'response.function_call_arguments.delta';
}
/**
* Returned when the model-generated function call arguments are done streaming.
* Also emitted when a Response is interrupted, incomplete, or cancelled.
*/
export interface ResponseFunctionCallArgumentsDoneEvent {
/**
* The final arguments as a JSON string.
*/
arguments: string;
/**
* The ID of the function call.
*/
call_id: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the function call item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.function_call_arguments.done`.
*/
type: 'response.function_call_