openai
Version:
The official TypeScript library for the OpenAI API
1,870 lines (1,612 loc) • 132 kB
text/typescript
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
import { APIResource } from '../../core/resource';
import * as RealtimeAPI from './realtime';
import * as Shared from '../shared';
import * as ClientSecretsAPI from './client-secrets';
import {
ClientSecretCreateParams,
ClientSecretCreateResponse,
ClientSecrets,
RealtimeSessionClientSecret,
RealtimeSessionCreateResponse,
RealtimeTranscriptionSessionCreateResponse,
RealtimeTranscriptionSessionTurnDetection,
} from './client-secrets';
import * as ResponsesAPI from '../responses/responses';
export class Realtime extends APIResource {
clientSecrets: ClientSecretsAPI.ClientSecrets = new ClientSecretsAPI.ClientSecrets(this._client);
}
export interface AudioTranscription {
/**
* The language of the input audio. Supplying the input language in
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
* format will improve accuracy and latency.
*/
language?: string;
/**
* The model to use for transcription. Current options are `whisper-1`,
* `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
*/
model?: 'whisper-1' | 'gpt-4o-transcribe-latest' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe';
/**
* An optional text to guide the model's style or continue a previous audio
* segment. For `whisper-1`, the
* [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
* For `gpt-4o-transcribe` models, the prompt is a free text string, for example
* "expect words related to technology".
*/
prompt?: string;
}
/**
* Returned when a conversation is created. Emitted right after session creation.
*/
export interface ConversationCreatedEvent {
/**
* The conversation resource.
*/
conversation: ConversationCreatedEvent.Conversation;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `conversation.created`.
*/
type: 'conversation.created';
}
export namespace ConversationCreatedEvent {
/**
* The conversation resource.
*/
export interface Conversation {
/**
* The unique ID of the conversation.
*/
id?: string;
/**
* The object type, must be `realtime.conversation`.
*/
object?: 'realtime.conversation';
}
}
/**
* A single item within a Realtime conversation.
*/
export type ConversationItem =
| RealtimeConversationItemSystemMessage
| RealtimeConversationItemUserMessage
| RealtimeConversationItemAssistantMessage
| RealtimeConversationItemFunctionCall
| RealtimeConversationItemFunctionCallOutput
| RealtimeMcpApprovalResponse
| RealtimeMcpListTools
| RealtimeMcpToolCall
| RealtimeMcpApprovalRequest;
/**
* Sent by the server when an Item is added to the default Conversation. This can
* happen in several cases:
*
* - When the client sends a `conversation.item.create` event.
* - When the input audio buffer is committed. In this case the item will be a user
* message containing the audio from the buffer.
* - When the model is generating a Response. In this case the
* `conversation.item.added` event will be sent when the model starts generating
* a specific Item, and thus it will not yet have any content (and `status` will
* be `in_progress`).
*
* The event will include the full content of the Item (except when model is
* generating a Response) except for audio data, which can be retrieved separately
* with a `conversation.item.retrieve` event if necessary.
*/
export interface ConversationItemAdded {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* A single item within a Realtime conversation.
*/
item: ConversationItem;
/**
* The event type, must be `conversation.item.added`.
*/
type: 'conversation.item.added';
/**
* The ID of the item that precedes this one, if any. This is used to maintain
* ordering when items are inserted.
*/
previous_item_id?: string | null;
}
/**
* Add a new Item to the Conversation's context, including messages, function
* calls, and function call responses. This event can be used both to populate a
* "history" of the conversation and to add new items mid-stream, but has the
* current limitation that it cannot populate assistant audio messages.
*
* If successful, the server will respond with a `conversation.item.created` event,
* otherwise an `error` event will be sent.
*/
export interface ConversationItemCreateEvent {
/**
* A single item within a Realtime conversation.
*/
item: ConversationItem;
/**
* The event type, must be `conversation.item.create`.
*/
type: 'conversation.item.create';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
/**
* The ID of the preceding item after which the new item will be inserted. If not
* set, the new item will be appended to the end of the conversation. If set to
* `root`, the new item will be added to the beginning of the conversation. If set
* to an existing ID, it allows an item to be inserted mid-conversation. If the ID
* cannot be found, an error will be returned and the item will not be added.
*/
previous_item_id?: string;
}
/**
* Returned when a conversation item is created. There are several scenarios that
* produce this event:
*
* - The server is generating a Response, which if successful will produce either
* one or two Items, which will be of type `message` (role `assistant`) or type
* `function_call`.
* - The input audio buffer has been committed, either by the client or the server
* (in `server_vad` mode). The server will take the content of the input audio
* buffer and add it to a new user message Item.
* - The client has sent a `conversation.item.create` event to add a new Item to
* the Conversation.
*/
export interface ConversationItemCreatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* A single item within a Realtime conversation.
*/
item: ConversationItem;
/**
* The event type, must be `conversation.item.created`.
*/
type: 'conversation.item.created';
/**
* The ID of the preceding item in the Conversation context, allows the client to
* understand the order of the conversation. Can be `null` if the item has no
* predecessor.
*/
previous_item_id?: string | null;
}
/**
* Send this event when you want to remove any item from the conversation history.
* The server will respond with a `conversation.item.deleted` event, unless the
* item does not exist in the conversation history, in which case the server will
* respond with an error.
*/
export interface ConversationItemDeleteEvent {
/**
* The ID of the item to delete.
*/
item_id: string;
/**
* The event type, must be `conversation.item.delete`.
*/
type: 'conversation.item.delete';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an item in the conversation is deleted by the client with a
* `conversation.item.delete` event. This event is used to synchronize the server's
* understanding of the conversation history with the client's view.
*/
export interface ConversationItemDeletedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item that was deleted.
*/
item_id: string;
/**
* The event type, must be `conversation.item.deleted`.
*/
type: 'conversation.item.deleted';
}
/**
* Returned when a conversation item is finalized.
*
* The event will include the full content of the Item except for audio data, which
* can be retrieved separately with a `conversation.item.retrieve` event if needed.
*/
export interface ConversationItemDone {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* A single item within a Realtime conversation.
*/
item: ConversationItem;
/**
* The event type, must be `conversation.item.done`.
*/
type: 'conversation.item.done';
/**
* The ID of the item that precedes this one, if any. This is used to maintain
* ordering when items are inserted.
*/
previous_item_id?: string | null;
}
/**
* This event is the output of audio transcription for user audio written to the
* user audio buffer. Transcription begins when the input audio buffer is committed
* by the client or server (when VAD is enabled). Transcription runs asynchronously
* with Response creation, so this event may come before or after the Response
* events.
*
* Realtime API models accept audio natively, and thus input transcription is a
* separate process run on a separate ASR (Automatic Speech Recognition) model. The
* transcript may diverge somewhat from the model's interpretation, and should be
* treated as a rough guide.
*/
export interface ConversationItemInputAudioTranscriptionCompletedEvent {
/**
* The index of the content part containing the audio.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item containing the audio that is being transcribed.
*/
item_id: string;
/**
* The transcribed text.
*/
transcript: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.completed`.
*/
type: 'conversation.item.input_audio_transcription.completed';
/**
* Usage statistics for the transcription, this is billed according to the ASR
* model's pricing rather than the realtime model's pricing.
*/
usage:
| ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageTokens
| ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageDuration;
/**
* The log probabilities of the transcription.
*/
logprobs?: Array<LogProbProperties> | null;
}
export namespace ConversationItemInputAudioTranscriptionCompletedEvent {
/**
* Usage statistics for models billed by token usage.
*/
export interface TranscriptTextUsageTokens {
/**
* Number of input tokens billed for this request.
*/
input_tokens: number;
/**
* Number of output tokens generated.
*/
output_tokens: number;
/**
* Total number of tokens used (input + output).
*/
total_tokens: number;
/**
* The type of the usage object. Always `tokens` for this variant.
*/
type: 'tokens';
/**
* Details about the input tokens billed for this request.
*/
input_token_details?: TranscriptTextUsageTokens.InputTokenDetails;
}
export namespace TranscriptTextUsageTokens {
/**
* Details about the input tokens billed for this request.
*/
export interface InputTokenDetails {
/**
* Number of audio tokens billed for this request.
*/
audio_tokens?: number;
/**
* Number of text tokens billed for this request.
*/
text_tokens?: number;
}
}
/**
* Usage statistics for models billed by audio input duration.
*/
export interface TranscriptTextUsageDuration {
/**
* Duration of the input audio in seconds.
*/
seconds: number;
/**
* The type of the usage object. Always `duration` for this variant.
*/
type: 'duration';
}
}
/**
* Returned when the text value of an input audio transcription content part is
* updated with incremental transcription results.
*/
export interface ConversationItemInputAudioTranscriptionDeltaEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item containing the audio that is being transcribed.
*/
item_id: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.delta`.
*/
type: 'conversation.item.input_audio_transcription.delta';
/**
* The index of the content part in the item's content array.
*/
content_index?: number;
/**
* The text delta.
*/
delta?: string;
/**
* The log probabilities of the transcription. These can be enabled by
* configurating the session with
* `"include": ["item.input_audio_transcription.logprobs"]`. Each entry in the
* array corresponds a log probability of which token would be selected for this
* chunk of transcription. This can help to identify if it was possible there were
* multiple valid options for a given chunk of transcription.
*/
logprobs?: Array<LogProbProperties> | null;
}
/**
* Returned when input audio transcription is configured, and a transcription
* request for a user message failed. These events are separate from other `error`
* events so that the client can identify the related Item.
*/
export interface ConversationItemInputAudioTranscriptionFailedEvent {
/**
* The index of the content part containing the audio.
*/
content_index: number;
/**
* Details of the transcription error.
*/
error: ConversationItemInputAudioTranscriptionFailedEvent.Error;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item.
*/
item_id: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.failed`.
*/
type: 'conversation.item.input_audio_transcription.failed';
}
export namespace ConversationItemInputAudioTranscriptionFailedEvent {
/**
* Details of the transcription error.
*/
export interface Error {
/**
* Error code, if any.
*/
code?: string;
/**
* A human-readable error message.
*/
message?: string;
/**
* Parameter related to the error, if any.
*/
param?: string;
/**
* The type of error.
*/
type?: string;
}
}
/**
* Returned when an input audio transcription segment is identified for an item.
*/
export interface ConversationItemInputAudioTranscriptionSegment {
/**
* The segment identifier.
*/
id: string;
/**
* The index of the input audio content part within the item.
*/
content_index: number;
/**
* End time of the segment in seconds.
*/
end: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item containing the input audio content.
*/
item_id: string;
/**
* The detected speaker label for this segment.
*/
speaker: string;
/**
* Start time of the segment in seconds.
*/
start: number;
/**
* The text for this segment.
*/
text: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.segment`.
*/
type: 'conversation.item.input_audio_transcription.segment';
}
/**
* Send this event when you want to retrieve the server's representation of a
* specific item in the conversation history. This is useful, for example, to
* inspect user audio after noise cancellation and VAD. The server will respond
* with a `conversation.item.retrieved` event, unless the item does not exist in
* the conversation history, in which case the server will respond with an error.
*/
export interface ConversationItemRetrieveEvent {
/**
* The ID of the item to retrieve.
*/
item_id: string;
/**
* The event type, must be `conversation.item.retrieve`.
*/
type: 'conversation.item.retrieve';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Send this event to truncate a previous assistant message’s audio. The server
* will produce audio faster than realtime, so this event is useful when the user
* interrupts to truncate audio that has already been sent to the client but not
* yet played. This will synchronize the server's understanding of the audio with
* the client's playback.
*
* Truncating audio will delete the server-side text transcript to ensure there is
* not text in the context that hasn't been heard by the user.
*
* If successful, the server will respond with a `conversation.item.truncated`
* event.
*/
export interface ConversationItemTruncateEvent {
/**
* Inclusive duration up to which audio is truncated, in milliseconds. If the
* audio_end_ms is greater than the actual audio duration, the server will respond
* with an error.
*/
audio_end_ms: number;
/**
* The index of the content part to truncate. Set this to `0`.
*/
content_index: number;
/**
* The ID of the assistant message item to truncate. Only assistant message items
* can be truncated.
*/
item_id: string;
/**
* The event type, must be `conversation.item.truncate`.
*/
type: 'conversation.item.truncate';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an earlier assistant audio message item is truncated by the client
* with a `conversation.item.truncate` event. This event is used to synchronize the
* server's understanding of the audio with the client's playback.
*
* This action will truncate the audio and remove the server-side text transcript
* to ensure there is no text in the context that hasn't been heard by the user.
*/
export interface ConversationItemTruncatedEvent {
/**
* The duration up to which the audio was truncated, in milliseconds.
*/
audio_end_ms: number;
/**
* The index of the content part that was truncated.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the assistant message item that was truncated.
*/
item_id: string;
/**
* The event type, must be `conversation.item.truncated`.
*/
type: 'conversation.item.truncated';
}
/**
* The item to add to the conversation.
*/
export interface ConversationItemWithReference {
/**
* For an item of type (`message` | `function_call` | `function_call_output`) this
* field allows the client to assign the unique ID of the item. It is not required
* because the server will generate one if not provided.
*
* For an item of type `item_reference`, this field is required and is a reference
* to any item that has previously existed in the conversation.
*/
id?: string;
/**
* The arguments of the function call (for `function_call` items).
*/
arguments?: string;
/**
* The ID of the function call (for `function_call` and `function_call_output`
* items). If passed on a `function_call_output` item, the server will check that a
* `function_call` item with the same ID exists in the conversation history.
*/
call_id?: string;
/**
* The content of the message, applicable for `message` items.
*
* - Message items of role `system` support only `input_text` content
* - Message items of role `user` support `input_text` and `input_audio` content
* - Message items of role `assistant` support `text` content.
*/
content?: Array<ConversationItemWithReference.Content>;
/**
* The name of the function being called (for `function_call` items).
*/
name?: string;
/**
* Identifier for the API object being returned - always `realtime.item`.
*/
object?: 'realtime.item';
/**
* The output of the function call (for `function_call_output` items).
*/
output?: string;
/**
* The role of the message sender (`user`, `assistant`, `system`), only applicable
* for `message` items.
*/
role?: 'user' | 'assistant' | 'system';
/**
* The status of the item (`completed`, `incomplete`, `in_progress`). These have no
* effect on the conversation, but are accepted for consistency with the
* `conversation.item.created` event.
*/
status?: 'completed' | 'incomplete' | 'in_progress';
/**
* The type of the item (`message`, `function_call`, `function_call_output`,
* `item_reference`).
*/
type?: 'message' | 'function_call' | 'function_call_output' | 'item_reference';
}
export namespace ConversationItemWithReference {
export interface Content {
/**
* ID of a previous conversation item to reference (for `item_reference` content
* types in `response.create` events). These can reference both client and server
* created items.
*/
id?: string;
/**
* Base64-encoded audio bytes, used for `input_audio` content type.
*/
audio?: string;
/**
* The text content, used for `input_text` and `text` content types.
*/
text?: string;
/**
* The transcript of the audio, used for `input_audio` content type.
*/
transcript?: string;
/**
* The content type (`input_text`, `input_audio`, `item_reference`, `text`).
*/
type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
}
}
/**
* Send this event to append audio bytes to the input audio buffer. The audio
* buffer is temporary storage you can write to and later commit. A "commit" will
* create a new user message item in the conversation history from the buffer
* content and clear the buffer. Input audio transcription (if enabled) will be
* generated when the buffer is committed.
*
* If VAD is enabled the audio buffer is used to detect speech and the server will
* decide when to commit. When Server VAD is disabled, you must commit the audio
* buffer manually. Input audio noise reduction operates on writes to the audio
* buffer.
*
* The client may choose how much audio to place in each event up to a maximum of
* 15 MiB, for example streaming smaller chunks from the client may allow the VAD
* to be more responsive. Unlike most other client events, the server will not send
* a confirmation response to this event.
*/
export interface InputAudioBufferAppendEvent {
/**
* Base64-encoded audio bytes. This must be in the format specified by the
* `input_audio_format` field in the session configuration.
*/
audio: string;
/**
* The event type, must be `input_audio_buffer.append`.
*/
type: 'input_audio_buffer.append';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Send this event to clear the audio bytes in the buffer. The server will respond
* with an `input_audio_buffer.cleared` event.
*/
export interface InputAudioBufferClearEvent {
/**
* The event type, must be `input_audio_buffer.clear`.
*/
type: 'input_audio_buffer.clear';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when the input audio buffer is cleared by the client with a
* `input_audio_buffer.clear` event.
*/
export interface InputAudioBufferClearedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `input_audio_buffer.cleared`.
*/
type: 'input_audio_buffer.cleared';
}
/**
* Send this event to commit the user input audio buffer, which will create a new
* user message item in the conversation. This event will produce an error if the
* input audio buffer is empty. When in Server VAD mode, the client does not need
* to send this event, the server will commit the audio buffer automatically.
*
* Committing the input audio buffer will trigger input audio transcription (if
* enabled in session configuration), but it will not create a response from the
* model. The server will respond with an `input_audio_buffer.committed` event.
*/
export interface InputAudioBufferCommitEvent {
/**
* The event type, must be `input_audio_buffer.commit`.
*/
type: 'input_audio_buffer.commit';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an input audio buffer is committed, either by the client or
* automatically in server VAD mode. The `item_id` property is the ID of the user
* message item that will be created, thus a `conversation.item.created` event will
* also be sent to the client.
*/
export interface InputAudioBufferCommittedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.committed`.
*/
type: 'input_audio_buffer.committed';
/**
* The ID of the preceding item after which the new item will be inserted. Can be
* `null` if the item has no predecessor.
*/
previous_item_id?: string | null;
}
/**
* Sent by the server when in `server_vad` mode to indicate that speech has been
* detected in the audio buffer. This can happen any time audio is added to the
* buffer (unless speech is already detected). The client may want to use this
* event to interrupt audio playback or provide visual feedback to the user.
*
* The client should expect to receive a `input_audio_buffer.speech_stopped` event
* when speech stops. The `item_id` property is the ID of the user message item
* that will be created when speech stops and will also be included in the
* `input_audio_buffer.speech_stopped` event (unless the client manually commits
* the audio buffer during VAD activation).
*/
export interface InputAudioBufferSpeechStartedEvent {
/**
* Milliseconds from the start of all audio written to the buffer during the
* session when speech was first detected. This will correspond to the beginning of
* audio sent to the model, and thus includes the `prefix_padding_ms` configured in
* the Session.
*/
audio_start_ms: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created when speech stops.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.speech_started`.
*/
type: 'input_audio_buffer.speech_started';
}
/**
* Returned in `server_vad` mode when the server detects the end of speech in the
* audio buffer. The server will also send an `conversation.item.created` event
* with the user message item that is created from the audio buffer.
*/
export interface InputAudioBufferSpeechStoppedEvent {
/**
* Milliseconds since the session started when speech stopped. This will correspond
* to the end of audio sent to the model, and thus includes the
* `min_silence_duration_ms` configured in the Session.
*/
audio_end_ms: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.speech_stopped`.
*/
type: 'input_audio_buffer.speech_stopped';
}
/**
* Returned when the server VAD timeout is triggered for the input audio buffer.
*/
export interface InputAudioBufferTimeoutTriggered {
/**
* Millisecond offset where speech ended within the buffered audio.
*/
audio_end_ms: number;
/**
* Millisecond offset where speech started within the buffered audio.
*/
audio_start_ms: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item associated with this segment.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.timeout_triggered`.
*/
type: 'input_audio_buffer.timeout_triggered';
}
/**
* A log probability object.
*/
export interface LogProbProperties {
/**
* The token that was used to generate the log probability.
*/
token: string;
/**
* The bytes that were used to generate the log probability.
*/
bytes: Array<number>;
/**
* The log probability of the token.
*/
logprob: number;
}
/**
* Returned when listing MCP tools has completed for an item.
*/
export interface McpListToolsCompleted {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the MCP list tools item.
*/
item_id: string;
/**
* The event type, must be `mcp_list_tools.completed`.
*/
type: 'mcp_list_tools.completed';
}
/**
* Returned when listing MCP tools has failed for an item.
*/
export interface McpListToolsFailed {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the MCP list tools item.
*/
item_id: string;
/**
* The event type, must be `mcp_list_tools.failed`.
*/
type: 'mcp_list_tools.failed';
}
/**
* Returned when listing MCP tools is in progress for an item.
*/
export interface McpListToolsInProgress {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the MCP list tools item.
*/
item_id: string;
/**
* The event type, must be `mcp_list_tools.in_progress`.
*/
type: 'mcp_list_tools.in_progress';
}
/**
* Type of noise reduction. `near_field` is for close-talking microphones such as
* headphones, `far_field` is for far-field microphones such as laptop or
* conference room microphones.
*/
export type NoiseReductionType = 'near_field' | 'far_field';
/**
* **WebRTC Only:** Emit to cut off the current audio response. This will trigger
* the server to stop generating audio and emit a `output_audio_buffer.cleared`
* event. This event should be preceded by a `response.cancel` client event to stop
* the generation of the current response.
* [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc).
*/
export interface OutputAudioBufferClearEvent {
/**
* The event type, must be `output_audio_buffer.clear`.
*/
type: 'output_audio_buffer.clear';
/**
* The unique ID of the client event used for error handling.
*/
event_id?: string;
}
/**
* Emitted at the beginning of a Response to indicate the updated rate limits. When
* a Response is created some tokens will be "reserved" for the output tokens, the
* rate limits shown here reflect that reservation, which is then adjusted
* accordingly once the Response is completed.
*/
export interface RateLimitsUpdatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* List of rate limit information.
*/
rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>;
/**
* The event type, must be `rate_limits.updated`.
*/
type: 'rate_limits.updated';
}
export namespace RateLimitsUpdatedEvent {
export interface RateLimit {
/**
* The maximum allowed value for the rate limit.
*/
limit?: number;
/**
* The name of the rate limit (`requests`, `tokens`).
*/
name?: 'requests' | 'tokens';
/**
* The remaining value before the limit is reached.
*/
remaining?: number;
/**
* Seconds until the rate limit resets.
*/
reset_seconds?: number;
}
}
/**
* Configuration for input and output audio.
*/
export interface RealtimeAudioConfig {
input?: RealtimeAudioConfigInput;
output?: RealtimeAudioConfigOutput;
}
export interface RealtimeAudioConfigInput {
/**
* The format of the input audio.
*/
format?: RealtimeAudioFormats;
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
* off. Noise reduction filters audio added to the input audio buffer before it is
* sent to VAD and the model. Filtering the audio can improve VAD and turn
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
noise_reduction?: RealtimeAudioConfigInput.NoiseReduction;
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through
* [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
* and should be treated as guidance of input audio content rather than precisely
* what the model heard. The client can optionally set the language and prompt for
* transcription, these offer additional guidance to the transcription service.
*/
transcription?: AudioTranscription;
/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjunction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
*/
turn_detection?: RealtimeAudioInputTurnDetection;
}
export namespace RealtimeAudioConfigInput {
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
* off. Noise reduction filters audio added to the input audio buffer before it is
* sent to VAD and the model. Filtering the audio can improve VAD and turn
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
export interface NoiseReduction {
/**
* Type of noise reduction. `near_field` is for close-talking microphones such as
* headphones, `far_field` is for far-field microphones such as laptop or
* conference room microphones.
*/
type?: RealtimeAPI.NoiseReductionType;
}
}
export interface RealtimeAudioConfigOutput {
/**
* The format of the output audio.
*/
format?: RealtimeAudioFormats;
/**
* The speed of the model's spoken response as a multiple of the original speed.
* 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
* This value can only be changed in between model turns, not while a response is
* in progress.
*
* This parameter is a post-processing adjustment to the audio after it is
* generated, it's also possible to prompt the model to speak faster or slower.
*/
speed?: number;
/**
* The voice the model uses to respond. Voice cannot be changed during the session
* once the model has responded with audio at least once. Current voice options are
* `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
* and `cedar`. We recommend `marin` and `cedar` for best quality.
*/
voice?:
| (string & {})
| 'alloy'
| 'ash'
| 'ballad'
| 'coral'
| 'echo'
| 'sage'
| 'shimmer'
| 'verse'
| 'marin'
| 'cedar';
}
/**
* The PCM audio format. Only a 24kHz sample rate is supported.
*/
export type RealtimeAudioFormats =
| RealtimeAudioFormats.AudioPCM
| RealtimeAudioFormats.AudioPCMU
| RealtimeAudioFormats.AudioPCMA;
export namespace RealtimeAudioFormats {
/**
* The PCM audio format. Only a 24kHz sample rate is supported.
*/
export interface AudioPCM {
/**
* The sample rate of the audio. Always `24000`.
*/
rate?: 24000;
/**
* The audio format. Always `audio/pcm`.
*/
type?: 'audio/pcm';
}
/**
* The G.711 μ-law format.
*/
export interface AudioPCMU {
/**
* The audio format. Always `audio/pcmu`.
*/
type?: 'audio/pcmu';
}
/**
* The G.711 A-law format.
*/
export interface AudioPCMA {
/**
* The audio format. Always `audio/pcma`.
*/
type?: 'audio/pcma';
}
}
/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjunction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
*/
export interface RealtimeAudioInputTurnDetection {
/**
* Whether or not to automatically generate a response when a VAD stop event
* occurs.
*/
create_response?: boolean;
/**
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
* will wait longer for the user to continue speaking, `high` will respond more
* quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
* and `high` have max timeouts of 8s, 4s, and 2s respectively.
*/
eagerness?: 'low' | 'medium' | 'high' | 'auto';
/**
* Optional idle timeout after which turn detection will auto-timeout when no
* additional audio is received and emits a `timeout_triggered` event.
*/
idle_timeout_ms?: number | null;
/**
* Whether or not to automatically interrupt any ongoing response with output to
* the default conversation (i.e. `conversation` of `auto`) when a VAD start event
* occurs.
*/
interrupt_response?: boolean;
/**
* Used only for `server_vad` mode. Amount of audio to include before the VAD
* detected speech (in milliseconds). Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Used only for `server_vad` mode. Duration of silence to detect speech stop (in
* milliseconds). Defaults to 500ms. With shorter values the model will respond
* more quickly, but may jump in on short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
* defaults to 0.5. A higher threshold will require louder audio to activate the
* model, and thus might perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection.
*/
type?: 'server_vad' | 'semantic_vad';
}
/**
* A realtime client event.
*/
export type RealtimeClientEvent =
| ConversationItemCreateEvent
| ConversationItemDeleteEvent
| ConversationItemRetrieveEvent
| ConversationItemTruncateEvent
| InputAudioBufferAppendEvent
| InputAudioBufferClearEvent
| OutputAudioBufferClearEvent
| InputAudioBufferCommitEvent
| ResponseCancelEvent
| ResponseCreateEvent
| SessionUpdateEvent;
/**
* An assistant message item in a Realtime conversation.
*/
export interface RealtimeConversationItemAssistantMessage {
/**
* The content of the message.
*/
content: Array<RealtimeConversationItemAssistantMessage.Content>;
/**
* The role of the message sender. Always `assistant`.
*/
role: 'assistant';
/**
* The type of the item. Always `message`.
*/
type: 'message';
/**
* The unique ID of the item. This may be provided by the client or generated by
* the server.
*/
id?: string;
/**
* Identifier for the API object being returned - always `realtime.item`. Optional
* when creating a new item.
*/
object?: 'realtime.item';
/**
* The status of the item. Has no effect on the conversation.
*/
status?: 'completed' | 'incomplete' | 'in_progress';
}
export namespace RealtimeConversationItemAssistantMessage {
export interface Content {
/**
* Base64-encoded audio bytes, these will be parsed as the format specified in the
* session output audio type configuration. This defaults to PCM 16-bit 24kHz mono
* if not specified.
*/
audio?: string;
/**
* The text content.
*/
text?: string;
/**
* The transcript of the audio content, this will always be present if the output
* type is `audio`.
*/
transcript?: string;
/**
* The content type, `output_text` or `output_audio` depending on the session
* `output_modalities` configuration.
*/
type?: 'output_text' | 'output_audio';
}
}
/**
* A function call item in a Realtime conversation.
*/
export interface RealtimeConversationItemFunctionCall {
/**
* The arguments of the function call. This is a JSON-encoded string representing
* the arguments passed to the function, for example
* `{"arg1": "value1", "arg2": 42}`.
*/
arguments: string;
/**
* The name of the function being called.
*/
name: string;
/**
* The type of the item. Always `function_call`.
*/
type: 'function_call';
/**
* The unique ID of the item. This may be provided by the client or generated by
* the server.
*/
id?: string;
/**
* The ID of the function call.
*/
call_id?: string;
/**
* Identifier for the API object being returned - always `realtime.item`. Optional
* when creating a new item.
*/
object?: 'realtime.item';
/**
* The status of the item. Has no effect on the conversation.
*/
status?: 'completed' | 'incomplete' | 'in_progress';
}
/**
* A function call output item in a Realtime conversation.
*/
export interface RealtimeConversationItemFunctionCallOutput {
/**
* The ID of the function call this output is for.
*/
call_id: string;
/**
* The output of the function call, this is free text and can contain any
* information or simply be empty.
*/
output: string;
/**
* The type of the item. Always `function_call_output`.
*/
type: 'function_call_output';
/**
* The unique ID of the item. This may be provided by the client or generated by
* the server.
*/
id?: string;
/**
* Identifier for the API object being returned - always `realtime.item`. Optional
* when creating a new item.
*/
object?: 'realtime.item';
/**
* The status of the item. Has no effect on the conversation.
*/
status?: 'completed' | 'incomplete' | 'in_progress';
}
/**
* A system message in a Realtime conversation can be used to provide additional
* context or instructions to the model. This is similar but distinct from the
* instruction prompt provided at the start of a conversation, as system messages
* can be added at any point in the conversation. For major changes to the
* conversation's behavior, use instructions, but for smaller updates (e.g. "the
* user is now asking about a different topic"), use system messages.
*/
export interface RealtimeConversationItemSystemMessage {
/**
* The content of the message.
*/
content: Array<RealtimeConversationItemSystemMessage.Content>;
/**
* The role of the message sender. Always `system`.
*/
role: 'system';
/**
* The type of the item. Always `message`.
*/
type: 'message';
/**
* The unique ID of the item. This may be provided by the client or generated by
* the server.
*/
id?: string;
/**
* Identifier for the API object being returned - always `realtime.item`. Optional
* when creating a new item.
*/
object?: 'realtime.item';
/**
* The status of the item. Has no effect on the conversation.
*/
status?: 'completed' | 'incomplete' | 'in_progress';
}
export namespace RealtimeConversationItemSystemMessage {
export interface Content {
/**
* The text content.
*/
text?: string;
/**
* The content type. Always `input_text` for system messages.
*/
type?: 'input_text';
}
}
/**
* A user message item in a Realtime conversation.
*/
export interface RealtimeConversationItemUserMessage {
/**
* The content of the message.
*/
content: Array<RealtimeConversationItemUserMessage.Content>;
/**
* The role of the message sender. Always `user`.
*/
role: 'user';
/**
* The type of the item. Always `message`.
*/
type: 'message';
/**
* The unique ID of the item. This may be provided by the client or generated by
* the server.
*/
id?: string;
/**
* Identifier for the API object being returned - always `realtime.item`. Optional
* when creating a new item.
*/
object?: 'realtime.item';
/**
* The status of the item. Has no effect on the conversation.
*/
status?: 'completed' | 'incomplete' | 'in_progress';
}
export namespace RealtimeConversationItemUserMessage {
export interface Content {
/**
* Base64-encoded audio bytes (for `input_audio`), these will be parsed as the
* format specified in the session input audio type configuration. This defaults to
* PCM 16-bit 24kHz mono if not specified.
*/
audio?: string;
/**
* The detail level of the image (for `input_image`). `auto` will default to
* `high`.
*/
detail?: 'auto' | 'low' | 'high';
/**
* Base64-encoded image bytes (for `input_image`) as a data URI. For example
* `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...`. Supported formats are PNG
* and JPEG.
*/
image_url?: string;
/**
* The text content (for `input_text`).
*/
text?: string;
/**
* Transcript of the audio (for `input_audio`). This is not sent to the model, but
* will be attached to the message item for reference.
*/
transcript?: string;
/**
* The content type (`input_text`, `input_audio`, or `input_image`).
*/
type?: 'input_text' | 'input_audio' | 'input_image';
}
}
/**
* Details of the error.
*/
export interface RealtimeError {
/**
* A human-readable error message.
*/
message: string;
/**
* The type of error (e.g., "invalid_request_error", "server_error").
*/
type: string;
/**
* Error code, if any.
*/
code?: string | null;
/**
* The event_id of the client event that caused the error, if applicable.
*/
event_id?: string | null;
/**
* Parameter related to the error, if any.
*/
param?: string | null;
}
/**
* Returned when an error occurs, which could be a client problem or a server
* problem. Most errors are recoverable and the session will stay open, we
* recommend to implementors to monitor and log error messages by default.
*/
export interface RealtimeErrorEvent {
/**
* Details of the error.
*/
error: RealtimeError;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `error`.
*/
type: 'error';
}
export interface RealtimeFunctionTool {
/**
* The description of the function, including guidance on when and how to call it,
* and guidance about what to tell the user when calling (if anything).
*/
description?: string;
/**
* The name of the function.
*/
name?: string;
/**
* Parameters of the function in JSON Schema.
*/
parameters?: unknown;
/**
* The type of the tool, i.e. `function`.
*/
type?: 'function';
}
/**
* A Realtime item requesting human approval of a tool invocation.
*/
export interface RealtimeMcpApprovalRequest {
/**
* The unique ID of the approval request.
*/
id: string;
/**
* A JSON string of arguments for the tool.
*/
arguments: string;
/**
* The name of the tool to run.
*/
name: string;
/**
* The label of the MCP server making the request.
*/
server_label: string;
/**
* The type of the item. Always `mcp_approval_request`.
*/
type: 'mcp_approval_request';
}
/**
* A Realtime item responding to an MCP approval request.
*/
export interface RealtimeMcpApprovalResponse {
/**
* The unique ID of the approval response.
*/
id: string;
/**
* The ID of the approval request being answered.
*/
approval_request_id: string;
/**
* Whether the request was approved.
*/
approve: boolean;
/**
* The type of the item. Always `mcp_approval_response`.
*/
type: 'mcp_approval_response';
/**
* Optional reason for the decision.
*/
reason?: string | null;
}
/**
* A Realtime item listing tools available on an MCP server.
*/
export interface RealtimeMcpListTools {
/**
* The label of the MCP server.
*/
server_label: string;
/**
* The tools available on the server.
*/
tools: Array<RealtimeMcpListTools.Tool>;
/**
* The type of the item. Always `mcp_list_tools`.
*/
type: 'mcp_list_tools';
/**
* The unique ID of the list.
*/
id?: string;
}
export namespace RealtimeMcpListTools {
/**
* A tool available on an MCP server.
*/
export interface Tool {
/**
* The JSON schema describing the tool's input.
*/
input_schema: unknown;
/**
* The name of the tool.
*/
name: string;
/**
* Additional annotations about the tool.
*/
annotations?: unknown | null;
/**
* The description of the tool.
*/
description?: string | null;
}
}
export interface RealtimeMcpProtocolError {
code: number;
message: string;
type: 'protocol_error';
}
/**
* A Realtime item representing an invocation of a tool on an MCP server.
*/
export interface RealtimeMcpToolCall {
/**
* The unique ID of the tool call.
*/
id: string;
/**
* A JSON string of the arguments passed to the tool.
*/
arguments: string;
/**
* The name of the tool that was run.
*/
name: string;
/**
* The label of the MCP server running the tool.
*/
server_label: string;
/**
* The type of the item. Always `mcp_tool_call`.
*/
type: 'mcp_tool_call';
/**
* The ID of an associated approval request, if any.
*/
approval_request_id?: string | null;
/**
* The error from the tool call, if any.
*/
error?: RealtimeMcpProtocolError | RealtimeMcpToolExecutionError | RealtimeMcphttpError | null;
/**
* The output from the tool call.
*/
output?: string | null;
}