UNPKG

openai

Version:

The official TypeScript library for the OpenAI API

1,741 lines (1,519 loc) 145 kB
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. import { APIResource } from '../../core/resource'; import * as RealtimeAPI from './realtime'; import * as Shared from '../shared'; import * as CallsAPI from './calls'; import { CallAcceptParams, CallReferParams, CallRejectParams, Calls } from './calls'; import * as ClientSecretsAPI from './client-secrets'; import { ClientSecretCreateParams, ClientSecretCreateResponse, ClientSecrets, RealtimeSessionClientSecret, RealtimeSessionCreateResponse, RealtimeTranscriptionSessionCreateResponse, RealtimeTranscriptionSessionTurnDetection, } from './client-secrets'; import * as ResponsesAPI from '../responses/responses'; export class Realtime extends APIResource { clientSecrets: ClientSecretsAPI.ClientSecrets = new ClientSecretsAPI.ClientSecrets(this._client); calls: CallsAPI.Calls = new CallsAPI.Calls(this._client); } export interface AudioTranscription { /** * The language of the input audio. Supplying the input language in * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) * format will improve accuracy and latency. */ language?: string; /** * The model to use for transcription. Current options are `whisper-1`, * `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, * `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. Use * `gpt-4o-transcribe-diarize` when you need diarization with speaker labels. */ model?: | (string & {}) | 'whisper-1' | 'gpt-4o-mini-transcribe' | 'gpt-4o-mini-transcribe-2025-12-15' | 'gpt-4o-transcribe' | 'gpt-4o-transcribe-diarize'; /** * An optional text to guide the model's style or continue a previous audio * segment. For `whisper-1`, the * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). * For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the * prompt is a free text string, for example "expect words related to technology". */ prompt?: string; } /** * Returned when a conversation is created. Emitted right after session creation. */ export interface ConversationCreatedEvent { /** * The conversation resource. */ conversation: ConversationCreatedEvent.Conversation; /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `conversation.created`. */ type: 'conversation.created'; } export namespace ConversationCreatedEvent { /** * The conversation resource. */ export interface Conversation { /** * The unique ID of the conversation. */ id?: string; /** * The object type, must be `realtime.conversation`. */ object?: 'realtime.conversation'; } } /** * A single item within a Realtime conversation. */ export type ConversationItem = | RealtimeConversationItemSystemMessage | RealtimeConversationItemUserMessage | RealtimeConversationItemAssistantMessage | RealtimeConversationItemFunctionCall | RealtimeConversationItemFunctionCallOutput | RealtimeMcpApprovalResponse | RealtimeMcpListTools | RealtimeMcpToolCall | RealtimeMcpApprovalRequest; /** * Sent by the server when an Item is added to the default Conversation. This can * happen in several cases: * * - When the client sends a `conversation.item.create` event. * - When the input audio buffer is committed. In this case the item will be a user * message containing the audio from the buffer. * - When the model is generating a Response. In this case the * `conversation.item.added` event will be sent when the model starts generating * a specific Item, and thus it will not yet have any content (and `status` will * be `in_progress`). * * The event will include the full content of the Item (except when model is * generating a Response) except for audio data, which can be retrieved separately * with a `conversation.item.retrieve` event if necessary. */ export interface ConversationItemAdded { /** * The unique ID of the server event. */ event_id: string; /** * A single item within a Realtime conversation. */ item: ConversationItem; /** * The event type, must be `conversation.item.added`. */ type: 'conversation.item.added'; /** * The ID of the item that precedes this one, if any. This is used to maintain * ordering when items are inserted. */ previous_item_id?: string | null; } /** * Add a new Item to the Conversation's context, including messages, function * calls, and function call responses. This event can be used both to populate a * "history" of the conversation and to add new items mid-stream, but has the * current limitation that it cannot populate assistant audio messages. * * If successful, the server will respond with a `conversation.item.created` event, * otherwise an `error` event will be sent. */ export interface ConversationItemCreateEvent { /** * A single item within a Realtime conversation. */ item: ConversationItem; /** * The event type, must be `conversation.item.create`. */ type: 'conversation.item.create'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; /** * The ID of the preceding item after which the new item will be inserted. If not * set, the new item will be appended to the end of the conversation. * * If set to `root`, the new item will be added to the beginning of the * conversation. * * If set to an existing ID, it allows an item to be inserted mid-conversation. If * the ID cannot be found, an error will be returned and the item will not be * added. */ previous_item_id?: string; } /** * Returned when a conversation item is created. There are several scenarios that * produce this event: * * - The server is generating a Response, which if successful will produce either * one or two Items, which will be of type `message` (role `assistant`) or type * `function_call`. * - The input audio buffer has been committed, either by the client or the server * (in `server_vad` mode). The server will take the content of the input audio * buffer and add it to a new user message Item. * - The client has sent a `conversation.item.create` event to add a new Item to * the Conversation. */ export interface ConversationItemCreatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * A single item within a Realtime conversation. */ item: ConversationItem; /** * The event type, must be `conversation.item.created`. */ type: 'conversation.item.created'; /** * The ID of the preceding item in the Conversation context, allows the client to * understand the order of the conversation. Can be `null` if the item has no * predecessor. */ previous_item_id?: string | null; } /** * Send this event when you want to remove any item from the conversation history. * The server will respond with a `conversation.item.deleted` event, unless the * item does not exist in the conversation history, in which case the server will * respond with an error. */ export interface ConversationItemDeleteEvent { /** * The ID of the item to delete. */ item_id: string; /** * The event type, must be `conversation.item.delete`. */ type: 'conversation.item.delete'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an item in the conversation is deleted by the client with a * `conversation.item.delete` event. This event is used to synchronize the server's * understanding of the conversation history with the client's view. */ export interface ConversationItemDeletedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item that was deleted. */ item_id: string; /** * The event type, must be `conversation.item.deleted`. */ type: 'conversation.item.deleted'; } /** * Returned when a conversation item is finalized. * * The event will include the full content of the Item except for audio data, which * can be retrieved separately with a `conversation.item.retrieve` event if needed. */ export interface ConversationItemDone { /** * The unique ID of the server event. */ event_id: string; /** * A single item within a Realtime conversation. */ item: ConversationItem; /** * The event type, must be `conversation.item.done`. */ type: 'conversation.item.done'; /** * The ID of the item that precedes this one, if any. This is used to maintain * ordering when items are inserted. */ previous_item_id?: string | null; } /** * This event is the output of audio transcription for user audio written to the * user audio buffer. Transcription begins when the input audio buffer is committed * by the client or server (when VAD is enabled). Transcription runs asynchronously * with Response creation, so this event may come before or after the Response * events. * * Realtime API models accept audio natively, and thus input transcription is a * separate process run on a separate ASR (Automatic Speech Recognition) model. The * transcript may diverge somewhat from the model's interpretation, and should be * treated as a rough guide. */ export interface ConversationItemInputAudioTranscriptionCompletedEvent { /** * The index of the content part containing the audio. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item containing the audio that is being transcribed. */ item_id: string; /** * The transcribed text. */ transcript: string; /** * The event type, must be `conversation.item.input_audio_transcription.completed`. */ type: 'conversation.item.input_audio_transcription.completed'; /** * Usage statistics for the transcription, this is billed according to the ASR * model's pricing rather than the realtime model's pricing. */ usage: | ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageTokens | ConversationItemInputAudioTranscriptionCompletedEvent.TranscriptTextUsageDuration; /** * The log probabilities of the transcription. */ logprobs?: Array<LogProbProperties> | null; } export namespace ConversationItemInputAudioTranscriptionCompletedEvent { /** * Usage statistics for models billed by token usage. */ export interface TranscriptTextUsageTokens { /** * Number of input tokens billed for this request. */ input_tokens: number; /** * Number of output tokens generated. */ output_tokens: number; /** * Total number of tokens used (input + output). */ total_tokens: number; /** * The type of the usage object. Always `tokens` for this variant. */ type: 'tokens'; /** * Details about the input tokens billed for this request. */ input_token_details?: TranscriptTextUsageTokens.InputTokenDetails; } export namespace TranscriptTextUsageTokens { /** * Details about the input tokens billed for this request. */ export interface InputTokenDetails { /** * Number of audio tokens billed for this request. */ audio_tokens?: number; /** * Number of text tokens billed for this request. */ text_tokens?: number; } } /** * Usage statistics for models billed by audio input duration. */ export interface TranscriptTextUsageDuration { /** * Duration of the input audio in seconds. */ seconds: number; /** * The type of the usage object. Always `duration` for this variant. */ type: 'duration'; } } /** * Returned when the text value of an input audio transcription content part is * updated with incremental transcription results. */ export interface ConversationItemInputAudioTranscriptionDeltaEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item containing the audio that is being transcribed. */ item_id: string; /** * The event type, must be `conversation.item.input_audio_transcription.delta`. */ type: 'conversation.item.input_audio_transcription.delta'; /** * The index of the content part in the item's content array. */ content_index?: number; /** * The text delta. */ delta?: string; /** * The log probabilities of the transcription. These can be enabled by * configurating the session with * `"include": ["item.input_audio_transcription.logprobs"]`. Each entry in the * array corresponds a log probability of which token would be selected for this * chunk of transcription. This can help to identify if it was possible there were * multiple valid options for a given chunk of transcription. */ logprobs?: Array<LogProbProperties> | null; } /** * Returned when input audio transcription is configured, and a transcription * request for a user message failed. These events are separate from other `error` * events so that the client can identify the related Item. */ export interface ConversationItemInputAudioTranscriptionFailedEvent { /** * The index of the content part containing the audio. */ content_index: number; /** * Details of the transcription error. */ error: ConversationItemInputAudioTranscriptionFailedEvent.Error; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item. */ item_id: string; /** * The event type, must be `conversation.item.input_audio_transcription.failed`. */ type: 'conversation.item.input_audio_transcription.failed'; } export namespace ConversationItemInputAudioTranscriptionFailedEvent { /** * Details of the transcription error. */ export interface Error { /** * Error code, if any. */ code?: string; /** * A human-readable error message. */ message?: string; /** * Parameter related to the error, if any. */ param?: string; /** * The type of error. */ type?: string; } } /** * Returned when an input audio transcription segment is identified for an item. */ export interface ConversationItemInputAudioTranscriptionSegment { /** * The segment identifier. */ id: string; /** * The index of the input audio content part within the item. */ content_index: number; /** * End time of the segment in seconds. */ end: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item containing the input audio content. */ item_id: string; /** * The detected speaker label for this segment. */ speaker: string; /** * Start time of the segment in seconds. */ start: number; /** * The text for this segment. */ text: string; /** * The event type, must be `conversation.item.input_audio_transcription.segment`. */ type: 'conversation.item.input_audio_transcription.segment'; } /** * Send this event when you want to retrieve the server's representation of a * specific item in the conversation history. This is useful, for example, to * inspect user audio after noise cancellation and VAD. The server will respond * with a `conversation.item.retrieved` event, unless the item does not exist in * the conversation history, in which case the server will respond with an error. */ export interface ConversationItemRetrieveEvent { /** * The ID of the item to retrieve. */ item_id: string; /** * The event type, must be `conversation.item.retrieve`. */ type: 'conversation.item.retrieve'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Send this event to truncate a previous assistant message’s audio. The server * will produce audio faster than realtime, so this event is useful when the user * interrupts to truncate audio that has already been sent to the client but not * yet played. This will synchronize the server's understanding of the audio with * the client's playback. * * Truncating audio will delete the server-side text transcript to ensure there is * not text in the context that hasn't been heard by the user. * * If successful, the server will respond with a `conversation.item.truncated` * event. */ export interface ConversationItemTruncateEvent { /** * Inclusive duration up to which audio is truncated, in milliseconds. If the * audio_end_ms is greater than the actual audio duration, the server will respond * with an error. */ audio_end_ms: number; /** * The index of the content part to truncate. Set this to `0`. */ content_index: number; /** * The ID of the assistant message item to truncate. Only assistant message items * can be truncated. */ item_id: string; /** * The event type, must be `conversation.item.truncate`. */ type: 'conversation.item.truncate'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an earlier assistant audio message item is truncated by the client * with a `conversation.item.truncate` event. This event is used to synchronize the * server's understanding of the audio with the client's playback. * * This action will truncate the audio and remove the server-side text transcript * to ensure there is no text in the context that hasn't been heard by the user. */ export interface ConversationItemTruncatedEvent { /** * The duration up to which the audio was truncated, in milliseconds. */ audio_end_ms: number; /** * The index of the content part that was truncated. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the assistant message item that was truncated. */ item_id: string; /** * The event type, must be `conversation.item.truncated`. */ type: 'conversation.item.truncated'; } /** * The item to add to the conversation. */ export interface ConversationItemWithReference { /** * For an item of type (`message` | `function_call` | `function_call_output`) this * field allows the client to assign the unique ID of the item. It is not required * because the server will generate one if not provided. * * For an item of type `item_reference`, this field is required and is a reference * to any item that has previously existed in the conversation. */ id?: string; /** * The arguments of the function call (for `function_call` items). */ arguments?: string; /** * The ID of the function call (for `function_call` and `function_call_output` * items). If passed on a `function_call_output` item, the server will check that a * `function_call` item with the same ID exists in the conversation history. */ call_id?: string; /** * The content of the message, applicable for `message` items. * * - Message items of role `system` support only `input_text` content * - Message items of role `user` support `input_text` and `input_audio` content * - Message items of role `assistant` support `text` content. */ content?: Array<ConversationItemWithReference.Content>; /** * The name of the function being called (for `function_call` items). */ name?: string; /** * Identifier for the API object being returned - always `realtime.item`. */ object?: 'realtime.item'; /** * The output of the function call (for `function_call_output` items). */ output?: string; /** * The role of the message sender (`user`, `assistant`, `system`), only applicable * for `message` items. */ role?: 'user' | 'assistant' | 'system'; /** * The status of the item (`completed`, `incomplete`, `in_progress`). These have no * effect on the conversation, but are accepted for consistency with the * `conversation.item.created` event. */ status?: 'completed' | 'incomplete' | 'in_progress'; /** * The type of the item (`message`, `function_call`, `function_call_output`, * `item_reference`). */ type?: 'message' | 'function_call' | 'function_call_output' | 'item_reference'; } export namespace ConversationItemWithReference { export interface Content { /** * ID of a previous conversation item to reference (for `item_reference` content * types in `response.create` events). These can reference both client and server * created items. */ id?: string; /** * Base64-encoded audio bytes, used for `input_audio` content type. */ audio?: string; /** * The text content, used for `input_text` and `text` content types. */ text?: string; /** * The transcript of the audio, used for `input_audio` content type. */ transcript?: string; /** * The content type (`input_text`, `input_audio`, `item_reference`, `text`). */ type?: 'input_text' | 'input_audio' | 'item_reference' | 'text'; } } /** * Send this event to append audio bytes to the input audio buffer. The audio * buffer is temporary storage you can write to and later commit. A "commit" will * create a new user message item in the conversation history from the buffer * content and clear the buffer. Input audio transcription (if enabled) will be * generated when the buffer is committed. * * If VAD is enabled the audio buffer is used to detect speech and the server will * decide when to commit. When Server VAD is disabled, you must commit the audio * buffer manually. Input audio noise reduction operates on writes to the audio * buffer. * * The client may choose how much audio to place in each event up to a maximum of * 15 MiB, for example streaming smaller chunks from the client may allow the VAD * to be more responsive. Unlike most other client events, the server will not send * a confirmation response to this event. */ export interface InputAudioBufferAppendEvent { /** * Base64-encoded audio bytes. This must be in the format specified by the * `input_audio_format` field in the session configuration. */ audio: string; /** * The event type, must be `input_audio_buffer.append`. */ type: 'input_audio_buffer.append'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Send this event to clear the audio bytes in the buffer. The server will respond * with an `input_audio_buffer.cleared` event. */ export interface InputAudioBufferClearEvent { /** * The event type, must be `input_audio_buffer.clear`. */ type: 'input_audio_buffer.clear'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when the input audio buffer is cleared by the client with a * `input_audio_buffer.clear` event. */ export interface InputAudioBufferClearedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `input_audio_buffer.cleared`. */ type: 'input_audio_buffer.cleared'; } /** * Send this event to commit the user input audio buffer, which will create a new * user message item in the conversation. This event will produce an error if the * input audio buffer is empty. When in Server VAD mode, the client does not need * to send this event, the server will commit the audio buffer automatically. * * Committing the input audio buffer will trigger input audio transcription (if * enabled in session configuration), but it will not create a response from the * model. The server will respond with an `input_audio_buffer.committed` event. */ export interface InputAudioBufferCommitEvent { /** * The event type, must be `input_audio_buffer.commit`. */ type: 'input_audio_buffer.commit'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an input audio buffer is committed, either by the client or * automatically in server VAD mode. The `item_id` property is the ID of the user * message item that will be created, thus a `conversation.item.created` event will * also be sent to the client. */ export interface InputAudioBufferCommittedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created. */ item_id: string; /** * The event type, must be `input_audio_buffer.committed`. */ type: 'input_audio_buffer.committed'; /** * The ID of the preceding item after which the new item will be inserted. Can be * `null` if the item has no predecessor. */ previous_item_id?: string | null; } /** * **SIP Only:** Returned when an DTMF event is received. A DTMF event is a message * that represents a telephone keypad press (0–9, \*, #, A–D). The `event` property * is the keypad that the user press. The `received_at` is the UTC Unix Timestamp * that the server received the event. */ export interface InputAudioBufferDtmfEventReceivedEvent { /** * The telephone keypad that was pressed by the user. */ event: string; /** * UTC Unix Timestamp when DTMF Event was received by server. */ received_at: number; /** * The event type, must be `input_audio_buffer.dtmf_event_received`. */ type: 'input_audio_buffer.dtmf_event_received'; } /** * Sent by the server when in `server_vad` mode to indicate that speech has been * detected in the audio buffer. This can happen any time audio is added to the * buffer (unless speech is already detected). The client may want to use this * event to interrupt audio playback or provide visual feedback to the user. * * The client should expect to receive a `input_audio_buffer.speech_stopped` event * when speech stops. The `item_id` property is the ID of the user message item * that will be created when speech stops and will also be included in the * `input_audio_buffer.speech_stopped` event (unless the client manually commits * the audio buffer during VAD activation). */ export interface InputAudioBufferSpeechStartedEvent { /** * Milliseconds from the start of all audio written to the buffer during the * session when speech was first detected. This will correspond to the beginning of * audio sent to the model, and thus includes the `prefix_padding_ms` configured in * the Session. */ audio_start_ms: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created when speech stops. */ item_id: string; /** * The event type, must be `input_audio_buffer.speech_started`. */ type: 'input_audio_buffer.speech_started'; } /** * Returned in `server_vad` mode when the server detects the end of speech in the * audio buffer. The server will also send an `conversation.item.created` event * with the user message item that is created from the audio buffer. */ export interface InputAudioBufferSpeechStoppedEvent { /** * Milliseconds since the session started when speech stopped. This will correspond * to the end of audio sent to the model, and thus includes the * `min_silence_duration_ms` configured in the Session. */ audio_end_ms: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created. */ item_id: string; /** * The event type, must be `input_audio_buffer.speech_stopped`. */ type: 'input_audio_buffer.speech_stopped'; } /** * Returned when the Server VAD timeout is triggered for the input audio buffer. * This is configured with `idle_timeout_ms` in the `turn_detection` settings of * the session, and it indicates that there hasn't been any speech detected for the * configured duration. * * The `audio_start_ms` and `audio_end_ms` fields indicate the segment of audio * after the last model response up to the triggering time, as an offset from the * beginning of audio written to the input audio buffer. This means it demarcates * the segment of audio that was silent and the difference between the start and * end values will roughly match the configured timeout. * * The empty audio will be committed to the conversation as an `input_audio` item * (there will be a `input_audio_buffer.committed` event) and a model response will * be generated. There may be speech that didn't trigger VAD but is still detected * by the model, so the model may respond with something relevant to the * conversation or a prompt to continue speaking. */ export interface InputAudioBufferTimeoutTriggered { /** * Millisecond offset of audio written to the input audio buffer at the time the * timeout was triggered. */ audio_end_ms: number; /** * Millisecond offset of audio written to the input audio buffer that was after the * playback time of the last model response. */ audio_start_ms: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item associated with this segment. */ item_id: string; /** * The event type, must be `input_audio_buffer.timeout_triggered`. */ type: 'input_audio_buffer.timeout_triggered'; } /** * A log probability object. */ export interface LogProbProperties { /** * The token that was used to generate the log probability. */ token: string; /** * The bytes that were used to generate the log probability. */ bytes: Array<number>; /** * The log probability of the token. */ logprob: number; } /** * Returned when listing MCP tools has completed for an item. */ export interface McpListToolsCompleted { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the MCP list tools item. */ item_id: string; /** * The event type, must be `mcp_list_tools.completed`. */ type: 'mcp_list_tools.completed'; } /** * Returned when listing MCP tools has failed for an item. */ export interface McpListToolsFailed { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the MCP list tools item. */ item_id: string; /** * The event type, must be `mcp_list_tools.failed`. */ type: 'mcp_list_tools.failed'; } /** * Returned when listing MCP tools is in progress for an item. */ export interface McpListToolsInProgress { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the MCP list tools item. */ item_id: string; /** * The event type, must be `mcp_list_tools.in_progress`. */ type: 'mcp_list_tools.in_progress'; } /** * Type of noise reduction. `near_field` is for close-talking microphones such as * headphones, `far_field` is for far-field microphones such as laptop or * conference room microphones. */ export type NoiseReductionType = 'near_field' | 'far_field'; /** * **WebRTC/SIP Only:** Emit to cut off the current audio response. This will * trigger the server to stop generating audio and emit a * `output_audio_buffer.cleared` event. This event should be preceded by a * `response.cancel` client event to stop the generation of the current response. * [Learn more](https://platform.openai.com/docs/guides/realtime-conversations#client-and-server-events-for-audio-in-webrtc). */ export interface OutputAudioBufferClearEvent { /** * The event type, must be `output_audio_buffer.clear`. */ type: 'output_audio_buffer.clear'; /** * The unique ID of the client event used for error handling. */ event_id?: string; } /** * Emitted at the beginning of a Response to indicate the updated rate limits. When * a Response is created some tokens will be "reserved" for the output tokens, the * rate limits shown here reflect that reservation, which is then adjusted * accordingly once the Response is completed. */ export interface RateLimitsUpdatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * List of rate limit information. */ rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>; /** * The event type, must be `rate_limits.updated`. */ type: 'rate_limits.updated'; } export namespace RateLimitsUpdatedEvent { export interface RateLimit { /** * The maximum allowed value for the rate limit. */ limit?: number; /** * The name of the rate limit (`requests`, `tokens`). */ name?: 'requests' | 'tokens'; /** * The remaining value before the limit is reached. */ remaining?: number; /** * Seconds until the rate limit resets. */ reset_seconds?: number; } } /** * Configuration for input and output audio. */ export interface RealtimeAudioConfig { input?: RealtimeAudioConfigInput; output?: RealtimeAudioConfigOutput; } export interface RealtimeAudioConfigInput { /** * The format of the input audio. */ format?: RealtimeAudioFormats; /** * Configuration for input audio noise reduction. This can be set to `null` to turn * off. Noise reduction filters audio added to the input audio buffer before it is * sent to VAD and the model. Filtering the audio can improve VAD and turn * detection accuracy (reducing false positives) and model performance by improving * perception of the input audio. */ noise_reduction?: RealtimeAudioConfigInput.NoiseReduction; /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) * and should be treated as guidance of input audio content rather than precisely * what the model heard. The client can optionally set the language and prompt for * transcription, these offer additional guidance to the transcription service. */ transcription?: AudioTranscription; /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model * response. * * Server VAD means that the model will detect the start and end of speech based on * audio volume and respond at the end of user speech. * * Semantic VAD is more advanced and uses a turn detection model (in conjunction * with VAD) to semantically estimate whether the user has finished speaking, then * dynamically sets a timeout based on this probability. For example, if user audio * trails off with "uhhm", the model will score a low probability of turn end and * wait longer for the user to continue speaking. This can be useful for more * natural conversations, but may have a higher latency. */ turn_detection?: RealtimeAudioInputTurnDetection | null; } export namespace RealtimeAudioConfigInput { /** * Configuration for input audio noise reduction. This can be set to `null` to turn * off. Noise reduction filters audio added to the input audio buffer before it is * sent to VAD and the model. Filtering the audio can improve VAD and turn * detection accuracy (reducing false positives) and model performance by improving * perception of the input audio. */ export interface NoiseReduction { /** * Type of noise reduction. `near_field` is for close-talking microphones such as * headphones, `far_field` is for far-field microphones such as laptop or * conference room microphones. */ type?: RealtimeAPI.NoiseReductionType; } } export interface RealtimeAudioConfigOutput { /** * The format of the output audio. */ format?: RealtimeAudioFormats; /** * The speed of the model's spoken response as a multiple of the original speed. * 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. * This value can only be changed in between model turns, not while a response is * in progress. * * This parameter is a post-processing adjustment to the audio after it is * generated, it's also possible to prompt the model to speak faster or slower. */ speed?: number; /** * The voice the model uses to respond. Supported built-in voices are `alloy`, * `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and * `cedar`. You may also provide a custom voice object with an `id`, for example * `{ "id": "voice_1234" }`. Voice cannot be changed during the session once the * model has responded with audio at least once. We recommend `marin` and `cedar` * for best quality. */ voice?: | string | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse' | 'marin' | 'cedar' | RealtimeAudioConfigOutput.ID; } export namespace RealtimeAudioConfigOutput { /** * Custom voice reference. */ export interface ID { /** * The custom voice ID, e.g. `voice_1234`. */ id: string; } } /** * The PCM audio format. Only a 24kHz sample rate is supported. */ export type RealtimeAudioFormats = | RealtimeAudioFormats.AudioPCM | RealtimeAudioFormats.AudioPCMU | RealtimeAudioFormats.AudioPCMA; export namespace RealtimeAudioFormats { /** * The PCM audio format. Only a 24kHz sample rate is supported. */ export interface AudioPCM { /** * The sample rate of the audio. Always `24000`. */ rate?: 24000; /** * The audio format. Always `audio/pcm`. */ type?: 'audio/pcm'; } /** * The G.711 μ-law format. */ export interface AudioPCMU { /** * The audio format. Always `audio/pcmu`. */ type?: 'audio/pcmu'; } /** * The G.711 A-law format. */ export interface AudioPCMA { /** * The audio format. Always `audio/pcma`. */ type?: 'audio/pcma'; } } /** * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be * set to `null` to turn off, in which case the client must manually trigger model * response. * * Server VAD means that the model will detect the start and end of speech based on * audio volume and respond at the end of user speech. * * Semantic VAD is more advanced and uses a turn detection model (in conjunction * with VAD) to semantically estimate whether the user has finished speaking, then * dynamically sets a timeout based on this probability. For example, if user audio * trails off with "uhhm", the model will score a low probability of turn end and * wait longer for the user to continue speaking. This can be useful for more * natural conversations, but may have a higher latency. */ export type RealtimeAudioInputTurnDetection = | RealtimeAudioInputTurnDetection.ServerVad | RealtimeAudioInputTurnDetection.SemanticVad; export namespace RealtimeAudioInputTurnDetection { /** * Server-side voice activity detection (VAD) which flips on when user speech is * detected and off after a period of silence. */ export interface ServerVad { /** * Type of turn detection, `server_vad` to turn on simple Server VAD. */ type: 'server_vad'; /** * Whether or not to automatically generate a response when a VAD stop event * occurs. If `interrupt_response` is set to `false` this may fail to create a * response if the model is already responding. * * If both `create_response` and `interrupt_response` are set to `false`, the model * will never respond automatically but VAD events will still be emitted. */ create_response?: boolean; /** * Optional timeout after which a model response will be triggered automatically. * This is useful for situations in which a long pause from the user is unexpected, * such as a phone call. The model will effectively prompt the user to continue the * conversation based on the current context. * * The timeout value will be applied after the last model response's audio has * finished playing, i.e. it's set to the `response.done` time plus audio playback * duration. * * An `input_audio_buffer.timeout_triggered` event (plus events associated with the * Response) will be emitted when the timeout is reached. Idle timeout is currently * only supported for `server_vad` mode. */ idle_timeout_ms?: number | null; /** * Whether or not to automatically interrupt (cancel) any ongoing response with * output to the default conversation (i.e. `conversation` of `auto`) when a VAD * start event occurs. If `true` then the response will be cancelled, otherwise it * will continue until complete. * * If both `create_response` and `interrupt_response` are set to `false`, the model * will never respond automatically but VAD events will still be emitted. */ interrupt_response?: boolean; /** * Used only for `server_vad` mode. Amount of audio to include before the VAD * detected speech (in milliseconds). Defaults to 300ms. */ prefix_padding_ms?: number; /** * Used only for `server_vad` mode. Duration of silence to detect speech stop (in * milliseconds). Defaults to 500ms. With shorter values the model will respond * more quickly, but may jump in on short pauses from the user. */ silence_duration_ms?: number; /** * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this * defaults to 0.5. A higher threshold will require louder audio to activate the * model, and thus might perform better in noisy environments. */ threshold?: number; } /** * Server-side semantic turn detection which uses a model to determine when the * user has finished speaking. */ export interface SemanticVad { /** * Type of turn detection, `semantic_vad` to turn on Semantic VAD. */ type: 'semantic_vad'; /** * Whether or not to automatically generate a response when a VAD stop event * occurs. */ create_response?: boolean; /** * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` * will wait longer for the user to continue speaking, `high` will respond more * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, * and `high` have max timeouts of 8s, 4s, and 2s respectively. */ eagerness?: 'low' | 'medium' | 'high' | 'auto'; /** * Whether or not to automatically interrupt any ongoing response with output to * the default conversation (i.e. `conversation` of `auto`) when a VAD start event * occurs. */ interrupt_response?: boolean; } } /** * A realtime client event. */ export type RealtimeClientEvent = | ConversationItemCreateEvent | ConversationItemDeleteEvent | ConversationItemRetrieveEvent | ConversationItemTruncateEvent | InputAudioBufferAppendEvent | InputAudioBufferClearEvent | OutputAudioBufferClearEvent | InputAudioBufferCommitEvent | ResponseCancelEvent | ResponseCreateEvent | SessionUpdateEvent; /** * An assistant message item in a Realtime conversation. */ export interface RealtimeConversationItemAssistantMessage { /** * The content of the message. */ content: Array<RealtimeConversationItemAssistantMessage.Content>; /** * The role of the message sender. Always `assistant`. */ role: 'assistant'; /** * The type of the item. Always `message`. */ type: 'message'; /** * The unique ID of the item. This may be provided by the client or generated by * the server. */ id?: string; /** * Identifier for the API object being returned - always `realtime.item`. Optional * when creating a new item. */ object?: 'realtime.item'; /** * The status of the item. Has no effect on the conversation. */ status?: 'completed' | 'incomplete' | 'in_progress'; } export namespace RealtimeConversationItemAssistantMessage { export interface Content { /** * Base64-encoded audio bytes, these will be parsed as the format specified in the * session output audio type configuration. This defaults to PCM 16-bit 24kHz mono * if not specified. */ audio?: string; /** * The text content. */ text?: string; /** * The transcript of the audio content, this will always be present if the output * type is `audio`. */ transcript?: string; /** * The content type, `output_text` or `output_audio` depending on the session * `output_modalities` configuration. */ type?: 'output_text' | 'output_audio'; } } /** * A function call item in a Realtime conversation. */ export interface RealtimeConversationItemFunctionCall { /** * The arguments of the function call. This is a JSON-encoded string representing * the arguments passed to the function, for example * `{"arg1": "value1", "arg2": 42}`. */ arguments: string; /** * The name of the function being called. */ name: string; /** * The type of the item. Always `function_call`. */ type: 'function_call'; /** * The unique ID of the item. This may be provided by the client or generated by * the server. */ id?: string; /** * The ID of the function call. */ call_id?: string; /** * Identifier for the API object being returned - always `realtime.item`. Optional * when creating a new item. */ object?: 'realtime.item'; /** * The status of the item. Has no effect on the conversation. */ status?: 'completed' | 'incomplete' | 'in_progress'; } /** * A function call output item in a Realtime conversation. */ export interface RealtimeConversationItemFunctionCallOutput { /** * The ID of the function call this output is for. */ call_id: string; /** * The output of the function call, this is free text and can contain any * information or simply be empty. */ output: string; /** * The type of the item. Always `function_call_output`. */ type: 'function_call_output'; /** * The unique ID of the item. This may be provided by the client or generated by * the server. */ id?: string; /** * Identifier for the API object being returned - always `realtime.item`. Optional * when creating a new item. */ object?: 'realtime.item'; /** * The status of the item. Has no effect on the conversation. */ status?: 'completed' | 'incomplete' | 'in_progress'; } /** * A system message in a Realtime conversation can be used to provide additional * context or instructions to the model. This is similar but distinct from the * instruction prompt provided at the start of a conversation, as system messages * can be added at any point in the conversation. For major changes to the * conversation's behavior, use instructions, but for smaller updates (e.g. "the * user is now asking about a different topic"), use system messages. */ export interface RealtimeConversationItemSystemMessage { /** * The content of the message. */ content: Array<RealtimeConversationItemSystemMessage.Content>; /** * The role of the message sender. Always `system`. */ role: 'system'; /** * The type of the item. Always `message`. */ type: 'message'; /** * The unique ID of the item. This may be provided by the client or generated by * the server. */ id?: string; /** * Identifier for the API object being returned - always `realtime.item`. Optional * when creating a new item. */ object?: 'realtime.item'; /** * The status of the item. Has no effect on the conversation. */ status?: 'completed' | 'incomplete' | 'in_progress'; } export namespace RealtimeConversationItemSystemMessage { export interface Content { /** * The text content. */ text?: string; /** * The content type. Always `input_text` for system messages. */ type?: 'input_text'; } } /** * A user message item in a Realtime conversation. */ export interface RealtimeConversationItemUserMessage { /** * The content of the message. */ content: Array<RealtimeConversationItemUserMessage.Content>; /** * The role of the message sender. Always `user`. */ role: 'user'; /** * The type of the item. Always `message`. */ type: 'message'; /** * The unique ID of the item. This may be provided by the client or generated by * the server. */ id?: string; /** * Identifier for the API object being returned - always `realtime.item`. Optional * when creating a new item. */ object?: 'realtime.item'; /** * The status of the item. Has no effect on the conversation. */ status?: 'completed' | 'incomplete' | 'in_progress'; } export namespace RealtimeConversationItemUserMessage { export interface Content { /** * Base64-encoded audio bytes (for `input_audio`), these will be parsed as the * format specified in the session input audio type configuration. This defaults to * PCM 16-bit 24kHz mono if not specified. */ audio?: string; /** * The detail level of the image (for `input_image`). `auto` will default to * `high`. */ detail?: 'auto' | 'low' | 'high'; /** * Base64-encoded image bytes (for `input_image`) as a data URI. For example * `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...`. Supported formats are PNG * and JPEG. */ image_url?: string; /*