UNPKG

openai

Version:

The official TypeScript library for the OpenAI API

1,897 lines (1,641 loc) 77 kB
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. import { APIResource } from '../../../resource'; import * as RealtimeAPI from './realtime'; import * as Shared from '../../shared'; import * as SessionsAPI from './sessions'; import { Session as SessionsAPISession, SessionCreateParams, SessionCreateResponse, Sessions, } from './sessions'; import * as TranscriptionSessionsAPI from './transcription-sessions'; import { TranscriptionSession, TranscriptionSessionCreateParams, TranscriptionSessions, } from './transcription-sessions'; export class Realtime extends APIResource { sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client); transcriptionSessions: TranscriptionSessionsAPI.TranscriptionSessions = new TranscriptionSessionsAPI.TranscriptionSessions(this._client); } /** * Returned when a conversation is created. Emitted right after session creation. */ export interface ConversationCreatedEvent { /** * The conversation resource. */ conversation: ConversationCreatedEvent.Conversation; /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `conversation.created`. */ type: 'conversation.created'; } export namespace ConversationCreatedEvent { /** * The conversation resource. */ export interface Conversation { /** * The unique ID of the conversation. */ id?: string; /** * The object type, must be `realtime.conversation`. */ object?: 'realtime.conversation'; } } /** * The item to add to the conversation. */ export interface ConversationItem { /** * The unique ID of the item, this can be generated by the client to help manage * server-side context, but is not required because the server will generate one if * not provided. */ id?: string; /** * The arguments of the function call (for `function_call` items). */ arguments?: string; /** * The ID of the function call (for `function_call` and `function_call_output` * items). If passed on a `function_call_output` item, the server will check that a * `function_call` item with the same ID exists in the conversation history. */ call_id?: string; /** * The content of the message, applicable for `message` items. * * - Message items of role `system` support only `input_text` content * - Message items of role `user` support `input_text` and `input_audio` content * - Message items of role `assistant` support `text` content. */ content?: Array<ConversationItemContent>; /** * The name of the function being called (for `function_call` items). */ name?: string; /** * Identifier for the API object being returned - always `realtime.item`. */ object?: 'realtime.item'; /** * The output of the function call (for `function_call_output` items). */ output?: string; /** * The role of the message sender (`user`, `assistant`, `system`), only applicable * for `message` items. */ role?: 'user' | 'assistant' | 'system'; /** * The status of the item (`completed`, `incomplete`). These have no effect on the * conversation, but are accepted for consistency with the * `conversation.item.created` event. */ status?: 'completed' | 'incomplete'; /** * The type of the item (`message`, `function_call`, `function_call_output`). */ type?: 'message' | 'function_call' | 'function_call_output'; } export interface ConversationItemContent { /** * ID of a previous conversation item to reference (for `item_reference` content * types in `response.create` events). These can reference both client and server * created items. */ id?: string; /** * Base64-encoded audio bytes, used for `input_audio` content type. */ audio?: string; /** * The text content, used for `input_text` and `text` content types. */ text?: string; /** * The transcript of the audio, used for `input_audio` content type. */ transcript?: string; /** * The content type (`input_text`, `input_audio`, `item_reference`, `text`). */ type?: 'input_text' | 'input_audio' | 'item_reference' | 'text'; } /** * Add a new Item to the Conversation's context, including messages, function * calls, and function call responses. This event can be used both to populate a * "history" of the conversation and to add new items mid-stream, but has the * current limitation that it cannot populate assistant audio messages. * * If successful, the server will respond with a `conversation.item.created` event, * otherwise an `error` event will be sent. */ export interface ConversationItemCreateEvent { /** * The item to add to the conversation. */ item: ConversationItem; /** * The event type, must be `conversation.item.create`. */ type: 'conversation.item.create'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; /** * The ID of the preceding item after which the new item will be inserted. If not * set, the new item will be appended to the end of the conversation. If set to * `root`, the new item will be added to the beginning of the conversation. If set * to an existing ID, it allows an item to be inserted mid-conversation. If the ID * cannot be found, an error will be returned and the item will not be added. */ previous_item_id?: string; } /** * Returned when a conversation item is created. There are several scenarios that * produce this event: * * - The server is generating a Response, which if successful will produce either * one or two Items, which will be of type `message` (role `assistant`) or type * `function_call`. * - The input audio buffer has been committed, either by the client or the server * (in `server_vad` mode). The server will take the content of the input audio * buffer and add it to a new user message Item. * - The client has sent a `conversation.item.create` event to add a new Item to * the Conversation. */ export interface ConversationItemCreatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The item to add to the conversation. */ item: ConversationItem; /** * The ID of the preceding item in the Conversation context, allows the client to * understand the order of the conversation. */ previous_item_id: string; /** * The event type, must be `conversation.item.created`. */ type: 'conversation.item.created'; } /** * Send this event when you want to remove any item from the conversation history. * The server will respond with a `conversation.item.deleted` event, unless the * item does not exist in the conversation history, in which case the server will * respond with an error. */ export interface ConversationItemDeleteEvent { /** * The ID of the item to delete. */ item_id: string; /** * The event type, must be `conversation.item.delete`. */ type: 'conversation.item.delete'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an item in the conversation is deleted by the client with a * `conversation.item.delete` event. This event is used to synchronize the server's * understanding of the conversation history with the client's view. */ export interface ConversationItemDeletedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item that was deleted. */ item_id: string; /** * The event type, must be `conversation.item.deleted`. */ type: 'conversation.item.deleted'; } /** * This event is the output of audio transcription for user audio written to the * user audio buffer. Transcription begins when the input audio buffer is committed * by the client or server (in `server_vad` mode). Transcription runs * asynchronously with Response creation, so this event may come before or after * the Response events. * * Realtime API models accept audio natively, and thus input transcription is a * separate process run on a separate ASR (Automatic Speech Recognition) model, * currently always `whisper-1`. Thus the transcript may diverge somewhat from the * model's interpretation, and should be treated as a rough guide. */ export interface ConversationItemInputAudioTranscriptionCompletedEvent { /** * The index of the content part containing the audio. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item containing the audio. */ item_id: string; /** * The transcribed text. */ transcript: string; /** * The event type, must be `conversation.item.input_audio_transcription.completed`. */ type: 'conversation.item.input_audio_transcription.completed'; /** * The log probabilities of the transcription. */ logprobs?: Array<ConversationItemInputAudioTranscriptionCompletedEvent.Logprob> | null; } export namespace ConversationItemInputAudioTranscriptionCompletedEvent { /** * A log probability object. */ export interface Logprob { /** * The token that was used to generate the log probability. */ token: string; /** * The bytes that were used to generate the log probability. */ bytes: Array<number>; /** * The log probability of the token. */ logprob: number; } } /** * Returned when the text value of an input audio transcription content part is * updated. */ export interface ConversationItemInputAudioTranscriptionDeltaEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The event type, must be `conversation.item.input_audio_transcription.delta`. */ type: 'conversation.item.input_audio_transcription.delta'; /** * The index of the content part in the item's content array. */ content_index?: number; /** * The text delta. */ delta?: string; /** * The log probabilities of the transcription. */ logprobs?: Array<ConversationItemInputAudioTranscriptionDeltaEvent.Logprob> | null; } export namespace ConversationItemInputAudioTranscriptionDeltaEvent { /** * A log probability object. */ export interface Logprob { /** * The token that was used to generate the log probability. */ token: string; /** * The bytes that were used to generate the log probability. */ bytes: Array<number>; /** * The log probability of the token. */ logprob: number; } } /** * Returned when input audio transcription is configured, and a transcription * request for a user message failed. These events are separate from other `error` * events so that the client can identify the related Item. */ export interface ConversationItemInputAudioTranscriptionFailedEvent { /** * The index of the content part containing the audio. */ content_index: number; /** * Details of the transcription error. */ error: ConversationItemInputAudioTranscriptionFailedEvent.Error; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item. */ item_id: string; /** * The event type, must be `conversation.item.input_audio_transcription.failed`. */ type: 'conversation.item.input_audio_transcription.failed'; } export namespace ConversationItemInputAudioTranscriptionFailedEvent { /** * Details of the transcription error. */ export interface Error { /** * Error code, if any. */ code?: string; /** * A human-readable error message. */ message?: string; /** * Parameter related to the error, if any. */ param?: string; /** * The type of error. */ type?: string; } } /** * Send this event when you want to retrieve the server's representation of a * specific item in the conversation history. This is useful, for example, to * inspect user audio after noise cancellation and VAD. The server will respond * with a `conversation.item.retrieved` event, unless the item does not exist in * the conversation history, in which case the server will respond with an error. */ export interface ConversationItemRetrieveEvent { /** * The ID of the item to retrieve. */ item_id: string; /** * The event type, must be `conversation.item.retrieve`. */ type: 'conversation.item.retrieve'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Send this event to truncate a previous assistant message’s audio. The server * will produce audio faster than realtime, so this event is useful when the user * interrupts to truncate audio that has already been sent to the client but not * yet played. This will synchronize the server's understanding of the audio with * the client's playback. * * Truncating audio will delete the server-side text transcript to ensure there is * not text in the context that hasn't been heard by the user. * * If successful, the server will respond with a `conversation.item.truncated` * event. */ export interface ConversationItemTruncateEvent { /** * Inclusive duration up to which audio is truncated, in milliseconds. If the * audio_end_ms is greater than the actual audio duration, the server will respond * with an error. */ audio_end_ms: number; /** * The index of the content part to truncate. Set this to 0. */ content_index: number; /** * The ID of the assistant message item to truncate. Only assistant message items * can be truncated. */ item_id: string; /** * The event type, must be `conversation.item.truncate`. */ type: 'conversation.item.truncate'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an earlier assistant audio message item is truncated by the client * with a `conversation.item.truncate` event. This event is used to synchronize the * server's understanding of the audio with the client's playback. * * This action will truncate the audio and remove the server-side text transcript * to ensure there is no text in the context that hasn't been heard by the user. */ export interface ConversationItemTruncatedEvent { /** * The duration up to which the audio was truncated, in milliseconds. */ audio_end_ms: number; /** * The index of the content part that was truncated. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the assistant message item that was truncated. */ item_id: string; /** * The event type, must be `conversation.item.truncated`. */ type: 'conversation.item.truncated'; } /** * The item to add to the conversation. */ export interface ConversationItemWithReference { /** * For an item of type (`message` | `function_call` | `function_call_output`) this * field allows the client to assign the unique ID of the item. It is not required * because the server will generate one if not provided. * * For an item of type `item_reference`, this field is required and is a reference * to any item that has previously existed in the conversation. */ id?: string; /** * The arguments of the function call (for `function_call` items). */ arguments?: string; /** * The ID of the function call (for `function_call` and `function_call_output` * items). If passed on a `function_call_output` item, the server will check that a * `function_call` item with the same ID exists in the conversation history. */ call_id?: string; /** * The content of the message, applicable for `message` items. * * - Message items of role `system` support only `input_text` content * - Message items of role `user` support `input_text` and `input_audio` content * - Message items of role `assistant` support `text` content. */ content?: Array<ConversationItemContent>; /** * The name of the function being called (for `function_call` items). */ name?: string; /** * Identifier for the API object being returned - always `realtime.item`. */ object?: 'realtime.item'; /** * The output of the function call (for `function_call_output` items). */ output?: string; /** * The role of the message sender (`user`, `assistant`, `system`), only applicable * for `message` items. */ role?: 'user' | 'assistant' | 'system'; /** * The status of the item (`completed`, `incomplete`). These have no effect on the * conversation, but are accepted for consistency with the * `conversation.item.created` event. */ status?: 'completed' | 'incomplete'; /** * The type of the item (`message`, `function_call`, `function_call_output`, * `item_reference`). */ type?: 'message' | 'function_call' | 'function_call_output' | 'item_reference'; } /** * Returned when an error occurs, which could be a client problem or a server * problem. Most errors are recoverable and the session will stay open, we * recommend to implementors to monitor and log error messages by default. */ export interface ErrorEvent { /** * Details of the error. */ error: ErrorEvent.Error; /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `error`. */ type: 'error'; } export namespace ErrorEvent { /** * Details of the error. */ export interface Error { /** * A human-readable error message. */ message: string; /** * The type of error (e.g., "invalid_request_error", "server_error"). */ type: string; /** * Error code, if any. */ code?: string | null; /** * The event_id of the client event that caused the error, if applicable. */ event_id?: string | null; /** * Parameter related to the error, if any. */ param?: string | null; } } /** * Send this event to append audio bytes to the input audio buffer. The audio * buffer is temporary storage you can write to and later commit. In Server VAD * mode, the audio buffer is used to detect speech and the server will decide when * to commit. When Server VAD is disabled, you must commit the audio buffer * manually. * * The client may choose how much audio to place in each event up to a maximum of * 15 MiB, for example streaming smaller chunks from the client may allow the VAD * to be more responsive. Unlike made other client events, the server will not send * a confirmation response to this event. */ export interface InputAudioBufferAppendEvent { /** * Base64-encoded audio bytes. This must be in the format specified by the * `input_audio_format` field in the session configuration. */ audio: string; /** * The event type, must be `input_audio_buffer.append`. */ type: 'input_audio_buffer.append'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Send this event to clear the audio bytes in the buffer. The server will respond * with an `input_audio_buffer.cleared` event. */ export interface InputAudioBufferClearEvent { /** * The event type, must be `input_audio_buffer.clear`. */ type: 'input_audio_buffer.clear'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when the input audio buffer is cleared by the client with a * `input_audio_buffer.clear` event. */ export interface InputAudioBufferClearedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `input_audio_buffer.cleared`. */ type: 'input_audio_buffer.cleared'; } /** * Send this event to commit the user input audio buffer, which will create a new * user message item in the conversation. This event will produce an error if the * input audio buffer is empty. When in Server VAD mode, the client does not need * to send this event, the server will commit the audio buffer automatically. * * Committing the input audio buffer will trigger input audio transcription (if * enabled in session configuration), but it will not create a response from the * model. The server will respond with an `input_audio_buffer.committed` event. */ export interface InputAudioBufferCommitEvent { /** * The event type, must be `input_audio_buffer.commit`. */ type: 'input_audio_buffer.commit'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an input audio buffer is committed, either by the client or * automatically in server VAD mode. The `item_id` property is the ID of the user * message item that will be created, thus a `conversation.item.created` event will * also be sent to the client. */ export interface InputAudioBufferCommittedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created. */ item_id: string; /** * The ID of the preceding item after which the new item will be inserted. */ previous_item_id: string; /** * The event type, must be `input_audio_buffer.committed`. */ type: 'input_audio_buffer.committed'; } /** * Sent by the server when in `server_vad` mode to indicate that speech has been * detected in the audio buffer. This can happen any time audio is added to the * buffer (unless speech is already detected). The client may want to use this * event to interrupt audio playback or provide visual feedback to the user. * * The client should expect to receive a `input_audio_buffer.speech_stopped` event * when speech stops. The `item_id` property is the ID of the user message item * that will be created when speech stops and will also be included in the * `input_audio_buffer.speech_stopped` event (unless the client manually commits * the audio buffer during VAD activation). */ export interface InputAudioBufferSpeechStartedEvent { /** * Milliseconds from the start of all audio written to the buffer during the * session when speech was first detected. This will correspond to the beginning of * audio sent to the model, and thus includes the `prefix_padding_ms` configured in * the Session. */ audio_start_ms: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created when speech stops. */ item_id: string; /** * The event type, must be `input_audio_buffer.speech_started`. */ type: 'input_audio_buffer.speech_started'; } /** * Returned in `server_vad` mode when the server detects the end of speech in the * audio buffer. The server will also send an `conversation.item.created` event * with the user message item that is created from the audio buffer. */ export interface InputAudioBufferSpeechStoppedEvent { /** * Milliseconds since the session started when speech stopped. This will correspond * to the end of audio sent to the model, and thus includes the * `min_silence_duration_ms` configured in the Session. */ audio_end_ms: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created. */ item_id: string; /** * The event type, must be `input_audio_buffer.speech_stopped`. */ type: 'input_audio_buffer.speech_stopped'; } /** * Emitted at the beginning of a Response to indicate the updated rate limits. When * a Response is created some tokens will be "reserved" for the output tokens, the * rate limits shown here reflect that reservation, which is then adjusted * accordingly once the Response is completed. */ export interface RateLimitsUpdatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * List of rate limit information. */ rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>; /** * The event type, must be `rate_limits.updated`. */ type: 'rate_limits.updated'; } export namespace RateLimitsUpdatedEvent { export interface RateLimit { /** * The maximum allowed value for the rate limit. */ limit?: number; /** * The name of the rate limit (`requests`, `tokens`). */ name?: 'requests' | 'tokens'; /** * The remaining value before the limit is reached. */ remaining?: number; /** * Seconds until the rate limit resets. */ reset_seconds?: number; } } /** * A realtime client event. */ export type RealtimeClientEvent = | ConversationItemCreateEvent | ConversationItemDeleteEvent | ConversationItemRetrieveEvent | ConversationItemTruncateEvent | InputAudioBufferAppendEvent | InputAudioBufferClearEvent | RealtimeClientEvent.OutputAudioBufferClear | InputAudioBufferCommitEvent | ResponseCancelEvent | ResponseCreateEvent | SessionUpdateEvent | TranscriptionSessionUpdate; export namespace RealtimeClientEvent { /** * **WebRTC Only:** Emit to cut off the current audio response. This will trigger * the server to stop generating audio and emit a `output_audio_buffer.cleared` * event. This event should be preceded by a `response.cancel` client event to stop * the generation of the current response. * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc). */ export interface OutputAudioBufferClear { /** * The event type, must be `output_audio_buffer.clear`. */ type: 'output_audio_buffer.clear'; /** * The unique ID of the client event used for error handling. */ event_id?: string; } } /** * The response resource. */ export interface RealtimeResponse { /** * The unique ID of the response. */ id?: string; /** * Which conversation the response is added to, determined by the `conversation` * field in the `response.create` event. If `auto`, the response will be added to * the default conversation and the value of `conversation_id` will be an id like * `conv_1234`. If `none`, the response will not be added to any conversation and * the value of `conversation_id` will be `null`. If responses are being triggered * by server VAD, the response will be added to the default conversation, thus the * `conversation_id` will be an id like `conv_1234`. */ conversation_id?: string; /** * Maximum number of output tokens for a single assistant response, inclusive of * tool calls, that was used in this response. */ max_output_tokens?: number | 'inf'; /** * Set of 16 key-value pairs that can be attached to an object. This can be useful * for storing additional information about the object in a structured format, and * querying for objects via API or the dashboard. * * Keys are strings with a maximum length of 64 characters. Values are strings with * a maximum length of 512 characters. */ metadata?: Shared.Metadata | null; /** * The set of modalities the model used to respond. If there are multiple * modalities, the model will pick one, for example if `modalities` is * `["text", "audio"]`, the model could be responding in either text or audio. */ modalities?: Array<'text' | 'audio'>; /** * The object type, must be `realtime.response`. */ object?: 'realtime.response'; /** * The list of output items generated by the response. */ output?: Array<ConversationItem>; /** * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * The final status of the response (`completed`, `cancelled`, `failed`, or * `incomplete`). */ status?: 'completed' | 'cancelled' | 'failed' | 'incomplete'; /** * Additional details about the status. */ status_details?: RealtimeResponseStatus; /** * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: number; /** * Usage statistics for the Response, this will correspond to billing. A Realtime * API session will maintain a conversation context and append new Items to the * Conversation, thus output from previous turns (text and audio tokens) will * become the input for later turns. */ usage?: RealtimeResponseUsage; /** * The voice the model used to respond. Current voice options are `alloy`, `ash`, * `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`, and * `verse`. */ voice?: | (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'fable' | 'onyx' | 'nova' | 'sage' | 'shimmer' | 'verse'; } /** * Additional details about the status. */ export interface RealtimeResponseStatus { /** * A description of the error that caused the response to fail, populated when the * `status` is `failed`. */ error?: RealtimeResponseStatus.Error; /** * The reason the Response did not complete. For a `cancelled` Response, one of * `turn_detected` (the server VAD detected a new start of speech) or * `client_cancelled` (the client sent a cancel event). For an `incomplete` * Response, one of `max_output_tokens` or `content_filter` (the server-side safety * filter activated and cut off the response). */ reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter'; /** * The type of error that caused the response to fail, corresponding with the * `status` field (`completed`, `cancelled`, `incomplete`, `failed`). */ type?: 'completed' | 'cancelled' | 'incomplete' | 'failed'; } export namespace RealtimeResponseStatus { /** * A description of the error that caused the response to fail, populated when the * `status` is `failed`. */ export interface Error { /** * Error code, if any. */ code?: string; /** * The type of error. */ type?: string; } } /** * Usage statistics for the Response, this will correspond to billing. A Realtime * API session will maintain a conversation context and append new Items to the * Conversation, thus output from previous turns (text and audio tokens) will * become the input for later turns. */ export interface RealtimeResponseUsage { /** * Details about the input tokens used in the Response. */ input_token_details?: RealtimeResponseUsage.InputTokenDetails; /** * The number of input tokens used in the Response, including text and audio * tokens. */ input_tokens?: number; /** * Details about the output tokens used in the Response. */ output_token_details?: RealtimeResponseUsage.OutputTokenDetails; /** * The number of output tokens sent in the Response, including text and audio * tokens. */ output_tokens?: number; /** * The total number of tokens in the Response including input and output text and * audio tokens. */ total_tokens?: number; } export namespace RealtimeResponseUsage { /** * Details about the input tokens used in the Response. */ export interface InputTokenDetails { /** * The number of audio tokens used in the Response. */ audio_tokens?: number; /** * The number of cached tokens used in the Response. */ cached_tokens?: number; /** * The number of text tokens used in the Response. */ text_tokens?: number; } /** * Details about the output tokens used in the Response. */ export interface OutputTokenDetails { /** * The number of audio tokens used in the Response. */ audio_tokens?: number; /** * The number of text tokens used in the Response. */ text_tokens?: number; } } /** * A realtime server event. */ export type RealtimeServerEvent = | ConversationCreatedEvent | ConversationItemCreatedEvent | ConversationItemDeletedEvent | ConversationItemInputAudioTranscriptionCompletedEvent | ConversationItemInputAudioTranscriptionDeltaEvent | ConversationItemInputAudioTranscriptionFailedEvent | RealtimeServerEvent.ConversationItemRetrieved | ConversationItemTruncatedEvent | ErrorEvent | InputAudioBufferClearedEvent | InputAudioBufferCommittedEvent | InputAudioBufferSpeechStartedEvent | InputAudioBufferSpeechStoppedEvent | RateLimitsUpdatedEvent | ResponseAudioDeltaEvent | ResponseAudioDoneEvent | ResponseAudioTranscriptDeltaEvent | ResponseAudioTranscriptDoneEvent | ResponseContentPartAddedEvent | ResponseContentPartDoneEvent | ResponseCreatedEvent | ResponseDoneEvent | ResponseFunctionCallArgumentsDeltaEvent | ResponseFunctionCallArgumentsDoneEvent | ResponseOutputItemAddedEvent | ResponseOutputItemDoneEvent | ResponseTextDeltaEvent | ResponseTextDoneEvent | SessionCreatedEvent | SessionUpdatedEvent | TranscriptionSessionUpdatedEvent | RealtimeServerEvent.OutputAudioBufferStarted | RealtimeServerEvent.OutputAudioBufferStopped | RealtimeServerEvent.OutputAudioBufferCleared; export namespace RealtimeServerEvent { /** * Returned when a conversation item is retrieved with * `conversation.item.retrieve`. */ export interface ConversationItemRetrieved { /** * The unique ID of the server event. */ event_id: string; /** * The item to add to the conversation. */ item: RealtimeAPI.ConversationItem; /** * The event type, must be `conversation.item.retrieved`. */ type: 'conversation.item.retrieved'; } /** * **WebRTC Only:** Emitted when the server begins streaming audio to the client. * This event is emitted after an audio content part has been added * (`response.content_part.added`) to the response. * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc). */ export interface OutputAudioBufferStarted { /** * The unique ID of the server event. */ event_id: string; /** * The unique ID of the response that produced the audio. */ response_id: string; /** * The event type, must be `output_audio_buffer.started`. */ type: 'output_audio_buffer.started'; } /** * **WebRTC Only:** Emitted when the output audio buffer has been completely * drained on the server, and no more audio is forthcoming. This event is emitted * after the full response data has been sent to the client (`response.done`). * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc). */ export interface OutputAudioBufferStopped { /** * The unique ID of the server event. */ event_id: string; /** * The unique ID of the response that produced the audio. */ response_id: string; /** * The event type, must be `output_audio_buffer.stopped`. */ type: 'output_audio_buffer.stopped'; } /** * **WebRTC Only:** Emitted when the output audio buffer is cleared. This happens * either in VAD mode when the user has interrupted * (`input_audio_buffer.speech_started`), or when the client has emitted the * `output_audio_buffer.clear` event to manually cut off the current audio * response. * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc). */ export interface OutputAudioBufferCleared { /** * The unique ID of the server event. */ event_id: string; /** * The unique ID of the response that produced the audio. */ response_id: string; /** * The event type, must be `output_audio_buffer.cleared`. */ type: 'output_audio_buffer.cleared'; } } /** * Returned when the model-generated audio is updated. */ export interface ResponseAudioDeltaEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * Base64-encoded audio data delta. */ delta: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.audio.delta`. */ type: 'response.audio.delta'; } /** * Returned when the model-generated audio is done. Also emitted when a Response is * interrupted, incomplete, or cancelled. */ export interface ResponseAudioDoneEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.audio.done`. */ type: 'response.audio.done'; } /** * Returned when the model-generated transcription of audio output is updated. */ export interface ResponseAudioTranscriptDeltaEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The transcript delta. */ delta: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.audio_transcript.delta`. */ type: 'response.audio_transcript.delta'; } /** * Returned when the model-generated transcription of audio output is done * streaming. Also emitted when a Response is interrupted, incomplete, or * cancelled. */ export interface ResponseAudioTranscriptDoneEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The final transcript of the audio. */ transcript: string; /** * The event type, must be `response.audio_transcript.done`. */ type: 'response.audio_transcript.done'; } /** * Send this event to cancel an in-progress response. The server will respond with * a `response.cancelled` event or an error if there is no response to cancel. */ export interface ResponseCancelEvent { /** * The event type, must be `response.cancel`. */ type: 'response.cancel'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; /** * A specific response ID to cancel - if not provided, will cancel an in-progress * response in the default conversation. */ response_id?: string; } /** * Returned when a new content part is added to an assistant message item during * response generation. */ export interface ResponseContentPartAddedEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item to which the content part was added. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The content part that was added. */ part: ResponseContentPartAddedEvent.Part; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.content_part.added`. */ type: 'response.content_part.added'; } export namespace ResponseContentPartAddedEvent { /** * The content part that was added. */ export interface Part { /** * Base64-encoded audio data (if type is "audio"). */ audio?: string; /** * The text content (if type is "text"). */ text?: string; /** * The transcript of the audio (if type is "audio"). */ transcript?: string; /** * The content type ("text", "audio"). */ type?: 'text' | 'audio'; } } /** * Returned when a content part is done streaming in an assistant message item. * Also emitted when a Response is interrupted, incomplete, or cancelled. */ export interface ResponseContentPartDoneEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The content part that is done. */ part: ResponseContentPartDoneEvent.Part; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.content_part.done`. */ type: 'response.content_part.done'; } export namespace ResponseContentPartDoneEvent { /** * The content part that is done. */ export interface Part { /** * Base64-encoded audio data (if type is "audio"). */ audio?: string; /** * The text content (if type is "text"). */ text?: string; /** * The transcript of the audio (if type is "audio"). */ transcript?: string; /** * The content type ("text", "audio"). */ type?: 'text' | 'audio'; } } /** * This event instructs the server to create a Response, which means triggering * model inference. When in Server VAD mode, the server will create Responses * automatically. * * A Response will include at least one Item, and may have two, in which case the * second will be a function call. These Items will be appended to the conversation * history. * * The server will respond with a `response.created` event, events for Items and * content created, and finally a `response.done` event to indicate the Response is * complete. * * The `response.create` event includes inference configuration like * `instructions`, and `temperature`. These fields will override the Session's * configuration for this Response only. */ export interface ResponseCreateEvent { /** * The event type, must be `response.create`. */ type: 'response.create'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; /** * Create a new Realtime response with these parameters */ response?: ResponseCreateEvent.Response; } export namespace ResponseCreateEvent { /** * Create a new Realtime response with these parameters */ export interface Response { /** * Controls which conversation the response is added to. Currently supports `auto` * and `none`, with `auto` as the default value. The `auto` value means that the * contents of the response will be added to the default conversation. Set this to * `none` to create an out-of-band response which will not add items to default * conversation. */ conversation?: (string & {}) | 'auto' | 'none'; /** * Input items to include in the prompt for the model. Using this field creates a * new context for this Response instead of using the default conversation. An * empty array `[]` will clear the context for this Response. Note that this can * include references to items from the default conversation. */ input?: Array<RealtimeAPI.ConversationItemWithReference>; /** * The default system instructions (i.e. system message) prepended to model calls. * This field allows the client to guide the model on desired responses. The model * can be instructed on response content and format, (e.g. "be extremely succinct", * "act friendly", "here are examples of good responses") and on audio behavior * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The * instructions are not guaranteed to be followed by the model, but they provide * guidance to the model on the desired behavior. * * Note that the server sets default instructions which will be used if this field * is not set and are visible in the `session.created` event at the start of the * session. */ instructions?: string; /** * Maximum number of output tokens for a single assistant response, inclusive of * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or * `inf` for the maximum available tokens for a given model. Defaults to `inf`. */ max_response_output_tokens?: number | 'inf'; /** * Set of 16 key-value pairs that can be attached to an object. This can be useful * for storing additional information about the object in a structured format, and * querying for objects via API or the dashboard. * * Keys are strings with a maximum length of 64 characters. Values are strings with * a maximum length of 512 characters. */ metadata?: Shared.Metadata | null; /** * The set of modalities the model can respond with. To disable audio, set this to * ["text"]. */ modalities?: Array<'text' | 'audio'>; /** * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: number; /** * How the model chooses tools. Options are `auto`, `none`, `required`, or specify * a function, like `{"type": "function", "function": {"name": "my_function"}}`. */ tool_choice?: string; /** * Tools (functions) available to the model. */ tools?: Array<Response.Tool>; /** * The voice the model uses to respond. Voice cannot be changed during the session * once the model has responded with audio at least once. Current voice options are * `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, * `shimmer`, and `verse`. */ voice?: | (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'fable' | 'onyx' | 'nova' | 'sage' | 'shimmer' | 'verse'; } export namespace Response { export interface Tool { /** * The description of the function, including guidance on when and how to call it, * and guidance about what to tell the user when calling (if anything). */ description?: string; /** * The name of the function. */ name?: string; /** * Parameters of the function in JSON Schema. */ parameters?: unknown; /** * The type of the tool, i.e. `function`. */ type?: 'function'; } } } /** * Returned when a new Response is created. The first event of response creation, * where the response is in an initial state of `in_progress`. */ export interface ResponseCreatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The response resource. */ response: RealtimeResponse; /** * The event type, must be `response.created`. */ type: 'response.created'; } /** * Returned when a Response is done streaming. Always emitted, no matter the final * state. The Response object included in the `response.done` event will include * all output Items in the Response but will omit the raw audio data. */ export interface ResponseDoneEvent { /** * The unique ID of the server event. */ event_id: string; /** * The response resource. */ response: RealtimeResponse; /** * The event type, must be `response.done`. */ type: 'response.done'; } /** * Returned when the model-generated function call arguments are updated. */ export interface ResponseFunctionCallArgumentsDeltaEvent { /** * The ID of the function call. */ call_id: string; /** * The arguments delta as a JSON string. */ delta: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the function call item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.function_call_arguments.delta`. */ type: 'response.function_call_arguments.delta'; } /** * Returned when the model-generated function call arguments are done streaming. * Also emitted when a Response is interrupted, incomplete, or cancelled. */ export interface ResponseFunctionCallArgumentsDoneEvent { /** * The final arguments as a JSON string. */ arguments: string; /** * The ID of the function call. */ call_id: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the function call item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.function_call_arguments.done`. */ type: 'response.function_call_