openai
Version:
The official TypeScript library for the OpenAI API
1,535 lines • 51.8 kB
TypeScript
import { APIResource } from "../../../resource.js";
import * as RealtimeAPI from "./realtime.js";
import * as SessionsAPI from "./sessions.js";
import { Session as SessionsAPISession, SessionCreateParams, SessionCreateResponse, Sessions } from "./sessions.js";
export declare class Realtime extends APIResource {
sessions: SessionsAPI.Sessions;
}
/**
* Returned when a conversation is created. Emitted right after session creation.
*/
export interface ConversationCreatedEvent {
/**
* The conversation resource.
*/
conversation: ConversationCreatedEvent.Conversation;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `conversation.created`.
*/
type: 'conversation.created';
}
export declare namespace ConversationCreatedEvent {
/**
* The conversation resource.
*/
interface Conversation {
/**
* The unique ID of the conversation.
*/
id?: string;
/**
* The object type, must be `realtime.conversation`.
*/
object?: 'realtime.conversation';
}
}
/**
* The item to add to the conversation.
*/
export interface ConversationItem {
/**
* The unique ID of the item, this can be generated by the client to help manage
* server-side context, but is not required because the server will generate one if
* not provided.
*/
id?: string;
/**
* The arguments of the function call (for `function_call` items).
*/
arguments?: string;
/**
* The ID of the function call (for `function_call` and `function_call_output`
* items). If passed on a `function_call_output` item, the server will check that a
* `function_call` item with the same ID exists in the conversation history.
*/
call_id?: string;
/**
* The content of the message, applicable for `message` items.
*
* - Message items of role `system` support only `input_text` content
* - Message items of role `user` support `input_text` and `input_audio` content
* - Message items of role `assistant` support `text` content.
*/
content?: Array<ConversationItemContent>;
/**
* The name of the function being called (for `function_call` items).
*/
name?: string;
/**
* Identifier for the API object being returned - always `realtime.item`.
*/
object?: 'realtime.item';
/**
* The output of the function call (for `function_call_output` items).
*/
output?: string;
/**
* The role of the message sender (`user`, `assistant`, `system`), only applicable
* for `message` items.
*/
role?: 'user' | 'assistant' | 'system';
/**
* The status of the item (`completed`, `incomplete`). These have no effect on the
* conversation, but are accepted for consistency with the
* `conversation.item.created` event.
*/
status?: 'completed' | 'incomplete';
/**
* The type of the item (`message`, `function_call`, `function_call_output`).
*/
type?: 'message' | 'function_call' | 'function_call_output';
}
export interface ConversationItemContent {
/**
* ID of a previous conversation item to reference (for `item_reference` content
* types in `response.create` events). These can reference both client and server
* created items.
*/
id?: string;
/**
* Base64-encoded audio bytes, used for `input_audio` content type.
*/
audio?: string;
/**
* The text content, used for `input_text` and `text` content types.
*/
text?: string;
/**
* The transcript of the audio, used for `input_audio` content type.
*/
transcript?: string;
/**
* The content type (`input_text`, `input_audio`, `item_reference`, `text`).
*/
type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
}
/**
* Add a new Item to the Conversation's context, including messages, function
* calls, and function call responses. This event can be used both to populate a
* "history" of the conversation and to add new items mid-stream, but has the
* current limitation that it cannot populate assistant audio messages.
*
* If successful, the server will respond with a `conversation.item.created` event,
* otherwise an `error` event will be sent.
*/
export interface ConversationItemCreateEvent {
/**
* The item to add to the conversation.
*/
item: ConversationItem;
/**
* The event type, must be `conversation.item.create`.
*/
type: 'conversation.item.create';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
/**
* The ID of the preceding item after which the new item will be inserted. If not
* set, the new item will be appended to the end of the conversation. If set, it
* allows an item to be inserted mid-conversation. If the ID cannot be found, an
* error will be returned and the item will not be added.
*/
previous_item_id?: string;
}
/**
* Returned when a conversation item is created. There are several scenarios that
* produce this event:
*
* - The server is generating a Response, which if successful will produce either
* one or two Items, which will be of type `message` (role `assistant`) or type
* `function_call`.
* - The input audio buffer has been committed, either by the client or the server
* (in `server_vad` mode). The server will take the content of the input audio
* buffer and add it to a new user message Item.
* - The client has sent a `conversation.item.create` event to add a new Item to
* the Conversation.
*/
export interface ConversationItemCreatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The item to add to the conversation.
*/
item: ConversationItem;
/**
* The ID of the preceding item in the Conversation context, allows the client to
* understand the order of the conversation.
*/
previous_item_id: string;
/**
* The event type, must be `conversation.item.created`.
*/
type: 'conversation.item.created';
}
/**
* Send this event when you want to remove any item from the conversation history.
* The server will respond with a `conversation.item.deleted` event, unless the
* item does not exist in the conversation history, in which case the server will
* respond with an error.
*/
export interface ConversationItemDeleteEvent {
/**
* The ID of the item to delete.
*/
item_id: string;
/**
* The event type, must be `conversation.item.delete`.
*/
type: 'conversation.item.delete';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an item in the conversation is deleted by the client with a
* `conversation.item.delete` event. This event is used to synchronize the server's
* understanding of the conversation history with the client's view.
*/
export interface ConversationItemDeletedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item that was deleted.
*/
item_id: string;
/**
* The event type, must be `conversation.item.deleted`.
*/
type: 'conversation.item.deleted';
}
/**
* This event is the output of audio transcription for user audio written to the
* user audio buffer. Transcription begins when the input audio buffer is committed
* by the client or server (in `server_vad` mode). Transcription runs
* asynchronously with Response creation, so this event may come before or after
* the Response events.
*
* Realtime API models accept audio natively, and thus input transcription is a
* separate process run on a separate ASR (Automatic Speech Recognition) model,
* currently always `whisper-1`. Thus the transcript may diverge somewhat from the
* model's interpretation, and should be treated as a rough guide.
*/
export interface ConversationItemInputAudioTranscriptionCompletedEvent {
/**
* The index of the content part containing the audio.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item containing the audio.
*/
item_id: string;
/**
* The transcribed text.
*/
transcript: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.completed`.
*/
type: 'conversation.item.input_audio_transcription.completed';
}
/**
* Returned when input audio transcription is configured, and a transcription
* request for a user message failed. These events are separate from other `error`
* events so that the client can identify the related Item.
*/
export interface ConversationItemInputAudioTranscriptionFailedEvent {
/**
* The index of the content part containing the audio.
*/
content_index: number;
/**
* Details of the transcription error.
*/
error: ConversationItemInputAudioTranscriptionFailedEvent.Error;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item.
*/
item_id: string;
/**
* The event type, must be `conversation.item.input_audio_transcription.failed`.
*/
type: 'conversation.item.input_audio_transcription.failed';
}
export declare namespace ConversationItemInputAudioTranscriptionFailedEvent {
/**
* Details of the transcription error.
*/
interface Error {
/**
* Error code, if any.
*/
code?: string;
/**
* A human-readable error message.
*/
message?: string;
/**
* Parameter related to the error, if any.
*/
param?: string;
/**
* The type of error.
*/
type?: string;
}
}
/**
* Send this event to truncate a previous assistant message’s audio. The server
* will produce audio faster than realtime, so this event is useful when the user
* interrupts to truncate audio that has already been sent to the client but not
* yet played. This will synchronize the server's understanding of the audio with
* the client's playback.
*
* Truncating audio will delete the server-side text transcript to ensure there is
* not text in the context that hasn't been heard by the user.
*
* If successful, the server will respond with a `conversation.item.truncated`
* event.
*/
export interface ConversationItemTruncateEvent {
/**
* Inclusive duration up to which audio is truncated, in milliseconds. If the
* audio_end_ms is greater than the actual audio duration, the server will respond
* with an error.
*/
audio_end_ms: number;
/**
* The index of the content part to truncate. Set this to 0.
*/
content_index: number;
/**
* The ID of the assistant message item to truncate. Only assistant message items
* can be truncated.
*/
item_id: string;
/**
* The event type, must be `conversation.item.truncate`.
*/
type: 'conversation.item.truncate';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an earlier assistant audio message item is truncated by the client
* with a `conversation.item.truncate` event. This event is used to synchronize the
* server's understanding of the audio with the client's playback.
*
* This action will truncate the audio and remove the server-side text transcript
* to ensure there is no text in the context that hasn't been heard by the user.
*/
export interface ConversationItemTruncatedEvent {
/**
* The duration up to which the audio was truncated, in milliseconds.
*/
audio_end_ms: number;
/**
* The index of the content part that was truncated.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the assistant message item that was truncated.
*/
item_id: string;
/**
* The event type, must be `conversation.item.truncated`.
*/
type: 'conversation.item.truncated';
}
/**
* Returned when an error occurs, which could be a client problem or a server
* problem. Most errors are recoverable and the session will stay open, we
* recommend to implementors to monitor and log error messages by default.
*/
export interface ErrorEvent {
/**
* Details of the error.
*/
error: ErrorEvent.Error;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `error`.
*/
type: 'error';
}
export declare namespace ErrorEvent {
/**
* Details of the error.
*/
interface Error {
/**
* A human-readable error message.
*/
message: string;
/**
* The type of error (e.g., "invalid_request_error", "server_error").
*/
type: string;
/**
* Error code, if any.
*/
code?: string | null;
/**
* The event_id of the client event that caused the error, if applicable.
*/
event_id?: string | null;
/**
* Parameter related to the error, if any.
*/
param?: string | null;
}
}
/**
* Send this event to append audio bytes to the input audio buffer. The audio
* buffer is temporary storage you can write to and later commit. In Server VAD
* mode, the audio buffer is used to detect speech and the server will decide when
* to commit. When Server VAD is disabled, you must commit the audio buffer
* manually.
*
* The client may choose how much audio to place in each event up to a maximum of
* 15 MiB, for example streaming smaller chunks from the client may allow the VAD
* to be more responsive. Unlike made other client events, the server will not send
* a confirmation response to this event.
*/
export interface InputAudioBufferAppendEvent {
/**
* Base64-encoded audio bytes. This must be in the format specified by the
* `input_audio_format` field in the session configuration.
*/
audio: string;
/**
* The event type, must be `input_audio_buffer.append`.
*/
type: 'input_audio_buffer.append';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Send this event to clear the audio bytes in the buffer. The server will respond
* with an `input_audio_buffer.cleared` event.
*/
export interface InputAudioBufferClearEvent {
/**
* The event type, must be `input_audio_buffer.clear`.
*/
type: 'input_audio_buffer.clear';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when the input audio buffer is cleared by the client with a
* `input_audio_buffer.clear` event.
*/
export interface InputAudioBufferClearedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The event type, must be `input_audio_buffer.cleared`.
*/
type: 'input_audio_buffer.cleared';
}
/**
* Send this event to commit the user input audio buffer, which will create a new
* user message item in the conversation. This event will produce an error if the
* input audio buffer is empty. When in Server VAD mode, the client does not need
* to send this event, the server will commit the audio buffer automatically.
*
* Committing the input audio buffer will trigger input audio transcription (if
* enabled in session configuration), but it will not create a response from the
* model. The server will respond with an `input_audio_buffer.committed` event.
*/
export interface InputAudioBufferCommitEvent {
/**
* The event type, must be `input_audio_buffer.commit`.
*/
type: 'input_audio_buffer.commit';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
/**
* Returned when an input audio buffer is committed, either by the client or
* automatically in server VAD mode. The `item_id` property is the ID of the user
* message item that will be created, thus a `conversation.item.created` event will
* also be sent to the client.
*/
export interface InputAudioBufferCommittedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created.
*/
item_id: string;
/**
* The ID of the preceding item after which the new item will be inserted.
*/
previous_item_id: string;
/**
* The event type, must be `input_audio_buffer.committed`.
*/
type: 'input_audio_buffer.committed';
}
/**
* Sent by the server when in `server_vad` mode to indicate that speech has been
* detected in the audio buffer. This can happen any time audio is added to the
* buffer (unless speech is already detected). The client may want to use this
* event to interrupt audio playback or provide visual feedback to the user.
*
* The client should expect to receive a `input_audio_buffer.speech_stopped` event
* when speech stops. The `item_id` property is the ID of the user message item
* that will be created when speech stops and will also be included in the
* `input_audio_buffer.speech_stopped` event (unless the client manually commits
* the audio buffer during VAD activation).
*/
export interface InputAudioBufferSpeechStartedEvent {
/**
* Milliseconds from the start of all audio written to the buffer during the
* session when speech was first detected. This will correspond to the beginning of
* audio sent to the model, and thus includes the `prefix_padding_ms` configured in
* the Session.
*/
audio_start_ms: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created when speech stops.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.speech_started`.
*/
type: 'input_audio_buffer.speech_started';
}
/**
* Returned in `server_vad` mode when the server detects the end of speech in the
* audio buffer. The server will also send an `conversation.item.created` event
* with the user message item that is created from the audio buffer.
*/
export interface InputAudioBufferSpeechStoppedEvent {
/**
* Milliseconds since the session started when speech stopped. This will correspond
* to the end of audio sent to the model, and thus includes the
* `min_silence_duration_ms` configured in the Session.
*/
audio_end_ms: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the user message item that will be created.
*/
item_id: string;
/**
* The event type, must be `input_audio_buffer.speech_stopped`.
*/
type: 'input_audio_buffer.speech_stopped';
}
/**
* Emitted at the beginning of a Response to indicate the updated rate limits. When
* a Response is created some tokens will be "reserved" for the output tokens, the
* rate limits shown here reflect that reservation, which is then adjusted
* accordingly once the Response is completed.
*/
export interface RateLimitsUpdatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* List of rate limit information.
*/
rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>;
/**
* The event type, must be `rate_limits.updated`.
*/
type: 'rate_limits.updated';
}
export declare namespace RateLimitsUpdatedEvent {
interface RateLimit {
/**
* The maximum allowed value for the rate limit.
*/
limit?: number;
/**
* The name of the rate limit (`requests`, `tokens`).
*/
name?: 'requests' | 'tokens';
/**
* The remaining value before the limit is reached.
*/
remaining?: number;
/**
* Seconds until the rate limit resets.
*/
reset_seconds?: number;
}
}
/**
* All events that the client can send to the Realtime API
*/
export type RealtimeClientEvent = SessionUpdateEvent | InputAudioBufferAppendEvent | InputAudioBufferCommitEvent | InputAudioBufferClearEvent | ConversationItemCreateEvent | ConversationItemTruncateEvent | ConversationItemDeleteEvent | ResponseCreateEvent | ResponseCancelEvent;
/**
* The response resource.
*/
export interface RealtimeResponse {
/**
* The unique ID of the response.
*/
id?: string;
/**
* Developer-provided string key-value pairs associated with this response.
*/
metadata?: unknown | null;
/**
* The object type, must be `realtime.response`.
*/
object?: 'realtime.response';
/**
* The list of output items generated by the response.
*/
output?: Array<ConversationItem>;
/**
* The final status of the response (`completed`, `cancelled`, `failed`, or
* `incomplete`).
*/
status?: 'completed' | 'cancelled' | 'failed' | 'incomplete';
/**
* Additional details about the status.
*/
status_details?: RealtimeResponseStatus;
/**
* Usage statistics for the Response, this will correspond to billing. A Realtime
* API session will maintain a conversation context and append new Items to the
* Conversation, thus output from previous turns (text and audio tokens) will
* become the input for later turns.
*/
usage?: RealtimeResponseUsage;
}
/**
* Additional details about the status.
*/
export interface RealtimeResponseStatus {
/**
* A description of the error that caused the response to fail, populated when the
* `status` is `failed`.
*/
error?: RealtimeResponseStatus.Error;
/**
* The reason the Response did not complete. For a `cancelled` Response, one of
* `turn_detected` (the server VAD detected a new start of speech) or
* `client_cancelled` (the client sent a cancel event). For an `incomplete`
* Response, one of `max_output_tokens` or `content_filter` (the server-side safety
* filter activated and cut off the response).
*/
reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter';
/**
* The type of error that caused the response to fail, corresponding with the
* `status` field (`completed`, `cancelled`, `incomplete`, `failed`).
*/
type?: 'completed' | 'cancelled' | 'incomplete' | 'failed';
}
export declare namespace RealtimeResponseStatus {
/**
* A description of the error that caused the response to fail, populated when the
* `status` is `failed`.
*/
interface Error {
/**
* Error code, if any.
*/
code?: string;
/**
* The type of error.
*/
type?: string;
}
}
/**
* Usage statistics for the Response, this will correspond to billing. A Realtime
* API session will maintain a conversation context and append new Items to the
* Conversation, thus output from previous turns (text and audio tokens) will
* become the input for later turns.
*/
export interface RealtimeResponseUsage {
/**
* Details about the input tokens used in the Response.
*/
input_token_details?: RealtimeResponseUsage.InputTokenDetails;
/**
* The number of input tokens used in the Response, including text and audio
* tokens.
*/
input_tokens?: number;
/**
* Details about the output tokens used in the Response.
*/
output_token_details?: RealtimeResponseUsage.OutputTokenDetails;
/**
* The number of output tokens sent in the Response, including text and audio
* tokens.
*/
output_tokens?: number;
/**
* The total number of tokens in the Response including input and output text and
* audio tokens.
*/
total_tokens?: number;
}
export declare namespace RealtimeResponseUsage {
/**
* Details about the input tokens used in the Response.
*/
interface InputTokenDetails {
/**
* The number of audio tokens used in the Response.
*/
audio_tokens?: number;
/**
* The number of cached tokens used in the Response.
*/
cached_tokens?: number;
/**
* The number of text tokens used in the Response.
*/
text_tokens?: number;
}
/**
* Details about the output tokens used in the Response.
*/
interface OutputTokenDetails {
/**
* The number of audio tokens used in the Response.
*/
audio_tokens?: number;
/**
* The number of text tokens used in the Response.
*/
text_tokens?: number;
}
}
/**
* All events that the Realtime API can send back
*/
export type RealtimeServerEvent = ErrorEvent | SessionCreatedEvent | SessionUpdatedEvent | ConversationCreatedEvent | InputAudioBufferCommittedEvent | InputAudioBufferClearedEvent | InputAudioBufferSpeechStartedEvent | InputAudioBufferSpeechStoppedEvent | ConversationItemCreatedEvent | ConversationItemInputAudioTranscriptionCompletedEvent | ConversationItemInputAudioTranscriptionFailedEvent | ConversationItemTruncatedEvent | ConversationItemDeletedEvent | ResponseCreatedEvent | ResponseDoneEvent | ResponseOutputItemAddedEvent | ResponseOutputItemDoneEvent | ResponseContentPartAddedEvent | ResponseContentPartDoneEvent | ResponseTextDeltaEvent | ResponseTextDoneEvent | ResponseAudioTranscriptDeltaEvent | ResponseAudioTranscriptDoneEvent | ResponseAudioDeltaEvent | ResponseAudioDoneEvent | ResponseFunctionCallArgumentsDeltaEvent | ResponseFunctionCallArgumentsDoneEvent | RateLimitsUpdatedEvent;
/**
* Returned when the model-generated audio is updated.
*/
export interface ResponseAudioDeltaEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* Base64-encoded audio data delta.
*/
delta: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.audio.delta`.
*/
type: 'response.audio.delta';
}
/**
* Returned when the model-generated audio is done. Also emitted when a Response is
* interrupted, incomplete, or cancelled.
*/
export interface ResponseAudioDoneEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.audio.done`.
*/
type: 'response.audio.done';
}
/**
* Returned when the model-generated transcription of audio output is updated.
*/
export interface ResponseAudioTranscriptDeltaEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The transcript delta.
*/
delta: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.audio_transcript.delta`.
*/
type: 'response.audio_transcript.delta';
}
/**
* Returned when the model-generated transcription of audio output is done
* streaming. Also emitted when a Response is interrupted, incomplete, or
* cancelled.
*/
export interface ResponseAudioTranscriptDoneEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The final transcript of the audio.
*/
transcript: string;
/**
* The event type, must be `response.audio_transcript.done`.
*/
type: 'response.audio_transcript.done';
}
/**
* Send this event to cancel an in-progress response. The server will respond with
* a `response.cancelled` event or an error if there is no response to cancel.
*/
export interface ResponseCancelEvent {
/**
* The event type, must be `response.cancel`.
*/
type: 'response.cancel';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
/**
* A specific response ID to cancel - if not provided, will cancel an in-progress
* response in the default conversation.
*/
response_id?: string;
}
/**
* Returned when a new content part is added to an assistant message item during
* response generation.
*/
export interface ResponseContentPartAddedEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item to which the content part was added.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The content part that was added.
*/
part: ResponseContentPartAddedEvent.Part;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.content_part.added`.
*/
type: 'response.content_part.added';
}
export declare namespace ResponseContentPartAddedEvent {
/**
* The content part that was added.
*/
interface Part {
/**
* Base64-encoded audio data (if type is "audio").
*/
audio?: string;
/**
* The text content (if type is "text").
*/
text?: string;
/**
* The transcript of the audio (if type is "audio").
*/
transcript?: string;
/**
* The content type ("text", "audio").
*/
type?: 'text' | 'audio';
}
}
/**
* Returned when a content part is done streaming in an assistant message item.
* Also emitted when a Response is interrupted, incomplete, or cancelled.
*/
export interface ResponseContentPartDoneEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The content part that is done.
*/
part: ResponseContentPartDoneEvent.Part;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.content_part.done`.
*/
type: 'response.content_part.done';
}
export declare namespace ResponseContentPartDoneEvent {
/**
* The content part that is done.
*/
interface Part {
/**
* Base64-encoded audio data (if type is "audio").
*/
audio?: string;
/**
* The text content (if type is "text").
*/
text?: string;
/**
* The transcript of the audio (if type is "audio").
*/
transcript?: string;
/**
* The content type ("text", "audio").
*/
type?: 'text' | 'audio';
}
}
/**
* This event instructs the server to create a Response, which means triggering
* model inference. When in Server VAD mode, the server will create Responses
* automatically.
*
* A Response will include at least one Item, and may have two, in which case the
* second will be a function call. These Items will be appended to the conversation
* history.
*
* The server will respond with a `response.created` event, events for Items and
* content created, and finally a `response.done` event to indicate the Response is
* complete.
*
* The `response.create` event includes inference configuration like
* `instructions`, and `temperature`. These fields will override the Session's
* configuration for this Response only.
*/
export interface ResponseCreateEvent {
/**
* The event type, must be `response.create`.
*/
type: 'response.create';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
/**
* Create a new Realtime response with these parameters
*/
response?: ResponseCreateEvent.Response;
}
export declare namespace ResponseCreateEvent {
/**
* Create a new Realtime response with these parameters
*/
interface Response {
/**
* Controls which conversation the response is added to. Currently supports `auto`
* and `none`, with `auto` as the default value. The `auto` value means that the
* contents of the response will be added to the default conversation. Set this to
* `none` to create an out-of-band response which will not add items to default
* conversation.
*/
conversation?: (string & {}) | 'auto' | 'none';
/**
* Input items to include in the prompt for the model. Creates a new context for
* this response, without including the default conversation. Can include
* references to items from the default conversation.
*/
input?: Array<RealtimeAPI.ConversationItem>;
/**
* The default system instructions (i.e. system message) prepended to model calls.
* This field allows the client to guide the model on desired responses. The model
* can be instructed on response content and format, (e.g. "be extremely succinct",
* "act friendly", "here are examples of good responses") and on audio behavior
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
* instructions are not guaranteed to be followed by the model, but they provide
* guidance to the model on the desired behavior.
*
* Note that the server sets default instructions which will be used if this field
* is not set and are visible in the `session.created` event at the start of the
* session.
*/
instructions?: string;
/**
* Maximum number of output tokens for a single assistant response, inclusive of
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
*/
max_response_output_tokens?: number | 'inf';
/**
* Set of 16 key-value pairs that can be attached to an object. This can be useful
* for storing additional information about the object in a structured format. Keys
* can be a maximum of 64 characters long and values can be a maximum of 512
* characters long.
*/
metadata?: unknown | null;
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
*/
temperature?: number;
/**
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
* a function, like `{"type": "function", "function": {"name": "my_function"}}`.
*/
tool_choice?: string;
/**
* Tools (functions) available to the model.
*/
tools?: Array<Response.Tool>;
/**
* The voice the model uses to respond. Voice cannot be changed during the session
* once the model has responded with audio at least once. Current voice options are
* `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
*/
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
}
namespace Response {
interface Tool {
/**
* The description of the function, including guidance on when and how to call it,
* and guidance about what to tell the user when calling (if anything).
*/
description?: string;
/**
* The name of the function.
*/
name?: string;
/**
* Parameters of the function in JSON Schema.
*/
parameters?: unknown;
/**
* The type of the tool, i.e. `function`.
*/
type?: 'function';
}
}
}
/**
* Returned when a new Response is created. The first event of response creation,
* where the response is in an initial state of `in_progress`.
*/
export interface ResponseCreatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The response resource.
*/
response: RealtimeResponse;
/**
* The event type, must be `response.created`.
*/
type: 'response.created';
}
/**
* Returned when a Response is done streaming. Always emitted, no matter the final
* state. The Response object included in the `response.done` event will include
* all output Items in the Response but will omit the raw audio data.
*/
export interface ResponseDoneEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The response resource.
*/
response: RealtimeResponse;
/**
* The event type, must be `response.done`.
*/
type: 'response.done';
}
/**
* Returned when the model-generated function call arguments are updated.
*/
export interface ResponseFunctionCallArgumentsDeltaEvent {
/**
* The ID of the function call.
*/
call_id: string;
/**
* The arguments delta as a JSON string.
*/
delta: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the function call item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.function_call_arguments.delta`.
*/
type: 'response.function_call_arguments.delta';
}
/**
* Returned when the model-generated function call arguments are done streaming.
* Also emitted when a Response is interrupted, incomplete, or cancelled.
*/
export interface ResponseFunctionCallArgumentsDoneEvent {
/**
* The final arguments as a JSON string.
*/
arguments: string;
/**
* The ID of the function call.
*/
call_id: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the function call item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.function_call_arguments.done`.
*/
type: 'response.function_call_arguments.done';
}
/**
* Returned when a new Item is created during Response generation.
*/
export interface ResponseOutputItemAddedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The item to add to the conversation.
*/
item: ConversationItem;
/**
* The index of the output item in the Response.
*/
output_index: number;
/**
* The ID of the Response to which the item belongs.
*/
response_id: string;
/**
* The event type, must be `response.output_item.added`.
*/
type: 'response.output_item.added';
}
/**
* Returned when an Item is done streaming. Also emitted when a Response is
* interrupted, incomplete, or cancelled.
*/
export interface ResponseOutputItemDoneEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The item to add to the conversation.
*/
item: ConversationItem;
/**
* The index of the output item in the Response.
*/
output_index: number;
/**
* The ID of the Response to which the item belongs.
*/
response_id: string;
/**
* The event type, must be `response.output_item.done`.
*/
type: 'response.output_item.done';
}
/**
* Returned when the text value of a "text" content part is updated.
*/
export interface ResponseTextDeltaEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The text delta.
*/
delta: string;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The event type, must be `response.text.delta`.
*/
type: 'response.text.delta';
}
/**
* Returned when the text value of a "text" content part is done streaming. Also
* emitted when a Response is interrupted, incomplete, or cancelled.
*/
export interface ResponseTextDoneEvent {
/**
* The index of the content part in the item's content array.
*/
content_index: number;
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* The ID of the item.
*/
item_id: string;
/**
* The index of the output item in the response.
*/
output_index: number;
/**
* The ID of the response.
*/
response_id: string;
/**
* The final text content.
*/
text: string;
/**
* The event type, must be `response.text.done`.
*/
type: 'response.text.done';
}
/**
* Returned when a Session is created. Emitted automatically when a new connection
* is established as the first server event. This event will contain the default
* Session configuration.
*/
export interface SessionCreatedEvent {
/**
* The unique ID of the server event.
*/
event_id: string;
/**
* Realtime session object configuration.
*/
session: SessionsAPI.Session;
/**
* The event type, must be `session.created`.
*/
type: 'session.created';
}
/**
* Send this event to update the session’s default configuration. The client may
* send this event at any time to update the session configuration, and any field
* may be updated at any time, except for "voice". The server will respond with a
* `session.updated` event that shows the full effective configuration. Only fields
* that are present are updated, thus the correct way to clear a field like
* "instructions" is to pass an empty string.
*/
export interface SessionUpdateEvent {
/**
* Realtime session object configuration.
*/
session: SessionUpdateEvent.Session;
/**
* The event type, must be `session.update`.
*/
type: 'session.update';
/**
* Optional client-generated ID used to identify this event.
*/
event_id?: string;
}
export declare namespace SessionUpdateEvent {
/**
* Realtime session object configuration.
*/
interface Session {
/**
* The Realtime model used for this session.
*/
model: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
input_audio_transcription?: Session.InputAudioTranscription;
/**
* The default system instructions (i.e. system message) prepended to model calls.
* This field allows the client to guide the model on desired responses. The model
* can be instructed on response content and format, (e.g. "be extremely succinct",
* "act friendly", "here are examples of good responses") and on audio behavior
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
* instructions are not guaranteed to be followed by the model, but they provide
* guidance to the model on the desired behavior.
*
* Note that the server sets default instructions which will be used if this field
* is not set and are visible in the `session.created` event at the start of the
* session.
*/
instructions?: string;
/**
* Maximum number of output tokens for a single assistant response, inclusive of
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
*/
max_response_output_tokens?: number | 'inf';
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
*/
temperature?: number;
/**
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
* a function.
*/
tool_choice?: string;
/**
* Tools (functions) available to the model.
*/
tools?: Array<Session.Tool>;
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
turn_detection?: Session.TurnDetection;
/**
* The voice the model uses to respond. Voice cannot be changed during the session
* once the model has responded with audio at least once. Current voice options are
* `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
*/
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
}
namespace Session {
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
interface InputAudioTranscription {
/**
* The model to use for transcription, `whisper-1` is the only currently supported
* model.
*/
model?: string;
}
interface Tool {
/**
* The description of the function, including guidance on when and how to call it,
* and guidance about what to tell the user when calling (if anything).
*/
description?: string;
/**
* The name of the function.
*/
name?: string;
/**
* Parameters of the function in JSON Schema.
*/
parameters?: unknown;
/**
* The type of the tool, i.e. `function`.
*/
type?: 'function';
}
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
interface TurnDetection {
/**
* Whet