UNPKG

openai

Version:

The official TypeScript library for the OpenAI API

1,535 lines 51.8 kB
import { APIResource } from "../../../resource.js"; import * as RealtimeAPI from "./realtime.js"; import * as SessionsAPI from "./sessions.js"; import { Session as SessionsAPISession, SessionCreateParams, SessionCreateResponse, Sessions } from "./sessions.js"; export declare class Realtime extends APIResource { sessions: SessionsAPI.Sessions; } /** * Returned when a conversation is created. Emitted right after session creation. */ export interface ConversationCreatedEvent { /** * The conversation resource. */ conversation: ConversationCreatedEvent.Conversation; /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `conversation.created`. */ type: 'conversation.created'; } export declare namespace ConversationCreatedEvent { /** * The conversation resource. */ interface Conversation { /** * The unique ID of the conversation. */ id?: string; /** * The object type, must be `realtime.conversation`. */ object?: 'realtime.conversation'; } } /** * The item to add to the conversation. */ export interface ConversationItem { /** * The unique ID of the item, this can be generated by the client to help manage * server-side context, but is not required because the server will generate one if * not provided. */ id?: string; /** * The arguments of the function call (for `function_call` items). */ arguments?: string; /** * The ID of the function call (for `function_call` and `function_call_output` * items). If passed on a `function_call_output` item, the server will check that a * `function_call` item with the same ID exists in the conversation history. */ call_id?: string; /** * The content of the message, applicable for `message` items. * * - Message items of role `system` support only `input_text` content * - Message items of role `user` support `input_text` and `input_audio` content * - Message items of role `assistant` support `text` content. */ content?: Array<ConversationItemContent>; /** * The name of the function being called (for `function_call` items). */ name?: string; /** * Identifier for the API object being returned - always `realtime.item`. */ object?: 'realtime.item'; /** * The output of the function call (for `function_call_output` items). */ output?: string; /** * The role of the message sender (`user`, `assistant`, `system`), only applicable * for `message` items. */ role?: 'user' | 'assistant' | 'system'; /** * The status of the item (`completed`, `incomplete`). These have no effect on the * conversation, but are accepted for consistency with the * `conversation.item.created` event. */ status?: 'completed' | 'incomplete'; /** * The type of the item (`message`, `function_call`, `function_call_output`). */ type?: 'message' | 'function_call' | 'function_call_output'; } export interface ConversationItemContent { /** * ID of a previous conversation item to reference (for `item_reference` content * types in `response.create` events). These can reference both client and server * created items. */ id?: string; /** * Base64-encoded audio bytes, used for `input_audio` content type. */ audio?: string; /** * The text content, used for `input_text` and `text` content types. */ text?: string; /** * The transcript of the audio, used for `input_audio` content type. */ transcript?: string; /** * The content type (`input_text`, `input_audio`, `item_reference`, `text`). */ type?: 'input_text' | 'input_audio' | 'item_reference' | 'text'; } /** * Add a new Item to the Conversation's context, including messages, function * calls, and function call responses. This event can be used both to populate a * "history" of the conversation and to add new items mid-stream, but has the * current limitation that it cannot populate assistant audio messages. * * If successful, the server will respond with a `conversation.item.created` event, * otherwise an `error` event will be sent. */ export interface ConversationItemCreateEvent { /** * The item to add to the conversation. */ item: ConversationItem; /** * The event type, must be `conversation.item.create`. */ type: 'conversation.item.create'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; /** * The ID of the preceding item after which the new item will be inserted. If not * set, the new item will be appended to the end of the conversation. If set, it * allows an item to be inserted mid-conversation. If the ID cannot be found, an * error will be returned and the item will not be added. */ previous_item_id?: string; } /** * Returned when a conversation item is created. There are several scenarios that * produce this event: * * - The server is generating a Response, which if successful will produce either * one or two Items, which will be of type `message` (role `assistant`) or type * `function_call`. * - The input audio buffer has been committed, either by the client or the server * (in `server_vad` mode). The server will take the content of the input audio * buffer and add it to a new user message Item. * - The client has sent a `conversation.item.create` event to add a new Item to * the Conversation. */ export interface ConversationItemCreatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The item to add to the conversation. */ item: ConversationItem; /** * The ID of the preceding item in the Conversation context, allows the client to * understand the order of the conversation. */ previous_item_id: string; /** * The event type, must be `conversation.item.created`. */ type: 'conversation.item.created'; } /** * Send this event when you want to remove any item from the conversation history. * The server will respond with a `conversation.item.deleted` event, unless the * item does not exist in the conversation history, in which case the server will * respond with an error. */ export interface ConversationItemDeleteEvent { /** * The ID of the item to delete. */ item_id: string; /** * The event type, must be `conversation.item.delete`. */ type: 'conversation.item.delete'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an item in the conversation is deleted by the client with a * `conversation.item.delete` event. This event is used to synchronize the server's * understanding of the conversation history with the client's view. */ export interface ConversationItemDeletedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item that was deleted. */ item_id: string; /** * The event type, must be `conversation.item.deleted`. */ type: 'conversation.item.deleted'; } /** * This event is the output of audio transcription for user audio written to the * user audio buffer. Transcription begins when the input audio buffer is committed * by the client or server (in `server_vad` mode). Transcription runs * asynchronously with Response creation, so this event may come before or after * the Response events. * * Realtime API models accept audio natively, and thus input transcription is a * separate process run on a separate ASR (Automatic Speech Recognition) model, * currently always `whisper-1`. Thus the transcript may diverge somewhat from the * model's interpretation, and should be treated as a rough guide. */ export interface ConversationItemInputAudioTranscriptionCompletedEvent { /** * The index of the content part containing the audio. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item containing the audio. */ item_id: string; /** * The transcribed text. */ transcript: string; /** * The event type, must be `conversation.item.input_audio_transcription.completed`. */ type: 'conversation.item.input_audio_transcription.completed'; } /** * Returned when input audio transcription is configured, and a transcription * request for a user message failed. These events are separate from other `error` * events so that the client can identify the related Item. */ export interface ConversationItemInputAudioTranscriptionFailedEvent { /** * The index of the content part containing the audio. */ content_index: number; /** * Details of the transcription error. */ error: ConversationItemInputAudioTranscriptionFailedEvent.Error; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item. */ item_id: string; /** * The event type, must be `conversation.item.input_audio_transcription.failed`. */ type: 'conversation.item.input_audio_transcription.failed'; } export declare namespace ConversationItemInputAudioTranscriptionFailedEvent { /** * Details of the transcription error. */ interface Error { /** * Error code, if any. */ code?: string; /** * A human-readable error message. */ message?: string; /** * Parameter related to the error, if any. */ param?: string; /** * The type of error. */ type?: string; } } /** * Send this event to truncate a previous assistant message’s audio. The server * will produce audio faster than realtime, so this event is useful when the user * interrupts to truncate audio that has already been sent to the client but not * yet played. This will synchronize the server's understanding of the audio with * the client's playback. * * Truncating audio will delete the server-side text transcript to ensure there is * not text in the context that hasn't been heard by the user. * * If successful, the server will respond with a `conversation.item.truncated` * event. */ export interface ConversationItemTruncateEvent { /** * Inclusive duration up to which audio is truncated, in milliseconds. If the * audio_end_ms is greater than the actual audio duration, the server will respond * with an error. */ audio_end_ms: number; /** * The index of the content part to truncate. Set this to 0. */ content_index: number; /** * The ID of the assistant message item to truncate. Only assistant message items * can be truncated. */ item_id: string; /** * The event type, must be `conversation.item.truncate`. */ type: 'conversation.item.truncate'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an earlier assistant audio message item is truncated by the client * with a `conversation.item.truncate` event. This event is used to synchronize the * server's understanding of the audio with the client's playback. * * This action will truncate the audio and remove the server-side text transcript * to ensure there is no text in the context that hasn't been heard by the user. */ export interface ConversationItemTruncatedEvent { /** * The duration up to which the audio was truncated, in milliseconds. */ audio_end_ms: number; /** * The index of the content part that was truncated. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the assistant message item that was truncated. */ item_id: string; /** * The event type, must be `conversation.item.truncated`. */ type: 'conversation.item.truncated'; } /** * Returned when an error occurs, which could be a client problem or a server * problem. Most errors are recoverable and the session will stay open, we * recommend to implementors to monitor and log error messages by default. */ export interface ErrorEvent { /** * Details of the error. */ error: ErrorEvent.Error; /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `error`. */ type: 'error'; } export declare namespace ErrorEvent { /** * Details of the error. */ interface Error { /** * A human-readable error message. */ message: string; /** * The type of error (e.g., "invalid_request_error", "server_error"). */ type: string; /** * Error code, if any. */ code?: string | null; /** * The event_id of the client event that caused the error, if applicable. */ event_id?: string | null; /** * Parameter related to the error, if any. */ param?: string | null; } } /** * Send this event to append audio bytes to the input audio buffer. The audio * buffer is temporary storage you can write to and later commit. In Server VAD * mode, the audio buffer is used to detect speech and the server will decide when * to commit. When Server VAD is disabled, you must commit the audio buffer * manually. * * The client may choose how much audio to place in each event up to a maximum of * 15 MiB, for example streaming smaller chunks from the client may allow the VAD * to be more responsive. Unlike made other client events, the server will not send * a confirmation response to this event. */ export interface InputAudioBufferAppendEvent { /** * Base64-encoded audio bytes. This must be in the format specified by the * `input_audio_format` field in the session configuration. */ audio: string; /** * The event type, must be `input_audio_buffer.append`. */ type: 'input_audio_buffer.append'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Send this event to clear the audio bytes in the buffer. The server will respond * with an `input_audio_buffer.cleared` event. */ export interface InputAudioBufferClearEvent { /** * The event type, must be `input_audio_buffer.clear`. */ type: 'input_audio_buffer.clear'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when the input audio buffer is cleared by the client with a * `input_audio_buffer.clear` event. */ export interface InputAudioBufferClearedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The event type, must be `input_audio_buffer.cleared`. */ type: 'input_audio_buffer.cleared'; } /** * Send this event to commit the user input audio buffer, which will create a new * user message item in the conversation. This event will produce an error if the * input audio buffer is empty. When in Server VAD mode, the client does not need * to send this event, the server will commit the audio buffer automatically. * * Committing the input audio buffer will trigger input audio transcription (if * enabled in session configuration), but it will not create a response from the * model. The server will respond with an `input_audio_buffer.committed` event. */ export interface InputAudioBufferCommitEvent { /** * The event type, must be `input_audio_buffer.commit`. */ type: 'input_audio_buffer.commit'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } /** * Returned when an input audio buffer is committed, either by the client or * automatically in server VAD mode. The `item_id` property is the ID of the user * message item that will be created, thus a `conversation.item.created` event will * also be sent to the client. */ export interface InputAudioBufferCommittedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created. */ item_id: string; /** * The ID of the preceding item after which the new item will be inserted. */ previous_item_id: string; /** * The event type, must be `input_audio_buffer.committed`. */ type: 'input_audio_buffer.committed'; } /** * Sent by the server when in `server_vad` mode to indicate that speech has been * detected in the audio buffer. This can happen any time audio is added to the * buffer (unless speech is already detected). The client may want to use this * event to interrupt audio playback or provide visual feedback to the user. * * The client should expect to receive a `input_audio_buffer.speech_stopped` event * when speech stops. The `item_id` property is the ID of the user message item * that will be created when speech stops and will also be included in the * `input_audio_buffer.speech_stopped` event (unless the client manually commits * the audio buffer during VAD activation). */ export interface InputAudioBufferSpeechStartedEvent { /** * Milliseconds from the start of all audio written to the buffer during the * session when speech was first detected. This will correspond to the beginning of * audio sent to the model, and thus includes the `prefix_padding_ms` configured in * the Session. */ audio_start_ms: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created when speech stops. */ item_id: string; /** * The event type, must be `input_audio_buffer.speech_started`. */ type: 'input_audio_buffer.speech_started'; } /** * Returned in `server_vad` mode when the server detects the end of speech in the * audio buffer. The server will also send an `conversation.item.created` event * with the user message item that is created from the audio buffer. */ export interface InputAudioBufferSpeechStoppedEvent { /** * Milliseconds since the session started when speech stopped. This will correspond * to the end of audio sent to the model, and thus includes the * `min_silence_duration_ms` configured in the Session. */ audio_end_ms: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the user message item that will be created. */ item_id: string; /** * The event type, must be `input_audio_buffer.speech_stopped`. */ type: 'input_audio_buffer.speech_stopped'; } /** * Emitted at the beginning of a Response to indicate the updated rate limits. When * a Response is created some tokens will be "reserved" for the output tokens, the * rate limits shown here reflect that reservation, which is then adjusted * accordingly once the Response is completed. */ export interface RateLimitsUpdatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * List of rate limit information. */ rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>; /** * The event type, must be `rate_limits.updated`. */ type: 'rate_limits.updated'; } export declare namespace RateLimitsUpdatedEvent { interface RateLimit { /** * The maximum allowed value for the rate limit. */ limit?: number; /** * The name of the rate limit (`requests`, `tokens`). */ name?: 'requests' | 'tokens'; /** * The remaining value before the limit is reached. */ remaining?: number; /** * Seconds until the rate limit resets. */ reset_seconds?: number; } } /** * All events that the client can send to the Realtime API */ export type RealtimeClientEvent = SessionUpdateEvent | InputAudioBufferAppendEvent | InputAudioBufferCommitEvent | InputAudioBufferClearEvent | ConversationItemCreateEvent | ConversationItemTruncateEvent | ConversationItemDeleteEvent | ResponseCreateEvent | ResponseCancelEvent; /** * The response resource. */ export interface RealtimeResponse { /** * The unique ID of the response. */ id?: string; /** * Developer-provided string key-value pairs associated with this response. */ metadata?: unknown | null; /** * The object type, must be `realtime.response`. */ object?: 'realtime.response'; /** * The list of output items generated by the response. */ output?: Array<ConversationItem>; /** * The final status of the response (`completed`, `cancelled`, `failed`, or * `incomplete`). */ status?: 'completed' | 'cancelled' | 'failed' | 'incomplete'; /** * Additional details about the status. */ status_details?: RealtimeResponseStatus; /** * Usage statistics for the Response, this will correspond to billing. A Realtime * API session will maintain a conversation context and append new Items to the * Conversation, thus output from previous turns (text and audio tokens) will * become the input for later turns. */ usage?: RealtimeResponseUsage; } /** * Additional details about the status. */ export interface RealtimeResponseStatus { /** * A description of the error that caused the response to fail, populated when the * `status` is `failed`. */ error?: RealtimeResponseStatus.Error; /** * The reason the Response did not complete. For a `cancelled` Response, one of * `turn_detected` (the server VAD detected a new start of speech) or * `client_cancelled` (the client sent a cancel event). For an `incomplete` * Response, one of `max_output_tokens` or `content_filter` (the server-side safety * filter activated and cut off the response). */ reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter'; /** * The type of error that caused the response to fail, corresponding with the * `status` field (`completed`, `cancelled`, `incomplete`, `failed`). */ type?: 'completed' | 'cancelled' | 'incomplete' | 'failed'; } export declare namespace RealtimeResponseStatus { /** * A description of the error that caused the response to fail, populated when the * `status` is `failed`. */ interface Error { /** * Error code, if any. */ code?: string; /** * The type of error. */ type?: string; } } /** * Usage statistics for the Response, this will correspond to billing. A Realtime * API session will maintain a conversation context and append new Items to the * Conversation, thus output from previous turns (text and audio tokens) will * become the input for later turns. */ export interface RealtimeResponseUsage { /** * Details about the input tokens used in the Response. */ input_token_details?: RealtimeResponseUsage.InputTokenDetails; /** * The number of input tokens used in the Response, including text and audio * tokens. */ input_tokens?: number; /** * Details about the output tokens used in the Response. */ output_token_details?: RealtimeResponseUsage.OutputTokenDetails; /** * The number of output tokens sent in the Response, including text and audio * tokens. */ output_tokens?: number; /** * The total number of tokens in the Response including input and output text and * audio tokens. */ total_tokens?: number; } export declare namespace RealtimeResponseUsage { /** * Details about the input tokens used in the Response. */ interface InputTokenDetails { /** * The number of audio tokens used in the Response. */ audio_tokens?: number; /** * The number of cached tokens used in the Response. */ cached_tokens?: number; /** * The number of text tokens used in the Response. */ text_tokens?: number; } /** * Details about the output tokens used in the Response. */ interface OutputTokenDetails { /** * The number of audio tokens used in the Response. */ audio_tokens?: number; /** * The number of text tokens used in the Response. */ text_tokens?: number; } } /** * All events that the Realtime API can send back */ export type RealtimeServerEvent = ErrorEvent | SessionCreatedEvent | SessionUpdatedEvent | ConversationCreatedEvent | InputAudioBufferCommittedEvent | InputAudioBufferClearedEvent | InputAudioBufferSpeechStartedEvent | InputAudioBufferSpeechStoppedEvent | ConversationItemCreatedEvent | ConversationItemInputAudioTranscriptionCompletedEvent | ConversationItemInputAudioTranscriptionFailedEvent | ConversationItemTruncatedEvent | ConversationItemDeletedEvent | ResponseCreatedEvent | ResponseDoneEvent | ResponseOutputItemAddedEvent | ResponseOutputItemDoneEvent | ResponseContentPartAddedEvent | ResponseContentPartDoneEvent | ResponseTextDeltaEvent | ResponseTextDoneEvent | ResponseAudioTranscriptDeltaEvent | ResponseAudioTranscriptDoneEvent | ResponseAudioDeltaEvent | ResponseAudioDoneEvent | ResponseFunctionCallArgumentsDeltaEvent | ResponseFunctionCallArgumentsDoneEvent | RateLimitsUpdatedEvent; /** * Returned when the model-generated audio is updated. */ export interface ResponseAudioDeltaEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * Base64-encoded audio data delta. */ delta: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.audio.delta`. */ type: 'response.audio.delta'; } /** * Returned when the model-generated audio is done. Also emitted when a Response is * interrupted, incomplete, or cancelled. */ export interface ResponseAudioDoneEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.audio.done`. */ type: 'response.audio.done'; } /** * Returned when the model-generated transcription of audio output is updated. */ export interface ResponseAudioTranscriptDeltaEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The transcript delta. */ delta: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.audio_transcript.delta`. */ type: 'response.audio_transcript.delta'; } /** * Returned when the model-generated transcription of audio output is done * streaming. Also emitted when a Response is interrupted, incomplete, or * cancelled. */ export interface ResponseAudioTranscriptDoneEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The final transcript of the audio. */ transcript: string; /** * The event type, must be `response.audio_transcript.done`. */ type: 'response.audio_transcript.done'; } /** * Send this event to cancel an in-progress response. The server will respond with * a `response.cancelled` event or an error if there is no response to cancel. */ export interface ResponseCancelEvent { /** * The event type, must be `response.cancel`. */ type: 'response.cancel'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; /** * A specific response ID to cancel - if not provided, will cancel an in-progress * response in the default conversation. */ response_id?: string; } /** * Returned when a new content part is added to an assistant message item during * response generation. */ export interface ResponseContentPartAddedEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item to which the content part was added. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The content part that was added. */ part: ResponseContentPartAddedEvent.Part; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.content_part.added`. */ type: 'response.content_part.added'; } export declare namespace ResponseContentPartAddedEvent { /** * The content part that was added. */ interface Part { /** * Base64-encoded audio data (if type is "audio"). */ audio?: string; /** * The text content (if type is "text"). */ text?: string; /** * The transcript of the audio (if type is "audio"). */ transcript?: string; /** * The content type ("text", "audio"). */ type?: 'text' | 'audio'; } } /** * Returned when a content part is done streaming in an assistant message item. * Also emitted when a Response is interrupted, incomplete, or cancelled. */ export interface ResponseContentPartDoneEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The content part that is done. */ part: ResponseContentPartDoneEvent.Part; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.content_part.done`. */ type: 'response.content_part.done'; } export declare namespace ResponseContentPartDoneEvent { /** * The content part that is done. */ interface Part { /** * Base64-encoded audio data (if type is "audio"). */ audio?: string; /** * The text content (if type is "text"). */ text?: string; /** * The transcript of the audio (if type is "audio"). */ transcript?: string; /** * The content type ("text", "audio"). */ type?: 'text' | 'audio'; } } /** * This event instructs the server to create a Response, which means triggering * model inference. When in Server VAD mode, the server will create Responses * automatically. * * A Response will include at least one Item, and may have two, in which case the * second will be a function call. These Items will be appended to the conversation * history. * * The server will respond with a `response.created` event, events for Items and * content created, and finally a `response.done` event to indicate the Response is * complete. * * The `response.create` event includes inference configuration like * `instructions`, and `temperature`. These fields will override the Session's * configuration for this Response only. */ export interface ResponseCreateEvent { /** * The event type, must be `response.create`. */ type: 'response.create'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; /** * Create a new Realtime response with these parameters */ response?: ResponseCreateEvent.Response; } export declare namespace ResponseCreateEvent { /** * Create a new Realtime response with these parameters */ interface Response { /** * Controls which conversation the response is added to. Currently supports `auto` * and `none`, with `auto` as the default value. The `auto` value means that the * contents of the response will be added to the default conversation. Set this to * `none` to create an out-of-band response which will not add items to default * conversation. */ conversation?: (string & {}) | 'auto' | 'none'; /** * Input items to include in the prompt for the model. Creates a new context for * this response, without including the default conversation. Can include * references to items from the default conversation. */ input?: Array<RealtimeAPI.ConversationItem>; /** * The default system instructions (i.e. system message) prepended to model calls. * This field allows the client to guide the model on desired responses. The model * can be instructed on response content and format, (e.g. "be extremely succinct", * "act friendly", "here are examples of good responses") and on audio behavior * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The * instructions are not guaranteed to be followed by the model, but they provide * guidance to the model on the desired behavior. * * Note that the server sets default instructions which will be used if this field * is not set and are visible in the `session.created` event at the start of the * session. */ instructions?: string; /** * Maximum number of output tokens for a single assistant response, inclusive of * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or * `inf` for the maximum available tokens for a given model. Defaults to `inf`. */ max_response_output_tokens?: number | 'inf'; /** * Set of 16 key-value pairs that can be attached to an object. This can be useful * for storing additional information about the object in a structured format. Keys * can be a maximum of 64 characters long and values can be a maximum of 512 * characters long. */ metadata?: unknown | null; /** * The set of modalities the model can respond with. To disable audio, set this to * ["text"]. */ modalities?: Array<'text' | 'audio'>; /** * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: number; /** * How the model chooses tools. Options are `auto`, `none`, `required`, or specify * a function, like `{"type": "function", "function": {"name": "my_function"}}`. */ tool_choice?: string; /** * Tools (functions) available to the model. */ tools?: Array<Response.Tool>; /** * The voice the model uses to respond. Voice cannot be changed during the session * once the model has responded with audio at least once. Current voice options are * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. */ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; } namespace Response { interface Tool { /** * The description of the function, including guidance on when and how to call it, * and guidance about what to tell the user when calling (if anything). */ description?: string; /** * The name of the function. */ name?: string; /** * Parameters of the function in JSON Schema. */ parameters?: unknown; /** * The type of the tool, i.e. `function`. */ type?: 'function'; } } } /** * Returned when a new Response is created. The first event of response creation, * where the response is in an initial state of `in_progress`. */ export interface ResponseCreatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The response resource. */ response: RealtimeResponse; /** * The event type, must be `response.created`. */ type: 'response.created'; } /** * Returned when a Response is done streaming. Always emitted, no matter the final * state. The Response object included in the `response.done` event will include * all output Items in the Response but will omit the raw audio data. */ export interface ResponseDoneEvent { /** * The unique ID of the server event. */ event_id: string; /** * The response resource. */ response: RealtimeResponse; /** * The event type, must be `response.done`. */ type: 'response.done'; } /** * Returned when the model-generated function call arguments are updated. */ export interface ResponseFunctionCallArgumentsDeltaEvent { /** * The ID of the function call. */ call_id: string; /** * The arguments delta as a JSON string. */ delta: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the function call item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.function_call_arguments.delta`. */ type: 'response.function_call_arguments.delta'; } /** * Returned when the model-generated function call arguments are done streaming. * Also emitted when a Response is interrupted, incomplete, or cancelled. */ export interface ResponseFunctionCallArgumentsDoneEvent { /** * The final arguments as a JSON string. */ arguments: string; /** * The ID of the function call. */ call_id: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the function call item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.function_call_arguments.done`. */ type: 'response.function_call_arguments.done'; } /** * Returned when a new Item is created during Response generation. */ export interface ResponseOutputItemAddedEvent { /** * The unique ID of the server event. */ event_id: string; /** * The item to add to the conversation. */ item: ConversationItem; /** * The index of the output item in the Response. */ output_index: number; /** * The ID of the Response to which the item belongs. */ response_id: string; /** * The event type, must be `response.output_item.added`. */ type: 'response.output_item.added'; } /** * Returned when an Item is done streaming. Also emitted when a Response is * interrupted, incomplete, or cancelled. */ export interface ResponseOutputItemDoneEvent { /** * The unique ID of the server event. */ event_id: string; /** * The item to add to the conversation. */ item: ConversationItem; /** * The index of the output item in the Response. */ output_index: number; /** * The ID of the Response to which the item belongs. */ response_id: string; /** * The event type, must be `response.output_item.done`. */ type: 'response.output_item.done'; } /** * Returned when the text value of a "text" content part is updated. */ export interface ResponseTextDeltaEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The text delta. */ delta: string; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The event type, must be `response.text.delta`. */ type: 'response.text.delta'; } /** * Returned when the text value of a "text" content part is done streaming. Also * emitted when a Response is interrupted, incomplete, or cancelled. */ export interface ResponseTextDoneEvent { /** * The index of the content part in the item's content array. */ content_index: number; /** * The unique ID of the server event. */ event_id: string; /** * The ID of the item. */ item_id: string; /** * The index of the output item in the response. */ output_index: number; /** * The ID of the response. */ response_id: string; /** * The final text content. */ text: string; /** * The event type, must be `response.text.done`. */ type: 'response.text.done'; } /** * Returned when a Session is created. Emitted automatically when a new connection * is established as the first server event. This event will contain the default * Session configuration. */ export interface SessionCreatedEvent { /** * The unique ID of the server event. */ event_id: string; /** * Realtime session object configuration. */ session: SessionsAPI.Session; /** * The event type, must be `session.created`. */ type: 'session.created'; } /** * Send this event to update the session’s default configuration. The client may * send this event at any time to update the session configuration, and any field * may be updated at any time, except for "voice". The server will respond with a * `session.updated` event that shows the full effective configuration. Only fields * that are present are updated, thus the correct way to clear a field like * "instructions" is to pass an empty string. */ export interface SessionUpdateEvent { /** * Realtime session object configuration. */ session: SessionUpdateEvent.Session; /** * The event type, must be `session.update`. */ type: 'session.update'; /** * Optional client-generated ID used to identify this event. */ event_id?: string; } export declare namespace SessionUpdateEvent { /** * Realtime session object configuration. */ interface Session { /** * The Realtime model used for this session. */ model: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17'; /** * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ input_audio_transcription?: Session.InputAudioTranscription; /** * The default system instructions (i.e. system message) prepended to model calls. * This field allows the client to guide the model on desired responses. The model * can be instructed on response content and format, (e.g. "be extremely succinct", * "act friendly", "here are examples of good responses") and on audio behavior * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The * instructions are not guaranteed to be followed by the model, but they provide * guidance to the model on the desired behavior. * * Note that the server sets default instructions which will be used if this field * is not set and are visible in the `session.created` event at the start of the * session. */ instructions?: string; /** * Maximum number of output tokens for a single assistant response, inclusive of * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or * `inf` for the maximum available tokens for a given model. Defaults to `inf`. */ max_response_output_tokens?: number | 'inf'; /** * The set of modalities the model can respond with. To disable audio, set this to * ["text"]. */ modalities?: Array<'text' | 'audio'>; /** * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: number; /** * How the model chooses tools. Options are `auto`, `none`, `required`, or specify * a function. */ tool_choice?: string; /** * Tools (functions) available to the model. */ tools?: Array<Session.Tool>; /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ turn_detection?: Session.TurnDetection; /** * The voice the model uses to respond. Voice cannot be changed during the session * once the model has responded with audio at least once. Current voice options are * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. */ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; } namespace Session { /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ interface InputAudioTranscription { /** * The model to use for transcription, `whisper-1` is the only currently supported * model. */ model?: string; } interface Tool { /** * The description of the function, including guidance on when and how to call it, * and guidance about what to tell the user when calling (if anything). */ description?: string; /** * The name of the function. */ name?: string; /** * Parameters of the function in JSON Schema. */ parameters?: unknown; /** * The type of the tool, i.e. `function`. */ type?: 'function'; } /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ interface TurnDetection { /** * Whet