openai
Version:
The official TypeScript library for the OpenAI API
455 lines • 18.9 kB
TypeScript
import { APIResource } from "../../../resource.js";
import * as Core from "../../../core.js";
export declare class Sessions extends APIResource {
/**
* Create an ephemeral API token for use in client-side applications with the
* Realtime API. Can be configured with the same session parameters as the
* `session.update` client event.
*
* It responds with a session object, plus a `client_secret` key which contains a
* usable ephemeral API token that can be used to authenticate browser clients for
* the Realtime API.
*/
create(body: SessionCreateParams, options?: Core.RequestOptions): Core.APIPromise<SessionCreateResponse>;
}
/**
* Realtime session object configuration.
*/
export interface Session {
/**
* Unique identifier for the session object.
*/
id?: string;
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
input_audio_transcription?: Session.InputAudioTranscription;
/**
* The default system instructions (i.e. system message) prepended to model calls.
* This field allows the client to guide the model on desired responses. The model
* can be instructed on response content and format, (e.g. "be extremely succinct",
* "act friendly", "here are examples of good responses") and on audio behavior
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
* instructions are not guaranteed to be followed by the model, but they provide
* guidance to the model on the desired behavior.
*
* Note that the server sets default instructions which will be used if this field
* is not set and are visible in the `session.created` event at the start of the
* session.
*/
instructions?: string;
/**
* Maximum number of output tokens for a single assistant response, inclusive of
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
*/
max_response_output_tokens?: number | 'inf';
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* The Realtime model used for this session.
*/
model?: (string & {}) | 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
/**
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
*/
temperature?: number;
/**
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
* a function.
*/
tool_choice?: string;
/**
* Tools (functions) available to the model.
*/
tools?: Array<Session.Tool>;
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
turn_detection?: Session.TurnDetection | null;
/**
* The voice the model uses to respond. Voice cannot be changed during the session
* once the model has responded with audio at least once. Current voice options are
* `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
*/
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
}
export declare namespace Session {
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
interface InputAudioTranscription {
/**
* The model to use for transcription, `whisper-1` is the only currently supported
* model.
*/
model?: string;
}
interface Tool {
/**
* The description of the function, including guidance on when and how to call it,
* and guidance about what to tell the user when calling (if anything).
*/
description?: string;
/**
* The name of the function.
*/
name?: string;
/**
* Parameters of the function in JSON Schema.
*/
parameters?: unknown;
/**
* The type of the tool, i.e. `function`.
*/
type?: 'function';
}
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
interface TurnDetection {
/**
* Amount of audio to include before the VAD detected speech (in milliseconds).
* Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
* With shorter values the model will respond more quickly, but may jump in on
* short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
* threshold will require louder audio to activate the model, and thus might
* perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection, only `server_vad` is currently supported.
*/
type?: 'server_vad';
}
}
/**
* A new Realtime session configuration, with an ephermeral key. Default TTL for
* keys is one minute.
*/
export interface SessionCreateResponse {
/**
* Ephemeral key returned by the API.
*/
client_secret?: SessionCreateResponse.ClientSecret;
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
input_audio_format?: string;
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
input_audio_transcription?: SessionCreateResponse.InputAudioTranscription;
/**
* The default system instructions (i.e. system message) prepended to model calls.
* This field allows the client to guide the model on desired responses. The model
* can be instructed on response content and format, (e.g. "be extremely succinct",
* "act friendly", "here are examples of good responses") and on audio behavior
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
* instructions are not guaranteed to be followed by the model, but they provide
* guidance to the model on the desired behavior.
*
* Note that the server sets default instructions which will be used if this field
* is not set and are visible in the `session.created` event at the start of the
* session.
*/
instructions?: string;
/**
* Maximum number of output tokens for a single assistant response, inclusive of
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
*/
max_response_output_tokens?: number | 'inf';
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
output_audio_format?: string;
/**
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
*/
temperature?: number;
/**
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
* a function.
*/
tool_choice?: string;
/**
* Tools (functions) available to the model.
*/
tools?: Array<SessionCreateResponse.Tool>;
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
turn_detection?: SessionCreateResponse.TurnDetection;
/**
* The voice the model uses to respond. Voice cannot be changed during the session
* once the model has responded with audio at least once. Current voice options are
* `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
*/
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
}
export declare namespace SessionCreateResponse {
/**
* Ephemeral key returned by the API.
*/
interface ClientSecret {
/**
* Timestamp for when the token expires. Currently, all tokens expire after one
* minute.
*/
expires_at?: number;
/**
* Ephemeral key usable in client environments to authenticate connections to the
* Realtime API. Use this in client-side environments rather than a standard API
* token, which should only be used server-side.
*/
value?: string;
}
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
interface InputAudioTranscription {
/**
* The model to use for transcription, `whisper-1` is the only currently supported
* model.
*/
model?: string;
}
interface Tool {
/**
* The description of the function, including guidance on when and how to call it,
* and guidance about what to tell the user when calling (if anything).
*/
description?: string;
/**
* The name of the function.
*/
name?: string;
/**
* Parameters of the function in JSON Schema.
*/
parameters?: unknown;
/**
* The type of the tool, i.e. `function`.
*/
type?: 'function';
}
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
interface TurnDetection {
/**
* Amount of audio to include before the VAD detected speech (in milliseconds).
* Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
* With shorter values the model will respond more quickly, but may jump in on
* short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
* threshold will require louder audio to activate the model, and thus might
* perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection, only `server_vad` is currently supported.
*/
type?: string;
}
}
export interface SessionCreateParams {
/**
* The Realtime model used for this session.
*/
model: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
input_audio_transcription?: SessionCreateParams.InputAudioTranscription;
/**
* The default system instructions (i.e. system message) prepended to model calls.
* This field allows the client to guide the model on desired responses. The model
* can be instructed on response content and format, (e.g. "be extremely succinct",
* "act friendly", "here are examples of good responses") and on audio behavior
* (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
* instructions are not guaranteed to be followed by the model, but they provide
* guidance to the model on the desired behavior.
*
* Note that the server sets default instructions which will be used if this field
* is not set and are visible in the `session.created` event at the start of the
* session.
*/
instructions?: string;
/**
* Maximum number of output tokens for a single assistant response, inclusive of
* tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
* `inf` for the maximum available tokens for a given model. Defaults to `inf`.
*/
max_response_output_tokens?: number | 'inf';
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
*/
temperature?: number;
/**
* How the model chooses tools. Options are `auto`, `none`, `required`, or specify
* a function.
*/
tool_choice?: string;
/**
* Tools (functions) available to the model.
*/
tools?: Array<SessionCreateParams.Tool>;
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
turn_detection?: SessionCreateParams.TurnDetection;
/**
* The voice the model uses to respond. Voice cannot be changed during the session
* once the model has responded with audio at least once. Current voice options are
* `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
*/
voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
}
export declare namespace SessionCreateParams {
/**
* Configuration for input audio transcription, defaults to off and can be set to
* `null` to turn off once on. Input audio transcription is not native to the
* model, since the model consumes audio directly. Transcription runs
* asynchronously through Whisper and should be treated as rough guidance rather
* than the representation understood by the model.
*/
interface InputAudioTranscription {
/**
* The model to use for transcription, `whisper-1` is the only currently supported
* model.
*/
model?: string;
}
interface Tool {
/**
* The description of the function, including guidance on when and how to call it,
* and guidance about what to tell the user when calling (if anything).
*/
description?: string;
/**
* The name of the function.
*/
name?: string;
/**
* Parameters of the function in JSON Schema.
*/
parameters?: unknown;
/**
* The type of the tool, i.e. `function`.
*/
type?: 'function';
}
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
interface TurnDetection {
/**
* Whether or not to automatically generate a response when VAD is enabled. `true`
* by default.
*/
create_response?: boolean;
/**
* Amount of audio to include before the VAD detected speech (in milliseconds).
* Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
* With shorter values the model will respond more quickly, but may jump in on
* short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
* threshold will require louder audio to activate the model, and thus might
* perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection, only `server_vad` is currently supported.
*/
type?: string;
}
}
export declare namespace Sessions {
export { type Session as Session, type SessionCreateResponse as SessionCreateResponse, type SessionCreateParams as SessionCreateParams, };
}
//# sourceMappingURL=sessions.d.ts.map