openai

import { APIResource } from "../../../resource.js"; import * as Core from "../../../core.js"; export declare class Sessions extends APIResource { /** * Create an ephemeral API token for use in client-side applications with the * Realtime API. Can be configured with the same session parameters as the * `session.update` client event. * * It responds with a session object, plus a `client_secret` key which contains a * usable ephemeral API token that can be used to authenticate browser clients for * the Realtime API. */ create(body: SessionCreateParams, options?: Core.RequestOptions): Core.APIPromise<SessionCreateResponse>; } /** * Realtime session object configuration. */ export interface Session { /** * Unique identifier for the session object. */ id?: string; /** * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ input_audio_transcription?: Session.InputAudioTranscription; /** * The default system instructions (i.e. system message) prepended to model calls. * This field allows the client to guide the model on desired responses. The model * can be instructed on response content and format, (e.g. "be extremely succinct", * "act friendly", "here are examples of good responses") and on audio behavior * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The * instructions are not guaranteed to be followed by the model, but they provide * guidance to the model on the desired behavior. * * Note that the server sets default instructions which will be used if this field * is not set and are visible in the `session.created` event at the start of the * session. */ instructions?: string; /** * Maximum number of output tokens for a single assistant response, inclusive of * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or * `inf` for the maximum available tokens for a given model. Defaults to `inf`. */ max_response_output_tokens?: number | 'inf'; /** * The set of modalities the model can respond with. To disable audio, set this to * ["text"]. */ modalities?: Array<'text' | 'audio'>; /** * The Realtime model used for this session. */ model?: (string & {}) | 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17'; /** * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: number; /** * How the model chooses tools. Options are `auto`, `none`, `required`, or specify * a function. */ tool_choice?: string; /** * Tools (functions) available to the model. */ tools?: Array<Session.Tool>; /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ turn_detection?: Session.TurnDetection | null; /** * The voice the model uses to respond. Voice cannot be changed during the session * once the model has responded with audio at least once. Current voice options are * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. */ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; } export declare namespace Session { /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ interface InputAudioTranscription { /** * The model to use for transcription, `whisper-1` is the only currently supported * model. */ model?: string; } interface Tool { /** * The description of the function, including guidance on when and how to call it, * and guidance about what to tell the user when calling (if anything). */ description?: string; /** * The name of the function. */ name?: string; /** * Parameters of the function in JSON Schema. */ parameters?: unknown; /** * The type of the tool, i.e. `function`. */ type?: 'function'; } /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ interface TurnDetection { /** * Amount of audio to include before the VAD detected speech (in milliseconds). * Defaults to 300ms. */ prefix_padding_ms?: number; /** * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. * With shorter values the model will respond more quickly, but may jump in on * short pauses from the user. */ silence_duration_ms?: number; /** * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher * threshold will require louder audio to activate the model, and thus might * perform better in noisy environments. */ threshold?: number; /** * Type of turn detection, only `server_vad` is currently supported. */ type?: 'server_vad'; } } /** * A new Realtime session configuration, with an ephermeral key. Default TTL for * keys is one minute. */ export interface SessionCreateResponse { /** * Ephemeral key returned by the API. */ client_secret?: SessionCreateResponse.ClientSecret; /** * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ input_audio_format?: string; /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ input_audio_transcription?: SessionCreateResponse.InputAudioTranscription; /** * The default system instructions (i.e. system message) prepended to model calls. * This field allows the client to guide the model on desired responses. The model * can be instructed on response content and format, (e.g. "be extremely succinct", * "act friendly", "here are examples of good responses") and on audio behavior * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The * instructions are not guaranteed to be followed by the model, but they provide * guidance to the model on the desired behavior. * * Note that the server sets default instructions which will be used if this field * is not set and are visible in the `session.created` event at the start of the * session. */ instructions?: string; /** * Maximum number of output tokens for a single assistant response, inclusive of * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or * `inf` for the maximum available tokens for a given model. Defaults to `inf`. */ max_response_output_tokens?: number | 'inf'; /** * The set of modalities the model can respond with. To disable audio, set this to * ["text"]. */ modalities?: Array<'text' | 'audio'>; /** * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ output_audio_format?: string; /** * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: number; /** * How the model chooses tools. Options are `auto`, `none`, `required`, or specify * a function. */ tool_choice?: string; /** * Tools (functions) available to the model. */ tools?: Array<SessionCreateResponse.Tool>; /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ turn_detection?: SessionCreateResponse.TurnDetection; /** * The voice the model uses to respond. Voice cannot be changed during the session * once the model has responded with audio at least once. Current voice options are * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. */ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; } export declare namespace SessionCreateResponse { /** * Ephemeral key returned by the API. */ interface ClientSecret { /** * Timestamp for when the token expires. Currently, all tokens expire after one * minute. */ expires_at?: number; /** * Ephemeral key usable in client environments to authenticate connections to the * Realtime API. Use this in client-side environments rather than a standard API * token, which should only be used server-side. */ value?: string; } /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ interface InputAudioTranscription { /** * The model to use for transcription, `whisper-1` is the only currently supported * model. */ model?: string; } interface Tool { /** * The description of the function, including guidance on when and how to call it, * and guidance about what to tell the user when calling (if anything). */ description?: string; /** * The name of the function. */ name?: string; /** * Parameters of the function in JSON Schema. */ parameters?: unknown; /** * The type of the tool, i.e. `function`. */ type?: 'function'; } /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ interface TurnDetection { /** * Amount of audio to include before the VAD detected speech (in milliseconds). * Defaults to 300ms. */ prefix_padding_ms?: number; /** * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. * With shorter values the model will respond more quickly, but may jump in on * short pauses from the user. */ silence_duration_ms?: number; /** * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher * threshold will require louder audio to activate the model, and thus might * perform better in noisy environments. */ threshold?: number; /** * Type of turn detection, only `server_vad` is currently supported. */ type?: string; } } export interface SessionCreateParams { /** * The Realtime model used for this session. */ model: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17'; /** * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ input_audio_transcription?: SessionCreateParams.InputAudioTranscription; /** * The default system instructions (i.e. system message) prepended to model calls. * This field allows the client to guide the model on desired responses. The model * can be instructed on response content and format, (e.g. "be extremely succinct", * "act friendly", "here are examples of good responses") and on audio behavior * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The * instructions are not guaranteed to be followed by the model, but they provide * guidance to the model on the desired behavior. * * Note that the server sets default instructions which will be used if this field * is not set and are visible in the `session.created` event at the start of the * session. */ instructions?: string; /** * Maximum number of output tokens for a single assistant response, inclusive of * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or * `inf` for the maximum available tokens for a given model. Defaults to `inf`. */ max_response_output_tokens?: number | 'inf'; /** * The set of modalities the model can respond with. To disable audio, set this to * ["text"]. */ modalities?: Array<'text' | 'audio'>; /** * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. */ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; /** * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: number; /** * How the model chooses tools. Options are `auto`, `none`, `required`, or specify * a function. */ tool_choice?: string; /** * Tools (functions) available to the model. */ tools?: Array<SessionCreateParams.Tool>; /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ turn_detection?: SessionCreateParams.TurnDetection; /** * The voice the model uses to respond. Voice cannot be changed during the session * once the model has responded with audio at least once. Current voice options are * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. */ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; } export declare namespace SessionCreateParams { /** * Configuration for input audio transcription, defaults to off and can be set to * `null` to turn off once on. Input audio transcription is not native to the * model, since the model consumes audio directly. Transcription runs * asynchronously through Whisper and should be treated as rough guidance rather * than the representation understood by the model. */ interface InputAudioTranscription { /** * The model to use for transcription, `whisper-1` is the only currently supported * model. */ model?: string; } interface Tool { /** * The description of the function, including guidance on when and how to call it, * and guidance about what to tell the user when calling (if anything). */ description?: string; /** * The name of the function. */ name?: string; /** * Parameters of the function in JSON Schema. */ parameters?: unknown; /** * The type of the tool, i.e. `function`. */ type?: 'function'; } /** * Configuration for turn detection. Can be set to `null` to turn off. Server VAD * means that the model will detect the start and end of speech based on audio * volume and respond at the end of user speech. */ interface TurnDetection { /** * Whether or not to automatically generate a response when VAD is enabled. `true` * by default. */ create_response?: boolean; /** * Amount of audio to include before the VAD detected speech (in milliseconds). * Defaults to 300ms. */ prefix_padding_ms?: number; /** * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. * With shorter values the model will respond more quickly, but may jump in on * short pauses from the user. */ silence_duration_ms?: number; /** * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher * threshold will require louder audio to activate the model, and thus might * perform better in noisy environments. */ threshold?: number; /** * Type of turn detection, only `server_vad` is currently supported. */ type?: string; } } export declare namespace Sessions { export { type Session as Session, type SessionCreateResponse as SessionCreateResponse, type SessionCreateParams as SessionCreateParams, }; } //# sourceMappingURL=sessions.d.ts.map