openai
Version:
The official TypeScript library for the OpenAI API
309 lines (274 loc) • 11.6 kB
text/typescript
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
import { APIResource } from '../../../resource';
import * as Core from '../../../core';
export class TranscriptionSessions extends APIResource {
/**
* Create an ephemeral API token for use in client-side applications with the
* Realtime API specifically for realtime transcriptions. Can be configured with
* the same session parameters as the `transcription_session.update` client event.
*
* It responds with a session object, plus a `client_secret` key which contains a
* usable ephemeral API token that can be used to authenticate browser clients for
* the Realtime API.
*/
create(
body: TranscriptionSessionCreateParams,
options?: Core.RequestOptions,
): Core.APIPromise<TranscriptionSession> {
return this._client.post('/realtime/transcription_sessions', {
body,
...options,
headers: { 'OpenAI-Beta': 'assistants=v2', ...options?.headers },
});
}
}
/**
* A new Realtime transcription session configuration.
*
* When a session is created on the server via REST API, the session object also
* contains an ephemeral key. Default TTL for keys is one minute. This property is
* not present when a session is updated via the WebSocket API.
*/
export interface TranscriptionSession {
/**
* Ephemeral key returned by the API. Only present when the session is created on
* the server via REST API.
*/
client_secret: TranscriptionSession.ClientSecret;
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
*/
input_audio_format?: string;
/**
* Configuration of the transcription model.
*/
input_audio_transcription?: TranscriptionSession.InputAudioTranscription;
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
turn_detection?: TranscriptionSession.TurnDetection;
}
export namespace TranscriptionSession {
/**
* Ephemeral key returned by the API. Only present when the session is created on
* the server via REST API.
*/
export interface ClientSecret {
/**
* Timestamp for when the token expires. Currently, all tokens expire after one
* minute.
*/
expires_at: number;
/**
* Ephemeral key usable in client environments to authenticate connections to the
* Realtime API. Use this in client-side environments rather than a standard API
* token, which should only be used server-side.
*/
value: string;
}
/**
* Configuration of the transcription model.
*/
export interface InputAudioTranscription {
/**
* The language of the input audio. Supplying the input language in
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
* format will improve accuracy and latency.
*/
language?: string;
/**
* The model to use for transcription. Can be `gpt-4o-transcribe`,
* `gpt-4o-mini-transcribe`, or `whisper-1`.
*/
model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
/**
* An optional text to guide the model's style or continue a previous audio
* segment. The
* [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
* should match the audio language.
*/
prompt?: string;
}
/**
* Configuration for turn detection. Can be set to `null` to turn off. Server VAD
* means that the model will detect the start and end of speech based on audio
* volume and respond at the end of user speech.
*/
export interface TurnDetection {
/**
* Amount of audio to include before the VAD detected speech (in milliseconds).
* Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
* With shorter values the model will respond more quickly, but may jump in on
* short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
* threshold will require louder audio to activate the model, and thus might
* perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection, only `server_vad` is currently supported.
*/
type?: string;
}
}
export interface TranscriptionSessionCreateParams {
/**
* The set of items to include in the transcription. Current available items are:
*
* - `item.input_audio_transcription.logprobs`
*/
include?: Array<string>;
/**
* The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
* `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
* (mono), and little-endian byte order.
*/
input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
* off. Noise reduction filters audio added to the input audio buffer before it is
* sent to VAD and the model. Filtering the audio can improve VAD and turn
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
input_audio_noise_reduction?: TranscriptionSessionCreateParams.InputAudioNoiseReduction;
/**
* Configuration for input audio transcription. The client can optionally set the
* language and prompt for transcription, these offer additional guidance to the
* transcription service.
*/
input_audio_transcription?: TranscriptionSessionCreateParams.InputAudioTranscription;
/**
* The set of modalities the model can respond with. To disable audio, set this to
* ["text"].
*/
modalities?: Array<'text' | 'audio'>;
/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjuction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
*/
turn_detection?: TranscriptionSessionCreateParams.TurnDetection;
}
export namespace TranscriptionSessionCreateParams {
/**
* Configuration for input audio noise reduction. This can be set to `null` to turn
* off. Noise reduction filters audio added to the input audio buffer before it is
* sent to VAD and the model. Filtering the audio can improve VAD and turn
* detection accuracy (reducing false positives) and model performance by improving
* perception of the input audio.
*/
export interface InputAudioNoiseReduction {
/**
* Type of noise reduction. `near_field` is for close-talking microphones such as
* headphones, `far_field` is for far-field microphones such as laptop or
* conference room microphones.
*/
type?: 'near_field' | 'far_field';
}
/**
* Configuration for input audio transcription. The client can optionally set the
* language and prompt for transcription, these offer additional guidance to the
* transcription service.
*/
export interface InputAudioTranscription {
/**
* The language of the input audio. Supplying the input language in
* [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
* format will improve accuracy and latency.
*/
language?: string;
/**
* The model to use for transcription, current options are `gpt-4o-transcribe`,
* `gpt-4o-mini-transcribe`, and `whisper-1`.
*/
model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
/**
* An optional text to guide the model's style or continue a previous audio
* segment. For `whisper-1`, the
* [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
* For `gpt-4o-transcribe` models, the prompt is a free text string, for example
* "expect words related to technology".
*/
prompt?: string;
}
/**
* Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
* set to `null` to turn off, in which case the client must manually trigger model
* response. Server VAD means that the model will detect the start and end of
* speech based on audio volume and respond at the end of user speech. Semantic VAD
* is more advanced and uses a turn detection model (in conjuction with VAD) to
* semantically estimate whether the user has finished speaking, then dynamically
* sets a timeout based on this probability. For example, if user audio trails off
* with "uhhm", the model will score a low probability of turn end and wait longer
* for the user to continue speaking. This can be useful for more natural
* conversations, but may have a higher latency.
*/
export interface TurnDetection {
/**
* Whether or not to automatically generate a response when a VAD stop event
* occurs. Not available for transcription sessions.
*/
create_response?: boolean;
/**
* Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
* will wait longer for the user to continue speaking, `high` will respond more
* quickly. `auto` is the default and is equivalent to `medium`.
*/
eagerness?: 'low' | 'medium' | 'high' | 'auto';
/**
* Whether or not to automatically interrupt any ongoing response with output to
* the default conversation (i.e. `conversation` of `auto`) when a VAD start event
* occurs. Not available for transcription sessions.
*/
interrupt_response?: boolean;
/**
* Used only for `server_vad` mode. Amount of audio to include before the VAD
* detected speech (in milliseconds). Defaults to 300ms.
*/
prefix_padding_ms?: number;
/**
* Used only for `server_vad` mode. Duration of silence to detect speech stop (in
* milliseconds). Defaults to 500ms. With shorter values the model will respond
* more quickly, but may jump in on short pauses from the user.
*/
silence_duration_ms?: number;
/**
* Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
* defaults to 0.5. A higher threshold will require louder audio to activate the
* model, and thus might perform better in noisy environments.
*/
threshold?: number;
/**
* Type of turn detection.
*/
type?: 'server_vad' | 'semantic_vad';
}
}
export declare namespace TranscriptionSessions {
export {
type TranscriptionSession as TranscriptionSession,
type TranscriptionSessionCreateParams as TranscriptionSessionCreateParams,
};
}