@ai-sdk/elevenlabs
Version:
The **[ElevenLabs provider](https://ai-sdk.dev/providers/ai-sdk-providers/elevenlabs)** for the [AI SDK](https://ai-sdk.dev/docs) contains language model support for the ElevenLabs chat and completion APIs and embedding model support for the ElevenLabs em
101 lines (94 loc) • 3.12 kB
text/typescript
export type ElevenLabsTranscriptionAPITypes = {
/**
* An ISO-639-1 or ISO-639-3 language_code corresponding to the language of the audio file.
* Can sometimes improve transcription performance if known beforehand.
* Defaults to null, in this case the language is predicted automatically.
*/
language_code?: string;
/**
* Whether to tag audio events like (laughter), (footsteps), etc. in the transcription.
* @default true
*/
tag_audio_events?: boolean;
/**
* The maximum amount of speakers talking in the uploaded file.
* Can help with predicting who speaks when.
* The maximum amount of speakers that can be predicted is 32.
* Defaults to null, in this case the amount of speakers is set to the maximum value the model supports.
* @min 1
* @max 32
*/
num_speakers?: number;
/**
* The granularity of the timestamps in the transcription.
* 'word' provides word-level timestamps and 'character' provides character-level timestamps per word.
* @default 'word'
*/
timestamps_granularity?: 'none' | 'word' | 'character';
/**
* Whether to annotate which speaker is currently talking in the uploaded file.
* @default false
*/
diarize?: boolean;
/**
* A list of additional formats to export the transcript to.
*/
additional_formats?: Array<
| {
format: 'docx';
include_speakers?: boolean;
include_timestamps?: boolean;
max_segment_chars?: number;
max_segment_duration_s?: number;
segment_on_silence_longer_than_s?: number;
}
| {
format: 'html';
include_speakers?: boolean;
include_timestamps?: boolean;
max_segment_chars?: number;
max_segment_duration_s?: number;
segment_on_silence_longer_than_s?: number;
}
| {
format: 'pdf';
include_speakers?: boolean;
include_timestamps?: boolean;
max_segment_chars?: number;
max_segment_duration_s?: number;
segment_on_silence_longer_than_s?: number;
}
| {
format: 'segmented_json';
max_segment_chars?: number;
max_segment_duration_s?: number;
segment_on_silence_longer_than_s?: number;
}
| {
format: 'srt';
include_speakers?: boolean;
include_timestamps?: boolean;
max_characters_per_line?: number;
max_segment_chars?: number;
max_segment_duration_s?: number;
segment_on_silence_longer_than_s?: number;
}
| {
format: 'txt';
include_speakers?: boolean;
include_timestamps?: boolean;
max_characters_per_line?: number;
max_segment_chars?: number;
max_segment_duration_s?: number;
segment_on_silence_longer_than_s?: number;
}
>;
/**
* The format of input audio.
* For pcm_s16le_16, the input audio must be 16-bit PCM at a 16kHz sample rate,
* single channel (mono), and little-endian byte order.
* Latency will be lower than with passing an encoded waveform.
* @default 'other'
*/
file_format?: 'pcm_s16le_16' | 'other';
};