@ai-sdk/elevenlabs
Version:
The **[ElevenLabs provider](https://ai-sdk.dev/providers/ai-sdk-providers/elevenlabs)** for the [AI SDK](https://ai-sdk.dev/docs) contains language model support for the ElevenLabs chat and completion APIs and embedding model support for the ElevenLabs em
184 lines (165 loc) • 5.57 kB
text/typescript
import type { TranscriptionModelV3, SharedV3Warning } from '@ai-sdk/provider';
import {
combineHeaders,
convertBase64ToUint8Array,
createJsonResponseHandler,
mediaTypeToExtension,
parseProviderOptions,
postFormDataToApi,
} from '@ai-sdk/provider-utils';
import { z } from 'zod/v4';
import type { ElevenLabsConfig } from './elevenlabs-config';
import { elevenlabsFailedResponseHandler } from './elevenlabs-error';
import type { ElevenLabsTranscriptionModelId } from './elevenlabs-transcription-options';
import type { ElevenLabsTranscriptionAPITypes } from './elevenlabs-api-types';
// https://elevenlabs.io/docs/api-reference/speech-to-text/convert
const elevenLabsTranscriptionModelOptionsSchema = z.object({
languageCode: z.string().nullish(),
tagAudioEvents: z.boolean().nullish().default(true),
numSpeakers: z.number().int().min(1).max(32).nullish(),
timestampsGranularity: z
.enum(['none', 'word', 'character'])
.nullish()
.default('word'),
diarize: z.boolean().nullish().default(false),
fileFormat: z.enum(['pcm_s16le_16', 'other']).nullish().default('other'),
});
export type ElevenLabsTranscriptionModelOptions = z.infer<
typeof elevenLabsTranscriptionModelOptionsSchema
>;
interface ElevenLabsTranscriptionModelConfig extends ElevenLabsConfig {
_internal?: {
currentDate?: () => Date;
};
}
export class ElevenLabsTranscriptionModel implements TranscriptionModelV3 {
readonly specificationVersion = 'v3';
get provider(): string {
return this.config.provider;
}
constructor(
readonly modelId: ElevenLabsTranscriptionModelId,
private readonly config: ElevenLabsTranscriptionModelConfig,
) {}
private async getArgs({
audio,
mediaType,
providerOptions,
}: Parameters<TranscriptionModelV3['doGenerate']>[0]) {
const warnings: SharedV3Warning[] = [];
// Parse provider options
const elevenlabsOptions = await parseProviderOptions({
provider: 'elevenlabs',
providerOptions,
schema: elevenLabsTranscriptionModelOptionsSchema,
});
// Create form data with base fields
const formData = new FormData();
const blob =
audio instanceof Uint8Array
? new Blob([audio])
: new Blob([convertBase64ToUint8Array(audio)]);
formData.append('model_id', this.modelId);
const fileExtension = mediaTypeToExtension(mediaType);
formData.append(
'file',
new File([blob], 'audio', { type: mediaType }),
`audio.${fileExtension}`,
);
formData.append('diarize', 'true');
// Add provider-specific options
if (elevenlabsOptions) {
const transcriptionModelOptions: ElevenLabsTranscriptionAPITypes = {
language_code: elevenlabsOptions.languageCode ?? undefined,
tag_audio_events: elevenlabsOptions.tagAudioEvents ?? undefined,
num_speakers: elevenlabsOptions.numSpeakers ?? undefined,
timestamps_granularity:
elevenlabsOptions.timestampsGranularity ?? undefined,
file_format: elevenlabsOptions.fileFormat ?? undefined,
};
if (typeof elevenlabsOptions.diarize === 'boolean') {
formData.append('diarize', String(elevenlabsOptions.diarize));
}
for (const key in transcriptionModelOptions) {
const value =
transcriptionModelOptions[
key as keyof ElevenLabsTranscriptionAPITypes
];
if (value !== undefined) {
formData.append(key, String(value));
}
}
}
return {
formData,
warnings,
};
}
async doGenerate(
options: Parameters<TranscriptionModelV3['doGenerate']>[0],
): Promise<Awaited<ReturnType<TranscriptionModelV3['doGenerate']>>> {
const currentDate = this.config._internal?.currentDate?.() ?? new Date();
const { formData, warnings } = await this.getArgs(options);
const {
value: response,
responseHeaders,
rawValue: rawResponse,
} = await postFormDataToApi({
url: this.config.url({
path: '/v1/speech-to-text',
modelId: this.modelId,
}),
headers: combineHeaders(this.config.headers(), options.headers),
formData,
failedResponseHandler: elevenlabsFailedResponseHandler,
successfulResponseHandler: createJsonResponseHandler(
elevenlabsTranscriptionResponseSchema,
),
abortSignal: options.abortSignal,
fetch: this.config.fetch,
});
return {
text: response.text,
segments:
response.words?.map(word => ({
text: word.text,
startSecond: word.start ?? 0,
endSecond: word.end ?? 0,
})) ?? [],
language: response.language_code,
durationInSeconds: response.words?.at(-1)?.end ?? undefined,
warnings,
response: {
timestamp: currentDate,
modelId: this.modelId,
headers: responseHeaders,
body: rawResponse,
},
};
}
}
const elevenlabsTranscriptionResponseSchema = z.object({
language_code: z.string(),
language_probability: z.number(),
text: z.string(),
words: z
.array(
z.object({
text: z.string(),
type: z.enum(['word', 'spacing', 'audio_event']),
start: z.number().nullish(),
end: z.number().nullish(),
speaker_id: z.string().nullish(),
characters: z
.array(
z.object({
text: z.string(),
start: z.number().nullish(),
end: z.number().nullish(),
}),
)
.nullish(),
}),
)
.nullish(),
});