UNPKG

@ai-sdk/elevenlabs

Version:

The **[ElevenLabs provider](https://ai-sdk.dev/providers/ai-sdk-providers/elevenlabs)** for the [AI SDK](https://ai-sdk.dev/docs) contains language model support for the ElevenLabs chat and completion APIs and embedding model support for the ElevenLabs em

184 lines (165 loc) 5.57 kB
import type { TranscriptionModelV3, SharedV3Warning } from '@ai-sdk/provider'; import { combineHeaders, convertBase64ToUint8Array, createJsonResponseHandler, mediaTypeToExtension, parseProviderOptions, postFormDataToApi, } from '@ai-sdk/provider-utils'; import { z } from 'zod/v4'; import type { ElevenLabsConfig } from './elevenlabs-config'; import { elevenlabsFailedResponseHandler } from './elevenlabs-error'; import type { ElevenLabsTranscriptionModelId } from './elevenlabs-transcription-options'; import type { ElevenLabsTranscriptionAPITypes } from './elevenlabs-api-types'; // https://elevenlabs.io/docs/api-reference/speech-to-text/convert const elevenLabsTranscriptionModelOptionsSchema = z.object({ languageCode: z.string().nullish(), tagAudioEvents: z.boolean().nullish().default(true), numSpeakers: z.number().int().min(1).max(32).nullish(), timestampsGranularity: z .enum(['none', 'word', 'character']) .nullish() .default('word'), diarize: z.boolean().nullish().default(false), fileFormat: z.enum(['pcm_s16le_16', 'other']).nullish().default('other'), }); export type ElevenLabsTranscriptionModelOptions = z.infer< typeof elevenLabsTranscriptionModelOptionsSchema >; interface ElevenLabsTranscriptionModelConfig extends ElevenLabsConfig { _internal?: { currentDate?: () => Date; }; } export class ElevenLabsTranscriptionModel implements TranscriptionModelV3 { readonly specificationVersion = 'v3'; get provider(): string { return this.config.provider; } constructor( readonly modelId: ElevenLabsTranscriptionModelId, private readonly config: ElevenLabsTranscriptionModelConfig, ) {} private async getArgs({ audio, mediaType, providerOptions, }: Parameters<TranscriptionModelV3['doGenerate']>[0]) { const warnings: SharedV3Warning[] = []; // Parse provider options const elevenlabsOptions = await parseProviderOptions({ provider: 'elevenlabs', providerOptions, schema: elevenLabsTranscriptionModelOptionsSchema, }); // Create form data with base fields const formData = new FormData(); const blob = audio instanceof Uint8Array ? new Blob([audio]) : new Blob([convertBase64ToUint8Array(audio)]); formData.append('model_id', this.modelId); const fileExtension = mediaTypeToExtension(mediaType); formData.append( 'file', new File([blob], 'audio', { type: mediaType }), `audio.${fileExtension}`, ); formData.append('diarize', 'true'); // Add provider-specific options if (elevenlabsOptions) { const transcriptionModelOptions: ElevenLabsTranscriptionAPITypes = { language_code: elevenlabsOptions.languageCode ?? undefined, tag_audio_events: elevenlabsOptions.tagAudioEvents ?? undefined, num_speakers: elevenlabsOptions.numSpeakers ?? undefined, timestamps_granularity: elevenlabsOptions.timestampsGranularity ?? undefined, file_format: elevenlabsOptions.fileFormat ?? undefined, }; if (typeof elevenlabsOptions.diarize === 'boolean') { formData.append('diarize', String(elevenlabsOptions.diarize)); } for (const key in transcriptionModelOptions) { const value = transcriptionModelOptions[ key as keyof ElevenLabsTranscriptionAPITypes ]; if (value !== undefined) { formData.append(key, String(value)); } } } return { formData, warnings, }; } async doGenerate( options: Parameters<TranscriptionModelV3['doGenerate']>[0], ): Promise<Awaited<ReturnType<TranscriptionModelV3['doGenerate']>>> { const currentDate = this.config._internal?.currentDate?.() ?? new Date(); const { formData, warnings } = await this.getArgs(options); const { value: response, responseHeaders, rawValue: rawResponse, } = await postFormDataToApi({ url: this.config.url({ path: '/v1/speech-to-text', modelId: this.modelId, }), headers: combineHeaders(this.config.headers(), options.headers), formData, failedResponseHandler: elevenlabsFailedResponseHandler, successfulResponseHandler: createJsonResponseHandler( elevenlabsTranscriptionResponseSchema, ), abortSignal: options.abortSignal, fetch: this.config.fetch, }); return { text: response.text, segments: response.words?.map(word => ({ text: word.text, startSecond: word.start ?? 0, endSecond: word.end ?? 0, })) ?? [], language: response.language_code, durationInSeconds: response.words?.at(-1)?.end ?? undefined, warnings, response: { timestamp: currentDate, modelId: this.modelId, headers: responseHeaders, body: rawResponse, }, }; } } const elevenlabsTranscriptionResponseSchema = z.object({ language_code: z.string(), language_probability: z.number(), text: z.string(), words: z .array( z.object({ text: z.string(), type: z.enum(['word', 'spacing', 'audio_event']), start: z.number().nullish(), end: z.number().nullish(), speaker_id: z.string().nullish(), characters: z .array( z.object({ text: z.string(), start: z.number().nullish(), end: z.number().nullish(), }), ) .nullish(), }), ) .nullish(), });