UNPKG

@ai-sdk/elevenlabs

Version:

The **[ElevenLabs provider](https://ai-sdk.dev/providers/ai-sdk-providers/elevenlabs)** for the [AI SDK](https://ai-sdk.dev/docs) contains language model support for the ElevenLabs chat and completion APIs and embedding model support for the ElevenLabs em

259 lines (231 loc) 8.25 kB
import type { SpeechModelV3, SharedV3Warning } from '@ai-sdk/provider'; import { combineHeaders, createBinaryResponseHandler, parseProviderOptions, postJsonToApi, } from '@ai-sdk/provider-utils'; import { z } from 'zod/v4'; import type { ElevenLabsConfig } from './elevenlabs-config'; import { elevenlabsFailedResponseHandler } from './elevenlabs-error'; import type { ElevenLabsSpeechAPITypes } from './elevenlabs-speech-api-types'; import type { ElevenLabsSpeechModelId, ElevenLabsSpeechVoiceId, } from './elevenlabs-speech-options'; // Schema for camelCase input from users const elevenLabsSpeechModelOptionsSchema = z.object({ languageCode: z.string().optional(), voiceSettings: z .object({ stability: z.number().min(0).max(1).optional(), similarityBoost: z.number().min(0).max(1).optional(), style: z.number().min(0).max(1).optional(), useSpeakerBoost: z.boolean().optional(), }) .optional(), pronunciationDictionaryLocators: z .array( z.object({ pronunciationDictionaryId: z.string(), versionId: z.string().optional(), }), ) .max(3) .optional(), seed: z.number().min(0).max(4294967295).optional(), previousText: z.string().optional(), nextText: z.string().optional(), previousRequestIds: z.array(z.string()).max(3).optional(), nextRequestIds: z.array(z.string()).max(3).optional(), applyTextNormalization: z.enum(['auto', 'on', 'off']).optional(), applyLanguageTextNormalization: z.boolean().optional(), enableLogging: z.boolean().optional(), }); export type ElevenLabsSpeechModelOptions = z.infer< typeof elevenLabsSpeechModelOptionsSchema >; interface ElevenLabsSpeechModelConfig extends ElevenLabsConfig { _internal?: { currentDate?: () => Date; }; } export class ElevenLabsSpeechModel implements SpeechModelV3 { readonly specificationVersion = 'v3'; get provider(): string { return this.config.provider; } constructor( readonly modelId: ElevenLabsSpeechModelId, private readonly config: ElevenLabsSpeechModelConfig, ) {} private async getArgs({ text, voice = '21m00Tcm4TlvDq8ikWAM', outputFormat = 'mp3_44100_128', instructions, language, speed, providerOptions, }: Parameters<SpeechModelV3['doGenerate']>[0]) { const warnings: SharedV3Warning[] = []; // Parse provider options const elevenLabsOptions = await parseProviderOptions({ provider: 'elevenlabs', providerOptions, schema: elevenLabsSpeechModelOptionsSchema, }); // Create request body const requestBody: ElevenLabsSpeechAPITypes = { text, model_id: this.modelId, }; // Prepare query parameters const queryParams: Record<string, string> = {}; // Map outputFormat to ElevenLabs format (as query param) if (outputFormat) { const formatMap: Record<string, string> = { mp3: 'mp3_44100_128', mp3_32: 'mp3_44100_32', mp3_64: 'mp3_44100_64', mp3_96: 'mp3_44100_96', mp3_128: 'mp3_44100_128', mp3_192: 'mp3_44100_192', pcm: 'pcm_44100', pcm_16000: 'pcm_16000', pcm_22050: 'pcm_22050', pcm_24000: 'pcm_24000', pcm_44100: 'pcm_44100', ulaw: 'ulaw_8000', }; const mappedFormat = formatMap[outputFormat] || outputFormat; queryParams.output_format = mappedFormat; } // Add language code if provided if (language) { requestBody.language_code = language; } const voiceSettings: typeof requestBody.voice_settings = {}; if (speed != null) { voiceSettings.speed = speed; } // Add provider-specific options - map from camelCase to snake_case if (elevenLabsOptions) { if (elevenLabsOptions.voiceSettings) { // Map camelCase voice settings to snake_case for API if (elevenLabsOptions.voiceSettings.stability != null) { voiceSettings.stability = elevenLabsOptions.voiceSettings.stability; } if (elevenLabsOptions.voiceSettings.similarityBoost != null) { voiceSettings.similarity_boost = elevenLabsOptions.voiceSettings.similarityBoost; } if (elevenLabsOptions.voiceSettings.style != null) { voiceSettings.style = elevenLabsOptions.voiceSettings.style; } if (elevenLabsOptions.voiceSettings.useSpeakerBoost != null) { voiceSettings.use_speaker_boost = elevenLabsOptions.voiceSettings.useSpeakerBoost; } } // Add language code from provider options if not already set if (elevenLabsOptions.languageCode && !requestBody.language_code) { requestBody.language_code = elevenLabsOptions.languageCode; } // Map pronunciation dictionary locators if (elevenLabsOptions.pronunciationDictionaryLocators) { requestBody.pronunciation_dictionary_locators = elevenLabsOptions.pronunciationDictionaryLocators.map(locator => ({ pronunciation_dictionary_id: locator.pronunciationDictionaryId, ...(locator.versionId && { version_id: locator.versionId }), })); } if (elevenLabsOptions.seed != null) { requestBody.seed = elevenLabsOptions.seed; } if (elevenLabsOptions.previousText) { requestBody.previous_text = elevenLabsOptions.previousText; } if (elevenLabsOptions.nextText) { requestBody.next_text = elevenLabsOptions.nextText; } // Add previous and next request IDs if (elevenLabsOptions.previousRequestIds) { requestBody.previous_request_ids = elevenLabsOptions.previousRequestIds; } if (elevenLabsOptions.nextRequestIds) { requestBody.next_request_ids = elevenLabsOptions.nextRequestIds; } // Add text normalization options if (elevenLabsOptions.applyTextNormalization) { requestBody.apply_text_normalization = elevenLabsOptions.applyTextNormalization; } if (elevenLabsOptions.applyLanguageTextNormalization != null) { requestBody.apply_language_text_normalization = elevenLabsOptions.applyLanguageTextNormalization; } // enable_logging is a query parameter if (elevenLabsOptions.enableLogging != null) { queryParams.enable_logging = String(elevenLabsOptions.enableLogging); } } // Only add voice_settings if there are settings to add if (Object.keys(voiceSettings).length > 0) { requestBody.voice_settings = voiceSettings; } if (instructions) { warnings.push({ type: 'unsupported', feature: 'instructions', details: `ElevenLabs speech models do not support instructions. Instructions parameter was ignored.`, }); } return { requestBody, queryParams, warnings, voiceId: voice as ElevenLabsSpeechVoiceId, }; } async doGenerate( options: Parameters<SpeechModelV3['doGenerate']>[0], ): Promise<Awaited<ReturnType<SpeechModelV3['doGenerate']>>> { const currentDate = this.config._internal?.currentDate?.() ?? new Date(); const { requestBody, queryParams, warnings, voiceId } = await this.getArgs(options); const { value: audio, responseHeaders, rawValue: rawResponse, } = await postJsonToApi({ url: (() => { const baseUrl = this.config.url({ path: `/v1/text-to-speech/${voiceId}`, modelId: this.modelId, }); const queryString = new URLSearchParams(queryParams).toString(); return queryString ? `${baseUrl}?${queryString}` : baseUrl; })(), headers: combineHeaders(this.config.headers(), options.headers), body: requestBody, failedResponseHandler: elevenlabsFailedResponseHandler, successfulResponseHandler: createBinaryResponseHandler(), abortSignal: options.abortSignal, fetch: this.config.fetch, }); return { audio, warnings, request: { body: JSON.stringify(requestBody), }, response: { timestamp: currentDate, modelId: this.modelId, headers: responseHeaders, body: rawResponse, }, }; } }