UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

809 lines (808 loc) 32.5 kB
import { AbstractTTSClient } from "../core/abstract-tts.js"; import * as SpeechMarkdown from "../markdown/converter.js"; import { base64ToUint8Array } from "../utils/base64-utils.js"; import { getFetch } from "../utils/fetch-utils.js"; // Get the fetch implementation for the current environment const fetch = getFetch(); /** * ElevenLabs TTS client */ export class ElevenLabsTTSClient extends AbstractTTSClient { /** * Create a new ElevenLabs TTS client * @param credentials ElevenLabs credentials */ constructor(credentials = {}) { super(credentials); /** * ElevenLabs API key */ Object.defineProperty(this, "apiKey", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Base URL for ElevenLabs API */ Object.defineProperty(this, "baseUrl", { enumerable: true, configurable: true, writable: true, value: "https://api.elevenlabs.io/v1" }); /** * Default model to use for synthesis */ Object.defineProperty(this, "modelId", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Default output format for requests */ Object.defineProperty(this, "outputFormat", { enumerable: true, configurable: true, writable: true, value: "mp3_44100_128" }); /** * Request-level overrides provided via credentials/properties */ Object.defineProperty(this, "requestOverrides", { enumerable: true, configurable: true, writable: true, value: {} }); this._models = [ { id: "eleven_v3", features: [ "streaming", "audio-tags", "inline-voice-cloning", "word-boundary-events", "character-boundary-events", ], }, { id: "eleven_turbo_v2_5", features: ["streaming", "word-boundary-events", "character-boundary-events"], }, { id: "eleven_turbo_v2", features: ["streaming", "word-boundary-events", "character-boundary-events"], }, { id: "eleven_monolingual_v1", features: ["streaming", "word-boundary-events", "character-boundary-events"], }, { id: "eleven_multilingual_v1", features: ["streaming", "word-boundary-events", "character-boundary-events"], }, { id: "eleven_multilingual_v2", features: ["streaming", "word-boundary-events", "character-boundary-events"], }, ]; this.apiKey = credentials.apiKey || process.env.ELEVENLABS_API_KEY || ""; this.modelId = credentials.modelId || credentials.model || ElevenLabsTTSClient.DEFAULT_MODEL; if (typeof credentials.outputFormat === "string") { this.outputFormat = credentials.outputFormat; } this.applyCredentialProperties(credentials); } /** * Apply any configuration passed through credentials (including JSON strings) */ applyCredentialProperties(credentials) { const directProps = []; if (typeof credentials.output_format === "string") { directProps.push({ output_format: credentials.output_format }); } const rawProps = credentials.properties ?? credentials.propertiesJson ?? credentials.propertiesJSON; if (rawProps) { if (typeof rawProps === "string") { try { const parsed = JSON.parse(rawProps); if (parsed && typeof parsed === "object") { directProps.push(parsed); } } catch (error) { console.warn("Failed to parse ElevenLabs properties JSON:", error); } } else if (typeof rawProps === "object") { directProps.push(rawProps); } } for (const props of directProps) { for (const [key, value] of Object.entries(props)) { this.setProperty(key, value); } } } /** * Resolve the model ID for a request */ resolveModelId(options, extraOverrides = {}) { return (options?.model || options?.modelId || options?.requestOptions?.model_id || extraOverrides?.model_id || extraOverrides?.model || this.modelId); } /** * Resolve the output format for a request */ resolveOutputFormat(options, extraOverrides = {}) { return (options?.outputFormat || options?.requestOptions?.output_format || extraOverrides?.output_format || this.requestOverrides.output_format || this.outputFormat); } /** * Merge default and override voice settings */ resolveVoiceSettings(options, extraOverrides = {}) { const defaultVoiceSettings = { stability: 0.5, similarity_boost: 0.75, use_speaker_boost: true, style: 0, speed: typeof this.properties.rate === "number" ? this.properties.rate : 1.0, }; const overridesFromCredentials = this.requestOverrides.voice_settings && typeof this.requestOverrides.voice_settings === "object" ? this.requestOverrides.voice_settings : {}; const overridesFromOptions = options?.voiceSettings && typeof options.voiceSettings === "object" ? options.voiceSettings : {}; const overridesFromRequestOptions = options?.requestOptions && typeof options.requestOptions.voice_settings === "object" ? options.requestOptions.voice_settings : {}; const overridesFromExtra = extraOverrides && typeof extraOverrides.voice_settings === "object" ? extraOverrides.voice_settings : {}; return { ...defaultVoiceSettings, ...overridesFromCredentials, ...overridesFromRequestOptions, ...overridesFromOptions, ...overridesFromExtra, }; } /** * Remove voice_settings from an overrides object to avoid double-merging */ withoutVoiceSettings(overrides) { if (!overrides || typeof overrides !== "object") return {}; const { voice_settings, ...rest } = overrides; return rest; } /** * Build a request payload honoring defaults and user overrides */ buildRequestPayload(text, options, extraOverrides = {}) { const payload = { text, model_id: this.resolveModelId(options, extraOverrides), output_format: this.resolveOutputFormat(options, extraOverrides), voice_settings: this.resolveVoiceSettings(options, extraOverrides), }; const merged = { ...payload, ...this.withoutVoiceSettings(this.requestOverrides), ...this.withoutVoiceSettings(options?.requestOptions), ...this.withoutVoiceSettings(extraOverrides), }; // Ensure required fields are preserved merged.text = text; merged.model_id = this.resolveModelId(options, merged); merged.output_format = this.resolveOutputFormat(options, merged); merged.voice_settings = this.resolveVoiceSettings(options, merged); if (options?.seed !== undefined) merged.seed = options.seed; if (options?.languageCode) merged.language_code = options.languageCode; if (options?.previousText) merged.previous_text = options.previousText; if (options?.nextText) merged.next_text = options.nextText; if (options?.applyTextNormalization) merged.apply_text_normalization = options.applyTextNormalization; return merged; } /** * Set default model ID */ setModelId(modelId) { if (modelId) { this.modelId = modelId; } } /** * Get a property value */ getProperty(property) { switch (property) { case "model": case "model_id": case "modelId": return this.modelId; case "outputFormat": case "output_format": return this.resolveOutputFormat(); default: return super.getProperty(property); } } /** * Set a property value */ setProperty(property, value) { switch (property) { case "model": case "model_id": case "modelId": this.setModelId(String(value)); break; case "outputFormat": case "output_format": if (typeof value === "string") { this.outputFormat = value; } break; case "voice_settings": if (value && typeof value === "object") { this.requestOverrides.voice_settings = value; } break; case "voice": case "voiceId": if (typeof value === "string") { this.setVoice(value); } break; default: super.setProperty(property, value); if (!["rate", "pitch", "volume"].includes(property) && value !== undefined) { this.requestOverrides[property] = value; } break; } } /** * Check if the credentials are valid * @returns Promise resolving to true if credentials are valid, false otherwise */ async checkCredentials() { if (!this.apiKey) { console.error("ElevenLabs API key is required"); return false; } try { // 1) Basic auth probe: list voices const voices = await this._getVoices(); if (!voices || voices.length === 0) return false; // 2) Quota probe: attempt a tiny synthesis to detect quota/Unauthorized early const quotaOk = await this._quotaProbe(); return quotaOk; } catch (error) { console.error("Error checking ElevenLabs credentials:", error); return false; } } /** * Perform a tiny synthesis to detect quota/Unauthorized issues up-front * Returns false if quota is exceeded or API key is unauthorized for synthesis */ async _quotaProbe() { try { const voiceId = this.voiceId || "21m00Tcm4TlvDq8ikWAM"; // Rachel const payload = this.buildRequestPayload("hello", undefined, { output_format: "mp3_44100_64", // keep tiny }); const requestOptions = { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.apiKey, }, body: JSON.stringify(payload), }; const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}`, requestOptions); if (!response.ok) { const errorText = await response.text(); const lower = (errorText || "").toLowerCase(); if (response.status === 401 || response.status === 402 || response.status === 429 || lower.includes("quota") || lower.includes("exceeded your current quota") || lower.includes("insufficient")) { console.log("ElevenLabs: quota/authorization not sufficient for tests; skipping."); return false; } // Other failures count as invalid console.error(`ElevenLabs quota probe failed: ${response.status} ${response.statusText} - ${errorText}`); return false; } // success return true; } catch (err) { console.error("ElevenLabs quota probe error:", err); return false; } } /** * Get the list of required credential types for this engine * @returns Array of required credential field names */ getRequiredCredentials() { return ["apiKey"]; } /** * Merge raw voices with resolved language data from the models endpoint. * Extracted as a separate method so tests can inject mock data directly. */ _getVoicesWithModels(rawVoices, models) { // Build model_id → languages map (TTS-capable models only) const modelLanguageMap = new Map(); for (const model of models) { if (model.can_do_text_to_speech && Array.isArray(model.languages)) { modelLanguageMap.set(model.model_id, model.languages); } } return rawVoices.map((voice) => { const modelIds = voice.high_quality_base_model_ids ?? []; const seen = new Set(); const resolvedLanguages = []; for (const modelId of modelIds) { for (const lang of modelLanguageMap.get(modelId) ?? []) { if (!seen.has(lang.language_id)) { seen.add(lang.language_id); resolvedLanguages.push(lang); } } } return { ...voice, _resolvedLanguages: resolvedLanguages }; }); } async _getVoices() { try { const headers = { "xi-api-key": this.apiKey }; const [voicesResp, modelsResp] = await Promise.all([ fetch(`${this.baseUrl}/voices`, { method: "GET", headers }), fetch(`${this.baseUrl}/models`, { method: "GET", headers }), ]); if (!voicesResp.ok) { const errorText = await voicesResp.text(); console.error(`ElevenLabs API error: ${voicesResp.status} ${voicesResp.statusText}\nResponse: ${errorText}`); throw new Error(`Failed to get voices: ${voicesResp.statusText}`); } const voiceData = await voicesResp.json(); const modelData = modelsResp.ok ? await modelsResp.json() : []; return this._getVoicesWithModels(voiceData.voices, modelData); } catch (error) { console.error("Error getting ElevenLabs voices:", error); return []; } } /** * Prepare text for synthesis by stripping SSML tags. * ElevenLabs does not support SSML — use native [audio tags] for v3 expressiveness. */ async prepareText(text, options) { let processedText = text; if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { const ssml = await SpeechMarkdown.toSSML(processedText, "elevenlabs"); processedText = ssml; } // If text is SSML, strip the tags as ElevenLabs doesn't support SSML if (this._isSSML(processedText)) { processedText = this._stripSSML(processedText); } // Process audio tags based on model processedText = this.processAudioTags(processedText, options); return processedText; } /** * Process audio tags ([laugh], [sigh], etc.) based on the model. * eleven_v3 natively supports audio tags — pass them through. * For all other models, strip audio tags. */ processAudioTags(text, options) { const modelId = this.resolveModelId(options); const isAudioTagModel = modelId.startsWith(ElevenLabsTTSClient.MODEL_V3); if (isAudioTagModel) { return text; } if (!ElevenLabsTTSClient.AUDIO_TAG_REGEX.test(text)) { return text; } const stripped = text .replace(ElevenLabsTTSClient.AUDIO_TAG_REGEX, "") .replace(/\s+/g, " ") .trim(); return stripped; } /** * Convert text to audio bytes * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio bytes */ async synthToBytes(text, options) { try { // Use voice from options or the default voice const voiceId = options?.voice || this.voiceId || "21m00Tcm4TlvDq8ikWAM"; // Default voice (Rachel) // Prepare text for synthesis (strip SSML tags) const preparedText = await this.prepareText(text, options); // Check if we need timing data for word boundaries const useTimestamps = options?.useTimestamps || options?.useWordBoundary; let audioData; if (useTimestamps) { // Use the with-timestamps endpoint for timing data const timestampResponse = await this.synthWithTimestamps(preparedText, voiceId, options); // Decode base64 audio data const audioBase64 = timestampResponse.audio_base64; audioData = base64ToUint8Array(audioBase64); // Convert character timing to word boundaries and store for events if (timestampResponse.alignment) { const wordBoundaries = this.convertCharacterTimingToWordBoundaries(preparedText, timestampResponse.alignment); // Store timing data for word boundary events this.timings = wordBoundaries.map((wb) => [ wb.offset / 10000, // Convert from 100-nanosecond units to seconds (wb.offset + wb.duration) / 10000, wb.text, ]); } } else { // Use the regular endpoint (no timing data) const payload = this.buildRequestPayload(preparedText, options); const requestOptions = { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.apiKey, }, body: JSON.stringify(payload), }; const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}`, requestOptions); if (!response.ok) { const errorText = await response.text(); console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`); const err = new Error(`Failed to synthesize speech: ${response.status} ${response.statusText} - ${errorText}`); err.status = response.status; throw err; } const arrayBuffer = await response.arrayBuffer(); audioData = new Uint8Array(arrayBuffer); // Create estimated word timings if no timing data available this._createEstimatedWordTimings(preparedText); } // Convert to WAV if requested (since we always get MP3 from ElevenLabs) if (options?.format === "wav") { audioData = await this.convertMp3ToWav(audioData); } return audioData; } catch (error) { console.error("Error synthesizing speech:", error); throw error; } } /** * Synthesize text to a byte stream * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to an object containing the audio stream and word boundaries array */ async synthToBytestream(text, options) { try { // Use voice from options or the default voice const voiceId = options?.voice || this.voiceId || "21m00Tcm4TlvDq8ikWAM"; // Default voice (Rachel) // Prepare text for synthesis (strip SSML tags) const preparedText = await this.prepareText(text, options); // Check if we need timing data const useTimestamps = options?.useTimestamps || options?.useWordBoundary; let audioStream; let wordBoundaries = []; if (useTimestamps) { const timestampResponse = await this.synthWithTimestamps(preparedText, voiceId, options); const audioBase64 = timestampResponse.audio_base64; const audioData = base64ToUint8Array(audioBase64); if (timestampResponse.alignment) { wordBoundaries = this.convertCharacterTimingToWordBoundaries(preparedText, timestampResponse.alignment); } let finalData = audioData; if (options?.format === "wav") { finalData = await this.convertMp3ToWav(audioData); } audioStream = new ReadableStream({ start(controller) { controller.enqueue(finalData); controller.close(); }, }); } else { const payload = this.buildRequestPayload(preparedText, options); const requestOptions = { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.apiKey, }, body: JSON.stringify(payload), }; const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}/stream`, requestOptions); if (!response.ok) { const errorText = await response.text(); console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`); const err = new Error(`Failed to synthesize speech stream: ${response.status} ${response.statusText} - ${errorText}`); err.status = response.status; throw err; } if (response.body) { audioStream = response.body; } else { const arrayBuffer = await response.arrayBuffer(); audioStream = new ReadableStream({ start(controller) { controller.enqueue(new Uint8Array(arrayBuffer)); controller.close(); }, }); } if (options?.format === "wav") { const chunks = []; const reader = audioStream.getReader(); while (true) { const { done, value } = await reader.read(); if (done) break; chunks.push(value); } const totalLength = chunks.reduce((acc, c) => acc + c.length, 0); const merged = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { merged.set(chunk, offset); offset += chunk.length; } const wavData = await this.convertMp3ToWav(merged); audioStream = new ReadableStream({ start(controller) { controller.enqueue(wavData); controller.close(); }, }); } } return { audioStream, wordBoundaries }; } catch (error) { console.error("Error synthesizing speech stream:", error); throw error; } } /** * Call ElevenLabs API with timestamps endpoint * @param text Text to synthesize * @param voiceId Voice ID to use * @param options Synthesis options * @returns Promise resolving to timestamp response */ async synthWithTimestamps(text, voiceId, options) { const payload = this.buildRequestPayload(text, options); const requestOptions = { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.apiKey, }, body: JSON.stringify(payload), }; const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}/with-timestamps`, requestOptions); if (!response.ok) { const errorText = await response.text(); console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`); const err = new Error(`Failed to synthesize speech with timestamps: ${response.status} ${response.statusText} - ${errorText}`); err.status = response.status; throw err; } return (await response.json()); } /** * Convert character-level timing data to word boundaries * @param text Original text * @param alignment Character alignment data from ElevenLabs * @returns Array of word boundary objects */ convertCharacterTimingToWordBoundaries(text, alignment) { const wordBoundaries = []; // Split text into words while preserving positions const words = []; const wordRegex = /\S+/g; let match = wordRegex.exec(text); while (match !== null) { words.push({ word: match[0], startIndex: match.index, endIndex: match.index + match[0].length - 1, }); match = wordRegex.exec(text); } // Convert each word to boundary data using character timing for (const wordInfo of words) { // Find the character timing for the start and end of this word const startCharIndex = wordInfo.startIndex; const endCharIndex = wordInfo.endIndex; // Make sure we have timing data for these character positions if (startCharIndex < alignment.character_start_times_seconds.length && endCharIndex < alignment.character_end_times_seconds.length) { const startTime = alignment.character_start_times_seconds[startCharIndex]; const endTime = alignment.character_end_times_seconds[endCharIndex]; wordBoundaries.push({ text: wordInfo.word, offset: Math.round(startTime * 10000), // Convert to 100-nanosecond units duration: Math.round((endTime - startTime) * 10000), }); } } return wordBoundaries; } /** * Start playback with word boundary callbacks * @param text Text to speak * @param callback Callback function for word boundaries * @param options Synthesis options */ async startPlaybackWithCallbacks(text, callback, options) { // Register the callback this.on("boundary", callback); // Enable timestamps for better word boundary accuracy const enhancedOptions = { ...options, useTimestamps: true, }; // Start playback await this.speakStreamed(text, enhancedOptions); } /** * Map ElevenLabs voice objects to unified format * @param rawVoices Array of ElevenLabs voice objects * @returns Promise resolving to an array of unified voice objects */ async _mapVoicesToUnified(rawVoices) { // Map raw voices directly without language normalization for now return rawVoices.map((voice) => ({ id: voice.voice_id, name: voice.name, gender: voice.labels?.gender === "female" ? "Female" : voice.labels?.gender === "male" ? "Male" : undefined, languageCodes: Array.isArray(voice._resolvedLanguages) && voice._resolvedLanguages.length > 0 ? voice._resolvedLanguages.map((lang) => ({ bcp47: lang.language_id, iso639_3: lang.language_id, display: lang.name, })) : [{ bcp47: "en", iso639_3: "en", display: "English" }], provider: "elevenlabs", })); } /** * Get voice by ID * @param voiceId Voice ID * @returns Promise resolving to voice details */ async getVoice(voiceId) { try { const response = await fetch(`${this.baseUrl}/voices/${voiceId}`, { method: "GET", headers: { "xi-api-key": this.apiKey, }, }); if (!response.ok) { if (response.status === 404) { return null; } const errorText = await response.text(); console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`); throw new Error(`Failed to get voice: ${response.statusText}`); } const voice = await response.json(); // Map to unified format using the same logic as _mapVoicesToUnified const unifiedVoice = { id: voice.voice_id, name: voice.name, gender: voice.labels?.gender === "female" ? "Female" : voice.labels?.gender === "male" ? "Male" : "Unknown", languageCodes: [ { bcp47: voice.labels?.language || "en-US", iso639_3: voice.labels?.language?.split("-")[0] || "eng", display: voice.labels?.accent || "English", }, ], provider: "elevenlabs", }; return unifiedVoice; } catch (error) { console.error("Error getting voice:", error); throw error; } } /** * Convert MP3 audio data to WAV format using the audio converter utility * @param mp3Data MP3 audio data from ElevenLabs * @returns WAV audio data */ async convertMp3ToWav(mp3Data) { try { // Import the audio converter utility (Node-only) using a truly dynamic import const dyn = new Function("m", "return import(m)"); const { convertAudioFormat } = await dyn("../utils/audio-converter"); // Convert MP3 to WAV const result = await convertAudioFormat(mp3Data, "wav"); return result.audioBytes; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); console.warn("Failed to convert MP3 to WAV, returning original MP3 data:", errorMessage); // Fallback: return the original MP3 data // The playback system should handle MP3 files even when WAV was requested return mp3Data; } } } Object.defineProperty(ElevenLabsTTSClient, "MODEL_V3", { enumerable: true, configurable: true, writable: true, value: "eleven_v3" }); Object.defineProperty(ElevenLabsTTSClient, "DEFAULT_MODEL", { enumerable: true, configurable: true, writable: true, value: "eleven_multilingual_v2" }); Object.defineProperty(ElevenLabsTTSClient, "AUDIO_TAG_REGEX", { enumerable: true, configurable: true, writable: true, value: /\[[^\]]+\]/g });