js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
538 lines (537 loc) • 22.8 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.ElevenLabsTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SpeechMarkdown = __importStar(require("../markdown/converter"));
const fetch_utils_1 = require("../utils/fetch-utils");
// Get the fetch implementation for the current environment
const fetch = (0, fetch_utils_1.getFetch)();
/**
* ElevenLabs TTS client
*/
class ElevenLabsTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new ElevenLabs TTS client
* @param credentials ElevenLabs credentials
*/
constructor(credentials = {}) {
super(credentials);
/**
* ElevenLabs API key
*/
Object.defineProperty(this, "apiKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* Base URL for ElevenLabs API
*/
Object.defineProperty(this, "baseUrl", {
enumerable: true,
configurable: true,
writable: true,
value: "https://api.elevenlabs.io/v1"
});
this.apiKey = credentials.apiKey || process.env.ELEVENLABS_API_KEY || "";
}
/**
* Check if the credentials are valid
* @returns Promise resolving to true if credentials are valid, false otherwise
*/
async checkCredentials() {
if (!this.apiKey) {
console.error("ElevenLabs API key is required");
return false;
}
try {
// 1) Basic auth probe: list voices
const voices = await this._getVoices();
if (!voices || voices.length === 0)
return false;
// 2) Quota probe: attempt a tiny synthesis to detect quota/Unauthorized early
const quotaOk = await this._quotaProbe();
return quotaOk;
}
catch (error) {
console.error("Error checking ElevenLabs credentials:", error);
return false;
}
}
/**
* Perform a tiny synthesis to detect quota/Unauthorized issues up-front
* Returns false if quota is exceeded or API key is unauthorized for synthesis
*/
async _quotaProbe() {
try {
const voiceId = this.voiceId || "21m00Tcm4TlvDq8ikWAM"; // Rachel
const requestOptions = {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.apiKey,
},
body: JSON.stringify({
text: "hello",
model_id: "eleven_monolingual_v1",
output_format: "mp3_44100_64", // keep tiny
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
use_speaker_boost: true,
style: 0,
speed: 1.0,
},
}),
};
const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}`, requestOptions);
if (!response.ok) {
const errorText = await response.text();
const lower = (errorText || "").toLowerCase();
if (response.status === 401 ||
response.status === 402 ||
response.status === 429 ||
lower.includes("quota") ||
lower.includes("exceeded your current quota") ||
lower.includes("insufficient")) {
console.log("ElevenLabs: quota/authorization not sufficient for tests; skipping.");
return false;
}
// Other failures count as invalid
console.error(`ElevenLabs quota probe failed: ${response.status} ${response.statusText} - ${errorText}`);
return false;
}
// success
return true;
}
catch (err) {
console.error("ElevenLabs quota probe error:", err);
return false;
}
}
/**
* Get the list of required credential types for this engine
* @returns Array of required credential field names
*/
getRequiredCredentials() {
return ['apiKey'];
}
/**
* Get available voices from the provider
* @returns Promise resolving to an array of voice objects
*/
async _getVoices() {
try {
const response = await fetch(`${this.baseUrl}/voices`, {
method: "GET",
headers: {
"xi-api-key": this.apiKey,
},
});
if (!response.ok) {
const errorText = await response.text();
console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`);
throw new Error(`Failed to get voices: ${response.statusText}`);
}
const data = await response.json();
return data.voices;
}
catch (error) {
console.error("Error getting ElevenLabs voices:", error);
return [];
}
}
/**
* Prepare text for synthesis by stripping SSML tags
* @param text Text to prepare
* @param options Synthesis options
* @returns Prepared text
*/
async prepareText(text, options) {
let processedText = text;
// Convert from Speech Markdown if requested
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) {
// Convert to SSML first, then strip SSML tags
const ssml = await SpeechMarkdown.toSSML(processedText);
processedText = this._stripSSML(ssml);
}
// If text is SSML, strip the tags as ElevenLabs doesn't support SSML
// and has its own emotion analysis
if (this._isSSML(processedText)) {
processedText = this._stripSSML(processedText);
}
return processedText;
}
/**
* Convert text to audio bytes
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
try {
// Use voice from options or the default voice
const voiceId = options?.voice || this.voiceId || "21m00Tcm4TlvDq8ikWAM"; // Default voice (Rachel)
// Prepare text for synthesis (strip SSML tags)
const preparedText = await this.prepareText(text, options);
// Check if we need timing data for word boundaries
const useTimestamps = options?.useTimestamps || options?.useWordBoundary;
let audioData;
if (useTimestamps) {
// Use the with-timestamps endpoint for timing data
const timestampResponse = await this.synthWithTimestamps(preparedText, voiceId);
// Decode base64 audio data
const audioBase64 = timestampResponse.audio_base64;
const audioBuffer = Buffer.from(audioBase64, 'base64');
audioData = new Uint8Array(audioBuffer);
// Convert character timing to word boundaries and store for events
if (timestampResponse.alignment) {
const wordBoundaries = this.convertCharacterTimingToWordBoundaries(preparedText, timestampResponse.alignment);
// Store timing data for word boundary events
this.timings = wordBoundaries.map(wb => [
wb.offset / 10000, // Convert from 100-nanosecond units to seconds
(wb.offset + wb.duration) / 10000,
wb.text
]);
}
}
else {
// Use the regular endpoint (no timing data)
const requestOptions = {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.apiKey,
},
body: JSON.stringify({
text: preparedText,
model_id: "eleven_monolingual_v1",
output_format: "mp3_44100_128",
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
use_speaker_boost: true,
style: 0,
speed: typeof this.properties.rate === "number" ? this.properties.rate : 1.0,
},
}),
};
const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}`, requestOptions);
if (!response.ok) {
const errorText = await response.text();
console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`);
const err = new Error(`Failed to synthesize speech: ${response.status} ${response.statusText} - ${errorText}`);
err.status = response.status;
throw err;
}
const arrayBuffer = await response.arrayBuffer();
audioData = new Uint8Array(arrayBuffer);
// Create estimated word timings if no timing data available
this._createEstimatedWordTimings(preparedText);
}
// Convert to WAV if requested (since we always get MP3 from ElevenLabs)
if (options?.format === "wav") {
audioData = await this.convertMp3ToWav(audioData);
}
return audioData;
}
catch (error) {
console.error("Error synthesizing speech:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and word boundaries array
*/
async synthToBytestream(text, options) {
try {
// Use voice from options or the default voice
const voiceId = options?.voice || this.voiceId || "21m00Tcm4TlvDq8ikWAM"; // Default voice (Rachel)
// Prepare text for synthesis (strip SSML tags)
const preparedText = await this.prepareText(text, options);
// Check if we need timing data
const useTimestamps = options?.useTimestamps || options?.useWordBoundary;
let audioData;
let wordBoundaries = [];
if (useTimestamps) {
// Use the with-timestamps endpoint for timing data
const timestampResponse = await this.synthWithTimestamps(preparedText, voiceId);
// Decode base64 audio data
const audioBase64 = timestampResponse.audio_base64;
const audioBuffer = Buffer.from(audioBase64, 'base64');
audioData = new Uint8Array(audioBuffer);
// Convert character timing to word boundaries
if (timestampResponse.alignment) {
wordBoundaries = this.convertCharacterTimingToWordBoundaries(preparedText, timestampResponse.alignment);
}
}
else {
// Use the regular streaming endpoint (no timing data)
const requestOptions = {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.apiKey,
},
body: JSON.stringify({
text: preparedText,
model_id: "eleven_monolingual_v1",
output_format: "mp3_44100_128",
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
use_speaker_boost: true,
style: 0,
speed: typeof this.properties.rate === "number" ? this.properties.rate : 1.0,
},
}),
};
const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}/stream`, requestOptions);
if (!response.ok) {
const errorText = await response.text();
console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`);
const err = new Error(`Failed to synthesize speech stream: ${response.status} ${response.statusText} - ${errorText}`);
err.status = response.status;
throw err;
}
const responseArrayBuffer = await response.arrayBuffer();
audioData = new Uint8Array(responseArrayBuffer);
}
// Convert to WAV if requested (since we always get MP3 from ElevenLabs)
if (options?.format === "wav") {
audioData = await this.convertMp3ToWav(audioData);
}
// Create a ReadableStream from the Uint8Array
const readableStream = new ReadableStream({
start(controller) {
controller.enqueue(audioData);
controller.close();
},
});
return { audioStream: readableStream, wordBoundaries };
}
catch (error) {
console.error("Error synthesizing speech stream:", error);
throw error;
}
}
/**
* Call ElevenLabs API with timestamps endpoint
* @param text Text to synthesize
* @param voiceId Voice ID to use
* @returns Promise resolving to timestamp response
*/
async synthWithTimestamps(text, voiceId) {
const requestOptions = {
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.apiKey,
},
body: JSON.stringify({
text: text,
model_id: "eleven_monolingual_v1",
output_format: "mp3_44100_128",
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
use_speaker_boost: true,
style: 0,
speed: typeof this.properties.rate === "number" ? this.properties.rate : 1.0,
},
}),
};
const response = await fetch(`${this.baseUrl}/text-to-speech/${voiceId}/with-timestamps`, requestOptions);
if (!response.ok) {
const errorText = await response.text();
console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`);
const err = new Error(`Failed to synthesize speech with timestamps: ${response.status} ${response.statusText} - ${errorText}`);
err.status = response.status;
throw err;
}
return await response.json();
}
/**
* Convert character-level timing data to word boundaries
* @param text Original text
* @param alignment Character alignment data from ElevenLabs
* @returns Array of word boundary objects
*/
convertCharacterTimingToWordBoundaries(text, alignment) {
const wordBoundaries = [];
// Split text into words while preserving positions
const words = [];
const wordRegex = /\S+/g;
let match;
while ((match = wordRegex.exec(text)) !== null) {
words.push({
word: match[0],
startIndex: match.index,
endIndex: match.index + match[0].length - 1
});
}
// Convert each word to boundary data using character timing
for (const wordInfo of words) {
// Find the character timing for the start and end of this word
const startCharIndex = wordInfo.startIndex;
const endCharIndex = wordInfo.endIndex;
// Make sure we have timing data for these character positions
if (startCharIndex < alignment.character_start_times_seconds.length &&
endCharIndex < alignment.character_end_times_seconds.length) {
const startTime = alignment.character_start_times_seconds[startCharIndex];
const endTime = alignment.character_end_times_seconds[endCharIndex];
wordBoundaries.push({
text: wordInfo.word,
offset: Math.round(startTime * 10000), // Convert to 100-nanosecond units
duration: Math.round((endTime - startTime) * 10000)
});
}
}
return wordBoundaries;
}
/**
* Start playback with word boundary callbacks
* @param text Text to speak
* @param callback Callback function for word boundaries
* @param options Synthesis options
*/
async startPlaybackWithCallbacks(text, callback, options) {
// Register the callback
this.on("boundary", callback);
// Enable timestamps for better word boundary accuracy
const enhancedOptions = {
...options,
useTimestamps: true
};
// Start playback
await this.speakStreamed(text, enhancedOptions);
}
/**
* Map ElevenLabs voice objects to unified format
* @param rawVoices Array of ElevenLabs voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Map raw voices directly without language normalization for now
return rawVoices.map((voice) => ({
id: voice.voice_id,
name: voice.name,
gender: undefined, // ElevenLabs doesn't provide gender
languageCodes: [
{
bcp47: voice.labels?.accent || "en-US",
iso639_3: (voice.labels?.accent || "en-US").split("-")[0] || "eng",
display: voice.labels?.accent || "English",
},
],
provider: "elevenlabs",
}));
}
/**
* Get voice by ID
* @param voiceId Voice ID
* @returns Promise resolving to voice details
*/
async getVoice(voiceId) {
try {
const response = await fetch(`${this.baseUrl}/voices/${voiceId}`, {
method: "GET",
headers: {
"xi-api-key": this.apiKey,
},
});
if (!response.ok) {
if (response.status === 404) {
return null;
}
const errorText = await response.text();
console.error(`ElevenLabs API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`);
throw new Error(`Failed to get voice: ${response.statusText}`);
}
const voice = await response.json();
// Map to unified format using the same logic as _mapVoicesToUnified
const unifiedVoice = {
id: voice.voice_id,
name: voice.name,
gender: voice.labels?.gender === "female"
? "Female"
: voice.labels?.gender === "male"
? "Male"
: "Unknown",
languageCodes: [
{
bcp47: voice.labels?.language || "en-US",
iso639_3: voice.labels?.language?.split("-")[0] || "eng",
display: voice.labels?.accent || "English",
},
],
provider: "elevenlabs",
};
return unifiedVoice;
}
catch (error) {
console.error("Error getting voice:", error);
throw error;
}
}
/**
* Convert MP3 audio data to WAV format using the audio converter utility
* @param mp3Data MP3 audio data from ElevenLabs
* @returns WAV audio data
*/
async convertMp3ToWav(mp3Data) {
try {
// Import the audio converter utility (Node-only) using a truly dynamic import
const dyn = new Function('m', 'return import(m)');
const { convertAudioFormat } = await dyn("../utils/audio-converter");
// Convert MP3 to WAV
const result = await convertAudioFormat(mp3Data, "wav");
return result.audioBytes;
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
console.warn("Failed to convert MP3 to WAV, returning original MP3 data:", errorMessage);
// Fallback: return the original MP3 data
// The playback system should handle MP3 files even when WAV was requested
return mp3Data;
}
}
}
exports.ElevenLabsTTSClient = ElevenLabsTTSClient;
;