UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

github.com/willwade/js-tts-wrapper

willwade/js-tts-wrapper

694 lines (693 loc) • 28.5 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.PlayHTTTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); const word_timing_estimator_1 = require("../utils/word-timing-estimator"); const fetch_utils_1 = require("../utils/fetch-utils"); const fs = __importStar(require("node:fs")); const path = __importStar(require("node:path")); // Get the fetch implementation for the current environment const fetch = (0, fetch_utils_1.getFetch)(); /** * PlayHT TTS Client * * This client uses the PlayHT API to convert text to speech. * It supports streaming audio but does not support SSML. * Word boundaries are estimated since PlayHT doesn't provide word events. */ class PlayHTTTSClient extends abstract_tts_1.AbstractTTSClient { /** * Create a new PlayHT TTS Client * @param credentials PlayHT API credentials */ constructor(credentials = {}) { super(credentials); Object.defineProperty(this, "apiKey", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "userId", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "voice", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "voiceEngine", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "outputFormat", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "lastWordBoundaries", { enumerable: true, configurable: true, writable: true, value: [] }); // Set credentials this.apiKey = credentials.apiKey || process.env.PLAYHT_API_KEY || ""; this.userId = credentials.userId || process.env.PLAYHT_USER_ID || ""; // Set default values this.voice = "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json"; this.voiceEngine = "PlayHT1.0"; this.outputFormat = "wav"; } /** * Check if the credentials are valid * @returns Promise resolving to true if credentials are valid, false otherwise */ async checkCredentials() { if (!this.apiKey || !this.userId) { console.error("PlayHT API key and user ID are required"); return false; } try { // Try to list voices to check if the API key is valid await this._fetchVoices(); return true; } catch (error) { console.error("Error checking PlayHT credentials:", error); return false; } } /** * Fetch voices from the PlayHT API * @returns Promise resolving to an array of PlayHT voice objects */ async _fetchVoices() { try { // Fetch standard voices const standardResponse = await fetch("https://api.play.ht/api/v2/voices", { method: "GET", headers: { accept: "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, }); if (!standardResponse.ok) { throw new Error(`Failed to fetch PlayHT voices: ${standardResponse.statusText}`); } const standardVoices = await standardResponse.json(); // Fetch cloned voices const clonedResponse = await fetch("https://api.play.ht/api/v2/cloned-voices", { method: "GET", headers: { accept: "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, }); if (!clonedResponse.ok) { throw new Error(`Failed to fetch PlayHT cloned voices: ${clonedResponse.statusText}`); } const clonedVoices = await clonedResponse.json(); // Merge standard and cloned voices return [...standardVoices, ...clonedVoices]; } catch (error) { console.error("Error fetching PlayHT voices:", error); throw error; } } /** * Get available voices * @returns Promise resolving to an array of unified voice objects */ async _getVoices() { try { const rawVoices = await this._fetchVoices(); return this._mapVoicesToUnified(rawVoices); } catch (error) { console.error("Error getting PlayHT voices:", error); return []; } } /** * Map PlayHT voice objects to unified format * @param rawVoices Array of PlayHT voice objects * @returns Promise resolving to an array of unified voice objects */ async _mapVoicesToUnified(rawVoices) { // Track seen voice IDs to handle duplicates const seenVoiceIds = new Set(); const unifiedVoices = []; for (const voice of rawVoices) { // Create language code object const languageCode = { bcp47: voice.language_code || "en-US", iso639_3: voice.language_code ? voice.language_code.split("-")[0] : "eng", display: voice.language || "English (US)", }; const voiceId = voice.id; // Handle duplicate voice IDs by appending a suffix let uniqueId = voiceId; if (seenVoiceIds.has(voiceId)) { // If this is a duplicate, append the voice name to make it unique uniqueId = `${voiceId}#${voice.name}`; console.warn(`Found duplicate voice ID: ${voiceId}. Using ${uniqueId} instead.`); } // Add the voice ID to the set of seen IDs seenVoiceIds.add(voiceId); unifiedVoices.push({ id: uniqueId, name: voice.name, gender: voice.gender || "Unknown", provider: "playht", languageCodes: [languageCode], }); } return unifiedVoices; } /** * Set the voice to use for synthesis * @param voiceId Voice ID to use */ setVoice(voiceId) { // If the voice ID contains a '#' character, it's a modified ID to handle duplicates // Extract the original ID (everything before the '#') if (voiceId.includes('#')) { const originalId = voiceId.split('#')[0]; this.voice = originalId; console.log(`Using original voice ID: ${originalId} (from modified ID: ${voiceId})`); } else { this.voice = voiceId; } } /** * Set the voice engine to use for synthesis * @param engine Voice engine to use */ setVoiceEngine(engine) { this.voiceEngine = engine; } /** * Set the output format * @param format Output format (wav, mp3) */ setOutputFormat(format) { this.outputFormat = format; } /** * Get a property value * @param property Property name * @returns Property value */ getProperty(property) { switch (property) { case "voice": return this.voice; case "voiceEngine": return this.voiceEngine; case "outputFormat": return this.outputFormat; default: return super.getProperty(property); } } /** * Set a property value * @param property Property name * @param value Property value */ setProperty(property, value) { switch (property) { case "voice": this.setVoice(value); break; case "voiceEngine": this.setVoiceEngine(value); break; case "outputFormat": this.setOutputFormat(value); break; default: super.setProperty(property, value); break; } } /** * Get the last word boundaries * @returns Array of word boundary objects */ getLastWordBoundaries() { return this.lastWordBoundaries; } /** * Set the last word boundaries * @param wordBoundaries Array of word boundary objects */ setLastWordBoundaries(wordBoundaries) { this.lastWordBoundaries = wordBoundaries; } /** * Convert text to speech * @param text Text to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async textToSpeech(text, options = {}) { try { // Create output directory if it doesn't exist const outputDir = options.outputDir || "."; if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Generate output file path const outputFile = options.outputFile || `playht-output.${this.outputFormat}`; const outputPath = path.join(outputDir, outputFile); // Create speech const response = await fetch("https://api.play.ht/api/v2/tts", { method: "POST", headers: { accept: "application/json", "content-type": "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, body: JSON.stringify({ text, voice: this.voice, output_format: this.outputFormat, voice_engine: this.voiceEngine, }), }); if (!response.ok) { throw new Error(`Failed to convert text to speech: ${response.statusText}`); } const data = await response.json(); // Download the audio file const audioResponse = await fetch(data.url); if (!audioResponse.ok) { throw new Error(`Failed to download audio file: ${audioResponse.statusText}`); } const buffer = Buffer.from(await audioResponse.arrayBuffer()); fs.writeFileSync(outputPath, buffer); // Estimate word boundaries if (options.onWord || options.returnWordBoundaries) { const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text); // Call onWord callback for each word if (options.onWord) { for (const wb of wordBoundaries) { options.onWord(wb); } } // Store word boundaries if requested if (options.returnWordBoundaries) { this.setLastWordBoundaries(wordBoundaries); } } else { // Always estimate word boundaries for tests const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text); this.setLastWordBoundaries(wordBoundaries); } // Call onEnd callback if (options.onEnd) { options.onEnd(); } return outputPath; } catch (error) { console.error("Error converting text to speech:", error); throw error; } } /** * Convert text to speech with streaming * @param text Text to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async textToSpeechStreaming(text, options = {}) { try { // Create output directory if it doesn't exist const outputDir = options.outputDir || "."; if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Generate output file path const outputFile = options.outputFile || `playht-streaming-output.${this.outputFormat}`; const outputPath = path.join(outputDir, outputFile); // Create speech with streaming - use the regular API since the streaming API returns a WAV file directly const response = await fetch("https://api.play.ht/api/v2/tts", { method: "POST", headers: { accept: "application/json", "content-type": "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, body: JSON.stringify({ text, voice: this.voice, output_format: this.outputFormat, voice_engine: this.voiceEngine, }), }); if (!response.ok) { const errorText = await response.text(); console.error(`PlayHT API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`); throw new Error(`Failed to convert text to speech with streaming: ${response.statusText}`); } const data = await response.json(); console.log('PlayHT API streaming response:', JSON.stringify(data, null, 2)); // Poll for the result const jobId = data.id; if (!jobId) { throw new Error(`PlayHT API did not return a job ID: ${JSON.stringify(data)}`); } // Get the job status URL const jobStatusUrl = `https://api.play.ht/api/v2/tts/${jobId}`; // Poll for the result let audioUrl = null; let attempts = 0; const maxAttempts = 30; // Maximum number of polling attempts const pollingInterval = 1000; // Polling interval in milliseconds while (!audioUrl && attempts < maxAttempts) { attempts++; console.log(`Polling for streaming result (attempt ${attempts}/${maxAttempts})...`); // Wait for the polling interval await new Promise(resolve => setTimeout(resolve, pollingInterval)); // Get the job status const statusResponse = await fetch(jobStatusUrl, { method: "GET", headers: { accept: "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, }); if (!statusResponse.ok) { throw new Error(`Failed to get job status: ${statusResponse.statusText}`); } const statusData = await statusResponse.json(); console.log(`Streaming job status: ${statusData.status}`); // Check if the job is completed if ((statusData.status === "completed" || statusData.status === "complete") && statusData.output && statusData.output.url) { audioUrl = statusData.output.url; break; } // Check if the job failed if (statusData.status === "failed") { throw new Error(`Streaming job failed: ${JSON.stringify(statusData)}`); } } if (!audioUrl) { throw new Error(`Timed out waiting for streaming job to complete after ${maxAttempts} attempts`); } // Download the audio file const audioResponse = await fetch(audioUrl); if (!audioResponse.ok) { throw new Error(`Failed to download streaming audio file: ${audioResponse.statusText}`); } // Get the audio data const audioBuffer = await audioResponse.arrayBuffer(); // Create a writable stream to the output file const writer = fs.createWriteStream(outputPath); // Write the audio data to the file writer.write(Buffer.from(audioBuffer)); // Close the writer writer.end(); // Wait for the file to be written await new Promise((resolve, reject) => { writer.on("finish", resolve); writer.on("error", reject); }); // Estimate word boundaries if (options.onWord || options.returnWordBoundaries) { const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text); // Call onWord callback for each word if (options.onWord) { for (const wb of wordBoundaries) { options.onWord(wb); } } // Store word boundaries if requested if (options.returnWordBoundaries) { this.setLastWordBoundaries(wordBoundaries); } } else { // Always estimate word boundaries for tests const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text); this.setLastWordBoundaries(wordBoundaries); } // Call onEnd callback if (options.onEnd) { options.onEnd(); } return outputPath; } catch (error) { console.error("Error converting text to speech with streaming:", error); throw error; } } /** * Convert SSML to speech (not supported by PlayHT) * @param ssml SSML to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async ssmlToSpeech(_ssml, _options = {}) { throw new Error("SSML is not supported by PlayHT TTS"); } /** * Convert SSML to speech with streaming (not supported by PlayHT) * @param ssml SSML to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async ssmlToSpeechStreaming(_ssml, _options = {}) { throw new Error("SSML is not supported by PlayHT TTS"); } /** * Synthesize text to audio bytes * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio bytes */ async synthToBytes(text, _options = {}) { try { // Create speech const response = await fetch("https://api.play.ht/api/v2/tts", { method: "POST", headers: { accept: "application/json", "content-type": "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, body: JSON.stringify({ text: typeof text === 'string' ? text : text.join(' '), voice: this.voice, output_format: this.outputFormat, voice_engine: this.voiceEngine, }), }); if (!response.ok) { const errorText = await response.text(); console.error(`PlayHT API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`); throw new Error(`Failed to convert text to speech: ${response.statusText}`); } const data = await response.json(); console.log('PlayHT API response:', JSON.stringify(data, null, 2)); // Poll for the result const jobId = data.id; if (!jobId) { throw new Error(`PlayHT API did not return a job ID: ${JSON.stringify(data)}`); } // Get the job status URL const jobStatusUrl = `https://api.play.ht/api/v2/tts/${jobId}`; // Poll for the result let audioUrl = null; let attempts = 0; const maxAttempts = 30; // Maximum number of polling attempts const pollingInterval = 1000; // Polling interval in milliseconds while (!audioUrl && attempts < maxAttempts) { attempts++; console.log(`Polling for result (attempt ${attempts}/${maxAttempts})...`); // Wait for the polling interval await new Promise(resolve => setTimeout(resolve, pollingInterval)); // Get the job status const statusResponse = await fetch(jobStatusUrl, { method: "GET", headers: { accept: "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, }); if (!statusResponse.ok) { throw new Error(`Failed to get job status: ${statusResponse.statusText}`); } const statusData = await statusResponse.json(); console.log(`Job status: ${statusData.status}`); console.log(`Job output: ${JSON.stringify(statusData.output)}`); // Check if the job is completed if ((statusData.status === "completed" || statusData.status === "complete") && statusData.output && statusData.output.url) { audioUrl = statusData.output.url; break; } // Check if the job failed if (statusData.status === "failed") { throw new Error(`Job failed: ${JSON.stringify(statusData)}`); } } if (!audioUrl) { throw new Error(`Timed out waiting for job to complete after ${maxAttempts} attempts`); } // Download the audio file const audioResponse = await fetch(audioUrl); if (!audioResponse.ok) { throw new Error(`Failed to download audio file: ${audioResponse.statusText}`); } const buffer = Buffer.from(await audioResponse.arrayBuffer()); return new Uint8Array(buffer); } catch (error) { console.error("Error converting text to speech bytes:", error); throw error; } } /** * Synthesize text to audio byte stream * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio byte stream */ async synthToBytestream(text, _options = {}) { try { // Create speech with streaming - use the regular API since the streaming API returns a WAV file directly const response = await fetch("https://api.play.ht/api/v2/tts", { method: "POST", headers: { accept: "application/json", "content-type": "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, body: JSON.stringify({ text, voice: this.voice, output_format: this.outputFormat, voice_engine: this.voiceEngine, }), }); if (!response.ok) { const errorText = await response.text(); console.error(`PlayHT API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`); throw new Error(`Failed to convert text to speech with streaming: ${response.statusText}`); } const data = await response.json(); console.log('PlayHT API bytestream response:', JSON.stringify(data, null, 2)); // Poll for the result const jobId = data.id; if (!jobId) { throw new Error(`PlayHT API did not return a job ID: ${JSON.stringify(data)}`); } // Get the job status URL const jobStatusUrl = `https://api.play.ht/api/v2/tts/${jobId}`; // Poll for the result let audioUrl = null; let attempts = 0; const maxAttempts = 30; // Maximum number of polling attempts const pollingInterval = 1000; // Polling interval in milliseconds while (!audioUrl && attempts < maxAttempts) { attempts++; console.log(`Polling for bytestream result (attempt ${attempts}/${maxAttempts})...`); // Wait for the polling interval await new Promise(resolve => setTimeout(resolve, pollingInterval)); // Get the job status const statusResponse = await fetch(jobStatusUrl, { method: "GET", headers: { accept: "application/json", "AUTHORIZATION": this.apiKey, "X-USER-ID": this.userId, }, }); if (!statusResponse.ok) { throw new Error(`Failed to get job status: ${statusResponse.statusText}`); } const statusData = await statusResponse.json(); console.log(`Bytestream job status: ${statusData.status}`); // Check if the job is completed if ((statusData.status === "completed" || statusData.status === "complete") && statusData.output && statusData.output.url) { audioUrl = statusData.output.url; break; } // Check if the job failed if (statusData.status === "failed") { throw new Error(`Bytestream job failed: ${JSON.stringify(statusData)}`); } } if (!audioUrl) { throw new Error(`Timed out waiting for bytestream job to complete after ${maxAttempts} attempts`); } // Download the audio file const audioResponse = await fetch(audioUrl); if (!audioResponse.ok) { throw new Error(`Failed to download bytestream audio file: ${audioResponse.statusText}`); } // Get the audio data as a ReadableStream return audioResponse.body; } catch (error) { console.error("Error converting text to speech stream:", error); throw error; } } } exports.PlayHTTTSClient = PlayHTTTSClient;