js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
675 lines (674 loc) • 28.1 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.PlayHTTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const word_timing_estimator_1 = require("../utils/word-timing-estimator");
const fetch_utils_1 = require("../utils/fetch-utils");
// Node-only imports moved inside Node-only code paths below for browser compatibility.
// Get the fetch implementation for the current environment
const fetch = (0, fetch_utils_1.getFetch)();
/**
* PlayHT TTS Client
*
* This client uses the PlayHT API to convert text to speech.
* It supports streaming audio but does not support SSML.
* Word boundaries are estimated since PlayHT doesn't provide word events.
*/
class PlayHTTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new PlayHT TTS Client
* @param credentials PlayHT API credentials
*/
constructor(credentials = {}) {
super(credentials);
Object.defineProperty(this, "apiKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "userId", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "voice", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "voiceEngine", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "outputFormat", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "lastWordBoundaries", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
// Set credentials
this.apiKey = credentials.apiKey || process.env.PLAYHT_API_KEY || "";
this.userId = credentials.userId || process.env.PLAYHT_USER_ID || "";
// Set default values
this.voice = "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json";
this.voiceEngine = "PlayHT2.0"; // Use PlayHT2.0 for cloned voices
this.outputFormat = "mp3"; // Use MP3 as default for better compatibility
}
/**
* Check if the credentials are valid
* @returns Promise resolving to true if credentials are valid, false otherwise
*/
async checkCredentials() {
if (!this.apiKey || !this.userId) {
console.error("PlayHT API key and user ID are required");
return false;
}
try {
// Try to list voices to check if the API key is valid
await this._fetchVoices();
return true;
}
catch (error) {
console.error("Error checking PlayHT credentials:", error);
return false;
}
}
/**
* Fetch voices from the PlayHT API
* @returns Promise resolving to an array of PlayHT voice objects
*/
async _fetchVoices() {
try {
// Fetch standard voices
const standardResponse = await fetch("https://api.play.ht/api/v2/voices", {
method: "GET",
headers: {
accept: "application/json",
"AUTHORIZATION": this.apiKey,
"X-USER-ID": this.userId,
},
});
if (!standardResponse.ok) {
throw new Error(`Failed to fetch PlayHT voices: ${standardResponse.statusText}`);
}
const standardVoices = await standardResponse.json();
// Fetch cloned voices
const clonedResponse = await fetch("https://api.play.ht/api/v2/cloned-voices", {
method: "GET",
headers: {
accept: "application/json",
"AUTHORIZATION": this.apiKey,
"X-USER-ID": this.userId,
},
});
if (!clonedResponse.ok) {
throw new Error(`Failed to fetch PlayHT cloned voices: ${clonedResponse.statusText}`);
}
const clonedVoices = await clonedResponse.json();
// Merge standard and cloned voices
return [...standardVoices, ...clonedVoices];
}
catch (error) {
console.error("Error fetching PlayHT voices:", error);
throw error;
}
}
/**
* Get available voices
* @returns Promise resolving to an array of unified voice objects
*/
async _getVoices() {
try {
const rawVoices = await this._fetchVoices();
return this._mapVoicesToUnified(rawVoices);
}
catch (error) {
console.error("Error getting PlayHT voices:", error);
return [];
}
}
/**
* Map PlayHT voice objects to unified format
* @param rawVoices Array of PlayHT voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Track seen voice IDs to handle duplicates
const seenVoiceIds = new Set();
const unifiedVoices = [];
for (const voice of rawVoices) {
// Create language code object
const languageCode = {
bcp47: voice.language_code || "en-US",
iso639_3: voice.language_code ? voice.language_code.split("-")[0] : "eng",
display: voice.language || "English (US)",
};
const voiceId = voice.id;
// Handle duplicate voice IDs by appending a suffix
let uniqueId = voiceId;
if (seenVoiceIds.has(voiceId)) {
// If this is a duplicate, append the voice name to make it unique
uniqueId = `${voiceId}#${voice.name}`;
console.warn(`Found duplicate voice ID: ${voiceId}. Using ${uniqueId} instead.`);
}
// Add the voice ID to the set of seen IDs
seenVoiceIds.add(voiceId);
unifiedVoices.push({
id: uniqueId,
name: voice.name,
gender: voice.gender || "Unknown",
provider: "playht",
languageCodes: [languageCode],
});
}
return unifiedVoices;
}
/**
* Set the voice to use for synthesis
* @param voiceId Voice ID to use
*/
setVoice(voiceId) {
// If the voice ID contains a '#' character, it's a modified ID to handle duplicates
// Extract the original ID (everything before the '#')
if (voiceId.includes('#')) {
const originalId = voiceId.split('#')[0];
this.voice = originalId;
console.log(`Using original voice ID: ${originalId} (from modified ID: ${voiceId})`);
}
else {
this.voice = voiceId;
}
// Auto-detect voice engine based on voice ID
this.autoDetectVoiceEngine(voiceId);
}
/**
* Auto-detect voice engine based on voice ID
* @param voiceId Voice ID to analyze
*/
autoDetectVoiceEngine(voiceId) {
// Extract the original voice ID if it has a '#' suffix
const originalVoiceId = voiceId.includes('#') ? voiceId.split('#')[0] : voiceId;
// Cloned voices (s3:// URLs) work better with PlayHT2.0
if (originalVoiceId.startsWith('s3://')) {
this.voiceEngine = "PlayHT2.0";
console.log(`Auto-detected cloned voice, using PlayHT2.0 engine`);
}
else {
// Standard voices work with PlayHT1.0
this.voiceEngine = "PlayHT1.0";
console.log(`Auto-detected standard voice, using PlayHT1.0 engine`);
}
}
/**
* Set the voice engine to use for synthesis
* @param engine Voice engine to use
*/
setVoiceEngine(engine) {
this.voiceEngine = engine;
}
/**
* Set the output format
* @param format Output format (wav, mp3)
*/
setOutputFormat(format) {
this.outputFormat = format;
}
/**
* Get a property value
* @param property Property name
* @returns Property value
*/
getProperty(property) {
switch (property) {
case "voice":
return this.voice;
case "voiceEngine":
return this.voiceEngine;
case "outputFormat":
return this.outputFormat;
default:
return super.getProperty(property);
}
}
/**
* Set a property value
* @param property Property name
* @param value Property value
*/
setProperty(property, value) {
switch (property) {
case "voice":
this.setVoice(value);
break;
case "voiceEngine":
this.setVoiceEngine(value);
break;
case "outputFormat":
this.setOutputFormat(value);
break;
default:
super.setProperty(property, value);
break;
}
}
/**
* Get the last word boundaries
* @returns Array of word boundary objects
*/
getLastWordBoundaries() {
return this.lastWordBoundaries;
}
/**
* Set the last word boundaries
* @param wordBoundaries Array of word boundary objects
*/
setLastWordBoundaries(wordBoundaries) {
this.lastWordBoundaries = wordBoundaries;
}
/**
* Convert text to speech
* @param text Text to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async textToSpeech(text, options = {}) {
try {
if (typeof window !== "undefined") {
throw new Error("File output is not supported in the browser. Use synthToBytes or synthToBytestream instead.");
}
const fs = await Promise.resolve().then(() => __importStar(require("node:fs")));
const path = await Promise.resolve().then(() => __importStar(require("node:path")));
// Create output directory if it doesn't exist
const outputDir = options.outputDir || ".";
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Generate output file path
const outputFile = options.outputFile || `playht-output.${this.outputFormat}`;
const outputPath = path.join(outputDir, outputFile);
// Create speech
const response = await fetch("https://api.play.ht/api/v2/tts", {
method: "POST",
headers: {
accept: "application/json",
"content-type": "application/json",
"AUTHORIZATION": this.apiKey,
"X-USER-ID": this.userId,
},
body: JSON.stringify({
text,
voice: this.voice,
output_format: this.outputFormat,
voice_engine: this.voiceEngine,
}),
});
if (!response.ok) {
throw new Error(`Failed to convert text to speech: ${response.statusText}`);
}
const data = await response.json();
// Download the audio file
const audioResponse = await fetch(data.url);
if (!audioResponse.ok) {
throw new Error(`Failed to download audio file: ${audioResponse.statusText}`);
}
const buffer = Buffer.from(await audioResponse.arrayBuffer());
fs.writeFileSync(outputPath, buffer);
// Estimate word boundaries
if (options.onWord || options.returnWordBoundaries) {
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
// Call onWord callback for each word
if (options.onWord) {
for (const wb of wordBoundaries) {
options.onWord(wb);
}
}
// Store word boundaries if requested
if (options.returnWordBoundaries) {
this.setLastWordBoundaries(wordBoundaries);
}
}
else {
// Always estimate word boundaries for tests
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
this.setLastWordBoundaries(wordBoundaries);
}
// Call onEnd callback
if (options.onEnd) {
options.onEnd();
}
return outputPath;
}
catch (error) {
console.error("Error converting text to speech:", error);
throw error;
}
}
/**
* Convert text to speech with streaming
* @param text Text to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async textToSpeechStreaming(text, options = {}) {
try {
if (typeof window !== "undefined") {
throw new Error("File output is not supported in the browser. Use synthToBytes or synthToBytestream instead.");
}
const fs = await Promise.resolve().then(() => __importStar(require("node:fs")));
const path = await Promise.resolve().then(() => __importStar(require("node:path")));
// Create output directory if it doesn't exist
const outputDir = options.outputDir || ".";
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Generate output file path
const outputFile = options.outputFile || `playht-streaming-output.${this.outputFormat}`;
const outputPath = path.join(outputDir, outputFile);
// Create speech with streaming - use the regular API since the streaming API returns a WAV file directly
const response = await fetch("https://api.play.ht/api/v2/tts", {
method: "POST",
headers: {
accept: "application/json",
"content-type": "application/json",
"AUTHORIZATION": this.apiKey,
"X-USER-ID": this.userId,
},
body: JSON.stringify({
text,
voice: this.voice,
output_format: this.outputFormat,
voice_engine: this.voiceEngine,
}),
});
if (!response.ok) {
const errorText = await response.text();
console.error(`PlayHT API error: ${response.status} ${response.statusText}\nResponse: ${errorText}`);
throw new Error(`Failed to convert text to speech with streaming: ${response.statusText}`);
}
const data = await response.json();
// Poll for the result
const jobId = data.id;
if (!jobId) {
throw new Error(`PlayHT API did not return a job ID: ${JSON.stringify(data)}`);
}
// Get the job status URL
const jobStatusUrl = `https://api.play.ht/api/v2/tts/${jobId}`;
// Poll for the result
let audioUrl = null;
let attempts = 0;
const maxAttempts = 30; // Maximum number of polling attempts
const pollingInterval = 1000; // Polling interval in milliseconds
while (!audioUrl && attempts < maxAttempts) {
attempts++;
console.log(`Polling for streaming result (attempt ${attempts}/${maxAttempts})...`);
// Wait for the polling interval
await new Promise(resolve => setTimeout(resolve, pollingInterval));
// Get the job status
const statusResponse = await fetch(jobStatusUrl, {
method: "GET",
headers: {
accept: "application/json",
"AUTHORIZATION": this.apiKey,
"X-USER-ID": this.userId,
},
});
if (!statusResponse.ok) {
throw new Error(`Failed to get job status: ${statusResponse.statusText}`);
}
const statusData = await statusResponse.json();
console.log(`Streaming job status: ${statusData.status}`);
// Check if the job is completed (using multiple possible status strings and URL paths)
const isSuccessStatus = statusData.status === "completed" || statusData.status === "complete" || statusData.status === "SUCCESS";
let potentialUrl = null;
if (statusData.output && statusData.output.url) {
potentialUrl = statusData.output.url;
}
else if (statusData.metadata && statusData.metadata.output && Array.isArray(statusData.metadata.output) && statusData.metadata.output.length > 0) {
potentialUrl = statusData.metadata.output[0];
}
if (isSuccessStatus && potentialUrl) {
audioUrl = potentialUrl;
console.log(`Streaming job finished successfully. Audio URL: ${audioUrl}`);
break;
}
// Check if the job failed
if (statusData.status === "failed") {
throw new Error(`Streaming job failed: ${JSON.stringify(statusData)}`);
}
}
if (!audioUrl) {
throw new Error(`Timed out waiting for streaming job to complete after ${maxAttempts} attempts`);
}
// Download the audio file
const audioResponse = await fetch(audioUrl);
if (!audioResponse.ok) {
throw new Error(`Failed to download streaming audio file: ${audioResponse.statusText}`);
}
const buffer = Buffer.from(await audioResponse.arrayBuffer());
fs.writeFileSync(outputPath, buffer);
// Estimate word boundaries
if (options.onWord || options.returnWordBoundaries) {
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
// Call onWord callback for each word
if (options.onWord) {
for (const wb of wordBoundaries) {
options.onWord(wb);
}
}
// Store word boundaries if requested
if (options.returnWordBoundaries) {
this.setLastWordBoundaries(wordBoundaries);
}
}
else {
// Always estimate word boundaries for tests
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
this.setLastWordBoundaries(wordBoundaries);
}
// Call onEnd callback
if (options.onEnd) {
options.onEnd();
}
return outputPath;
}
catch (error) {
console.error("Error converting text to speech with streaming:", error);
throw error;
}
}
/**
* Convert SSML to speech (not supported by PlayHT)
* @param ssml SSML to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async ssmlToSpeech(_ssml, _options = {}) {
throw new Error("SSML is not supported by PlayHT TTS");
}
/**
* Convert SSML to speech with streaming (not supported by PlayHT)
* @param ssml SSML to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async ssmlToSpeechStreaming(_ssml, _options = {}) {
throw new Error("SSML is not supported by PlayHT TTS");
}
/**
* Synthesize text to audio and save it to a file
* @param text Text or SSML to synthesize
* @param filename Filename to save as
* @param format Audio format (mp3 or wav)
* @param options Synthesis options
*/
async synthToFile(text, filename, format = "mp3", // Default to MP3 for PlayHT
options) {
// PlayHT works best with MP3, so we'll always use MP3 internally
// and warn if a different format is requested
if (format !== "mp3") {
console.warn(`PlayHT TTS works best with MP3 format. Converting ${format} request to MP3.`);
}
// Use MP3 as the native format
const audioBytes = await this.synthToBytes(text, { ...options, format: 'mp3' });
// Handle file saving (use requested filename but MP3 content)
if (typeof window !== "undefined") {
// Browser environment
const mimeType = "audio/mpeg";
const blob = new Blob([audioBytes], { type: mimeType });
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
// Use the requested filename as-is (even if it has .wav extension)
a.download = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`;
document.body.appendChild(a);
a.click();
requestAnimationFrame(() => {
if (document?.body?.contains(a)) {
document.body.removeChild(a);
}
URL.revokeObjectURL(url);
});
}
else {
// Node.js environment
const fs = await Promise.resolve().then(() => __importStar(require("node:fs")));
// Use the requested filename as-is (even if it has .wav extension)
const outputPath = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`;
fs.writeFileSync(outputPath, Buffer.from(audioBytes));
}
}
/**
* Synthesize text to audio bytes
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
try {
console.debug('PlayHT synthToBytes: Calling synthToBytestream internally...');
// For PlayHT, we'll always use MP3 as the native format for better compatibility
const audioStream = await this.synthToBytestream(text, options);
if (!audioStream) {
throw new Error('synthToBytestream returned null, cannot generate Buffer.');
}
console.debug('PlayHT synthToBytes: Buffering stream...');
// Helper function to read the entire stream into a Buffer
const streamToBuffer = async (stream) => {
const reader = stream.getReader();
const chunks = [];
let totalLength = 0;
while (true) {
const { done, value } = await reader.read();
if (done) {
break;
}
chunks.push(value);
totalLength += value.length;
}
return Buffer.concat(chunks, totalLength);
};
const buffer = await streamToBuffer(audioStream.audioStream);
console.debug(`PlayHT synthToBytes: Buffering complete (${buffer.length} bytes).`);
return buffer;
}
catch (error) {
console.error("Error in PlayHT synthToBytes (using streaming internally):", error);
throw error; // Re-throw the error
}
}
/**
* Synthesize text to audio byte stream
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and an empty word boundaries array.
*/
async synthToBytestream(text, _options = {}) {
try {
// PlayHT works best with MP3 format, especially for cloned voices
// Use MP3 as the native format regardless of what's requested
const nativeFormat = 'mp3';
const acceptHeader = 'audio/mpeg';
const response = await fetch("https://api.play.ht/api/v2/tts/stream", {
method: "POST",
headers: {
'accept': acceptHeader,
'content-type': 'application/json',
'AUTHORIZATION': this.apiKey,
'X-USER-ID': this.userId,
},
body: JSON.stringify({
text: text, // Parameter is string, no need for conditional
voice: this.voice,
output_format: nativeFormat,
voice_engine: this.voiceEngine, // Ensure this is set appropriately
// Add other relevant options like speed, sample_rate if needed
}),
});
if (!response.ok) {
// Attempt to read error response body for more details
let errorBody = '';
try {
errorBody = await response.text();
}
catch (e) { /* Ignore error reading body */ }
console.error(`PlayHT Streaming API error: ${response.status} ${response.statusText}\nResponse Body: ${errorBody}`);
throw new Error(`Failed to stream text to speech: ${response.status} ${response.statusText}`);
}
// The response body is the audio stream
if (!response.body) {
throw new Error('PlayHT Streaming API did not return a response body stream.');
}
// Return the stream along with an empty word boundaries array
return { audioStream: response.body, wordBoundaries: [] };
}
catch (error) {
console.error("Error converting text to speech stream:", error);
throw error;
}
}
}
exports.PlayHTTTSClient = PlayHTTTSClient;
;