UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

github.com/willwade/js-tts-wrapper

willwade/js-tts-wrapper

812 lines (811 loc) • 31.8 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.AbstractTTSClient = void 0; const builder_1 = require("../ssml/builder"); const language_utils_1 = require("./language-utils"); const SSMLUtils = __importStar(require("./ssml-utils")); const environment_1 = require("../utils/environment"); const node_audio_1 = require("../utils/node-audio"); /** * Abstract base class for all TTS clients * This provides a unified interface for all TTS providers */ class AbstractTTSClient { /** * Creates a new TTS client * @param credentials Provider-specific credentials */ constructor(credentials) { Object.defineProperty(this, "credentials", { enumerable: true, configurable: true, writable: true, value: credentials }); /** * Currently selected voice ID */ Object.defineProperty(this, "voiceId", { enumerable: true, configurable: true, writable: true, value: null }); /** * Currently selected language */ Object.defineProperty(this, "lang", { enumerable: true, configurable: true, writable: true, value: "en-US" }); /** * Event callbacks */ Object.defineProperty(this, "callbacks", { enumerable: true, configurable: true, writable: true, value: {} }); /** * SSML builder instance */ Object.defineProperty(this, "ssml", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Audio playback properties */ Object.defineProperty(this, "audio", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * TTS properties (rate, pitch, volume) */ Object.defineProperty(this, "properties", { enumerable: true, configurable: true, writable: true, value: { volume: 100, rate: "medium", pitch: "medium", } }); /** * Word timings for the current audio */ Object.defineProperty(this, "timings", { enumerable: true, configurable: true, writable: true, value: [] }); /** * Audio sample rate in Hz * This is used for playback and word timing estimation * Default is 24000 Hz, but engines can override this */ Object.defineProperty(this, "sampleRate", { enumerable: true, configurable: true, writable: true, value: 24000 }); this.ssml = new builder_1.SSMLBuilder(); this.audio = { isPlaying: false, isPaused: false, audioElement: null, position: 0, duration: 0, }; } /** * Get available voices from the provider with normalized language codes * @returns Promise resolving to an array of unified voice objects */ async getVoices() { // Get raw voices from the engine-specific implementation const rawVoices = await this._getVoices(); // Process and normalize the voices // Each engine should implement _mapVoiceToUnified to convert its raw voice format // to a partially filled UnifiedVoice object const voices = await this._mapVoicesToUnified(rawVoices); // Normalize language codes for all voices return voices.map((voice) => { // Normalize language codes for each language const normalizedLanguageCodes = voice.languageCodes.map((lang) => { const normalized = language_utils_1.LanguageNormalizer.normalize(lang.bcp47); return { bcp47: normalized.bcp47, iso639_3: normalized.iso639_3, display: normalized.display, }; }); // Return the voice with normalized language codes return { ...voice, languageCodes: normalizedLanguageCodes, }; }); } // --- Optional overrides --- /** * Map provider-specific voice objects to unified format * @param rawVoices Array of provider-specific voice objects * @returns Promise resolving to an array of partially unified voice objects */ async _mapVoicesToUnified(rawVoices) { // Default implementation that assumes rawVoices are already in UnifiedVoice format // Engine-specific implementations should override this method return rawVoices; } /** * Speak text using the default audio output, or play audio from file/bytes/stream * @param input Text to speak, or audio input (filename, audioBytes, or audioStream) * @param options Synthesis options * @returns Promise resolving when audio playback starts */ async speak(input, options) { // Trigger onStart callback this.emit("start"); try { let audioBytes; let mimeType; // Handle different input types if (typeof input === "string") { // Traditional text input audioBytes = await this.synthToBytes(input, options); // Determine MIME type based on options or engine default mimeType = "audio/wav"; // default to WAV if (options?.format === "mp3") { mimeType = "audio/mpeg"; } else if (options?.format === "ogg") { mimeType = "audio/ogg"; } } else { // Audio input (file, bytes, or stream) const { processAudioInput } = await Promise.resolve().then(() => __importStar(require("../utils/audio-input"))); const result = await processAudioInput(input); audioBytes = result.audioBytes; mimeType = result.mimeType; } // Check if we're in a browser environment if (environment_1.isBrowser) { // Create audio blob and URL with the correct MIME type const blob = new Blob([audioBytes], { type: mimeType }); const url = URL.createObjectURL(blob); // Create and play audio element const audio = new Audio(); // Set up event handlers before setting the source audio.oncanplay = async () => { try { this.audio.audioElement = audio; this.audio.isPlaying = true; this.audio.isPaused = false; // Create estimated word timings if needed (only for text input) if (typeof input === "string") { this._createEstimatedWordTimings(input); } // Play the audio await audio.play(); } catch (playError) { console.error("Error playing audio:", playError); this.emit("end"); } }; audio.onerror = (e) => { console.error("Audio playback error:", e); this.emit("end"); URL.revokeObjectURL(url); }; audio.onended = () => { this.emit("end"); this.audio.isPlaying = false; URL.revokeObjectURL(url); // Clean up the URL }; // Set the source after setting up event handlers audio.src = url; } else if (environment_1.isNode) { // In Node.js environment, try to use sound-play try { // Check if Node.js audio playback is available const audioAvailable = await (0, node_audio_1.isNodeAudioAvailable)(); if (audioAvailable) { // Emit start event this.emit("start"); // Play audio using our node-audio utility // Pass the engine name to handle Polly audio differently await (0, node_audio_1.playAudioInNode)(audioBytes, this.sampleRate, this.constructor.name.replace('TTSClient', '').toLowerCase()); // Emit end event this.emit("end"); } else { console.log("Audio playback in Node.js requires the sound-play package."); console.log("Install it with: npm install js-tts-wrapper[node-audio]"); console.log("Or use synthToFile() to save audio to a file and play it with an external player."); this.emit("end"); } } catch (nodeAudioError) { console.error("Error playing audio in Node.js:", nodeAudioError); this.emit("end"); } } else { // Unknown environment console.log("Audio playback is not supported in this environment."); console.log("Use synthToFile() to save audio to a file and play it with an external player."); this.emit("end"); } } catch (error) { console.error("Error in speak method:", error); this.emit("end"); // Ensure end event is triggered even on error throw error; } } /** * Speak text using streaming synthesis, or play audio from file/bytes/stream * @param input Text to speak, or audio input (filename, audioBytes, or audioStream) * @param options Synthesis options * @returns Promise resolving when audio playback starts */ async speakStreamed(input, options) { // Trigger onStart callback this.emit("start"); try { let audioBytes; let mimeType; let wordBoundaries = []; let text = ""; // Handle different input types if (typeof input === "string") { // Traditional text input - use streaming synthesis text = input; const streamResult = await this.synthToBytestream(text, options); // Get audio stream and word boundaries const audioStream = streamResult.audioStream; wordBoundaries = streamResult.wordBoundaries; const reader = audioStream.getReader(); const chunks = []; // Read all chunks from the stream let result = await reader.read(); while (!result.done) { chunks.push(result.value); result = await reader.read(); } // Combine chunks into a single audio buffer const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0); audioBytes = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { audioBytes.set(chunk, offset); offset += chunk.length; } // Determine MIME type based on options or engine default mimeType = "audio/wav"; // default to WAV if (options?.format === "mp3") { mimeType = "audio/mpeg"; } else if (options?.format === "ogg") { mimeType = "audio/ogg"; } } else { // Audio input (file, bytes, or stream) const { processAudioInput } = await Promise.resolve().then(() => __importStar(require("../utils/audio-input"))); const result = await processAudioInput(input); audioBytes = result.audioBytes; mimeType = result.mimeType; // For audio input, we don't have word boundaries or text // We'll create estimated timings if needed text = ""; // No text available for audio input } // Use actual word boundaries if available, otherwise create estimated ones if (wordBoundaries.length > 0) { // Convert the word boundaries to our internal format this.timings = wordBoundaries.map((wb) => [ wb.offset / 10000, // Convert from 100-nanosecond units to seconds (wb.offset + wb.duration) / 10000, wb.text, ]); } else if (text) { // Create estimated word timings only if we have text this._createEstimatedWordTimings(text); } else { // No text available (audio input), clear timings this.timings = []; } // Check if we're in a browser environment if (environment_1.isBrowser) { // Create audio blob and URL with the correct MIME type const blob = new Blob([audioBytes], { type: mimeType }); const url = URL.createObjectURL(blob); // Create and play audio element const audio = new Audio(); // Set up event handlers before setting the source audio.oncanplay = async () => { try { this.audio.audioElement = audio; this.audio.isPlaying = true; this.audio.isPaused = false; // Play the audio await audio.play(); } catch (playError) { console.error("Error playing audio:", playError); this.emit("end"); } }; audio.onerror = (e) => { console.error("Audio playback error:", e); this.emit("end"); URL.revokeObjectURL(url); }; audio.onended = () => { this.emit("end"); this.audio.isPlaying = false; URL.revokeObjectURL(url); }; // Set the source after setting up event handlers audio.src = url; } else if (environment_1.isNode) { // In Node.js environment, try to use sound-play try { // Check if Node.js audio playback is available const audioAvailable = await (0, node_audio_1.isNodeAudioAvailable)(); // Create estimated word timings if needed and we have text if (text) { this._createEstimatedWordTimings(text); } if (audioAvailable) { // Schedule word boundary callbacks this._scheduleWordBoundaryCallbacks(); // Play audio using our node-audio utility with the engine's sample rate // Pass the engine name to handle Polly audio differently await (0, node_audio_1.playAudioInNode)(audioBytes, this.sampleRate, this.constructor.name.replace('TTSClient', '').toLowerCase()); // Emit end event this.emit("end"); } else { console.log("Audio playback in Node.js requires the sound-play package."); console.log("Install it with: npm install js-tts-wrapper[node-audio]"); console.log("Or use synthToFile() to save audio to a file and play it with an external player."); // Fire word boundary callbacks immediately this._fireWordBoundaryCallbacks(); this.emit("end"); } } catch (nodeAudioError) { console.error("Error playing audio in Node.js:", nodeAudioError); this._fireWordBoundaryCallbacks(); this.emit("end"); } } else { // Unknown environment console.log("Audio playback is not supported in this environment."); console.log("Use synthToFile() to save audio to a file and play it with an external player."); // Create estimated word timings if needed and we have text if (text) { this._createEstimatedWordTimings(text); } // Fire word boundary callbacks immediately setTimeout(() => { this._fireWordBoundaryCallbacks(); this.emit("end"); }, 100); } } catch (error) { console.error("Error in streaming synthesis:", error); this.emit("end"); // Ensure end event is triggered even on error throw error; } } /** * Synthesize text to audio and save it to a file (browser download) * @param text Text or SSML to synthesize * @param filename Filename to save as * @param format Audio format (mp3 or wav) * @param options Synthesis options */ async synthToFile(text, filename, format = "wav", options) { // Convert text to audio bytes with the specified format const audioBytes = await this.synthToBytes(text, { ...options, format }); if (environment_1.isBrowser) { // Create blob with appropriate MIME type const mimeType = format === "mp3" ? "audio/mpeg" : "audio/wav"; const blob = new Blob([audioBytes], { type: mimeType }); // Create download link const url = URL.createObjectURL(blob); const a = document.createElement("a"); a.href = url; a.download = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`; // Trigger download document.body.appendChild(a); a.click(); // Clean up: Use requestAnimationFrame for potentially smoother cleanup requestAnimationFrame(() => { if (document?.body?.contains(a)) { document.body.removeChild(a); } URL.revokeObjectURL(url); }); } else if (environment_1.isNode) { // In Node.js, use the file system const outputPath = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`; const fs = await Promise.resolve().then(() => __importStar(require('node:fs'))); fs.writeFileSync(outputPath, Buffer.from(audioBytes)); } else { console.warn("File saving not implemented for this environment."); } } /** * Set the voice to use for synthesis * @param voiceId Voice ID to use * @param lang Language code (optional) */ setVoice(voiceId, lang) { this.voiceId = voiceId; if (lang) { this.lang = lang; } } // --- Playback control methods --- /** * Pause audio playback */ pause() { if (environment_1.isBrowser) { // Browser environment - use HTML5 Audio element if (this.audio.audioElement && this.audio.isPlaying && !this.audio.isPaused) { this.audio.audioElement.pause(); this.audio.isPaused = true; } } else if (environment_1.isNode) { // Node.js environment - use node-speaker try { // Import dynamically to avoid circular dependencies Promise.resolve().then(() => __importStar(require('./node-audio-control'))).then(nodeAudio => { const paused = nodeAudio.pauseAudioPlayback(); if (paused) { this.audio.isPaused = true; } }).catch(error => { console.error("Error importing node-audio-control:", error); }); } catch (error) { console.error("Error pausing audio in Node.js:", error); } } } /** * Resume audio playback */ resume() { if (environment_1.isBrowser) { // Browser environment - use HTML5 Audio element if (this.audio.audioElement && this.audio.isPlaying && this.audio.isPaused) { this.audio.audioElement.play(); this.audio.isPaused = false; } } else if (environment_1.isNode) { // Node.js environment - use node-speaker try { // Import dynamically to avoid circular dependencies Promise.resolve().then(() => __importStar(require('./node-audio-control'))).then(nodeAudio => { const resumed = nodeAudio.resumeAudioPlayback(); if (resumed) { this.audio.isPaused = false; } }).catch(error => { console.error("Error importing node-audio-control:", error); }); } catch (error) { console.error("Error resuming audio in Node.js:", error); } } } /** * Stop audio playback */ stop() { if (environment_1.isBrowser) { // Browser environment - use HTML5 Audio element if (this.audio.audioElement) { this.audio.audioElement.pause(); this.audio.audioElement.currentTime = 0; this.audio.isPlaying = false; this.audio.isPaused = false; } } else if (environment_1.isNode) { // Node.js environment - use node-speaker try { // Import dynamically to avoid circular dependencies Promise.resolve().then(() => __importStar(require('./node-audio-control'))).then(nodeAudio => { const stopped = nodeAudio.stopAudioPlayback(); if (stopped) { this.audio.isPlaying = false; this.audio.isPaused = false; } }).catch(error => { console.error("Error importing node-audio-control:", error); }); } catch (error) { console.error("Error stopping audio in Node.js:", error); } } } /** * Create estimated word timings for non-streaming engines * @param text Text to create timings for */ _createEstimatedWordTimings(text) { // Extract plain text from SSML if needed const plainText = this._isSSML(text) ? this._stripSSML(text) : text; // Split into words const words = plainText.split(/\s+/).filter((word) => word.length > 0); if (!words.length) return; // Estimate duration (assuming average speaking rate) const estimatedDuration = words.length * 0.3; // ~300ms per word const wordDuration = estimatedDuration / words.length; // Create evenly-spaced word timings this.timings = []; for (let i = 0; i < words.length; i++) { const startTime = i * wordDuration; const endTime = (i + 1) * wordDuration; this.timings.push([startTime, endTime, words[i]]); } } /** * Fire word boundary callbacks based on timing data */ _fireWordBoundaryCallbacks() { if (!this.timings.length) return; // Get all boundary callbacks const callbacks = this.callbacks["boundary"] || []; if (!callbacks.length) return; // Fire callbacks for each word for (const [start, end, word] of this.timings) { for (const callback of callbacks) { callback(word, start, end); } } } /** * Schedule word boundary callbacks based on timing information * This is used when we have audio playback but need to schedule callbacks */ _scheduleWordBoundaryCallbacks() { if (!this.timings.length) return; // Get all boundary callbacks const callbacks = this.callbacks["boundary"] || []; if (!callbacks.length) return; // Schedule callbacks for each word for (const [start, end, word] of this.timings) { setTimeout(() => { for (const callback of callbacks) { callback(word, start, end); } }, start * 1000); } } /** * Check if text is SSML * @param text Text to check * @returns True if text is SSML */ _isSSML(text) { return SSMLUtils.isSSML(text); } /** * Strip SSML tags from text * @param ssml SSML text * @returns Plain text without SSML tags */ _stripSSML(ssml) { return SSMLUtils.stripSSML(ssml); } // --- Event system --- /** * Register a callback for an event * @param event Event type * @param fn Callback function */ on(event, fn) { this.callbacks[event] = this.callbacks[event] || []; this.callbacks[event].push(fn); } /** * Emit an event to all registered callbacks * @param event Event type * @param args Event arguments */ emit(event, ...args) { for (const fn of this.callbacks[event] || []) { fn(...args); } } /** * Start playback with word boundary callbacks * @param text Text or SSML to speak * @param callback Callback function for word boundaries * @param options Synthesis options */ async startPlaybackWithCallbacks(text, callback, options) { // Speak the text await this.speak(text, options); // Use the timings to schedule callbacks for (const [start, end, word] of this.timings) { setTimeout(() => { callback(word, start, end); }, start * 1000); } } /** * Connect a callback to an event * @param event Event name * @param callback Callback function */ connect(event, callback) { if (event === "onStart") { this.on("start", callback); } else if (event === "onEnd") { this.on("end", callback); } } /** * Get the value of a property * @param propertyName Property name * @returns Property value */ getProperty(propertyName) { return this.properties[propertyName]; } /** * Set a property value * @param propertyName Property name * @param value Property value */ setProperty(propertyName, value) { this.properties[propertyName] = value; } /** * Create a prosody tag with the current properties * @param text Text to wrap with prosody * @returns Text with prosody tag */ constructProsodyTag(text) { const attrs = []; if (this.properties.rate) { attrs.push(`rate="${this.properties.rate}"`); } if (this.properties.pitch) { attrs.push(`pitch="${this.properties.pitch}"`); } if (this.properties.volume) { attrs.push(`volume="${this.properties.volume}%"`); } if (attrs.length === 0) { return text; } return `<prosody ${attrs.join(" ")}>${text}</prosody>`; } /** * Check if credentials are valid * @returns Promise resolving to true if credentials are valid */ async checkCredentials() { try { const voices = await this._getVoices(); return voices.length > 0; } catch (error) { console.error("Error checking credentials:", error); return false; } } /** * Check if credentials are valid with detailed response * @returns Promise resolving to an object with success flag and optional error message */ async checkCredentialsDetailed() { try { const voices = await this._getVoices(); return { success: voices.length > 0, voiceCount: voices.length }; } catch (error) { console.error("Error checking credentials:", error); return { success: false, error: error instanceof Error ? error.message : String(error) }; } } /** * Get available voices for a specific language * @param language Language code (BCP-47 format, e.g., 'en-US') * @returns Promise resolving to an array of available voices for the specified language */ async getVoicesByLanguage(language) { // Normalize the input language code const normalizedLanguage = language_utils_1.LanguageNormalizer.normalize(language); // Get all voices const voices = await this.getVoices(); // Filter voices by language return voices.filter((voice) => voice.languageCodes.some((lang) => // Match by BCP-47 code lang.bcp47 === normalizedLanguage.bcp47 || // Or by ISO 639-3 code lang.iso639_3 === normalizedLanguage.iso639_3)); } } exports.AbstractTTSClient = AbstractTTSClient;