UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

github.com/willwade/js-tts-wrapper

willwade/js-tts-wrapper

556 lines (555 loc) • 19.3 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.AbstractTTSClient = void 0; const builder_1 = require("../ssml/builder"); const language_utils_1 = require("./language-utils"); const SSMLUtils = __importStar(require("./ssml-utils")); /** * Abstract base class for all TTS clients * This provides a unified interface for all TTS providers */ class AbstractTTSClient { /** * Creates a new TTS client * @param credentials Provider-specific credentials */ constructor(credentials) { Object.defineProperty(this, "credentials", { enumerable: true, configurable: true, writable: true, value: credentials }); /** * Currently selected voice ID */ Object.defineProperty(this, "voiceId", { enumerable: true, configurable: true, writable: true, value: null }); /** * Currently selected language */ Object.defineProperty(this, "lang", { enumerable: true, configurable: true, writable: true, value: "en-US" }); /** * Event callbacks */ Object.defineProperty(this, "callbacks", { enumerable: true, configurable: true, writable: true, value: {} }); /** * SSML builder instance */ Object.defineProperty(this, "ssml", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * Audio playback properties */ Object.defineProperty(this, "audio", { enumerable: true, configurable: true, writable: true, value: void 0 }); /** * TTS properties (rate, pitch, volume) */ Object.defineProperty(this, "properties", { enumerable: true, configurable: true, writable: true, value: { volume: 100, rate: "medium", pitch: "medium", } }); /** * Word timings for the current audio */ Object.defineProperty(this, "timings", { enumerable: true, configurable: true, writable: true, value: [] }); /** * Audio sample rate */ Object.defineProperty(this, "audioRate", { enumerable: true, configurable: true, writable: true, value: 24000 }); this.ssml = new builder_1.SSMLBuilder(); this.audio = { isPlaying: false, isPaused: false, audioElement: null, position: 0, duration: 0, }; } /** * Get available voices from the provider with normalized language codes * @returns Promise resolving to an array of unified voice objects */ async getVoices() { // Get raw voices from the engine-specific implementation const rawVoices = await this._getVoices(); // Process and normalize the voices // Each engine should implement _mapVoiceToUnified to convert its raw voice format // to a partially filled UnifiedVoice object const voices = await this._mapVoicesToUnified(rawVoices); // Normalize language codes for all voices return voices.map((voice) => { // Normalize language codes for each language const normalizedLanguageCodes = voice.languageCodes.map((lang) => { const normalized = language_utils_1.LanguageNormalizer.normalize(lang.bcp47); return { bcp47: normalized.bcp47, iso639_3: normalized.iso639_3, display: normalized.display, }; }); // Return the voice with normalized language codes return { ...voice, languageCodes: normalizedLanguageCodes, }; }); } // --- Optional overrides --- /** * Map provider-specific voice objects to unified format * @param rawVoices Array of provider-specific voice objects * @returns Promise resolving to an array of partially unified voice objects */ async _mapVoicesToUnified(rawVoices) { // Default implementation that assumes rawVoices are already in UnifiedVoice format // Engine-specific implementations should override this method return rawVoices; } /** * Speak text using the default audio output * @param text Text or SSML to speak * @param options Synthesis options * @returns Promise resolving when audio playback starts */ async speak(text, options) { // Trigger onStart callback this.emit("start"); // Convert text to audio bytes const audioBytes = await this.synthToBytes(text, options); // Check if we're in a browser environment let url = ""; if (typeof Blob !== "undefined" && typeof URL !== "undefined") { // Create audio blob and URL const blob = new Blob([audioBytes], { type: "audio/wav" }); // default to WAV url = URL.createObjectURL(blob); } // Check if we're in a browser environment if (typeof Audio !== "undefined") { // Create and play audio element const audio = new Audio(url); this.audio.audioElement = audio; this.audio.isPlaying = true; this.audio.isPaused = false; // Set up event handlers audio.onended = () => { this.emit("end"); this.audio.isPlaying = false; URL.revokeObjectURL(url); // Clean up the URL }; } else { // In Node.js environment, we can't play audio // Just emit the end event immediately this.emit("end"); } // Create estimated word timings if needed this._createEstimatedWordTimings(text); // Play the audio if in browser environment if (this.audio.audioElement) { await this.audio.audioElement.play(); } } /** * Speak text using streaming synthesis * @param text Text or SSML to speak * @param options Synthesis options * @returns Promise resolving when audio playback starts */ async speakStreamed(text, options) { // Trigger onStart callback this.emit("start"); try { // Get streaming audio data const streamResult = await this.synthToBytestream(text, options); // Handle both simple stream and stream with word boundaries let audioStream; let wordBoundaries = []; if ("audioStream" in streamResult) { // It's the enhanced version with word boundaries audioStream = streamResult.audioStream; wordBoundaries = streamResult.wordBoundaries; } else { // It's just a simple stream audioStream = streamResult; } const reader = audioStream.getReader(); const chunks = []; // Read all chunks from the stream let result = await reader.read(); while (!result.done) { chunks.push(result.value); result = await reader.read(); } // Combine chunks into a single audio buffer const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0); const audioBytes = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { audioBytes.set(chunk, offset); offset += chunk.length; } // Use actual word boundaries if available, otherwise create estimated ones if (wordBoundaries.length > 0) { // Convert the word boundaries to our internal format this.timings = wordBoundaries.map((wb) => [ wb.offset / 10000, // Convert from 100-nanosecond units to seconds (wb.offset + wb.duration) / 10000, wb.text, ]); } else { // Create estimated word timings this._createEstimatedWordTimings(text); } // Check if we're in a browser environment if (typeof Blob !== "undefined" && typeof URL !== "undefined" && typeof Audio !== "undefined") { // Create audio blob and URL const blob = new Blob([audioBytes], { type: "audio/wav" }); const url = URL.createObjectURL(blob); // Create and play audio element const audio = new Audio(url); this.audio.audioElement = audio; this.audio.isPlaying = true; this.audio.isPaused = false; // Set up event handlers audio.onended = () => { this.emit("end"); this.audio.isPlaying = false; URL.revokeObjectURL(url); }; // Play the audio await audio.play(); } else { // In Node.js environment, just emit events // Fire word boundary events immediately setTimeout(() => { this._fireWordBoundaryCallbacks(); this.emit("end"); }, 100); } } catch (error) { console.error("Error in streaming synthesis:", error); this.emit("end"); // Ensure end event is triggered even on error throw error; } } /** * Synthesize text to audio and save it to a file (browser download) * @param text Text or SSML to synthesize * @param filename Filename to save as * @param format Audio format (mp3 or wav) * @param options Synthesis options */ async synthToFile(text, filename, format = "wav", options) { // Convert text to audio bytes const audioBytes = await this.synthToBytes(text, options); // Create blob with appropriate MIME type const mimeType = format === "mp3" ? "audio/mpeg" : "audio/wav"; const blob = new Blob([audioBytes], { type: mimeType }); // Create download link const url = URL.createObjectURL(blob); const a = document.createElement("a"); a.href = url; a.download = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`; // Trigger download document.body.appendChild(a); a.click(); // Clean up setTimeout(() => { document.body.removeChild(a); URL.revokeObjectURL(url); }, 100); } /** * Set the voice to use for synthesis * @param voiceId Voice ID to use * @param lang Language code (optional) */ setVoice(voiceId, lang) { this.voiceId = voiceId; if (lang) { this.lang = lang; } } // --- Playback control methods --- /** * Pause audio playback */ pause() { if (this.audio.audioElement && this.audio.isPlaying && !this.audio.isPaused) { this.audio.audioElement.pause(); this.audio.isPaused = true; } } /** * Resume audio playback */ resume() { if (this.audio.audioElement && this.audio.isPlaying && this.audio.isPaused) { this.audio.audioElement.play(); this.audio.isPaused = false; } } /** * Stop audio playback */ stop() { if (this.audio.audioElement) { this.audio.audioElement.pause(); this.audio.audioElement.currentTime = 0; this.audio.isPlaying = false; this.audio.isPaused = false; } } /** * Create estimated word timings for non-streaming engines * @param text Text to create timings for */ _createEstimatedWordTimings(text) { // Extract plain text from SSML if needed const plainText = this._isSSML(text) ? this._stripSSML(text) : text; // Split into words const words = plainText.split(/\s+/).filter((word) => word.length > 0); if (!words.length) return; // Estimate duration (assuming average speaking rate) const estimatedDuration = words.length * 0.3; // ~300ms per word const wordDuration = estimatedDuration / words.length; // Create evenly-spaced word timings this.timings = []; for (let i = 0; i < words.length; i++) { const startTime = i * wordDuration; const endTime = (i + 1) * wordDuration; this.timings.push([startTime, endTime, words[i]]); } } /** * Fire word boundary callbacks based on timing data */ _fireWordBoundaryCallbacks() { if (!this.timings.length) return; // Get all boundary callbacks const callbacks = this.callbacks["boundary"] || []; if (!callbacks.length) return; // Fire callbacks for each word for (const [start, end, word] of this.timings) { for (const callback of callbacks) { callback(word, start, end); } } } /** * Check if text is SSML * @param text Text to check * @returns True if text is SSML */ _isSSML(text) { return SSMLUtils.isSSML(text); } /** * Strip SSML tags from text * @param ssml SSML text * @returns Plain text without SSML tags */ _stripSSML(ssml) { return SSMLUtils.stripSSML(ssml); } // --- Event system --- /** * Register a callback for an event * @param event Event type * @param fn Callback function */ on(event, fn) { this.callbacks[event] = this.callbacks[event] || []; this.callbacks[event].push(fn); } /** * Emit an event to all registered callbacks * @param event Event type * @param args Event arguments */ emit(event, ...args) { for (const fn of this.callbacks[event] || []) { fn(...args); } } /** * Start playback with word boundary callbacks * @param text Text or SSML to speak * @param callback Callback function for word boundaries * @param options Synthesis options */ async startPlaybackWithCallbacks(text, callback, options) { // Speak the text await this.speak(text, options); // Use the timings to schedule callbacks for (const [start, end, word] of this.timings) { setTimeout(() => { callback(word, start, end); }, start * 1000); } } /** * Connect a callback to an event * @param event Event name * @param callback Callback function */ connect(event, callback) { if (event === "onStart") { this.on("start", callback); } else if (event === "onEnd") { this.on("end", callback); } } /** * Get the value of a property * @param propertyName Property name * @returns Property value */ getProperty(propertyName) { return this.properties[propertyName]; } /** * Set a property value * @param propertyName Property name * @param value Property value */ setProperty(propertyName, value) { this.properties[propertyName] = value; } /** * Create a prosody tag with the current properties * @param text Text to wrap with prosody * @returns Text with prosody tag */ constructProsodyTag(text) { const attrs = []; if (this.properties.rate) { attrs.push(`rate="${this.properties.rate}"`); } if (this.properties.pitch) { attrs.push(`pitch="${this.properties.pitch}"`); } if (this.properties.volume) { attrs.push(`volume="${this.properties.volume}%"`); } if (attrs.length === 0) { return text; } return `<prosody ${attrs.join(" ")}>${text}</prosody>`; } /** * Check if credentials are valid * @returns Promise resolving to true if credentials are valid */ async checkCredentials() { try { const voices = await this._getVoices(); return voices.length > 0; } catch (error) { console.error("Error checking credentials:", error); return false; } } /** * Get available voices for a specific language * @param language Language code (BCP-47 format, e.g., 'en-US') * @returns Promise resolving to an array of available voices for the specified language */ async getVoicesByLanguage(language) { // Normalize the input language code const normalizedLanguage = language_utils_1.LanguageNormalizer.normalize(language); // Get all voices const voices = await this.getVoices(); // Filter voices by language return voices.filter((voice) => voice.languageCodes.some((lang) => // Match by BCP-47 code lang.bcp47 === normalizedLanguage.bcp47 || // Or by ISO 639-3 code lang.iso639_3 === normalizedLanguage.iso639_3)); } } exports.AbstractTTSClient = AbstractTTSClient;