UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

github.com/willwade/js-tts-wrapper

willwade/js-tts-wrapper

271 lines (270 loc) • 11.5 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.EspeakWasmTTSClient = exports.EspeakBrowserTTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); // Function to detect if we're in a browser environment function isBrowser() { return typeof window !== "undefined" && typeof document !== "undefined"; } function runtimeImport(specifier) { return new Function("m", "return import(m)")(specifier); } // Removed meSpeak interface - no longer used /** * eSpeak TTS client for browser environments using meSpeak.js * This provides eSpeak functionality in browsers and Node.js via WebAssembly * For Node.js-only environments with better performance, use EspeakNodeTTSClient instead. */ class EspeakBrowserTTSClient extends abstract_tts_1.AbstractTTSClient { constructor(credentials = {}) { super(credentials); Object.defineProperty(this, "nodeClient", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "meSpeak", { enumerable: true, configurable: true, writable: true, value: null }); Object.defineProperty(this, "meSpeakReady", { enumerable: true, configurable: true, writable: true, value: false }); // Set a default voice for eSpeak TTS this.voiceId = "en"; // Default English voice // In Node.js environments, we'll lazily load the Node client when needed to avoid bundling it in browsers. } async synthToBytes(text, options) { // Node.js: delegate to Node client if (!isBrowser()) { if (!this.nodeClient) { const mod = await runtimeImport("./espeak"); const EspeakNodeTTSClient = mod.EspeakNodeTTSClient || mod.default; this.nodeClient = new EspeakNodeTTSClient(this.credentials); } return await this.nodeClient.synthToBytes(text, options); } // Browser: use meSpeak (UMD) with embedded config/voice JSONs await this.ensureMeSpeakLoaded(); const meSpeak = this.meSpeak; if (!meSpeak) throw new Error("eSpeak-WASM: meSpeak failed to load"); const voiceId = (this.voiceId || "en").toLowerCase(); // pick meSpeak voice payload (limited set to keep bundle small) const voicePayload = await this.getVoicePayload(voiceId); if (!meSpeak.isConfigLoaded()) { const { default: configJson } = await runtimeImport("mespeak/src/mespeak_config.json"); meSpeak.loadConfig(configJson); } if (voicePayload && !meSpeak.isVoiceLoaded(voicePayload.voice_id)) { meSpeak.loadVoice(voicePayload); meSpeak.setDefaultVoice(voicePayload.voice_id); } // Map SpeakOptions rate/pitch to meSpeak numeric speed/pitch const rateToSpeed = { "x-slow": 80, slow: 120, medium: 175, fast: 220, "x-fast": 300, }; const pitchMap = { "x-low": 10, low: 25, medium: 50, high: 70, "x-high": 90, }; const speed = rateToSpeed[(options?.rate || "medium")] ?? 175; const pitch = pitchMap[(options?.pitch || "medium")] ?? 50; // get raw WAV buffer from meSpeak const arr = meSpeak.speak(text, { rawdata: "array", voice: voicePayload?.voice_id || voiceId, speed, pitch, }); if (!arr || !arr.length) throw new Error("eSpeak-WASM: synthesis failed"); return new Uint8Array(arr); } async ensureMeSpeakLoaded() { if (this.meSpeakReady) return; try { // mespeak is optional and should only be resolved when this engine is actually used. const mod = await runtimeImport("mespeak"); this.meSpeak = (mod && (mod.default || mod)); } catch (error) { throw new Error(`eSpeak-WASM requires the optional 'mespeak' package at runtime. ${error instanceof Error ? error.message : String(error)}`); } this.meSpeakReady = true; } // Load a small curated set of English voices inline to avoid URL/CORS async getVoicePayload(voiceId) { try { switch (voiceId) { case "en": { const { default: v } = await runtimeImport("mespeak/voices/en/en.json"); return v; } case "en-us": { const { default: v } = await runtimeImport("mespeak/voices/en/en-us.json"); return v; } case "en-rp": { const { default: v } = await runtimeImport("mespeak/voices/en/en-rp.json"); return v; } case "en-sc": { const { default: v } = await runtimeImport("mespeak/voices/en/en-sc.json"); return v; } case "en-wm": { const { default: v } = await runtimeImport("mespeak/voices/en/en-wm.json"); return v; } default: { // Fallback to plain English if requested voice not bundled const { default: v } = await runtimeImport("mespeak/voices/en/en.json"); return v; } } } catch { return null; } } /** * Synthesize text to a byte stream (ReadableStream) * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to an object containing the audio stream and an empty word boundaries array. */ async synthToBytestream(text, options) { const audioBytes = await this.synthToBytes(text, options); // Generate word boundaries if requested let wordBoundaries = []; if (options?.useWordBoundary) { // Create estimated word timings and store them this._createEstimatedWordTimings(text); // Convert internal timings to word boundary format wordBoundaries = this.timings.map(([start, end, word]) => ({ text: word, offset: Math.round(start * 10000), // Convert to 100-nanosecond units duration: Math.round((end - start) * 10000), })); } // "Fake" streaming by wrapping full audio in a ReadableStream const audioStream = new ReadableStream({ start(controller) { controller.enqueue(audioBytes); controller.close(); }, }); return { audioStream, wordBoundaries }; } /** * Return available voices for eSpeak WASM */ async _getVoices() { // For Node.js environments, delegate to the regular eSpeak client (lazy loaded) if (!isBrowser()) { if (!this.nodeClient) { const mod = await runtimeImport("./espeak"); const EspeakNodeTTSClient = mod.EspeakNodeTTSClient || mod.default; this.nodeClient = new EspeakNodeTTSClient(this.credentials); } const nodeVoices = await this.nodeClient._getVoices(); // Rename them to indicate they're from eSpeak WASM (but actually using Node.js fallback) return nodeVoices.map((voice) => ({ ...voice, name: voice.name.replace("(eSpeak)", "(eSpeak WASM)"), })); } // meSpeak supports many languages, here's a subset of common ones const commonVoices = [ { id: "en", name: "English", language: "English" }, { id: "en-us", name: "English (US)", language: "English" }, { id: "en-rp", name: "English (RP)", language: "English" }, { id: "en-sc", name: "English (Scottish)", language: "English" }, { id: "es", name: "Spanish", language: "Spanish" }, { id: "es-la", name: "Spanish (Latin America)", language: "Spanish" }, { id: "fr", name: "French", language: "French" }, { id: "de", name: "German", language: "German" }, { id: "it", name: "Italian", language: "Italian" }, { id: "pt", name: "Portuguese (Brazil)", language: "Portuguese" }, { id: "pt-pt", name: "Portuguese (European)", language: "Portuguese" }, { id: "ru", name: "Russian", language: "Russian" }, { id: "zh", name: "Chinese (Mandarin)", language: "Chinese" }, { id: "zh-yue", name: "Chinese (Cantonese)", language: "Chinese" }, { id: "ja", name: "Japanese", language: "Japanese" }, { id: "ko", name: "Korean", language: "Korean" }, { id: "ar", name: "Arabic", language: "Arabic" }, { id: "hi", name: "Hindi", language: "Hindi" }, { id: "nl", name: "Dutch", language: "Dutch" }, { id: "sv", name: "Swedish", language: "Swedish" }, { id: "da", name: "Danish", language: "Danish" }, { id: "no", name: "Norwegian", language: "Norwegian" }, { id: "fi", name: "Finnish", language: "Finnish" }, { id: "pl", name: "Polish", language: "Polish" }, { id: "cs", name: "Czech", language: "Czech" }, { id: "hu", name: "Hungarian", language: "Hungarian" }, { id: "tr", name: "Turkish", language: "Turkish" }, { id: "he", name: "Hebrew", language: "Hebrew" }, { id: "th", name: "Thai", language: "Thai" }, { id: "vi", name: "Vietnamese", language: "Vietnamese" }, ]; const voices = commonVoices.map((voice) => ({ id: voice.id, name: `${voice.name} (eSpeak WASM)`, gender: "Unknown", // meSpeak doesn't typically provide gender info provider: "espeak-ng", languageCodes: [ { bcp47: voice.id.split("-")[0], // Use the base language code iso639_3: "", // Would need mapping display: voice.language, }, ], })); return voices; } /** * Get the list of required credential types for this engine * @returns Array of required credential field names */ getRequiredCredentials() { return []; // eSpeak doesn't require any credentials } /** * Check if credentials are valid (eSpeak doesn't need credentials) */ async checkCredentials() { // eSpeak doesn't need credentials and we have fallbacks for both environments return true; } /** * Get detailed credential validation info */ async checkCredentialsAdvanced() { return { valid: true, message: "eSpeak WASM is available with environment-specific fallbacks", details: { environment: isBrowser() ? "browser" : "node", engine: isBrowser() ? "meSpeak" : "text2wav", note: "Credentials not required for eSpeak", }, }; } } exports.EspeakBrowserTTSClient = EspeakBrowserTTSClient; exports.EspeakWasmTTSClient = EspeakBrowserTTSClient;