UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

207 lines (206 loc) 9.45 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.EspeakTTSClient = exports.EspeakNodeTTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); // Dynamic text2wav module - will be loaded when needed let text2wav = null; // Function to load text2wav module with enhanced ESM compatibility for Next.js and other environments async function loadText2Wav() { if (text2wav) return text2wav; try { // Check if we're in a Node.js environment if (typeof process === "undefined" || !process.versions || !process.versions.node) { throw new Error("EspeakNodeTTSClient is only supported in Node.js environments"); } // Detect Next.js environment const isNextJS = typeof process !== "undefined" && (process.env.NEXT_RUNTIME || process.env.__NEXT_PRIVATE_ORIGIN); // Enhanced dynamic import for better ESM compatibility try { text2wav = await Promise.resolve(`${"text2wav"}`).then(s => __importStar(require(s))); // Handle both default and named exports if (text2wav.default) { text2wav = text2wav.default; } return text2wav; } catch (importError) { // Fallback for environments where dynamic import might fail if (isNextJS) { throw new Error("text2wav package not found in Next.js environment. " + "This may be due to Next.js bundling restrictions. " + "Consider using EspeakBrowserTTSClient for browser environments or " + "ensure text2wav is properly installed: npm install text2wav"); } throw importError; } } catch (err) { console.error("Error loading text2wav:", err); const errorMessage = err instanceof Error ? err.message : String(err); throw new Error(`text2wav package not found. ${errorMessage}. Please install it with: npm install text2wav`); } } /** * eSpeak TTS Client for Node.js environments * * This client uses the text2wav package for server-side eSpeak TTS synthesis. * For browser environments, use EspeakBrowserTTSClient instead. */ class EspeakNodeTTSClient extends abstract_tts_1.AbstractTTSClient { constructor(credentials = {}) { super(credentials); // Set a default voice for eSpeak TTS this.voiceId = "en"; // Default English voice } /** * eSpeak does not require credentials in Node.js */ async checkCredentials() { return true; } /** * Synthesize text to audio bytes (Uint8Array) * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio bytes */ async synthToBytes(text, options) { try { // Load the text2wav module const text2wavModule = await loadText2Wav(); // Prepare options for text2wav const text2wavOptions = {}; // Use voice from options or the default voice const voiceId = options?.voice || this.voiceId || "en"; text2wavOptions.voice = voiceId; // Map other options to text2wav format if (options?.rate) { // text2wav uses speed in words per minute, default is 175 // Convert from rate (0.1-10) to WPM (50-400) const rateNum = typeof options.rate === "string" ? Number.parseFloat(options.rate) : options.rate; const rate = Math.max(0.1, Math.min(10, rateNum)); text2wavOptions.speed = Math.round(50 + ((rate - 0.1) * (400 - 50)) / (10 - 0.1)); } if (options?.pitch) { // text2wav uses pitch 0-99, default is 50 // Convert from pitch (0.1-2) to 0-99 const pitchNum = typeof options.pitch === "string" ? Number.parseFloat(options.pitch) : options.pitch; const pitch = Math.max(0.1, Math.min(2, pitchNum)); text2wavOptions.pitch = Math.round(((pitch - 0.1) * 99) / (2 - 0.1)); } // Call text2wav to generate audio const audioBuffer = await text2wavModule(text, text2wavOptions); // text2wav returns a Uint8Array, which is what we need return audioBuffer; } catch (err) { console.error("eSpeak TTS synthesis error:", err); throw new Error(`Failed to synthesize speech with eSpeak: ${err instanceof Error ? err.message : String(err)}`); } } /** * Synthesize text to a byte stream (ReadableStream) * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to an object containing the audio stream and an empty word boundaries array. */ async synthToBytestream(text, options) { const audioBytes = await this.synthToBytes(text, options); // "Fake" streaming by wrapping full audio in a ReadableStream const audioStream = new ReadableStream({ start(controller) { controller.enqueue(audioBytes); controller.close(); }, }); return { audioStream, wordBoundaries: [] }; } // TODO: Add voice/language/rate/pitch options, browser WASM loader, etc. /** * Return available voices for eSpeak */ async _getVoices() { // eSpeak supports many languages, here's a subset of common ones // text2wav uses voice files from espeak-ng-data directory const commonVoices = [ { id: "en", name: "English", language: "English" }, { id: "en+f3", name: "English (Female 3)", language: "English" }, { id: "en+m3", name: "English (Male 3)", language: "English" }, { id: "en+whisper", name: "English (Whisper)", language: "English" }, { id: "es", name: "Spanish", language: "Spanish" }, { id: "fr", name: "French", language: "French" }, { id: "de", name: "German", language: "German" }, { id: "it", name: "Italian", language: "Italian" }, { id: "pt", name: "Portuguese", language: "Portuguese" }, { id: "ru", name: "Russian", language: "Russian" }, { id: "zh", name: "Chinese (Mandarin)", language: "Chinese" }, { id: "ja", name: "Japanese", language: "Japanese" }, { id: "ko", name: "Korean", language: "Korean" }, { id: "ar", name: "Arabic", language: "Arabic" }, { id: "hi", name: "Hindi", language: "Hindi" }, { id: "nl", name: "Dutch", language: "Dutch" }, { id: "sv", name: "Swedish", language: "Swedish" }, { id: "da", name: "Danish", language: "Danish" }, { id: "no", name: "Norwegian", language: "Norwegian" }, { id: "fi", name: "Finnish", language: "Finnish" }, { id: "pl", name: "Polish", language: "Polish" }, { id: "cs", name: "Czech", language: "Czech" }, { id: "hu", name: "Hungarian", language: "Hungarian" }, { id: "tr", name: "Turkish", language: "Turkish" }, { id: "he", name: "Hebrew", language: "Hebrew" }, { id: "th", name: "Thai", language: "Thai" }, { id: "vi", name: "Vietnamese", language: "Vietnamese" }, ]; const voices = commonVoices.map((voice) => ({ id: voice.id, name: `${voice.name} (eSpeak)`, gender: "Unknown", // eSpeak doesn't typically provide gender info provider: "espeak-ng", languageCodes: [ { bcp47: voice.id.split("+")[0], // Use the base language code iso639_3: "", // Would need mapping display: voice.language, }, ], })); return voices; } } exports.EspeakNodeTTSClient = EspeakNodeTTSClient; exports.EspeakTTSClient = EspeakNodeTTSClient;