UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

517 lines (516 loc) 20.4 kB
"use strict"; /** * SherpaOnnx WebAssembly TTS Client * * This client uses the WebAssembly build of SherpaOnnx for browser environments * where native modules cannot be used. */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.SherpaOnnxWasmTTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); const word_timing_estimator_1 = require("../utils/word-timing-estimator"); const fs = __importStar(require("node:fs")); const path = __importStar(require("node:path")); /** * SherpaOnnx WebAssembly TTS Client * * This client uses the WebAssembly build of SherpaOnnx for browser environments * where native modules cannot be used. */ class SherpaOnnxWasmTTSClient extends abstract_tts_1.AbstractTTSClient { /** * Create a new SherpaOnnx WebAssembly TTS client * @param credentials Optional credentials object */ constructor(credentials = {}) { super(credentials); Object.defineProperty(this, "wasmModule", { enumerable: true, configurable: true, writable: true, value: null }); Object.defineProperty(this, "tts", { enumerable: true, configurable: true, writable: true, value: 0 }); Object.defineProperty(this, "sampleRate", { enumerable: true, configurable: true, writable: true, value: 16000 }); // This property is used in the full implementation // @ts-ignore Object.defineProperty(this, "baseDir", { enumerable: true, configurable: true, writable: true, value: "" }); Object.defineProperty(this, "wasmPath", { enumerable: true, configurable: true, writable: true, value: "" }); Object.defineProperty(this, "wasmLoaded", { enumerable: true, configurable: true, writable: true, value: false }); // Set default base directory for models this.baseDir = credentials.baseDir || this._getDefaultModelsDir(); // Set default WebAssembly path this.wasmPath = credentials.wasmPath || ""; } /** * Get the default models directory * @returns Path to the default models directory */ _getDefaultModelsDir() { // In browser environments, use a relative path if (typeof window !== "undefined") { return "./models"; } // In Node.js, use the home directory const homeDir = process.env.HOME || process.env.USERPROFILE || "."; return path.join(homeDir, ".js-tts-wrapper", "models"); } /** * Check if the credentials are valid * @returns Promise resolving to true if credentials are valid */ async checkCredentials() { try { // In a browser environment, we can't check if the WASM file exists // so we'll just assume it's valid and will be loaded later if (typeof window !== "undefined") { return true; } // In Node.js, check if the WASM file exists if (this.wasmPath && fs.existsSync(this.wasmPath)) { return true; } // If no WASM path is provided, assume it will be loaded later if (!this.wasmPath) { console.warn("No WASM path provided. SherpaOnnx WebAssembly TTS will need to be initialized manually."); return true; } console.warn(`WASM file not found at ${this.wasmPath}`); return false; } catch (error) { console.error("Error checking SherpaOnnx WebAssembly credentials:", error); return false; } } /** * Get available voices * @returns Promise resolving to an array of unified voice objects */ async _getVoices() { try { // Load the voice models JSON file let voiceModels = []; try { // In Node.js, read from the file system if (typeof window === "undefined") { const modelsJsonPath = path.join(__dirname, "..", "data", "merged_models.json"); if (fs.existsSync(modelsJsonPath)) { const modelsJson = fs.readFileSync(modelsJsonPath, "utf-8"); voiceModels = JSON.parse(modelsJson); } } else { // In browser environments, fetch from a URL // This would need to be implemented by the application console.warn("Voice models JSON file not available in browser environment."); // Return a default voice for testing return [{ id: "piper_en_US", name: "Piper English (US)", gender: "Unknown", provider: "sherpaonnx-wasm", languageCodes: [ { bcp47: "en-US", iso639_3: "eng", display: "English (US)" } ] }]; } } catch (error) { console.error("Error loading voice models:", error); } // Filter for SherpaOnnx models and map to unified format const sherpaOnnxModels = voiceModels.filter(model => model.engine === "sherpaonnx" || model.engine === "sherpaonnx-wasm"); return sherpaOnnxModels.map(model => ({ id: model.id, name: model.name, gender: model.gender || "Unknown", provider: "sherpaonnx-wasm", languageCodes: [ { bcp47: model.language || "en-US", iso639_3: model.language ? model.language.split("-")[0] : "eng", display: model.language_display || "English (US)" } ] })); } catch (error) { console.error("Error getting SherpaOnnx WebAssembly voices:", error); return []; } } /** * Initialize the WebAssembly module * @param wasmUrl URL to the WebAssembly file * @returns Promise resolving when the module is initialized */ async initializeWasm(_wasmUrl) { if (this.wasmLoaded) { return; } try { // In browser environments, load the WebAssembly module if (typeof window !== "undefined") { // This would need to be implemented by the application console.warn("WebAssembly loading not implemented for browser environments."); this.wasmLoaded = false; return; } // In Node.js, we can't directly use WebAssembly in the same way console.warn("WebAssembly loading not implemented for Node.js environments."); this.wasmLoaded = false; } catch (error) { console.error("Error initializing WebAssembly:", error); this.wasmLoaded = false; } } /** * Synthesize text to speech and return the audio as a byte array * @param text Text to synthesize * @param options Options for synthesis * @returns Promise resolving to a byte array of audio data */ async synthToBytes(text, _options) { // If WebAssembly is not loaded, return a mock implementation if (!this.wasmLoaded || !this.wasmModule || this.tts === 0) { console.warn("SherpaOnnx WebAssembly TTS is not initialized. Using mock implementation for example."); return this._mockSynthToBytes(); } try { // Allocate memory for the text string const textPtr = this.wasmModule._malloc(text.length + 1); // Write the text string to memory this.wasmModule.stringToUTF8(text, textPtr, text.length + 1); // Generate the audio const result = this.wasmModule._ttsGenerateWithOffline(this.tts, textPtr); // Free the text string memory this.wasmModule._free(textPtr); // Check if generation was successful if (result !== 0) { throw new Error(`Failed to generate audio: ${result}`); } // Get the number of samples const numSamples = this.wasmModule._ttsNumSamplesWithOffline(this.tts); // Allocate memory for the samples const samplesPtr = this.wasmModule._malloc(numSamples * 4); // 4 bytes per float // Get the samples this.wasmModule._ttsGetSamplesWithOffline(this.tts, samplesPtr); // Create a Float32Array view of the samples const samplesView = new Float32Array(this.wasmModule.HEAPF32.buffer, samplesPtr, numSamples); // Copy the samples to a new array const samples = new Float32Array(samplesView); // Free the samples memory this.wasmModule._free(samplesPtr); // Convert the samples to the requested format const audioBytes = this._convertAudioFormat(samples); return audioBytes; } catch (error) { console.error("Error synthesizing text:", error); return this._mockSynthToBytes(); } } /** * Convert audio samples to the requested format * @param samples Float32Array of audio samples * @returns Uint8Array of audio data in the requested format */ _convertAudioFormat(samples) { // For now, we'll just return a WAV file // In a real implementation, we would use a library like audioEncoder // to convert to the requested format // Convert Float32Array to Int16Array const int16Samples = new Int16Array(samples.length); for (let i = 0; i < samples.length; i++) { // Scale to 16-bit range and clamp const sample = Math.max(-1, Math.min(1, samples[i])); int16Samples[i] = Math.floor(sample * 32767); } // Create a WAV file header const wavHeader = new ArrayBuffer(44); const view = new DataView(wavHeader); // "RIFF" chunk descriptor view.setUint8(0, "R".charCodeAt(0)); view.setUint8(1, "I".charCodeAt(0)); view.setUint8(2, "F".charCodeAt(0)); view.setUint8(3, "F".charCodeAt(0)); // Chunk size (file size - 8) view.setUint32(4, 36 + int16Samples.length * 2, true); // Format ("WAVE") view.setUint8(8, "W".charCodeAt(0)); view.setUint8(9, "A".charCodeAt(0)); view.setUint8(10, "V".charCodeAt(0)); view.setUint8(11, "E".charCodeAt(0)); // "fmt " sub-chunk view.setUint8(12, "f".charCodeAt(0)); view.setUint8(13, "m".charCodeAt(0)); view.setUint8(14, "t".charCodeAt(0)); view.setUint8(15, " ".charCodeAt(0)); // Sub-chunk size (16 for PCM) view.setUint32(16, 16, true); // Audio format (1 for PCM) view.setUint16(20, 1, true); // Number of channels (1 for mono) view.setUint16(22, 1, true); // Sample rate view.setUint32(24, this.sampleRate, true); // Byte rate (sample rate * channels * bytes per sample) view.setUint32(28, this.sampleRate * 1 * 2, true); // Block align (channels * bytes per sample) view.setUint16(32, 1 * 2, true); // Bits per sample view.setUint16(34, 16, true); // "data" sub-chunk view.setUint8(36, "d".charCodeAt(0)); view.setUint8(37, "a".charCodeAt(0)); view.setUint8(38, "t".charCodeAt(0)); view.setUint8(39, "a".charCodeAt(0)); // Sub-chunk size (number of samples * channels * bytes per sample) view.setUint32(40, int16Samples.length * 1 * 2, true); // Combine the header and the samples const wavBytes = new Uint8Array(wavHeader.byteLength + int16Samples.length * 2); wavBytes.set(new Uint8Array(wavHeader), 0); // Convert Int16Array to Uint8Array const samplesBytes = new Uint8Array(int16Samples.buffer); wavBytes.set(samplesBytes, wavHeader.byteLength); return wavBytes; } /** * Mock implementation for synthToBytes * @returns Promise resolving to a byte array of audio data */ _mockSynthToBytes() { // Generate a simple sine wave as a placeholder const sampleRate = 16000; const duration = 2; // seconds const numSamples = sampleRate * duration; const samples = new Float32Array(numSamples); // Generate a 440 Hz sine wave for (let i = 0; i < numSamples; i++) { samples[i] = Math.sin(2 * Math.PI * 440 * i / sampleRate) * 0.5; } // Convert to WAV return this._convertAudioFormat(samples); } /** * Synthesize text to speech and stream the audio * @param text Text to synthesize * @param onAudioBuffer Callback for audio buffers * @param onStart Callback for when synthesis starts * @param onEnd Callback for when synthesis ends * @param onWord Callback for word boundary events * @param options Options for synthesis * @returns Promise resolving when synthesis is complete */ async synthToStream(text, onAudioBuffer, onStart, onEnd, onWord, options) { try { // Call onStart callback if (onStart) { onStart(); } // Synthesize the entire audio const audioBytes = await this.synthToBytes(text, options); // Estimate word boundaries if (onWord) { const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text); // Schedule word boundary events for (const boundary of wordBoundaries) { setTimeout(() => { onWord(boundary.word, boundary.start, boundary.end); }, boundary.start * 1000); } } // Send the audio buffer onAudioBuffer(audioBytes); // Call onEnd callback if (onEnd) { onEnd(); } } catch (error) { console.error("Error synthesizing text to stream:", error); // Call onEnd callback even if there's an error if (onEnd) { onEnd(); } } } /** * Synthesize text to speech and save to a file * @param text Text to synthesize * @param filename Filename to save as * @param format Audio format (mp3 or wav) * @param options Options for synthesis * @returns Promise resolving when synthesis is complete */ async synthToFile(text, filename, format = "wav", options) { try { // Synthesize the audio const audioBytes = await this.synthToBytes(text, { ...options, format }); // Check if we're in a browser environment if (typeof window !== "undefined" && typeof document !== "undefined") { // Create blob with appropriate MIME type const mimeType = format === "mp3" ? "audio/mpeg" : "audio/wav"; const blob = new Blob([audioBytes], { type: mimeType }); // Create download link const url = URL.createObjectURL(blob); const a = document.createElement("a"); a.href = url; a.download = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`; // Trigger download document.body.appendChild(a); a.click(); // Clean up setTimeout(() => { if (document && document.body) { document.body.removeChild(a); } URL.revokeObjectURL(url); }, 100); } else { // In Node.js, use the file system if (typeof fs !== "undefined") { const outputPath = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`; fs.writeFileSync(outputPath, Buffer.from(audioBytes)); } else { console.warn("File saving not implemented for this environment."); } } } catch (error) { console.error("Error synthesizing text to file:", error); throw error; } } /** * Get a property value * @param property Property name * @returns Property value */ getProperty(property) { switch (property) { case "voice": return this.voiceId; case "sampleRate": return this.sampleRate; case "wasmLoaded": return this.wasmLoaded; case "wasmPath": return this.wasmPath; default: return super.getProperty(property); } } /** * Set a property value * @param property Property name * @param value Property value */ setProperty(property, value) { switch (property) { case "voice": this.setVoice(value); break; case "wasmPath": this.wasmPath = value; break; default: super.setProperty(property, value); break; } } /** * Clean up resources */ dispose() { if (this.wasmModule && this.tts !== 0) { this.wasmModule._ttsDestroyOffline(this.tts); this.tts = 0; } } /** * Synthesize text to a byte stream * @param text Text to synthesize * @param options Options for synthesis * @returns Promise resolving to a readable stream of audio bytes */ async synthToBytestream(text, options) { // This is a simplified implementation that doesn't actually stream // In a real implementation, you would use a ReadableStream const audioBytes = await this.synthToBytes(text, options); // Create a ReadableStream from the audio bytes return new ReadableStream({ start(controller) { controller.enqueue(audioBytes); controller.close(); } }); } } exports.SherpaOnnxWasmTTSClient = SherpaOnnxWasmTTSClient;