js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
1,111 lines • 50.6 kB
JavaScript
"use strict";
/**
* SherpaOnnx WebAssembly TTS Client
*
* Enhanced version with multi-model support for browser environments.
* Supports dynamic loading of Kokoro, Matcha, and VITS models.
*
* BACKWARD COMPATIBILITY: Maintains full compatibility with existing API.
* New multi-model features are opt-in via constructor options.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.SherpaOnnxWasmTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const environment_1 = require("../utils/environment");
const word_timing_estimator_1 = require("../utils/word-timing-estimator");
/**
* Enhanced SherpaOnnx WebAssembly TTS Client
*
* Supports both legacy single-model mode and new multi-model mode.
* Maintains full backward compatibility with existing API.
*/
class SherpaOnnxWasmTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new SherpaOnnx WebAssembly TTS client
* @param credentials Optional credentials object
* @param enhancedOptions Optional enhanced options for multi-model support
*/
constructor(credentials = {}, enhancedOptions = {}) {
super(credentials);
Object.defineProperty(this, "wasmModule", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "tts", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "wasmPath", {
enumerable: true,
configurable: true,
writable: true,
value: ""
});
Object.defineProperty(this, "wasmLoaded", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
// Enhanced multi-model support
Object.defineProperty(this, "enhancedOptions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "modelRepository", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "modelManager", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "currentVoiceId", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
// Set default sample rate for the Piper model
this.sampleRate = 22050;
// Note: baseDir from credentials is accepted for backward compatibility but not used
// Set default WebAssembly path
this.wasmPath = credentials.wasmPath || "";
// Enhanced options with defaults for backward compatibility
this.enhancedOptions = {
enableMultiModel: false, // Disabled by default for backward compatibility
maxCachedModels: 3,
...enhancedOptions
};
// Initialize multi-model components if enabled
if (this.enhancedOptions.enableMultiModel) {
this.modelRepository = new ModelRepository();
}
}
/**
* Check if the credentials are valid
* @returns Promise resolving to true if credentials are valid
*/
async checkCredentials() {
try {
// In a browser environment, we can't check if the WASM file exists
// so we'll just assume it's valid and will be loaded later
if (typeof window !== "undefined") {
return true;
}
// In Node.js, check if the WASM file exists
if (environment_1.isNode && this.wasmPath && environment_1.fileSystem.existsSync(this.wasmPath)) {
return true;
}
// If no WASM path is provided, assume it will be loaded later
if (!this.wasmPath) {
console.warn("No WASM path provided. SherpaOnnx WebAssembly TTS will need to be initialized manually.");
return true;
}
console.warn(`WASM file not found at ${this.wasmPath}`);
return false;
}
catch (error) {
console.error("Error checking SherpaOnnx WebAssembly credentials:", error);
return false;
}
}
/**
* Get available voices
* @returns Promise resolving to an array of unified voice objects
*/
async _getVoices() {
try {
// Enhanced multi-model support
if (this.enhancedOptions.enableMultiModel && this.modelRepository) {
console.log("Using enhanced multi-model voice repository");
try {
const models = this.modelRepository.getAvailableModels();
return models.map((model) => ({
id: model.id,
name: model.name,
gender: model.gender,
provider: "sherpaonnx-wasm",
languageCodes: [
{
bcp47: model.language,
iso639_3: model.language.split("-")[0],
display: model.language,
},
],
}));
}
catch (error) {
console.error("Error getting voices from enhanced repository:", error);
// Fall through to legacy mode
}
}
// Legacy voice loading (backward compatibility)
console.log("Using legacy voice loading mode");
// Load the voice models JSON file
let voiceModels = [];
try {
// In Node.js, read from the file system
if (environment_1.isNode) {
const modelsJsonPath = environment_1.pathUtils.join(__dirname, "..", "data", "merged_models.json");
if (environment_1.fileSystem.existsSync(modelsJsonPath)) {
const modelsJson = environment_1.fileSystem.readFileSync(modelsJsonPath);
voiceModels = JSON.parse(modelsJson);
}
}
else {
// In browser environments, try to fetch from a URL
try {
const response = await fetch("./data/merged_models.json");
if (response.ok) {
const modelsJson = await response.text();
voiceModels = JSON.parse(modelsJson);
}
else {
console.warn("Voice models JSON file not available in browser environment.");
// Return a default voice for testing
return [
{
id: "piper_en_US",
name: "Piper English (US)",
gender: "Unknown",
provider: "sherpaonnx-wasm",
languageCodes: [
{
bcp47: "en-US",
iso639_3: "eng",
display: "English (US)",
},
],
},
];
}
}
catch (fetchError) {
console.warn("Failed to fetch voice models JSON file:", fetchError);
// Return a default voice for testing
return [
{
id: "piper_en_US",
name: "Piper English (US)",
gender: "Unknown",
provider: "sherpaonnx-wasm",
languageCodes: [
{
bcp47: "en-US",
iso639_3: "eng",
display: "English (US)",
},
],
},
];
}
}
}
catch (error) {
console.error("Error loading voice models:", error);
}
// Filter for SherpaOnnx models and map to unified format
const sherpaOnnxModels = voiceModels.filter((model) => model.engine === "sherpaonnx" || model.engine === "sherpaonnx-wasm");
console.log("Found SherpaOnnx models:", sherpaOnnxModels);
const voices = sherpaOnnxModels.map((model) => ({
id: model.id,
name: model.name,
gender: model.gender || "Unknown",
provider: "sherpaonnx-wasm",
languageCodes: [
{
bcp47: model.language || "en-US",
iso639_3: model.language ? model.language.split("-")[0] : "eng",
display: model.language_display || "English (US)",
},
],
}));
// If no voices found, return a default voice for backward compatibility
if (voices.length === 0) {
return [
{
id: "piper_en_US",
name: "Piper English (US)",
gender: "Unknown",
provider: "sherpaonnx-wasm",
languageCodes: [
{
bcp47: "en-US",
iso639_3: "eng",
display: "English (US)",
},
],
},
];
}
return voices;
}
catch (error) {
console.error("Error getting SherpaOnnx WebAssembly voices:", error);
return [];
}
}
/**
* Initialize the WebAssembly module
* @param wasmUrl URL to the WebAssembly file
* @returns Promise resolving when the module is initialized
*/
async initializeWasm(wasmUrl) {
if (this.wasmLoaded) {
return;
}
try {
// In browser environments, load the WebAssembly module
if (environment_1.isBrowser) {
if (!wasmUrl) {
console.warn("No WebAssembly URL provided for browser environment.");
this.wasmLoaded = false;
return;
}
console.log("Loading WebAssembly module from", wasmUrl);
console.log(`Current state: wasmLoaded=${this.wasmLoaded}, wasmModule=${!!this.wasmModule}`);
try {
// Store the URL for later use
this.wasmPath = wasmUrl;
console.log("Setting wasmPath to:", this.wasmPath);
// We don't need to load the scripts here, as they should already be loaded in the HTML file
console.log("Checking if createOfflineTts is already available:", typeof window.createOfflineTts === "function");
// Wait for the createOfflineTts function to be available
await new Promise((resolve) => {
const checkCreateOfflineTts = () => {
if (typeof window.createOfflineTts === "function" &&
typeof window.Module !== "undefined" &&
window.Module.calledRun) {
console.log("createOfflineTts and Module are available and initialized");
resolve();
}
else {
console.log("Waiting for createOfflineTts and Module to be available and initialized...");
console.log("createOfflineTts available:", typeof window.createOfflineTts === "function");
console.log("Module available:", typeof window.Module !== "undefined");
console.log("Module.calledRun:", window.Module?.calledRun);
setTimeout(checkCreateOfflineTts, 500);
}
};
checkCreateOfflineTts();
});
// Now that we know createOfflineTts and Module are available, store them
console.log("Storing Module and createOfflineTts");
this.wasmModule = window.Module;
this.wasmLoaded = true;
// Store the createOfflineTts function
if (this.wasmModule && !this.wasmModule.createOfflineTts) {
this.wasmModule.createOfflineTts = window.createOfflineTts;
}
// Initialize multi-model support if enabled
if (this.enhancedOptions.enableMultiModel && this.modelRepository) {
console.log("Initializing enhanced multi-model support...");
try {
// Load models index
await this.modelRepository.loadModelsIndex();
// Initialize model manager
if (this.wasmModule) {
this.modelManager = new WasmModelManager(this.wasmModule, this.enhancedOptions.maxCachedModels);
}
console.log("Enhanced multi-model support initialized successfully");
}
catch (error) {
console.error("Error initializing multi-model support:", error);
console.log("Falling back to legacy single-model mode");
this.enhancedOptions.enableMultiModel = false;
}
}
console.log("WebAssembly module initialized successfully");
}
catch (error) {
console.error("Error initializing WebAssembly:", error);
this.wasmLoaded = false;
}
}
else {
// In Node.js, we can't directly use WebAssembly in the same way
console.warn("WebAssembly loading not implemented for Node.js environments.");
this.wasmLoaded = false;
}
}
catch (error) {
console.error("Error initializing WebAssembly:", error);
this.wasmLoaded = false;
}
console.log("End of initializeWasm method. wasmLoaded:", this.wasmLoaded, "wasmModule:", !!this.wasmModule);
console.log("createOfflineTts available at end of initializeWasm:", typeof window.createOfflineTts === "function");
console.log("window.Module available at end of initializeWasm:", typeof window.Module !== "undefined");
if (typeof window.Module !== "undefined") {
console.log("window.Module.calledRun at end of initializeWasm:", window.Module.calledRun);
}
}
/**
* Synthesize text to speech and return the audio as a byte array
* @param text Text to synthesize
* @param options Options for synthesis
* @returns Promise resolving to a byte array of audio data
*/
async synthToBytes(text, _options) {
console.log("synthToBytes called with text:", text);
// Enhanced multi-model synthesis
if (this.enhancedOptions.enableMultiModel && this.wasmModule && this.currentVoiceId) {
console.log(`Using enhanced multi-model synthesis for voice ${this.currentVoiceId}`);
try {
if (!this.wasmModule._GenerateAudio) {
throw new Error('Enhanced WASM module not loaded - _GenerateAudio not available');
}
// Generate audio using the enhanced WASM interface
const result = this.wasmModule._GenerateAudio(text, 0, 1.0); // text, speaker_id, speed
if (!result || !result.samples) {
throw new Error('Failed to generate audio with enhanced interface');
}
console.log(`Enhanced synthesis generated ${result.samples.length} samples at ${result.sampleRate}Hz`);
// Update sample rate if provided
if (result.sampleRate) {
this.sampleRate = result.sampleRate;
}
// Convert to WAV format
return this._convertAudioFormat(result.samples);
}
catch (error) {
console.error('Error with enhanced multi-model synthesis:', error);
console.log('Falling back to legacy synthesis mode');
// Fall through to legacy mode
}
}
// Legacy synthesis mode (backward compatibility)
console.log("Using legacy synthesis mode");
// IMPORTANT: We need to access the global window object directly
// This is because our code is bundled and the window object might not be accessible in the same way
const globalWindow = typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : {};
console.log("Global window type:", typeof globalWindow);
// Check if we're in a browser environment
if (typeof globalWindow !== "undefined" && typeof document !== "undefined") {
console.log("Browser environment detected");
// Check if createOfflineTts is available in the global scope
const createOfflineTtsFn = globalWindow.createOfflineTts;
const moduleObj = globalWindow.Module;
console.log("createOfflineTts available in global scope:", typeof createOfflineTtsFn === "function");
console.log("Module available in global scope:", typeof moduleObj !== "undefined");
console.log("Module.calledRun:", moduleObj?.calledRun);
// Try to use the global createOfflineTts function directly
if (typeof createOfflineTtsFn === "function" &&
typeof moduleObj !== "undefined" &&
moduleObj.calledRun) {
console.log("Using global createOfflineTts function directly");
try {
// Create a new TTS instance directly
console.log("About to call createOfflineTts...");
const directTts = createOfflineTtsFn(moduleObj);
console.log("createOfflineTts call successful, tts object:", directTts);
console.log("TTS initialized with default configuration");
console.log(`Sample rate: ${directTts?.sampleRate}`);
console.log(`Number of speakers: ${directTts?.numSpeakers}`);
// Update the sample rate from the TTS engine
if (directTts && typeof directTts.sampleRate === "number") {
this.sampleRate = directTts.sampleRate;
console.log(`Updated sample rate to ${this.sampleRate}`);
}
else {
console.warn("Could not update sample rate, using default");
}
// Generate audio
console.log("Generating audio directly...");
const result = directTts.generate({ text, sid: 0, speed: 1.0 });
console.log("Audio generated directly:", result);
console.log(`Generated ${result?.samples?.length} samples at ${result?.sampleRate}Hz`);
// Convert to WAV
const audioBytes = this._convertAudioFormat(result.samples);
console.log("Converted audio to WAV format, returning bytes");
return audioBytes;
}
catch (directError) {
console.error("Error using direct approach:", directError);
console.log("Falling back to standard approach");
}
}
else {
console.log("Direct approach not available, reason:");
if (typeof createOfflineTtsFn !== "function")
console.log("- createOfflineTts is not a function");
if (typeof moduleObj === "undefined")
console.log("- Module is undefined");
if (moduleObj && !moduleObj.calledRun)
console.log("- Module.calledRun is false");
}
}
else {
console.log("Not in a browser environment, skipping direct approach");
}
// If direct approach failed or not available, try the standard approach
console.log("Using standard approach");
console.log("Current state - wasmLoaded:", this.wasmLoaded, "wasmModule:", !!this.wasmModule);
console.log("createOfflineTts available:", typeof globalWindow.createOfflineTts === "function");
// If WebAssembly is not loaded or createOfflineTts is not available, return a mock implementation
if (!this.wasmLoaded ||
!this.wasmModule ||
typeof window.createOfflineTts !== "function") {
console.warn("SherpaOnnx WebAssembly TTS is not initialized. Using mock implementation for example.");
console.warn("Reason for fallback:");
if (!this.wasmLoaded)
console.warn("- wasmLoaded is false");
if (!this.wasmModule)
console.warn("- wasmModule is null");
if (typeof globalWindow.createOfflineTts !== "function")
console.warn("- createOfflineTts is not a function");
return this._mockSynthToBytes();
}
try {
// Use the SherpaOnnx WebAssembly API to generate audio
console.log("Using SherpaOnnx WebAssembly to generate audio");
// Create a TTS instance if it doesn't exist
if (!this.tts) {
console.log("Creating TTS instance");
try {
// Create the TTS instance
if (typeof window.createOfflineTts === "function") {
// Using the sherpa-onnx-tts.js API
console.log("Using createOfflineTts API from global scope");
console.log("createOfflineTts:", window.createOfflineTts);
console.log("Module:", window.Module);
try {
// Create the TTS instance with default configuration
console.log("About to call createOfflineTts...");
this.tts = window.createOfflineTts(window.Module);
console.log("createOfflineTts call successful, tts object:", this.tts);
console.log("TTS initialized with default configuration");
console.log(`Sample rate: ${this.tts?.sampleRate}`);
console.log(`Number of speakers: ${this.tts?.numSpeakers}`);
// Update the sample rate from the TTS engine
if (this.tts && typeof this.tts.sampleRate === "number") {
this.sampleRate = this.tts.sampleRate;
console.log(`Updated sample rate to ${this.sampleRate}`);
}
else {
console.warn("Could not update sample rate, using default");
}
}
catch (error) {
console.error("Error creating TTS instance with createOfflineTts:", error);
throw error;
}
}
else if (this.wasmModule?.OfflineTts) {
// Using the Module.OfflineTts API
console.log("Using Module.OfflineTts API");
this.tts = new this.wasmModule.OfflineTts();
}
else {
throw new Error("No compatible TTS API found");
}
console.log("TTS instance created successfully");
}
catch (error) {
console.error("Error creating TTS instance:", error);
console.warn("Falling back to mock implementation");
return this._mockSynthToBytes();
}
}
// Generate the audio
console.log("Generating audio for text:", text);
let samples;
if (typeof this.tts.generate === "function") {
// Using the generate method from sherpa-onnx-tts.js
console.log("Using generate method");
console.log("this.tts.generate:", this.tts.generate);
try {
console.log("Calling generate with:", { text, sid: 0, speed: 1.0 });
const result = this.tts.generate({ text, sid: 0, speed: 1.0 });
console.log("Generate call successful, result:", result);
samples = result.samples;
console.log(`Generated audio with sample rate: ${result.sampleRate} and samples: ${samples.length}`);
}
catch (error) {
console.error("Error calling generate:", error);
throw error;
}
}
else if (typeof this.tts.generateWithText === "function") {
// Using the generateWithText method
console.log("Using generateWithText method");
console.log("this.tts.generateWithText:", this.tts.generateWithText);
try {
console.log("Calling generateWithText with:", text);
samples = this.tts.generateWithText(text);
console.log(`Generated audio with samples: ${samples.length}`);
}
catch (error) {
console.error("Error calling generateWithText:", error);
throw error;
}
}
else {
console.error("No compatible generate method found");
console.log("Available methods on this.tts:", Object.keys(this.tts).filter((key) => typeof this.tts[key] === "function"));
throw new Error("No compatible generate method found");
}
console.log("Audio generated successfully, samples:", samples.length);
// Convert the samples to the requested format
const audioBytes = this._convertAudioFormat(samples);
return audioBytes;
}
catch (error) {
console.error("Error synthesizing text:", error);
console.warn("Falling back to mock implementation");
return this._mockSynthToBytes();
}
}
/**
* Convert audio samples to the requested format
* @param samples Float32Array of audio samples
* @returns Uint8Array of audio data in the requested format
*/
_convertAudioFormat(samples) {
// For now, we'll just return a WAV file
// In a real implementation, we would use a library like audioEncoder
// to convert to the requested format
// Convert Float32Array to Int16Array
const int16Samples = new Int16Array(samples.length);
for (let i = 0; i < samples.length; i++) {
// Scale to 16-bit range and clamp
const sample = Math.max(-1, Math.min(1, samples[i]));
int16Samples[i] = Math.floor(sample * 32767);
}
// Create a WAV file header
const wavHeader = new ArrayBuffer(44);
const view = new DataView(wavHeader);
// "RIFF" chunk descriptor
view.setUint8(0, "R".charCodeAt(0));
view.setUint8(1, "I".charCodeAt(0));
view.setUint8(2, "F".charCodeAt(0));
view.setUint8(3, "F".charCodeAt(0));
// Chunk size (file size - 8)
view.setUint32(4, 36 + int16Samples.length * 2, true);
// Format ("WAVE")
view.setUint8(8, "W".charCodeAt(0));
view.setUint8(9, "A".charCodeAt(0));
view.setUint8(10, "V".charCodeAt(0));
view.setUint8(11, "E".charCodeAt(0));
// "fmt " sub-chunk
view.setUint8(12, "f".charCodeAt(0));
view.setUint8(13, "m".charCodeAt(0));
view.setUint8(14, "t".charCodeAt(0));
view.setUint8(15, " ".charCodeAt(0));
// Sub-chunk size (16 for PCM)
view.setUint32(16, 16, true);
// Audio format (1 for PCM)
view.setUint16(20, 1, true);
// Number of channels (1 for mono)
view.setUint16(22, 1, true);
// Sample rate
view.setUint32(24, this.sampleRate, true);
// Byte rate (sample rate * channels * bytes per sample)
view.setUint32(28, this.sampleRate * 1 * 2, true);
// Block align (channels * bytes per sample)
view.setUint16(32, 1 * 2, true);
// Bits per sample
view.setUint16(34, 16, true);
// "data" sub-chunk
view.setUint8(36, "d".charCodeAt(0));
view.setUint8(37, "a".charCodeAt(0));
view.setUint8(38, "t".charCodeAt(0));
view.setUint8(39, "a".charCodeAt(0));
// Sub-chunk size (number of samples * channels * bytes per sample)
view.setUint32(40, int16Samples.length * 1 * 2, true);
// Combine the header and the samples
const wavBytes = new Uint8Array(wavHeader.byteLength + int16Samples.length * 2);
wavBytes.set(new Uint8Array(wavHeader), 0);
// Convert Int16Array to Uint8Array
const samplesBytes = new Uint8Array(int16Samples.buffer);
wavBytes.set(samplesBytes, wavHeader.byteLength);
return wavBytes;
}
/**
* Mock implementation for synthToBytes
* @returns Promise resolving to a byte array of audio data
*/
_mockSynthToBytes() {
// Generate a simple sine wave as a placeholder
const sampleRate = this.sampleRate;
const duration = 2; // seconds
const numSamples = sampleRate * duration;
const samples = new Float32Array(numSamples);
// Generate a 440 Hz sine wave
for (let i = 0; i < numSamples; i++) {
samples[i] = Math.sin((2 * Math.PI * 440 * i) / sampleRate) * 0.5;
}
// Convert to WAV
return this._convertAudioFormat(samples);
}
/**
* Synthesize text to speech and stream the audio
* @param text Text to synthesize
* @param onAudioBuffer Callback for audio buffers
* @param onStart Callback for when synthesis starts
* @param onEnd Callback for when synthesis ends
* @param onWord Callback for word boundary events
* @param options Options for synthesis
* @returns Promise resolving when synthesis is complete
*/
async synthToStream(text, onAudioBuffer, onStart, onEnd, onWord, options) {
try {
// Call onStart callback
if (onStart) {
onStart();
}
// Synthesize the entire audio
const audioBytes = await this.synthToBytes(text, options);
// Estimate word boundaries
if (onWord) {
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
// Schedule word boundary events
for (const boundary of wordBoundaries) {
setTimeout(() => {
onWord(boundary.word, boundary.start, boundary.end);
}, boundary.start * 1000);
}
}
// Send the audio buffer
onAudioBuffer(audioBytes);
// Call onEnd callback
if (onEnd) {
onEnd();
}
}
catch (error) {
console.error("Error synthesizing text to stream:", error);
// Call onEnd callback even if there's an error
if (onEnd) {
onEnd();
}
}
}
/**
* Synthesize text to speech and save to a file
* @param text Text to synthesize
* @param filename Filename to save as
* @param format Audio format (mp3 or wav)
* @param options Options for synthesis
* @returns Promise resolving when synthesis is complete
*/
async synthToFile(text, filename, format = "wav", // Override base class to only allow 'wav'
options // Use specific options type
) {
try {
let outputFormat = format;
// Sherpa-ONNX only supports WAV output
if (outputFormat !== "wav") {
console.warn("SherpaOnnx WebAssembly TTS only supports WAV output. Using WAV instead of", outputFormat);
outputFormat = "wav";
}
// Use the base class's file saving logic (which detects Node/Browser)
await super.synthToFile(text, filename, outputFormat, options);
}
catch (error) {
console.error("Error synthesizing text to file:", error);
throw error;
}
}
/**
* Get a property value
* @param property Property name
* @returns Property value
*/
getProperty(property) {
switch (property) {
case "voice":
return this.currentVoiceId || this.voiceId || undefined;
case "sampleRate":
return this.sampleRate;
case "wasmLoaded":
return this.wasmLoaded;
case "wasmPath":
return this.wasmPath;
case "multiModelEnabled":
return this.enhancedOptions.enableMultiModel;
case "maxCachedModels":
return this.enhancedOptions.maxCachedModels;
case "loadedModels":
return this.modelManager ? Array.from(this.modelManager['loadedModels'].keys()) : [];
case "currentModel":
return this.modelManager?.getCurrentModel();
case "availableModels":
return this.modelRepository?.getAvailableModels() || [];
default:
return super.getProperty(property);
}
}
/**
* Set a property value
* @param property Property name
* @param value Property value
*/
setProperty(property, value) {
switch (property) {
case "voice":
this.setVoice(value);
break;
case "wasmPath":
this.wasmPath = value;
break;
default:
super.setProperty(property, value);
break;
}
}
/**
* Set the voice to use for synthesis
* Enhanced with multi-model support while maintaining backward compatibility
* @param voiceId Voice ID to use
*/
async setVoice(voiceId) {
// Call the parent method to set the voiceId
super.setVoice(voiceId);
console.log(`Setting voice to ${voiceId}`);
// Enhanced multi-model support
if (this.enhancedOptions.enableMultiModel && this.modelRepository && this.modelManager) {
console.log(`Using enhanced multi-model mode for voice ${voiceId}`);
// Check if model is already loaded and active
if (this.modelManager.getCurrentModel() === voiceId) {
this.currentVoiceId = voiceId;
return;
}
// Load model if not already loaded
if (!this.modelManager.isModelLoaded(voiceId)) {
const files = await this.modelRepository.downloadModelFiles(voiceId);
const config = this.modelRepository.getModelConfig(voiceId);
await this.modelManager.loadModel(voiceId, files, config);
}
// Switch to the model
await this.modelManager.switchToModel(voiceId);
this.currentVoiceId = voiceId;
console.log(`Successfully switched to voice ${voiceId} using multi-model system`);
return;
}
// Legacy single-model mode (backward compatibility)
console.log(`Using legacy single-model mode for voice ${voiceId}`);
// Reset the TTS instance so it will be recreated with the new voice
if (this.tts) {
console.log("Resetting TTS instance for new voice");
this.tts = null;
}
}
/**
* Clean up resources
* Enhanced to handle multi-model cleanup
*/
dispose() {
// Clean up multi-model resources
if (this.modelManager) {
console.log("Disposing multi-model resources");
this.modelManager.dispose();
this.modelManager = undefined;
}
// Clean up legacy TTS instance
if (this.wasmModule && this.tts !== 0) {
if (typeof this.wasmModule._ttsDestroyOffline === "function") {
this.wasmModule._ttsDestroyOffline(this.tts);
}
this.tts = null;
}
// Reset state
this.currentVoiceId = undefined;
this.wasmLoaded = false;
this.wasmModule = null;
}
/**
* Synthesize text to a byte stream
* @param text Text to synthesize
* @param options Options for synthesis
* @returns Promise resolving to an object containing the audio stream and an empty word boundaries array
*/
async synthToBytestream(text, options) {
// This is a simplified implementation that doesn't actually stream
// In a real implementation, you would use a ReadableStream
const audioBytes = await this.synthToBytes(text, options);
// Create a ReadableStream from the audio bytes
return {
audioStream: new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
}),
wordBoundaries: [],
};
}
}
exports.SherpaOnnxWasmTTSClient = SherpaOnnxWasmTTSClient;
/**
* Model Repository Manager
* Uses existing merged_models.json infrastructure for multi-model support
*/
class ModelRepository {
constructor() {
Object.defineProperty(this, "modelsIndex", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
// No baseUrl needed - we use the existing merged_models.json infrastructure
}
async loadModelsIndex() {
try {
// Use the existing merged_models.json loading logic from _getVoices
let voiceModels = [];
if (environment_1.isNode) {
const modelsJsonPath = environment_1.pathUtils.join(__dirname, "..", "engines", "sherpaonnx", "merged_models.json");
if (environment_1.fileSystem.existsSync(modelsJsonPath)) {
const modelsJson = environment_1.fileSystem.readFileSync(modelsJsonPath);
const modelsData = JSON.parse(modelsJson);
voiceModels = Object.values(modelsData);
}
}
else {
// In browser, try to fetch from the existing location
try {
const response = await fetch("./data/merged_models.json");
if (response.ok) {
const modelsData = await response.json();
voiceModels = Object.values(modelsData);
}
}
catch (fetchError) {
console.warn("Failed to fetch merged_models.json:", fetchError);
}
}
// Convert to our ModelConfig format
this.modelsIndex = voiceModels
.filter(model => ['kokoro', 'matcha', 'vits'].includes(model.model_type))
.map(model => ({
id: model.id,
type: model.model_type,
name: model.name,
language: model.language?.[0]?.lang_code || 'en',
gender: 'unknown', // Not specified in merged_models.json
sampleRate: model.sample_rate || 22050,
files: {
model: 'model.onnx',
tokens: 'tokens.txt',
voices: model.model_type === 'kokoro' ? 'voices.bin' : undefined,
vocoder: model.model_type === 'matcha' ? 'vocoder.onnx' : undefined
},
size: Math.round((model.filesize_mb || 64) * 1024 * 1024)
}));
console.log(`Loaded ${this.modelsIndex.length} compatible models from merged_models.json`);
}
catch (error) {
console.error('Error loading models index:', error);
// Fallback to default models for backward compatibility
this.modelsIndex = this.getDefaultModels();
}
}
getAvailableModels() {
return this.modelsIndex;
}
getModelConfig(modelId) {
return this.modelsIndex.find(model => model.id === modelId);
}
async downloadModelFiles(modelId) {
const config = this.getModelConfig(modelId);
if (!config) {
throw new Error(`Model ${modelId} not found in repository`);
}
console.log(`Downloading model files for ${modelId}...`);
// For now, return mock files since we don't have the actual WASM build yet
// In the real implementation, this would download from the model's URL
console.warn(`Mock implementation: returning placeholder files for ${modelId}`);
const mockModelData = new ArrayBuffer(1000);
const mockTokensData = new ArrayBuffer(500);
const files = {
model: mockModelData,
tokens: mockTokensData
};
if (config.type === 'kokoro') {
files.voices = new ArrayBuffer(200);
}
else if (config.type === 'matcha') {
files.vocoder = new ArrayBuffer(800);
}
console.log(`Mock download completed for ${modelId}`);
return files;
}
getDefaultModels() {
return [
{
id: 'piper-en-amy-medium',
type: 'vits',
name: 'Piper Amy (Medium)',
language: 'en-US',
gender: 'female',
sampleRate: 22050,
files: {
model: 'model.onnx',
tokens: 'tokens.txt'
},
size: 15000000 // ~15MB
}
];
}
}
/**
* WASM Model Manager
* Handles loading models into WebAssembly memory for multi-model support
*/
class WasmModelManager {
constructor(wasmModule, maxCachedModels = 3) {
Object.defineProperty(this, "wasmModule", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "loadedModels", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
Object.defineProperty(this, "currentModel", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "maxCachedModels", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.wasmModule = wasmModule;
this.maxCachedModels = maxCachedModels;
}
async loadModel(modelId, files, config) {
// Check if already loaded
if (this.loadedModels.has(modelId)) {
const model = this.loadedModels.get(modelId);
model.lastUsed = Date.now();
return model.handle;
}
// Free memory if needed
await this.ensureMemoryAvailable();
console.log(`Loading ${config.type} model ${modelId} into WASM...`);
// Allocate memory for model files
const modelPtr = this.wasmModule._malloc(files.model.byteLength);
const tokensPtr = this.wasmModule._malloc(files.tokens.byteLength);
let voicesPtr = 0;
let vocoderPtr = 0;
try {
// Copy model data to WASM memory
this.wasmModule.HEAPU8.set(new Uint8Array(files.model), modelPtr);
this.wasmModule.HEAPU8.set(new Uint8Array(files.tokens), tokensPtr);
// Handle model-specific files
if (files.voices && config.type === 'kokoro') {
voicesPtr = this.wasmModule._malloc(files.voices.byteLength);
this.wasmModule.HEAPU8.set(new Uint8Array(files.voices), voicesPtr);
}
if (files.vocoder && config.type === 'matcha') {
vocoderPtr = this.wasmModule._malloc(files.vocoder.byteLength);
this.wasmModule.HEAPU8.set(new Uint8Array(files.vocoder), vocoderPtr);
}
// Load model based on type
let modelHandle;
switch (config.type) {
case 'kokoro':
if (!this.wasmModule._LoadKokoroModel) {
throw new Error('Kokoro model loading not supported in this WASM build');
}
modelHandle = this.wasmModule._LoadKokoroModel(modelPtr, files.model.byteLength, tokensPtr, files.tokens.byteLength, voicesPtr, files.voices?.byteLength || 0);
break;
case 'matcha':
if (!this.wasmModule._LoadMatchaModel) {
throw new Error('Matcha model loading not supported in this WASM build');
}
modelHandle = this.wasmModule._LoadMatchaModel(modelPtr, files.model.byteLength, tokensPtr, files.tokens.byteLength, vocoderPtr, files.vocoder?.byteLength || 0);
break;
case 'vits':
default:
if (!this.wasmModule._LoadVitsModel) {
throw new Error('VITS model loading not supported in this WASM build');
}
modelHandle = this.wasmModule._LoadVitsModel(modelPtr, files.model.byteLength, tokensPtr, files.tokens.byteLength);
break;
}
if (modelHandle <= 0) {
throw new Error(`Failed to load ${config.type} model: ${modelHandle}`);
}
// Store loaded model info
this.loadedModels.set(modelId, {
config,
handle: modelHandle,
loaded: true,
lastUsed: Date.now()
});
console.log(`Successfully loaded ${config.type} model ${modelId} with handle ${modelHandle}`);
return modelHandle;
}
finally {
// Free temporary memory
this.wasmModule._free(modelPtr);
this.wasmModule._free(tokensPtr);
if (voicesPtr)
this.wasmModule._free(voicesPtr);
if (vocoderPtr)
this.wasmModule._free(vocoderPtr);
}
}
async switchToModel(modelId) {
const model = this.loadedModels.get(modelId);
if (!model) {
throw new Error(`Model ${modelId} is not loaded`);
}
if (!this.wasmModule._SwitchToModel) {
throw new Error('Model switching not supported in this WASM build');
}
console.log(`Switching to model ${modelId} (handle: ${model.handle})`);
const result = this.wasmModule._SwitchToModel(model.handle);
if (result !== 0) {
throw new Error(`Failed to switch to model ${modelId}: ${result}`);
}
this.currentModel = modelId;
model.lastUsed = Date.now();
}
getCurrentModel() {
return this.currentModel;
}
isModelLoaded(modelId) {
return this.loadedModels.has(modelId);
}
async ensureMemoryAvailable() {
if (this.loadedModels.size < this.maxCachedModels) {
return;
}
// Find least recently used model
let oldestModel;
let oldestTime = Date.now();
for (const [modelId, model] of this.loadedModels) {
if (model.lastUsed < oldestTime && modelId !== this.currentModel) {
oldestTime = model.lastUsed;
oldestModel = modelId;
}
}
if (oldestModel) {
console.log(`Unloading least recently used model: ${oldestModel}`);
await this.unloadModel(oldestModel);
}
}
async unloadModel(modelId) {
const model = this