js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
271 lines (270 loc) • 11.5 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.EspeakWasmTTSClient = exports.EspeakBrowserTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
// Function to detect if we're in a browser environment
function isBrowser() {
return typeof window !== "undefined" && typeof document !== "undefined";
}
function runtimeImport(specifier) {
return new Function("m", "return import(m)")(specifier);
}
// Removed meSpeak interface - no longer used
/**
* eSpeak TTS client for browser environments using meSpeak.js
* This provides eSpeak functionality in browsers and Node.js via WebAssembly
* For Node.js-only environments with better performance, use EspeakNodeTTSClient instead.
*/
class EspeakBrowserTTSClient extends abstract_tts_1.AbstractTTSClient {
constructor(credentials = {}) {
super(credentials);
Object.defineProperty(this, "nodeClient", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "meSpeak", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "meSpeakReady", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
// Set a default voice for eSpeak TTS
this.voiceId = "en"; // Default English voice
// In Node.js environments, we'll lazily load the Node client when needed to avoid bundling it in browsers.
}
async synthToBytes(text, options) {
// Node.js: delegate to Node client
if (!isBrowser()) {
if (!this.nodeClient) {
const mod = await runtimeImport("./espeak");
const EspeakNodeTTSClient = mod.EspeakNodeTTSClient || mod.default;
this.nodeClient = new EspeakNodeTTSClient(this.credentials);
}
return await this.nodeClient.synthToBytes(text, options);
}
// Browser: use meSpeak (UMD) with embedded config/voice JSONs
await this.ensureMeSpeakLoaded();
const meSpeak = this.meSpeak;
if (!meSpeak)
throw new Error("eSpeak-WASM: meSpeak failed to load");
const voiceId = (this.voiceId || "en").toLowerCase();
// pick meSpeak voice payload (limited set to keep bundle small)
const voicePayload = await this.getVoicePayload(voiceId);
if (!meSpeak.isConfigLoaded()) {
const { default: configJson } = await runtimeImport("mespeak/src/mespeak_config.json");
meSpeak.loadConfig(configJson);
}
if (voicePayload && !meSpeak.isVoiceLoaded(voicePayload.voice_id)) {
meSpeak.loadVoice(voicePayload);
meSpeak.setDefaultVoice(voicePayload.voice_id);
}
// Map SpeakOptions rate/pitch to meSpeak numeric speed/pitch
const rateToSpeed = {
"x-slow": 80,
slow: 120,
medium: 175,
fast: 220,
"x-fast": 300,
};
const pitchMap = {
"x-low": 10,
low: 25,
medium: 50,
high: 70,
"x-high": 90,
};
const speed = rateToSpeed[(options?.rate || "medium")] ?? 175;
const pitch = pitchMap[(options?.pitch || "medium")] ?? 50;
// get raw WAV buffer from meSpeak
const arr = meSpeak.speak(text, {
rawdata: "array",
voice: voicePayload?.voice_id || voiceId,
speed,
pitch,
});
if (!arr || !arr.length)
throw new Error("eSpeak-WASM: synthesis failed");
return new Uint8Array(arr);
}
async ensureMeSpeakLoaded() {
if (this.meSpeakReady)
return;
try {
// mespeak is optional and should only be resolved when this engine is actually used.
const mod = await runtimeImport("mespeak");
this.meSpeak = (mod && (mod.default || mod));
}
catch (error) {
throw new Error(`eSpeak-WASM requires the optional 'mespeak' package at runtime. ${error instanceof Error ? error.message : String(error)}`);
}
this.meSpeakReady = true;
}
// Load a small curated set of English voices inline to avoid URL/CORS
async getVoicePayload(voiceId) {
try {
switch (voiceId) {
case "en": {
const { default: v } = await runtimeImport("mespeak/voices/en/en.json");
return v;
}
case "en-us": {
const { default: v } = await runtimeImport("mespeak/voices/en/en-us.json");
return v;
}
case "en-rp": {
const { default: v } = await runtimeImport("mespeak/voices/en/en-rp.json");
return v;
}
case "en-sc": {
const { default: v } = await runtimeImport("mespeak/voices/en/en-sc.json");
return v;
}
case "en-wm": {
const { default: v } = await runtimeImport("mespeak/voices/en/en-wm.json");
return v;
}
default: {
// Fallback to plain English if requested voice not bundled
const { default: v } = await runtimeImport("mespeak/voices/en/en.json");
return v;
}
}
}
catch {
return null;
}
}
/**
* Synthesize text to a byte stream (ReadableStream)
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and an empty word boundaries array.
*/
async synthToBytestream(text, options) {
const audioBytes = await this.synthToBytes(text, options);
// Generate word boundaries if requested
let wordBoundaries = [];
if (options?.useWordBoundary) {
// Create estimated word timings and store them
this._createEstimatedWordTimings(text);
// Convert internal timings to word boundary format
wordBoundaries = this.timings.map(([start, end, word]) => ({
text: word,
offset: Math.round(start * 10000), // Convert to 100-nanosecond units
duration: Math.round((end - start) * 10000),
}));
}
// "Fake" streaming by wrapping full audio in a ReadableStream
const audioStream = new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
return { audioStream, wordBoundaries };
}
/**
* Return available voices for eSpeak WASM
*/
async _getVoices() {
// For Node.js environments, delegate to the regular eSpeak client (lazy loaded)
if (!isBrowser()) {
if (!this.nodeClient) {
const mod = await runtimeImport("./espeak");
const EspeakNodeTTSClient = mod.EspeakNodeTTSClient || mod.default;
this.nodeClient = new EspeakNodeTTSClient(this.credentials);
}
const nodeVoices = await this.nodeClient._getVoices();
// Rename them to indicate they're from eSpeak WASM (but actually using Node.js fallback)
return nodeVoices.map((voice) => ({
...voice,
name: voice.name.replace("(eSpeak)", "(eSpeak WASM)"),
}));
}
// meSpeak supports many languages, here's a subset of common ones
const commonVoices = [
{ id: "en", name: "English", language: "English" },
{ id: "en-us", name: "English (US)", language: "English" },
{ id: "en-rp", name: "English (RP)", language: "English" },
{ id: "en-sc", name: "English (Scottish)", language: "English" },
{ id: "es", name: "Spanish", language: "Spanish" },
{ id: "es-la", name: "Spanish (Latin America)", language: "Spanish" },
{ id: "fr", name: "French", language: "French" },
{ id: "de", name: "German", language: "German" },
{ id: "it", name: "Italian", language: "Italian" },
{ id: "pt", name: "Portuguese (Brazil)", language: "Portuguese" },
{ id: "pt-pt", name: "Portuguese (European)", language: "Portuguese" },
{ id: "ru", name: "Russian", language: "Russian" },
{ id: "zh", name: "Chinese (Mandarin)", language: "Chinese" },
{ id: "zh-yue", name: "Chinese (Cantonese)", language: "Chinese" },
{ id: "ja", name: "Japanese", language: "Japanese" },
{ id: "ko", name: "Korean", language: "Korean" },
{ id: "ar", name: "Arabic", language: "Arabic" },
{ id: "hi", name: "Hindi", language: "Hindi" },
{ id: "nl", name: "Dutch", language: "Dutch" },
{ id: "sv", name: "Swedish", language: "Swedish" },
{ id: "da", name: "Danish", language: "Danish" },
{ id: "no", name: "Norwegian", language: "Norwegian" },
{ id: "fi", name: "Finnish", language: "Finnish" },
{ id: "pl", name: "Polish", language: "Polish" },
{ id: "cs", name: "Czech", language: "Czech" },
{ id: "hu", name: "Hungarian", language: "Hungarian" },
{ id: "tr", name: "Turkish", language: "Turkish" },
{ id: "he", name: "Hebrew", language: "Hebrew" },
{ id: "th", name: "Thai", language: "Thai" },
{ id: "vi", name: "Vietnamese", language: "Vietnamese" },
];
const voices = commonVoices.map((voice) => ({
id: voice.id,
name: `${voice.name} (eSpeak WASM)`,
gender: "Unknown", // meSpeak doesn't typically provide gender info
provider: "espeak-ng",
languageCodes: [
{
bcp47: voice.id.split("-")[0], // Use the base language code
iso639_3: "", // Would need mapping
display: voice.language,
},
],
}));
return voices;
}
/**
* Get the list of required credential types for this engine
* @returns Array of required credential field names
*/
getRequiredCredentials() {
return []; // eSpeak doesn't require any credentials
}
/**
* Check if credentials are valid (eSpeak doesn't need credentials)
*/
async checkCredentials() {
// eSpeak doesn't need credentials and we have fallbacks for both environments
return true;
}
/**
* Get detailed credential validation info
*/
async checkCredentialsAdvanced() {
return {
valid: true,
message: "eSpeak WASM is available with environment-specific fallbacks",
details: {
environment: isBrowser() ? "browser" : "node",
engine: isBrowser() ? "meSpeak" : "text2wav",
note: "Credentials not required for eSpeak",
},
};
}
}
exports.EspeakBrowserTTSClient = EspeakBrowserTTSClient;
exports.EspeakWasmTTSClient = EspeakBrowserTTSClient;