js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
178 lines (177 loc) • 8.12 kB
JavaScript
import { AbstractTTSClient } from "../core/abstract-tts.js";
// Dynamic text2wav module - will be loaded when needed
let text2wav = null;
// Function to load text2wav module with enhanced ESM compatibility for Next.js and other environments
async function loadText2Wav() {
if (text2wav)
return text2wav;
try {
// Check if we're in a Node.js environment
if (typeof process === "undefined" || !process.versions || !process.versions.node) {
throw new Error("EspeakNodeTTSClient is only supported in Node.js environments");
}
// Detect Next.js environment
const isNextJS = typeof process !== "undefined" &&
(process.env.NEXT_RUNTIME || process.env.__NEXT_PRIVATE_ORIGIN);
// Enhanced dynamic import for better ESM compatibility
try {
text2wav = await import("text2wav");
// Handle both default and named exports
if (text2wav.default) {
text2wav = text2wav.default;
}
return text2wav;
}
catch (importError) {
// Fallback for environments where dynamic import might fail
if (isNextJS) {
throw new Error("text2wav package not found in Next.js environment. " +
"This may be due to Next.js bundling restrictions. " +
"Consider using EspeakBrowserTTSClient for browser environments or " +
"ensure text2wav is properly installed: npm install text2wav");
}
throw importError;
}
}
catch (err) {
console.error("Error loading text2wav:", err);
const errorMessage = err instanceof Error ? err.message : String(err);
throw new Error(`text2wav package not found. ${errorMessage}. Please install it with: npm install text2wav`);
}
}
/**
* eSpeak TTS Client for Node.js environments
*
* This client uses the text2wav package for server-side eSpeak TTS synthesis.
* For browser environments, use EspeakBrowserTTSClient instead.
*/
export class EspeakNodeTTSClient extends AbstractTTSClient {
constructor(credentials = {}) {
super(credentials);
// Set a default voice for eSpeak TTS
this.voiceId = "en"; // Default English voice
}
/**
* Get the list of required credential types for this engine
* @returns Array of required credential field names
*/
getRequiredCredentials() {
return []; // eSpeak doesn't require any credentials
}
/**
* eSpeak does not require credentials in Node.js
*/
async checkCredentials() {
return true;
}
/**
* Synthesize text to audio bytes (Uint8Array)
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
try {
// Load the text2wav module
const text2wavModule = await loadText2Wav();
// Prepare options for text2wav
const text2wavOptions = {};
// Use voice from options or the default voice
const voiceId = options?.voice || this.voiceId || "en";
text2wavOptions.voice = voiceId;
// Map other options to text2wav format
if (options?.rate) {
// text2wav uses speed in words per minute, default is 175
// Convert from rate (0.1-10) to WPM (50-400)
const rateNum = typeof options.rate === "string" ? Number.parseFloat(options.rate) : options.rate;
const rate = Math.max(0.1, Math.min(10, rateNum));
text2wavOptions.speed = Math.round(50 + ((rate - 0.1) * (400 - 50)) / (10 - 0.1));
}
if (options?.pitch) {
// text2wav uses pitch 0-99, default is 50
// Convert from pitch (0.1-2) to 0-99
const pitchNum = typeof options.pitch === "string" ? Number.parseFloat(options.pitch) : options.pitch;
const pitch = Math.max(0.1, Math.min(2, pitchNum));
text2wavOptions.pitch = Math.round(((pitch - 0.1) * 99) / (2 - 0.1));
}
// Call text2wav to generate audio
const audioBuffer = await text2wavModule(text, text2wavOptions);
// text2wav returns a Uint8Array, which is what we need
return audioBuffer;
}
catch (err) {
console.error("eSpeak TTS synthesis error:", err);
throw new Error(`Failed to synthesize speech with eSpeak: ${err instanceof Error ? err.message : String(err)}`);
}
}
/**
* Synthesize text to a byte stream (ReadableStream)
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and an empty word boundaries array.
*/
async synthToBytestream(text, options) {
const audioBytes = await this.synthToBytes(text, options);
// "Fake" streaming by wrapping full audio in a ReadableStream
const audioStream = new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
return { audioStream, wordBoundaries: [] };
}
// TODO: Add voice/language/rate/pitch options, browser WASM loader, etc.
/**
* Return available voices for eSpeak
*/
async _getVoices() {
// eSpeak supports many languages, here's a subset of common ones
// text2wav uses voice files from espeak-ng-data directory
const commonVoices = [
{ id: "en", name: "English", language: "English" },
{ id: "en+f3", name: "English (Female 3)", language: "English" },
{ id: "en+m3", name: "English (Male 3)", language: "English" },
{ id: "en+whisper", name: "English (Whisper)", language: "English" },
{ id: "es", name: "Spanish", language: "Spanish" },
{ id: "fr", name: "French", language: "French" },
{ id: "de", name: "German", language: "German" },
{ id: "it", name: "Italian", language: "Italian" },
{ id: "pt", name: "Portuguese", language: "Portuguese" },
{ id: "ru", name: "Russian", language: "Russian" },
{ id: "zh", name: "Chinese (Mandarin)", language: "Chinese" },
{ id: "ja", name: "Japanese", language: "Japanese" },
{ id: "ko", name: "Korean", language: "Korean" },
{ id: "ar", name: "Arabic", language: "Arabic" },
{ id: "hi", name: "Hindi", language: "Hindi" },
{ id: "nl", name: "Dutch", language: "Dutch" },
{ id: "sv", name: "Swedish", language: "Swedish" },
{ id: "da", name: "Danish", language: "Danish" },
{ id: "no", name: "Norwegian", language: "Norwegian" },
{ id: "fi", name: "Finnish", language: "Finnish" },
{ id: "pl", name: "Polish", language: "Polish" },
{ id: "cs", name: "Czech", language: "Czech" },
{ id: "hu", name: "Hungarian", language: "Hungarian" },
{ id: "tr", name: "Turkish", language: "Turkish" },
{ id: "he", name: "Hebrew", language: "Hebrew" },
{ id: "th", name: "Thai", language: "Thai" },
{ id: "vi", name: "Vietnamese", language: "Vietnamese" },
];
const voices = commonVoices.map((voice) => ({
id: voice.id,
name: `${voice.name} (eSpeak)`,
gender: "Unknown", // eSpeak doesn't typically provide gender info
provider: "espeak-ng",
languageCodes: [
{
bcp47: voice.id.split("+")[0], // Use the base language code
iso639_3: "", // Would need mapping
display: voice.language,
},
],
}));
return voices;
}
}
// Backward compatibility export
export { EspeakNodeTTSClient as EspeakTTSClient };