js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
812 lines (811 loc) • 31.8 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.AbstractTTSClient = void 0;
const builder_1 = require("../ssml/builder");
const language_utils_1 = require("./language-utils");
const SSMLUtils = __importStar(require("./ssml-utils"));
const environment_1 = require("../utils/environment");
const node_audio_1 = require("../utils/node-audio");
/**
* Abstract base class for all TTS clients
* This provides a unified interface for all TTS providers
*/
class AbstractTTSClient {
/**
* Creates a new TTS client
* @param credentials Provider-specific credentials
*/
constructor(credentials) {
Object.defineProperty(this, "credentials", {
enumerable: true,
configurable: true,
writable: true,
value: credentials
});
/**
* Currently selected voice ID
*/
Object.defineProperty(this, "voiceId", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
/**
* Currently selected language
*/
Object.defineProperty(this, "lang", {
enumerable: true,
configurable: true,
writable: true,
value: "en-US"
});
/**
* Event callbacks
*/
Object.defineProperty(this, "callbacks", {
enumerable: true,
configurable: true,
writable: true,
value: {}
});
/**
* SSML builder instance
*/
Object.defineProperty(this, "ssml", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* Audio playback properties
*/
Object.defineProperty(this, "audio", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* TTS properties (rate, pitch, volume)
*/
Object.defineProperty(this, "properties", {
enumerable: true,
configurable: true,
writable: true,
value: {
volume: 100,
rate: "medium",
pitch: "medium",
}
});
/**
* Word timings for the current audio
*/
Object.defineProperty(this, "timings", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
/**
* Audio sample rate in Hz
* This is used for playback and word timing estimation
* Default is 24000 Hz, but engines can override this
*/
Object.defineProperty(this, "sampleRate", {
enumerable: true,
configurable: true,
writable: true,
value: 24000
});
this.ssml = new builder_1.SSMLBuilder();
this.audio = {
isPlaying: false,
isPaused: false,
audioElement: null,
position: 0,
duration: 0,
};
}
/**
* Get available voices from the provider with normalized language codes
* @returns Promise resolving to an array of unified voice objects
*/
async getVoices() {
// Get raw voices from the engine-specific implementation
const rawVoices = await this._getVoices();
// Process and normalize the voices
// Each engine should implement _mapVoiceToUnified to convert its raw voice format
// to a partially filled UnifiedVoice object
const voices = await this._mapVoicesToUnified(rawVoices);
// Normalize language codes for all voices
return voices.map((voice) => {
// Normalize language codes for each language
const normalizedLanguageCodes = voice.languageCodes.map((lang) => {
const normalized = language_utils_1.LanguageNormalizer.normalize(lang.bcp47);
return {
bcp47: normalized.bcp47,
iso639_3: normalized.iso639_3,
display: normalized.display,
};
});
// Return the voice with normalized language codes
return {
...voice,
languageCodes: normalizedLanguageCodes,
};
});
}
// --- Optional overrides ---
/**
* Map provider-specific voice objects to unified format
* @param rawVoices Array of provider-specific voice objects
* @returns Promise resolving to an array of partially unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Default implementation that assumes rawVoices are already in UnifiedVoice format
// Engine-specific implementations should override this method
return rawVoices;
}
/**
* Speak text using the default audio output, or play audio from file/bytes/stream
* @param input Text to speak, or audio input (filename, audioBytes, or audioStream)
* @param options Synthesis options
* @returns Promise resolving when audio playback starts
*/
async speak(input, options) {
// Trigger onStart callback
this.emit("start");
try {
let audioBytes;
let mimeType;
// Handle different input types
if (typeof input === "string") {
// Traditional text input
audioBytes = await this.synthToBytes(input, options);
// Determine MIME type based on options or engine default
mimeType = "audio/wav"; // default to WAV
if (options?.format === "mp3") {
mimeType = "audio/mpeg";
}
else if (options?.format === "ogg") {
mimeType = "audio/ogg";
}
}
else {
// Audio input (file, bytes, or stream)
const { processAudioInput } = await Promise.resolve().then(() => __importStar(require("../utils/audio-input")));
const result = await processAudioInput(input);
audioBytes = result.audioBytes;
mimeType = result.mimeType;
}
// Check if we're in a browser environment
if (environment_1.isBrowser) {
// Create audio blob and URL with the correct MIME type
const blob = new Blob([audioBytes], { type: mimeType });
const url = URL.createObjectURL(blob);
// Create and play audio element
const audio = new Audio();
// Set up event handlers before setting the source
audio.oncanplay = async () => {
try {
this.audio.audioElement = audio;
this.audio.isPlaying = true;
this.audio.isPaused = false;
// Create estimated word timings if needed (only for text input)
if (typeof input === "string") {
this._createEstimatedWordTimings(input);
}
// Play the audio
await audio.play();
}
catch (playError) {
console.error("Error playing audio:", playError);
this.emit("end");
}
};
audio.onerror = (e) => {
console.error("Audio playback error:", e);
this.emit("end");
URL.revokeObjectURL(url);
};
audio.onended = () => {
this.emit("end");
this.audio.isPlaying = false;
URL.revokeObjectURL(url); // Clean up the URL
};
// Set the source after setting up event handlers
audio.src = url;
}
else if (environment_1.isNode) {
// In Node.js environment, try to use sound-play
try {
// Check if Node.js audio playback is available
const audioAvailable = await (0, node_audio_1.isNodeAudioAvailable)();
if (audioAvailable) {
// Emit start event
this.emit("start");
// Play audio using our node-audio utility
// Pass the engine name to handle Polly audio differently
await (0, node_audio_1.playAudioInNode)(audioBytes, this.sampleRate, this.constructor.name.replace('TTSClient', '').toLowerCase());
// Emit end event
this.emit("end");
}
else {
console.log("Audio playback in Node.js requires the sound-play package.");
console.log("Install it with: npm install js-tts-wrapper[node-audio]");
console.log("Or use synthToFile() to save audio to a file and play it with an external player.");
this.emit("end");
}
}
catch (nodeAudioError) {
console.error("Error playing audio in Node.js:", nodeAudioError);
this.emit("end");
}
}
else {
// Unknown environment
console.log("Audio playback is not supported in this environment.");
console.log("Use synthToFile() to save audio to a file and play it with an external player.");
this.emit("end");
}
}
catch (error) {
console.error("Error in speak method:", error);
this.emit("end"); // Ensure end event is triggered even on error
throw error;
}
}
/**
* Speak text using streaming synthesis, or play audio from file/bytes/stream
* @param input Text to speak, or audio input (filename, audioBytes, or audioStream)
* @param options Synthesis options
* @returns Promise resolving when audio playback starts
*/
async speakStreamed(input, options) {
// Trigger onStart callback
this.emit("start");
try {
let audioBytes;
let mimeType;
let wordBoundaries = [];
let text = "";
// Handle different input types
if (typeof input === "string") {
// Traditional text input - use streaming synthesis
text = input;
const streamResult = await this.synthToBytestream(text, options);
// Get audio stream and word boundaries
const audioStream = streamResult.audioStream;
wordBoundaries = streamResult.wordBoundaries;
const reader = audioStream.getReader();
const chunks = [];
// Read all chunks from the stream
let result = await reader.read();
while (!result.done) {
chunks.push(result.value);
result = await reader.read();
}
// Combine chunks into a single audio buffer
const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
audioBytes = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
audioBytes.set(chunk, offset);
offset += chunk.length;
}
// Determine MIME type based on options or engine default
mimeType = "audio/wav"; // default to WAV
if (options?.format === "mp3") {
mimeType = "audio/mpeg";
}
else if (options?.format === "ogg") {
mimeType = "audio/ogg";
}
}
else {
// Audio input (file, bytes, or stream)
const { processAudioInput } = await Promise.resolve().then(() => __importStar(require("../utils/audio-input")));
const result = await processAudioInput(input);
audioBytes = result.audioBytes;
mimeType = result.mimeType;
// For audio input, we don't have word boundaries or text
// We'll create estimated timings if needed
text = ""; // No text available for audio input
}
// Use actual word boundaries if available, otherwise create estimated ones
if (wordBoundaries.length > 0) {
// Convert the word boundaries to our internal format
this.timings = wordBoundaries.map((wb) => [
wb.offset / 10000, // Convert from 100-nanosecond units to seconds
(wb.offset + wb.duration) / 10000,
wb.text,
]);
}
else if (text) {
// Create estimated word timings only if we have text
this._createEstimatedWordTimings(text);
}
else {
// No text available (audio input), clear timings
this.timings = [];
}
// Check if we're in a browser environment
if (environment_1.isBrowser) {
// Create audio blob and URL with the correct MIME type
const blob = new Blob([audioBytes], { type: mimeType });
const url = URL.createObjectURL(blob);
// Create and play audio element
const audio = new Audio();
// Set up event handlers before setting the source
audio.oncanplay = async () => {
try {
this.audio.audioElement = audio;
this.audio.isPlaying = true;
this.audio.isPaused = false;
// Play the audio
await audio.play();
}
catch (playError) {
console.error("Error playing audio:", playError);
this.emit("end");
}
};
audio.onerror = (e) => {
console.error("Audio playback error:", e);
this.emit("end");
URL.revokeObjectURL(url);
};
audio.onended = () => {
this.emit("end");
this.audio.isPlaying = false;
URL.revokeObjectURL(url);
};
// Set the source after setting up event handlers
audio.src = url;
}
else if (environment_1.isNode) {
// In Node.js environment, try to use sound-play
try {
// Check if Node.js audio playback is available
const audioAvailable = await (0, node_audio_1.isNodeAudioAvailable)();
// Create estimated word timings if needed and we have text
if (text) {
this._createEstimatedWordTimings(text);
}
if (audioAvailable) {
// Schedule word boundary callbacks
this._scheduleWordBoundaryCallbacks();
// Play audio using our node-audio utility with the engine's sample rate
// Pass the engine name to handle Polly audio differently
await (0, node_audio_1.playAudioInNode)(audioBytes, this.sampleRate, this.constructor.name.replace('TTSClient', '').toLowerCase());
// Emit end event
this.emit("end");
}
else {
console.log("Audio playback in Node.js requires the sound-play package.");
console.log("Install it with: npm install js-tts-wrapper[node-audio]");
console.log("Or use synthToFile() to save audio to a file and play it with an external player.");
// Fire word boundary callbacks immediately
this._fireWordBoundaryCallbacks();
this.emit("end");
}
}
catch (nodeAudioError) {
console.error("Error playing audio in Node.js:", nodeAudioError);
this._fireWordBoundaryCallbacks();
this.emit("end");
}
}
else {
// Unknown environment
console.log("Audio playback is not supported in this environment.");
console.log("Use synthToFile() to save audio to a file and play it with an external player.");
// Create estimated word timings if needed and we have text
if (text) {
this._createEstimatedWordTimings(text);
}
// Fire word boundary callbacks immediately
setTimeout(() => {
this._fireWordBoundaryCallbacks();
this.emit("end");
}, 100);
}
}
catch (error) {
console.error("Error in streaming synthesis:", error);
this.emit("end"); // Ensure end event is triggered even on error
throw error;
}
}
/**
* Synthesize text to audio and save it to a file (browser download)
* @param text Text or SSML to synthesize
* @param filename Filename to save as
* @param format Audio format (mp3 or wav)
* @param options Synthesis options
*/
async synthToFile(text, filename, format = "wav", options) {
// Convert text to audio bytes with the specified format
const audioBytes = await this.synthToBytes(text, { ...options, format });
if (environment_1.isBrowser) {
// Create blob with appropriate MIME type
const mimeType = format === "mp3" ? "audio/mpeg" : "audio/wav";
const blob = new Blob([audioBytes], { type: mimeType });
// Create download link
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`;
// Trigger download
document.body.appendChild(a);
a.click();
// Clean up: Use requestAnimationFrame for potentially smoother cleanup
requestAnimationFrame(() => {
if (document?.body?.contains(a)) {
document.body.removeChild(a);
}
URL.revokeObjectURL(url);
});
}
else if (environment_1.isNode) {
// In Node.js, use the file system
const outputPath = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`;
const fs = await Promise.resolve().then(() => __importStar(require('node:fs')));
fs.writeFileSync(outputPath, Buffer.from(audioBytes));
}
else {
console.warn("File saving not implemented for this environment.");
}
}
/**
* Set the voice to use for synthesis
* @param voiceId Voice ID to use
* @param lang Language code (optional)
*/
setVoice(voiceId, lang) {
this.voiceId = voiceId;
if (lang) {
this.lang = lang;
}
}
// --- Playback control methods ---
/**
* Pause audio playback
*/
pause() {
if (environment_1.isBrowser) {
// Browser environment - use HTML5 Audio element
if (this.audio.audioElement && this.audio.isPlaying && !this.audio.isPaused) {
this.audio.audioElement.pause();
this.audio.isPaused = true;
}
}
else if (environment_1.isNode) {
// Node.js environment - use node-speaker
try {
// Import dynamically to avoid circular dependencies
Promise.resolve().then(() => __importStar(require('./node-audio-control'))).then(nodeAudio => {
const paused = nodeAudio.pauseAudioPlayback();
if (paused) {
this.audio.isPaused = true;
}
}).catch(error => {
console.error("Error importing node-audio-control:", error);
});
}
catch (error) {
console.error("Error pausing audio in Node.js:", error);
}
}
}
/**
* Resume audio playback
*/
resume() {
if (environment_1.isBrowser) {
// Browser environment - use HTML5 Audio element
if (this.audio.audioElement && this.audio.isPlaying && this.audio.isPaused) {
this.audio.audioElement.play();
this.audio.isPaused = false;
}
}
else if (environment_1.isNode) {
// Node.js environment - use node-speaker
try {
// Import dynamically to avoid circular dependencies
Promise.resolve().then(() => __importStar(require('./node-audio-control'))).then(nodeAudio => {
const resumed = nodeAudio.resumeAudioPlayback();
if (resumed) {
this.audio.isPaused = false;
}
}).catch(error => {
console.error("Error importing node-audio-control:", error);
});
}
catch (error) {
console.error("Error resuming audio in Node.js:", error);
}
}
}
/**
* Stop audio playback
*/
stop() {
if (environment_1.isBrowser) {
// Browser environment - use HTML5 Audio element
if (this.audio.audioElement) {
this.audio.audioElement.pause();
this.audio.audioElement.currentTime = 0;
this.audio.isPlaying = false;
this.audio.isPaused = false;
}
}
else if (environment_1.isNode) {
// Node.js environment - use node-speaker
try {
// Import dynamically to avoid circular dependencies
Promise.resolve().then(() => __importStar(require('./node-audio-control'))).then(nodeAudio => {
const stopped = nodeAudio.stopAudioPlayback();
if (stopped) {
this.audio.isPlaying = false;
this.audio.isPaused = false;
}
}).catch(error => {
console.error("Error importing node-audio-control:", error);
});
}
catch (error) {
console.error("Error stopping audio in Node.js:", error);
}
}
}
/**
* Create estimated word timings for non-streaming engines
* @param text Text to create timings for
*/
_createEstimatedWordTimings(text) {
// Extract plain text from SSML if needed
const plainText = this._isSSML(text) ? this._stripSSML(text) : text;
// Split into words
const words = plainText.split(/\s+/).filter((word) => word.length > 0);
if (!words.length)
return;
// Estimate duration (assuming average speaking rate)
const estimatedDuration = words.length * 0.3; // ~300ms per word
const wordDuration = estimatedDuration / words.length;
// Create evenly-spaced word timings
this.timings = [];
for (let i = 0; i < words.length; i++) {
const startTime = i * wordDuration;
const endTime = (i + 1) * wordDuration;
this.timings.push([startTime, endTime, words[i]]);
}
}
/**
* Fire word boundary callbacks based on timing data
*/
_fireWordBoundaryCallbacks() {
if (!this.timings.length)
return;
// Get all boundary callbacks
const callbacks = this.callbacks["boundary"] || [];
if (!callbacks.length)
return;
// Fire callbacks for each word
for (const [start, end, word] of this.timings) {
for (const callback of callbacks) {
callback(word, start, end);
}
}
}
/**
* Schedule word boundary callbacks based on timing information
* This is used when we have audio playback but need to schedule callbacks
*/
_scheduleWordBoundaryCallbacks() {
if (!this.timings.length)
return;
// Get all boundary callbacks
const callbacks = this.callbacks["boundary"] || [];
if (!callbacks.length)
return;
// Schedule callbacks for each word
for (const [start, end, word] of this.timings) {
setTimeout(() => {
for (const callback of callbacks) {
callback(word, start, end);
}
}, start * 1000);
}
}
/**
* Check if text is SSML
* @param text Text to check
* @returns True if text is SSML
*/
_isSSML(text) {
return SSMLUtils.isSSML(text);
}
/**
* Strip SSML tags from text
* @param ssml SSML text
* @returns Plain text without SSML tags
*/
_stripSSML(ssml) {
return SSMLUtils.stripSSML(ssml);
}
// --- Event system ---
/**
* Register a callback for an event
* @param event Event type
* @param fn Callback function
*/
on(event, fn) {
this.callbacks[event] = this.callbacks[event] || [];
this.callbacks[event].push(fn);
}
/**
* Emit an event to all registered callbacks
* @param event Event type
* @param args Event arguments
*/
emit(event, ...args) {
for (const fn of this.callbacks[event] || []) {
fn(...args);
}
}
/**
* Start playback with word boundary callbacks
* @param text Text or SSML to speak
* @param callback Callback function for word boundaries
* @param options Synthesis options
*/
async startPlaybackWithCallbacks(text, callback, options) {
// Speak the text
await this.speak(text, options);
// Use the timings to schedule callbacks
for (const [start, end, word] of this.timings) {
setTimeout(() => {
callback(word, start, end);
}, start * 1000);
}
}
/**
* Connect a callback to an event
* @param event Event name
* @param callback Callback function
*/
connect(event, callback) {
if (event === "onStart") {
this.on("start", callback);
}
else if (event === "onEnd") {
this.on("end", callback);
}
}
/**
* Get the value of a property
* @param propertyName Property name
* @returns Property value
*/
getProperty(propertyName) {
return this.properties[propertyName];
}
/**
* Set a property value
* @param propertyName Property name
* @param value Property value
*/
setProperty(propertyName, value) {
this.properties[propertyName] = value;
}
/**
* Create a prosody tag with the current properties
* @param text Text to wrap with prosody
* @returns Text with prosody tag
*/
constructProsodyTag(text) {
const attrs = [];
if (this.properties.rate) {
attrs.push(`rate="${this.properties.rate}"`);
}
if (this.properties.pitch) {
attrs.push(`pitch="${this.properties.pitch}"`);
}
if (this.properties.volume) {
attrs.push(`volume="${this.properties.volume}%"`);
}
if (attrs.length === 0) {
return text;
}
return `<prosody ${attrs.join(" ")}>${text}</prosody>`;
}
/**
* Check if credentials are valid
* @returns Promise resolving to true if credentials are valid
*/
async checkCredentials() {
try {
const voices = await this._getVoices();
return voices.length > 0;
}
catch (error) {
console.error("Error checking credentials:", error);
return false;
}
}
/**
* Check if credentials are valid with detailed response
* @returns Promise resolving to an object with success flag and optional error message
*/
async checkCredentialsDetailed() {
try {
const voices = await this._getVoices();
return {
success: voices.length > 0,
voiceCount: voices.length
};
}
catch (error) {
console.error("Error checking credentials:", error);
return {
success: false,
error: error instanceof Error ? error.message : String(error)
};
}
}
/**
* Get available voices for a specific language
* @param language Language code (BCP-47 format, e.g., 'en-US')
* @returns Promise resolving to an array of available voices for the specified language
*/
async getVoicesByLanguage(language) {
// Normalize the input language code
const normalizedLanguage = language_utils_1.LanguageNormalizer.normalize(language);
// Get all voices
const voices = await this.getVoices();
// Filter voices by language
return voices.filter((voice) => voice.languageCodes.some((lang) =>
// Match by BCP-47 code
lang.bcp47 === normalizedLanguage.bcp47 ||
// Or by ISO 639-3 code
lang.iso639_3 === normalizedLanguage.iso639_3));
}
}
exports.AbstractTTSClient = AbstractTTSClient;