js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
556 lines (555 loc) • 19.3 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.AbstractTTSClient = void 0;
const builder_1 = require("../ssml/builder");
const language_utils_1 = require("./language-utils");
const SSMLUtils = __importStar(require("./ssml-utils"));
/**
* Abstract base class for all TTS clients
* This provides a unified interface for all TTS providers
*/
class AbstractTTSClient {
/**
* Creates a new TTS client
* @param credentials Provider-specific credentials
*/
constructor(credentials) {
Object.defineProperty(this, "credentials", {
enumerable: true,
configurable: true,
writable: true,
value: credentials
});
/**
* Currently selected voice ID
*/
Object.defineProperty(this, "voiceId", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
/**
* Currently selected language
*/
Object.defineProperty(this, "lang", {
enumerable: true,
configurable: true,
writable: true,
value: "en-US"
});
/**
* Event callbacks
*/
Object.defineProperty(this, "callbacks", {
enumerable: true,
configurable: true,
writable: true,
value: {}
});
/**
* SSML builder instance
*/
Object.defineProperty(this, "ssml", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* Audio playback properties
*/
Object.defineProperty(this, "audio", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* TTS properties (rate, pitch, volume)
*/
Object.defineProperty(this, "properties", {
enumerable: true,
configurable: true,
writable: true,
value: {
volume: 100,
rate: "medium",
pitch: "medium",
}
});
/**
* Word timings for the current audio
*/
Object.defineProperty(this, "timings", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
/**
* Audio sample rate
*/
Object.defineProperty(this, "audioRate", {
enumerable: true,
configurable: true,
writable: true,
value: 24000
});
this.ssml = new builder_1.SSMLBuilder();
this.audio = {
isPlaying: false,
isPaused: false,
audioElement: null,
position: 0,
duration: 0,
};
}
/**
* Get available voices from the provider with normalized language codes
* @returns Promise resolving to an array of unified voice objects
*/
async getVoices() {
// Get raw voices from the engine-specific implementation
const rawVoices = await this._getVoices();
// Process and normalize the voices
// Each engine should implement _mapVoiceToUnified to convert its raw voice format
// to a partially filled UnifiedVoice object
const voices = await this._mapVoicesToUnified(rawVoices);
// Normalize language codes for all voices
return voices.map((voice) => {
// Normalize language codes for each language
const normalizedLanguageCodes = voice.languageCodes.map((lang) => {
const normalized = language_utils_1.LanguageNormalizer.normalize(lang.bcp47);
return {
bcp47: normalized.bcp47,
iso639_3: normalized.iso639_3,
display: normalized.display,
};
});
// Return the voice with normalized language codes
return {
...voice,
languageCodes: normalizedLanguageCodes,
};
});
}
// --- Optional overrides ---
/**
* Map provider-specific voice objects to unified format
* @param rawVoices Array of provider-specific voice objects
* @returns Promise resolving to an array of partially unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Default implementation that assumes rawVoices are already in UnifiedVoice format
// Engine-specific implementations should override this method
return rawVoices;
}
/**
* Speak text using the default audio output
* @param text Text or SSML to speak
* @param options Synthesis options
* @returns Promise resolving when audio playback starts
*/
async speak(text, options) {
// Trigger onStart callback
this.emit("start");
// Convert text to audio bytes
const audioBytes = await this.synthToBytes(text, options);
// Check if we're in a browser environment
let url = "";
if (typeof Blob !== "undefined" && typeof URL !== "undefined") {
// Create audio blob and URL
const blob = new Blob([audioBytes], { type: "audio/wav" }); // default to WAV
url = URL.createObjectURL(blob);
}
// Check if we're in a browser environment
if (typeof Audio !== "undefined") {
// Create and play audio element
const audio = new Audio(url);
this.audio.audioElement = audio;
this.audio.isPlaying = true;
this.audio.isPaused = false;
// Set up event handlers
audio.onended = () => {
this.emit("end");
this.audio.isPlaying = false;
URL.revokeObjectURL(url); // Clean up the URL
};
}
else {
// In Node.js environment, we can't play audio
// Just emit the end event immediately
this.emit("end");
}
// Create estimated word timings if needed
this._createEstimatedWordTimings(text);
// Play the audio if in browser environment
if (this.audio.audioElement) {
await this.audio.audioElement.play();
}
}
/**
* Speak text using streaming synthesis
* @param text Text or SSML to speak
* @param options Synthesis options
* @returns Promise resolving when audio playback starts
*/
async speakStreamed(text, options) {
// Trigger onStart callback
this.emit("start");
try {
// Get streaming audio data
const streamResult = await this.synthToBytestream(text, options);
// Handle both simple stream and stream with word boundaries
let audioStream;
let wordBoundaries = [];
if ("audioStream" in streamResult) {
// It's the enhanced version with word boundaries
audioStream = streamResult.audioStream;
wordBoundaries = streamResult.wordBoundaries;
}
else {
// It's just a simple stream
audioStream = streamResult;
}
const reader = audioStream.getReader();
const chunks = [];
// Read all chunks from the stream
let result = await reader.read();
while (!result.done) {
chunks.push(result.value);
result = await reader.read();
}
// Combine chunks into a single audio buffer
const totalLength = chunks.reduce((acc, chunk) => acc + chunk.length, 0);
const audioBytes = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
audioBytes.set(chunk, offset);
offset += chunk.length;
}
// Use actual word boundaries if available, otherwise create estimated ones
if (wordBoundaries.length > 0) {
// Convert the word boundaries to our internal format
this.timings = wordBoundaries.map((wb) => [
wb.offset / 10000, // Convert from 100-nanosecond units to seconds
(wb.offset + wb.duration) / 10000,
wb.text,
]);
}
else {
// Create estimated word timings
this._createEstimatedWordTimings(text);
}
// Check if we're in a browser environment
if (typeof Blob !== "undefined" &&
typeof URL !== "undefined" &&
typeof Audio !== "undefined") {
// Create audio blob and URL
const blob = new Blob([audioBytes], { type: "audio/wav" });
const url = URL.createObjectURL(blob);
// Create and play audio element
const audio = new Audio(url);
this.audio.audioElement = audio;
this.audio.isPlaying = true;
this.audio.isPaused = false;
// Set up event handlers
audio.onended = () => {
this.emit("end");
this.audio.isPlaying = false;
URL.revokeObjectURL(url);
};
// Play the audio
await audio.play();
}
else {
// In Node.js environment, just emit events
// Fire word boundary events immediately
setTimeout(() => {
this._fireWordBoundaryCallbacks();
this.emit("end");
}, 100);
}
}
catch (error) {
console.error("Error in streaming synthesis:", error);
this.emit("end"); // Ensure end event is triggered even on error
throw error;
}
}
/**
* Synthesize text to audio and save it to a file (browser download)
* @param text Text or SSML to synthesize
* @param filename Filename to save as
* @param format Audio format (mp3 or wav)
* @param options Synthesis options
*/
async synthToFile(text, filename, format = "wav", options) {
// Convert text to audio bytes
const audioBytes = await this.synthToBytes(text, options);
// Create blob with appropriate MIME type
const mimeType = format === "mp3" ? "audio/mpeg" : "audio/wav";
const blob = new Blob([audioBytes], { type: mimeType });
// Create download link
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = filename.endsWith(`.${format}`) ? filename : `${filename}.${format}`;
// Trigger download
document.body.appendChild(a);
a.click();
// Clean up
setTimeout(() => {
document.body.removeChild(a);
URL.revokeObjectURL(url);
}, 100);
}
/**
* Set the voice to use for synthesis
* @param voiceId Voice ID to use
* @param lang Language code (optional)
*/
setVoice(voiceId, lang) {
this.voiceId = voiceId;
if (lang) {
this.lang = lang;
}
}
// --- Playback control methods ---
/**
* Pause audio playback
*/
pause() {
if (this.audio.audioElement && this.audio.isPlaying && !this.audio.isPaused) {
this.audio.audioElement.pause();
this.audio.isPaused = true;
}
}
/**
* Resume audio playback
*/
resume() {
if (this.audio.audioElement && this.audio.isPlaying && this.audio.isPaused) {
this.audio.audioElement.play();
this.audio.isPaused = false;
}
}
/**
* Stop audio playback
*/
stop() {
if (this.audio.audioElement) {
this.audio.audioElement.pause();
this.audio.audioElement.currentTime = 0;
this.audio.isPlaying = false;
this.audio.isPaused = false;
}
}
/**
* Create estimated word timings for non-streaming engines
* @param text Text to create timings for
*/
_createEstimatedWordTimings(text) {
// Extract plain text from SSML if needed
const plainText = this._isSSML(text) ? this._stripSSML(text) : text;
// Split into words
const words = plainText.split(/\s+/).filter((word) => word.length > 0);
if (!words.length)
return;
// Estimate duration (assuming average speaking rate)
const estimatedDuration = words.length * 0.3; // ~300ms per word
const wordDuration = estimatedDuration / words.length;
// Create evenly-spaced word timings
this.timings = [];
for (let i = 0; i < words.length; i++) {
const startTime = i * wordDuration;
const endTime = (i + 1) * wordDuration;
this.timings.push([startTime, endTime, words[i]]);
}
}
/**
* Fire word boundary callbacks based on timing data
*/
_fireWordBoundaryCallbacks() {
if (!this.timings.length)
return;
// Get all boundary callbacks
const callbacks = this.callbacks["boundary"] || [];
if (!callbacks.length)
return;
// Fire callbacks for each word
for (const [start, end, word] of this.timings) {
for (const callback of callbacks) {
callback(word, start, end);
}
}
}
/**
* Check if text is SSML
* @param text Text to check
* @returns True if text is SSML
*/
_isSSML(text) {
return SSMLUtils.isSSML(text);
}
/**
* Strip SSML tags from text
* @param ssml SSML text
* @returns Plain text without SSML tags
*/
_stripSSML(ssml) {
return SSMLUtils.stripSSML(ssml);
}
// --- Event system ---
/**
* Register a callback for an event
* @param event Event type
* @param fn Callback function
*/
on(event, fn) {
this.callbacks[event] = this.callbacks[event] || [];
this.callbacks[event].push(fn);
}
/**
* Emit an event to all registered callbacks
* @param event Event type
* @param args Event arguments
*/
emit(event, ...args) {
for (const fn of this.callbacks[event] || []) {
fn(...args);
}
}
/**
* Start playback with word boundary callbacks
* @param text Text or SSML to speak
* @param callback Callback function for word boundaries
* @param options Synthesis options
*/
async startPlaybackWithCallbacks(text, callback, options) {
// Speak the text
await this.speak(text, options);
// Use the timings to schedule callbacks
for (const [start, end, word] of this.timings) {
setTimeout(() => {
callback(word, start, end);
}, start * 1000);
}
}
/**
* Connect a callback to an event
* @param event Event name
* @param callback Callback function
*/
connect(event, callback) {
if (event === "onStart") {
this.on("start", callback);
}
else if (event === "onEnd") {
this.on("end", callback);
}
}
/**
* Get the value of a property
* @param propertyName Property name
* @returns Property value
*/
getProperty(propertyName) {
return this.properties[propertyName];
}
/**
* Set a property value
* @param propertyName Property name
* @param value Property value
*/
setProperty(propertyName, value) {
this.properties[propertyName] = value;
}
/**
* Create a prosody tag with the current properties
* @param text Text to wrap with prosody
* @returns Text with prosody tag
*/
constructProsodyTag(text) {
const attrs = [];
if (this.properties.rate) {
attrs.push(`rate="${this.properties.rate}"`);
}
if (this.properties.pitch) {
attrs.push(`pitch="${this.properties.pitch}"`);
}
if (this.properties.volume) {
attrs.push(`volume="${this.properties.volume}%"`);
}
if (attrs.length === 0) {
return text;
}
return `<prosody ${attrs.join(" ")}>${text}</prosody>`;
}
/**
* Check if credentials are valid
* @returns Promise resolving to true if credentials are valid
*/
async checkCredentials() {
try {
const voices = await this._getVoices();
return voices.length > 0;
}
catch (error) {
console.error("Error checking credentials:", error);
return false;
}
}
/**
* Get available voices for a specific language
* @param language Language code (BCP-47 format, e.g., 'en-US')
* @returns Promise resolving to an array of available voices for the specified language
*/
async getVoicesByLanguage(language) {
// Normalize the input language code
const normalizedLanguage = language_utils_1.LanguageNormalizer.normalize(language);
// Get all voices
const voices = await this.getVoices();
// Filter voices by language
return voices.filter((voice) => voice.languageCodes.some((lang) =>
// Match by BCP-47 code
lang.bcp47 === normalizedLanguage.bcp47 ||
// Or by ISO 639-3 code
lang.iso639_3 === normalizedLanguage.iso639_3));
}
}
exports.AbstractTTSClient = AbstractTTSClient;