js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
482 lines (481 loc) • 21.2 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.AzureTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SSMLUtils = __importStar(require("../core/ssml-utils"));
const SpeechMarkdown = __importStar(require("../markdown/converter"));
/**
* Azure TTS Client
*/
class AzureTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new Azure TTS client
* @param credentials Azure credentials object with subscriptionKey and region
*/
constructor(credentials) {
super(credentials);
Object.defineProperty(this, "subscriptionKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "region", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "sdk", {
enumerable: true,
configurable: true,
writable: true,
value: null
}); // Store loaded SDK instance
Object.defineProperty(this, "sdkLoadingPromise", {
enumerable: true,
configurable: true,
writable: true,
value: null
}); // Track loading
// Type assertion is safe here due to the AzureTTSCredentials interface
this.subscriptionKey = credentials.subscriptionKey;
this.region = credentials.region;
// Set a default voice for Azure TTS
this.voiceId = "en-US-AriaNeural";
}
/**
* Get raw voices from Azure
* @returns Promise resolving to an array of unified voice objects
*/
async _getVoices() {
try {
const response = await fetch(`https://${this.region}.tts.speech.microsoft.com/cognitiveservices/voices/list`, {
method: "GET",
headers: {
"Ocp-Apim-Subscription-Key": this.subscriptionKey,
},
});
if (!response.ok) {
throw new Error(`Failed to fetch voices: ${response.statusText}`);
}
return await response.json();
}
catch (error) {
console.error("Error fetching Azure voices:", error);
return [];
}
}
/**
* Map Azure voice objects to unified format
* @param rawVoices Array of Azure voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Transform Azure voices to unified format
return rawVoices.map((voice) => ({
id: voice.ShortName,
name: voice.DisplayName,
gender: voice.Gender === "Female" ? "Female" : voice.Gender === "Male" ? "Male" : "Unknown",
provider: "azure",
languageCodes: [
{
bcp47: voice.Locale,
iso639_3: voice.Locale.split("-")[0], // Simple extraction of language code
display: voice.LocaleName,
},
],
}));
}
/**
* Synthesize text to audio bytes
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
const ssml = await this.prepareSSML(text, options);
try {
const response = await fetch(`https://${this.region}.tts.speech.microsoft.com/cognitiveservices/v1`, {
method: "POST",
headers: {
"Ocp-Apim-Subscription-Key": this.subscriptionKey,
"Content-Type": "application/ssml+xml",
"X-Microsoft-OutputFormat": options?.format === "mp3"
? "audio-24khz-96kbitrate-mono-mp3"
: "riff-24khz-16bit-mono-pcm",
"User-Agent": "js-tts-wrapper",
},
body: ssml,
});
if (!response.ok) {
throw new Error(`Failed to synthesize speech: ${response.statusText}`);
}
const arrayBuffer = await response.arrayBuffer();
return new Uint8Array(arrayBuffer);
}
catch (error) {
console.error("Error synthesizing speech:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream with word boundary information
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and word boundary information
*/
async synthToBytestream(text, options) {
const ssml = await this.prepareSSML(text, options);
const useWordBoundary = options?.useWordBoundary !== false; // Default to true
// Attempt to load SDK if needed for word boundaries in Node.js
let sdkInstance = null;
if (useWordBoundary && typeof window === "undefined") {
sdkInstance = await this.loadSDK();
}
// If the SDK is available and word boundary information is requested, use the SDK
if (sdkInstance && useWordBoundary) {
return this.synthToBytestreamWithSDK(ssml, options, sdkInstance);
}
// Otherwise, fall back to the REST API (which doesn't provide word boundaries)
return this.synthToBytestreamWithREST(ssml, options);
}
/**
* Load the Microsoft Speech SDK dynamically.
* @returns A promise resolving to the SDK module, or null if loading fails or not applicable.
*/
async loadSDK() {
if (this.sdk) {
return this.sdk;
}
if (this.sdkLoadingPromise) {
return this.sdkLoadingPromise;
}
// Only attempt dynamic import in Node.js environment
if (typeof window !== "undefined") {
console.warn("Microsoft Speech SDK dynamic import skipped in browser environment.");
return null;
}
// @ts-ignore - Suppress module not found error for SDK types during build
this.sdkLoadingPromise = Promise.resolve().then(() => __importStar(require("microsoft-cognitiveservices-speech-sdk"))).then(sdkModule => {
this.sdk = sdkModule;
this.sdkLoadingPromise = null; // Reset promise after successful load
console.log("Microsoft Speech SDK loaded successfully.");
return this.sdk;
})
.catch(_error => {
// Log the actual error for debugging if needed: console.error("SDK Load Error:", _error);
console.warn("microsoft-cognitiveservices-speech-sdk not found or failed to load, using REST API fallback for word boundaries.");
this.sdkLoadingPromise = null; // Reset promise on error
this.sdk = null; // Ensure SDK is null if loading failed
return null; // Indicate SDK load failed
});
return this.sdkLoadingPromise;
}
/**
* Synthesize speech using the Microsoft Cognitive Services Speech SDK
* @param ssml SSML to synthesize
* @param options Synthesis options
* @param sdkInstance The loaded SDK instance.
* @returns Promise resolving to an object containing the audio stream and word boundary information
*/
async synthToBytestreamWithSDK(ssml, options, sdkInstance) {
try {
if (!sdkInstance) { // Should not happen if called correctly, but good practice
throw new Error("Attempted to use SDK method, but SDK instance is missing.");
}
// Create a speech config
const speechConfig = sdkInstance.SpeechConfig.fromSubscription(this.subscriptionKey, this.region);
// Set the output format
speechConfig.speechSynthesisOutputFormat = options?.format === "mp3"
? sdkInstance.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3
: sdkInstance.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm;
// Set the voice
if (this.voiceId) {
speechConfig.speechSynthesisVoiceName = this.voiceId;
}
// Create a synthesizer
const synthesizer = new sdkInstance.SpeechSynthesizer(speechConfig);
// Create a promise that will resolve with the audio data and word boundaries
return new Promise((resolve, reject) => {
const wordBoundaries = [];
const audioChunks = [];
// Set up the word boundary event handler
synthesizer.wordBoundary = (_s, e) => {
wordBoundaries.push({
text: e.text,
offset: e.audioOffset / 10000, // Convert to milliseconds
duration: 0, // Duration is not provided by the SDK
});
};
// Set up the synthesizing event handler to collect audio chunks
synthesizer.synthesizing = (_s, e) => {
if (e.result.reason === sdkInstance.ResultReason.SynthesizingAudio) {
audioChunks.push(new Uint8Array(e.result.audioData));
}
};
// Start the synthesis
synthesizer.speakSsmlAsync(ssml, (result) => {
// Synthesis completed
synthesizer.close();
if (result.reason === sdkInstance.ResultReason.SynthesizingAudioCompleted) {
// Add the final audio chunk
audioChunks.push(new Uint8Array(result.audioData));
// Create a readable stream from the audio chunks
const stream = new ReadableStream({
start(controller) {
for (const chunk of audioChunks) {
controller.enqueue(chunk);
}
controller.close();
},
});
// Calculate durations for word boundaries
if (wordBoundaries.length > 1) {
for (let i = 0; i < wordBoundaries.length - 1; i++) {
wordBoundaries[i].duration =
wordBoundaries[i + 1].offset - wordBoundaries[i].offset;
}
// Estimate duration for the last word
if (wordBoundaries.length > 0) {
const lastWord = wordBoundaries[wordBoundaries.length - 1];
lastWord.duration = 500; // Estimate 500ms for the last word
}
}
resolve({
audioStream: stream,
wordBoundaries,
});
}
else {
reject(new Error(`Synthesis failed: ${result.errorDetails}`));
}
}, (error) => {
// Synthesis error
synthesizer.close();
reject(error);
});
});
}
catch (error) {
console.error("Error synthesizing speech with SDK:", error);
throw error;
}
}
/**
* Synthesize speech using the REST API (no word boundaries)
* @param ssml SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and an empty word boundary array
*/
async synthToBytestreamWithREST(ssml, options) {
try {
// Use the standard endpoint
const endpoint = `https://${this.region}.tts.speech.microsoft.com/cognitiveservices/v1`;
const response = await fetch(endpoint, {
method: "POST",
headers: {
"Ocp-Apim-Subscription-Key": this.subscriptionKey,
"Content-Type": "application/ssml+xml",
"X-Microsoft-OutputFormat": options?.format === "mp3"
? "audio-24khz-96kbitrate-mono-mp3"
: "riff-24khz-16bit-mono-pcm",
"User-Agent": "js-tts-wrapper",
},
body: ssml,
});
if (!response.ok) {
throw new Error(`Failed to synthesize speech: ${response.statusText}`);
}
// No word boundary information is available with the REST API
const wordBoundaries = [];
return {
audioStream: response.body,
wordBoundaries,
};
}
catch (error) {
console.error("Error synthesizing speech with REST API:", error);
throw error;
}
}
/**
* Start playback with word boundary callbacks
* @param text Text or SSML to speak
* @param callback Callback function for word boundaries
* @param options Synthesis options
*/
async startPlaybackWithCallbacks(text, callback, options) {
// If the SDK is available, use it for better word boundary support
if (this.sdk) {
await this.startPlaybackWithCallbacksSDK(text, callback, options);
}
else {
// Fall back to the abstract implementation
// Register the callback
this.on("boundary", callback);
// Enable word boundary information
const enhancedOptions = { ...options, useWordBoundary: true };
// Start playback with word boundary information
await this.speakStreamed(text, enhancedOptions);
}
}
/**
* Start playback with word boundary callbacks using the SDK
* @param text Text or SSML to speak
* @param callback Callback function for word boundaries
* @param options Synthesis options
*/
async startPlaybackWithCallbacksSDK(text, callback, options) {
const ssml = await this.prepareSSML(text, options);
try {
// Create a speech config
const speechConfig = this.sdk.SpeechConfig.fromSubscription(this.subscriptionKey, this.region);
// Set the output format
speechConfig.speechSynthesisOutputFormat = options?.format === "mp3"
? this.sdk.SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3
: this.sdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm;
// Set the voice
if (this.voiceId) {
speechConfig.speechSynthesisVoiceName = this.voiceId;
}
// Create an audio config for playback
const audioConfig = this.sdk.AudioConfig.fromDefaultSpeakerOutput();
// Create a synthesizer
const synthesizer = new this.sdk.SpeechSynthesizer(speechConfig, audioConfig);
// Emit the start event
this.emit("start");
// Set up the word boundary event handler
synthesizer.wordBoundary = (_s, e) => {
// Call the callback with the word boundary information
const offset = e.audioOffset / 10000; // Convert to milliseconds
const duration = 500; // Estimate 500ms for each word
// Store the word boundary information for internal use
this.timings.push([offset, offset + duration, e.text]);
// Call the callback with the word boundary information
callback(e.text, offset, offset + duration);
};
// Set up the synthesis completed event handler
synthesizer.synthesisCompleted = (_s, _e) => {
// Emit the end event
this.emit("end");
// Close the synthesizer
synthesizer.close();
};
// Start the synthesis
await new Promise((resolve, reject) => {
synthesizer.speakSsmlAsync(ssml, () => {
resolve();
}, (error) => {
reject(error);
});
});
}
catch (error) {
console.error("Error starting playback with callbacks:", error);
throw error;
}
}
/**
* Prepare SSML for synthesis
* @param text Text or SSML to prepare
* @param options Synthesis options
* @returns SSML ready for synthesis
*/
async prepareSSML(text, options) {
// Convert from Speech Markdown if requested
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(text)) {
const ssmlText = await SpeechMarkdown.toSSML(text, "microsoft-azure");
text = ssmlText;
}
// Ensure text is wrapped in SSML
let ssml = SSMLUtils.isSSML(text) ? text : SSMLUtils.wrapWithSpeakTags(text);
// Use voice from options or the default voice
const voiceId = options?.voice || this.voiceId;
// Always add the required SSML attributes for Azure
ssml = ssml.replace("<speak", `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="${this.lang}"`);
// Add voice selection if a voice is set (should always be true now)
if (voiceId) {
// Insert voice tag before the content
ssml = ssml.replace(">", `><voice name="${voiceId}">`);
// Close voice tag before </speak>
ssml = ssml.replace("</speak>", "</voice></speak>");
}
// Add prosody if properties are set
if (this.properties.rate || this.properties.pitch || this.properties.volume) {
// Extract content between voice tags or speak tags
let content = "";
if (ssml.includes("<voice")) {
const match = ssml.match(/<voice[^>]*>(.*?)<\/voice>/s);
if (match) {
content = match[1];
const prosodyContent = this.constructProsodyTag(content);
ssml = ssml.replace(content, prosodyContent);
}
}
else {
const match = ssml.match(/<speak[^>]*>(.*?)<\/speak>/s);
if (match) {
content = match[1];
const prosodyContent = this.constructProsodyTag(content);
ssml = ssml.replace(content, prosodyContent);
}
}
}
// Also add prosody from options if provided
if (options?.rate || options?.pitch || options?.volume !== undefined) {
// Create prosody attributes
const attrs = [];
if (options.rate)
attrs.push(`rate="${options.rate}"`);
if (options.pitch)
attrs.push(`pitch="${options.pitch}"`);
if (options.volume !== undefined)
attrs.push(`volume="${options.volume}%"`);
if (attrs.length > 0) {
// Extract content
const match = ssml.match(/<speak[^>]*>(.*?)<\/speak>/s);
if (match) {
const content = match[1];
const prosodyContent = `<prosody ${attrs.join(" ")}>${content}</prosody>`;
ssml = ssml.replace(content, prosodyContent);
}
}
}
return ssml;
}
}
exports.AzureTTSClient = AzureTTSClient;
;