js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
408 lines (407 loc) • 16.2 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.WatsonTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SSMLUtils = __importStar(require("../core/ssml-utils"));
const SpeechMarkdown = __importStar(require("../markdown/converter"));
/**
* IBM Watson TTS Client
*/
class WatsonTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new IBM Watson TTS client
* @param credentials Watson credentials object with apiKey, region, and instanceId
*/
constructor(credentials) {
super(credentials);
Object.defineProperty(this, "apiKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "region", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "instanceId", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
// Word boundaries from the last synthesis
Object.defineProperty(this, "wordBoundaries", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
Object.defineProperty(this, "iamToken", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "wsUrl", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
this.apiKey = credentials.apiKey;
this.region = credentials.region;
this.instanceId = credentials.instanceId;
// SSL verification can be disabled but we don't use it directly in the browser
this.sampleRate = 22050; // Default sample rate for Watson TTS
}
/**
* Get raw voices from Watson
* @returns Promise resolving to an array of unified voice objects
*/
async _getVoices() {
try {
// Ensure we have a valid IAM token
await this._refreshIAMToken();
const response = await fetch(`https://api.${this.region}.text-to-speech.watson.cloud.ibm.com/v1/voices`, {
method: "GET",
headers: {
Authorization: `Bearer ${this.iamToken}`,
"Content-Type": "application/json",
},
});
if (!response.ok) {
throw new Error(`Failed to fetch voices: ${response.statusText}`);
}
const data = await response.json();
return data.voices || [];
}
catch (error) {
console.error("Error fetching Watson voices:", error);
return [];
}
}
/**
* Map Watson voice objects to unified format
* @param rawVoices Array of Watson voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Transform Watson voices to unified format
return rawVoices.map((voice) => ({
id: voice.name,
name: voice.name.split("_")[1].replace("V3Voice", ""),
gender: voice.gender === "female" ? "Female" : voice.gender === "male" ? "Male" : "Unknown",
provider: "ibm",
languageCodes: [
{
bcp47: voice.language,
iso639_3: voice.language.split("-")[0], // Simple extraction of language code
display: voice.description || voice.language,
},
],
}));
}
/**
* Refresh the IAM token for Watson API
* @returns Promise resolving when token is refreshed
*/
async _refreshIAMToken() {
try {
const response = await fetch("https://iam.cloud.ibm.com/identity/token", {
method: "POST",
headers: {
"Content-Type": "application/x-www-form-urlencoded",
},
body: new URLSearchParams({
apikey: this.apiKey,
grant_type: "urn:ibm:params:oauth:grant-type:apikey",
}),
});
if (!response.ok) {
throw new Error(`Failed to refresh IAM token: ${response.statusText}`);
}
const data = await response.json();
this.iamToken = data.access_token;
// Construct the WebSocket URL for streaming
this.wsUrl = `wss://api.${this.region}.text-to-speech.watson.cloud.ibm.com/instances/${this.instanceId}/v1/synthesize`;
}
catch (error) {
console.error("Error refreshing IAM token:", error);
throw error;
}
}
/**
* Prepare SSML for synthesis
* @param text Text or SSML to prepare
* @param options Synthesis options
* @returns SSML string ready for synthesis
*/
async prepareSSML(text, options) {
// Use the provided voice or the one set with setVoice
const voice = options?.voice || this.voiceId;
// Check if the input is already SSML
const isSSML = SSMLUtils.isSSML(text);
let processedText = text;
// If the input is SpeechMarkdown and useSpeechMarkdown is enabled, convert it to SSML
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) {
processedText = await SpeechMarkdown.toSSML(processedText);
}
// If the input is already SSML, use it directly
if (isSSML) {
return processedText;
}
// Otherwise, create SSML from plain text
this.ssml.clearSSML();
// Create SSML with voice and prosody
let ssmlContent = processedText;
// Apply prosody settings if specified
if (options?.rate || options?.pitch || options?.volume) {
const prosodyAttrs = [];
if (options.rate)
prosodyAttrs.push(`rate="${options.rate}"`);
if (options.pitch)
prosodyAttrs.push(`pitch="${options.pitch}"`);
if (options.volume !== undefined)
prosodyAttrs.push(`volume="${options.volume}%"`);
ssmlContent = `<prosody ${prosodyAttrs.join(" ")}>${ssmlContent}</prosody>`;
}
// Add voice tag
ssmlContent = `<voice name="${voice || "en-US_AllisonV3Voice"}">${ssmlContent}</voice>`;
// Wrap with speak tags
return this.ssml.wrapWithSpeak(ssmlContent);
}
// Using the checkCredentials method from AbstractTTSClient
/**
* Synthesize text to audio bytes
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
try {
// Ensure we have a valid IAM token
await this._refreshIAMToken();
// Prepare SSML for synthesis
const ssml = await this.prepareSSML(text, options);
// Use provided voice_id or the one set with setVoice
const voice = options?.voice || this.voiceId || "en-US_AllisonV3Voice";
const response = await fetch(`https://api.${this.region}.text-to-speech.watson.cloud.ibm.com/v1/synthesize`, {
method: "POST",
headers: {
Authorization: `Bearer ${this.iamToken}`,
"Content-Type": "application/json",
Accept: "audio/wav",
},
body: JSON.stringify({
text: ssml,
voice: voice,
accept: "audio/wav",
}),
});
if (!response.ok) {
throw new Error(`Failed to synthesize speech: ${response.statusText}`);
}
const arrayBuffer = await response.arrayBuffer();
return new Uint8Array(arrayBuffer);
}
catch (error) {
console.error("Error synthesizing speech:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream with word boundary information
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and word boundary information
*/
async synthToBytestream(text, options) {
// Ensure we have a valid IAM token
await this._refreshIAMToken();
// Prepare SSML for synthesis
const ssml = await this.prepareSSML(text, options);
// Use provided voice_id or the one set with setVoice
const voice = options?.voice || this.voiceId || "en-US_AllisonV3Voice";
// Reset word boundaries
this.wordBoundaries = [];
// Check if we're in a browser environment
if (typeof window !== "undefined" && "WebSocket" in window) {
return this._synthToBytestreamWithBrowserWebSocket(ssml, voice);
}
// In Node.js environment, use the REST API
return this._synthToBytestreamWithREST(ssml, options);
}
/**
* Synthesize text to a byte stream using the WebSocket API in browser
* @param ssml SSML to synthesize
* @param voice Voice to use
* @returns Promise resolving to an object containing the audio stream and word boundary information
*/
async _synthToBytestreamWithBrowserWebSocket(ssml, voice) {
return new Promise((resolve, reject) => {
if (!this.wsUrl || !this.iamToken) {
reject(new Error("WebSocket URL or IAM token not available"));
return;
}
const ws = new WebSocket(`${this.wsUrl}?access_token=${this.iamToken}&voice=${voice}`);
const chunks = [];
const wordTimings = [];
ws.binaryType = "arraybuffer";
ws.onopen = () => {
const message = {
text: ssml,
accept: "audio/wav",
voice: voice,
timings: ["words"],
};
ws.send(JSON.stringify(message));
};
ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
// Audio data
chunks.push(new Uint8Array(event.data));
}
else {
// Word timing data
try {
const data = JSON.parse(event.data);
if (data.words) {
for (const timing of data.words) {
wordTimings.push({
text: timing[0],
offset: timing[1] * 1000, // Convert to milliseconds
duration: (timing[2] - timing[1]) * 1000, // Convert to milliseconds
});
}
this.wordBoundaries = wordTimings;
}
}
catch (e) {
console.error("Error parsing WebSocket message:", e);
}
}
};
ws.onerror = (error) => {
reject(error);
};
ws.onclose = () => {
// Store word boundaries for later use
this.wordBoundaries = wordTimings;
// Create a ReadableStream from the collected chunks
const audioStream = new ReadableStream({
start(controller) {
for (const chunk of chunks) {
controller.enqueue(chunk);
}
controller.close();
},
});
resolve({
audioStream,
wordBoundaries: wordTimings,
});
};
});
}
/**
* Synthesize text to a byte stream using the REST API
* @param ssml SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and word boundary information
*/
async _synthToBytestreamWithREST(ssml, options) {
try {
// Use provided voice_id or the one set with setVoice
const voice = options?.voice || this.voiceId || "en-US_AllisonV3Voice";
const response = await fetch(`https://api.${this.region}.text-to-speech.watson.cloud.ibm.com/v1/synthesize`, {
method: "POST",
headers: {
Authorization: `Bearer ${this.iamToken}`,
"Content-Type": "application/json",
Accept: "audio/wav",
},
body: JSON.stringify({
text: ssml,
voice: voice,
accept: "audio/wav",
}),
});
if (!response.ok) {
throw new Error(`Failed to synthesize speech: ${response.statusText}`);
}
// Create estimated word timings based on text length
const words = ssml.replace(/<[^>]*>/g, "").split(/\s+/);
const estimatedDuration = 0.3; // Estimated duration per word in seconds
const wordBoundaries = [];
let currentTime = 0;
for (const word of words) {
if (word.trim()) {
wordBoundaries.push({
text: word,
offset: currentTime * 1000, // Convert to milliseconds
duration: estimatedDuration * 1000, // Convert to milliseconds
});
currentTime += estimatedDuration;
}
}
// Store word boundaries for later use
this.wordBoundaries = wordBoundaries;
return {
audioStream: response.body,
wordBoundaries,
};
}
catch (error) {
console.error("Error synthesizing speech:", error);
throw error;
}
}
/**
* Set the voice to use for synthesis
* @param voiceId Voice ID to use
* @param lang Language code (not used in Watson)
*/
setVoice(voiceId, lang) {
this.voiceId = voiceId;
if (lang) {
this.lang = lang;
}
}
}
exports.WatsonTTSClient = WatsonTTSClient;
;