UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

408 lines (407 loc) 16.2 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.WatsonTTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); const SSMLUtils = __importStar(require("../core/ssml-utils")); const SpeechMarkdown = __importStar(require("../markdown/converter")); /** * IBM Watson TTS Client */ class WatsonTTSClient extends abstract_tts_1.AbstractTTSClient { /** * Create a new IBM Watson TTS client * @param credentials Watson credentials object with apiKey, region, and instanceId */ constructor(credentials) { super(credentials); Object.defineProperty(this, "apiKey", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "region", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "instanceId", { enumerable: true, configurable: true, writable: true, value: void 0 }); // Word boundaries from the last synthesis Object.defineProperty(this, "wordBoundaries", { enumerable: true, configurable: true, writable: true, value: [] }); Object.defineProperty(this, "iamToken", { enumerable: true, configurable: true, writable: true, value: null }); Object.defineProperty(this, "wsUrl", { enumerable: true, configurable: true, writable: true, value: null }); this.apiKey = credentials.apiKey; this.region = credentials.region; this.instanceId = credentials.instanceId; // SSL verification can be disabled but we don't use it directly in the browser this.sampleRate = 22050; // Default sample rate for Watson TTS } /** * Get raw voices from Watson * @returns Promise resolving to an array of unified voice objects */ async _getVoices() { try { // Ensure we have a valid IAM token await this._refreshIAMToken(); const response = await fetch(`https://api.${this.region}.text-to-speech.watson.cloud.ibm.com/v1/voices`, { method: "GET", headers: { Authorization: `Bearer ${this.iamToken}`, "Content-Type": "application/json", }, }); if (!response.ok) { throw new Error(`Failed to fetch voices: ${response.statusText}`); } const data = await response.json(); return data.voices || []; } catch (error) { console.error("Error fetching Watson voices:", error); return []; } } /** * Map Watson voice objects to unified format * @param rawVoices Array of Watson voice objects * @returns Promise resolving to an array of unified voice objects */ async _mapVoicesToUnified(rawVoices) { // Transform Watson voices to unified format return rawVoices.map((voice) => ({ id: voice.name, name: voice.name.split("_")[1].replace("V3Voice", ""), gender: voice.gender === "female" ? "Female" : voice.gender === "male" ? "Male" : "Unknown", provider: "ibm", languageCodes: [ { bcp47: voice.language, iso639_3: voice.language.split("-")[0], // Simple extraction of language code display: voice.description || voice.language, }, ], })); } /** * Refresh the IAM token for Watson API * @returns Promise resolving when token is refreshed */ async _refreshIAMToken() { try { const response = await fetch("https://iam.cloud.ibm.com/identity/token", { method: "POST", headers: { "Content-Type": "application/x-www-form-urlencoded", }, body: new URLSearchParams({ apikey: this.apiKey, grant_type: "urn:ibm:params:oauth:grant-type:apikey", }), }); if (!response.ok) { throw new Error(`Failed to refresh IAM token: ${response.statusText}`); } const data = await response.json(); this.iamToken = data.access_token; // Construct the WebSocket URL for streaming this.wsUrl = `wss://api.${this.region}.text-to-speech.watson.cloud.ibm.com/instances/${this.instanceId}/v1/synthesize`; } catch (error) { console.error("Error refreshing IAM token:", error); throw error; } } /** * Prepare SSML for synthesis * @param text Text or SSML to prepare * @param options Synthesis options * @returns SSML string ready for synthesis */ async prepareSSML(text, options) { // Use the provided voice or the one set with setVoice const voice = options?.voice || this.voiceId; // Check if the input is already SSML const isSSML = SSMLUtils.isSSML(text); let processedText = text; // If the input is SpeechMarkdown and useSpeechMarkdown is enabled, convert it to SSML if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { processedText = await SpeechMarkdown.toSSML(processedText); } // If the input is already SSML, use it directly if (isSSML) { return processedText; } // Otherwise, create SSML from plain text this.ssml.clearSSML(); // Create SSML with voice and prosody let ssmlContent = processedText; // Apply prosody settings if specified if (options?.rate || options?.pitch || options?.volume) { const prosodyAttrs = []; if (options.rate) prosodyAttrs.push(`rate="${options.rate}"`); if (options.pitch) prosodyAttrs.push(`pitch="${options.pitch}"`); if (options.volume !== undefined) prosodyAttrs.push(`volume="${options.volume}%"`); ssmlContent = `<prosody ${prosodyAttrs.join(" ")}>${ssmlContent}</prosody>`; } // Add voice tag ssmlContent = `<voice name="${voice || "en-US_AllisonV3Voice"}">${ssmlContent}</voice>`; // Wrap with speak tags return this.ssml.wrapWithSpeak(ssmlContent); } // Using the checkCredentials method from AbstractTTSClient /** * Synthesize text to audio bytes * @param text Text or SSML to synthesize * @param options Synthesis options * @returns Promise resolving to audio bytes */ async synthToBytes(text, options) { try { // Ensure we have a valid IAM token await this._refreshIAMToken(); // Prepare SSML for synthesis const ssml = await this.prepareSSML(text, options); // Use provided voice_id or the one set with setVoice const voice = options?.voice || this.voiceId || "en-US_AllisonV3Voice"; const response = await fetch(`https://api.${this.region}.text-to-speech.watson.cloud.ibm.com/v1/synthesize`, { method: "POST", headers: { Authorization: `Bearer ${this.iamToken}`, "Content-Type": "application/json", Accept: "audio/wav", }, body: JSON.stringify({ text: ssml, voice: voice, accept: "audio/wav", }), }); if (!response.ok) { throw new Error(`Failed to synthesize speech: ${response.statusText}`); } const arrayBuffer = await response.arrayBuffer(); return new Uint8Array(arrayBuffer); } catch (error) { console.error("Error synthesizing speech:", error); throw error; } } /** * Synthesize text to a byte stream with word boundary information * @param text Text or SSML to synthesize * @param options Synthesis options * @returns Promise resolving to an object containing the audio stream and word boundary information */ async synthToBytestream(text, options) { // Ensure we have a valid IAM token await this._refreshIAMToken(); // Prepare SSML for synthesis const ssml = await this.prepareSSML(text, options); // Use provided voice_id or the one set with setVoice const voice = options?.voice || this.voiceId || "en-US_AllisonV3Voice"; // Reset word boundaries this.wordBoundaries = []; // Check if we're in a browser environment if (typeof window !== "undefined" && "WebSocket" in window) { return this._synthToBytestreamWithBrowserWebSocket(ssml, voice); } // In Node.js environment, use the REST API return this._synthToBytestreamWithREST(ssml, options); } /** * Synthesize text to a byte stream using the WebSocket API in browser * @param ssml SSML to synthesize * @param voice Voice to use * @returns Promise resolving to an object containing the audio stream and word boundary information */ async _synthToBytestreamWithBrowserWebSocket(ssml, voice) { return new Promise((resolve, reject) => { if (!this.wsUrl || !this.iamToken) { reject(new Error("WebSocket URL or IAM token not available")); return; } const ws = new WebSocket(`${this.wsUrl}?access_token=${this.iamToken}&voice=${voice}`); const chunks = []; const wordTimings = []; ws.binaryType = "arraybuffer"; ws.onopen = () => { const message = { text: ssml, accept: "audio/wav", voice: voice, timings: ["words"], }; ws.send(JSON.stringify(message)); }; ws.onmessage = (event) => { if (event.data instanceof ArrayBuffer) { // Audio data chunks.push(new Uint8Array(event.data)); } else { // Word timing data try { const data = JSON.parse(event.data); if (data.words) { for (const timing of data.words) { wordTimings.push({ text: timing[0], offset: timing[1] * 1000, // Convert to milliseconds duration: (timing[2] - timing[1]) * 1000, // Convert to milliseconds }); } this.wordBoundaries = wordTimings; } } catch (e) { console.error("Error parsing WebSocket message:", e); } } }; ws.onerror = (error) => { reject(error); }; ws.onclose = () => { // Store word boundaries for later use this.wordBoundaries = wordTimings; // Create a ReadableStream from the collected chunks const audioStream = new ReadableStream({ start(controller) { for (const chunk of chunks) { controller.enqueue(chunk); } controller.close(); }, }); resolve({ audioStream, wordBoundaries: wordTimings, }); }; }); } /** * Synthesize text to a byte stream using the REST API * @param ssml SSML to synthesize * @param options Synthesis options * @returns Promise resolving to an object containing the audio stream and word boundary information */ async _synthToBytestreamWithREST(ssml, options) { try { // Use provided voice_id or the one set with setVoice const voice = options?.voice || this.voiceId || "en-US_AllisonV3Voice"; const response = await fetch(`https://api.${this.region}.text-to-speech.watson.cloud.ibm.com/v1/synthesize`, { method: "POST", headers: { Authorization: `Bearer ${this.iamToken}`, "Content-Type": "application/json", Accept: "audio/wav", }, body: JSON.stringify({ text: ssml, voice: voice, accept: "audio/wav", }), }); if (!response.ok) { throw new Error(`Failed to synthesize speech: ${response.statusText}`); } // Create estimated word timings based on text length const words = ssml.replace(/<[^>]*>/g, "").split(/\s+/); const estimatedDuration = 0.3; // Estimated duration per word in seconds const wordBoundaries = []; let currentTime = 0; for (const word of words) { if (word.trim()) { wordBoundaries.push({ text: word, offset: currentTime * 1000, // Convert to milliseconds duration: estimatedDuration * 1000, // Convert to milliseconds }); currentTime += estimatedDuration; } } // Store word boundaries for later use this.wordBoundaries = wordBoundaries; return { audioStream: response.body, wordBoundaries, }; } catch (error) { console.error("Error synthesizing speech:", error); throw error; } } /** * Set the voice to use for synthesis * @param voiceId Voice ID to use * @param lang Language code (not used in Watson) */ setVoice(voiceId, lang) { this.voiceId = voiceId; if (lang) { this.lang = lang; } } } exports.WatsonTTSClient = WatsonTTSClient;