UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

343 lines (342 loc) 13.3 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.WitAITTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); const SSMLUtils = __importStar(require("../core/ssml-utils")); const SpeechMarkdown = __importStar(require("../markdown/converter")); /** * WitAI TTS Client */ class WitAITTSClient extends abstract_tts_1.AbstractTTSClient { /** * Create a new WitAI TTS client * @param credentials WitAI credentials object with token */ constructor(credentials) { super(credentials); Object.defineProperty(this, "token", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "baseUrl", { enumerable: true, configurable: true, writable: true, value: "https://api.wit.ai" }); Object.defineProperty(this, "apiVersion", { enumerable: true, configurable: true, writable: true, value: "20240601" }); Object.defineProperty(this, "headers", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "sampleRate", { enumerable: true, configurable: true, writable: true, value: 24000 }); // Default sample rate for WitAI if (!credentials.token) { throw new Error("An API token for Wit.ai must be provided"); } this.token = credentials.token; this.headers = { Authorization: `Bearer ${this.token}`, "Content-Type": "application/json", }; } /** * Get raw voices from WitAI * @returns Promise resolving to an array of unified voice objects */ async _getVoices() { try { const response = await fetch(`${this.baseUrl}/voices?v=${this.apiVersion}`, { method: "GET", headers: this.headers, }); if (!response.ok) { throw new Error(`Failed to fetch voices: ${response.statusText}`); } const voices = await response.json(); console.log("WitAI Raw Voices Response:", JSON.stringify(voices, null, 2)); const standardizedVoices = []; for (const localeKey in voices) { // Get the original locale (e.g., "en_US") const locale = localeKey.replace("_", "-"); for (const voice of voices[localeKey]) { const standardizedVoice = { id: voice.name, languageCodes: [locale], name: voice.name.split("$")[1] || voice.name, gender: voice.gender, styles: voice.styles || [], }; standardizedVoices.push(standardizedVoice); console.log("WitAI Standardized Voice:", standardizedVoice); } } return standardizedVoices; } catch (error) { console.error("Error fetching WitAI voices:", error); return []; } } /** * Map WitAI voice objects to unified format * @param rawVoices Array of WitAI voice objects * @returns Promise resolving to an array of unified voice objects */ async _mapVoicesToUnified(rawVoices) { // Transform WitAI voices to unified format return rawVoices.map((voice) => ({ id: voice.id, name: voice.name, gender: voice.gender === "female" ? "Female" : voice.gender === "male" ? "Male" : "Unknown", provider: "witai", languageCodes: voice.languageCodes.map((locale) => { const [language, region] = locale.split("-"); return { bcp47: locale, iso639_3: language, // Simple extraction of language code display: `${language.toUpperCase()} (${region || language})`, }; }), })); } /** * Prepare text for synthesis * @param text Text to prepare * @param options Synthesis options * @returns Prepared text */ async prepareText(text, options) { let processedText = text; // Check if the input is SpeechMarkdown and useSpeechMarkdown is enabled, convert it to plain text if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { // Convert SpeechMarkdown to SSML first, then strip SSML tags const ssml = await SpeechMarkdown.toSSML(processedText); processedText = SSMLUtils.stripSSML(ssml); } // If the input is SSML, convert it to plain text if (SSMLUtils.isSSML(processedText)) { processedText = SSMLUtils.stripSSML(processedText); } return processedText; } /** * Get the appropriate Accept header based on the format option * @param format Format option from WitAITTSOptions * @returns MIME type string */ getAcceptHeader(format) { const formats = { pcm: "audio/raw", mp3: "audio/mpeg", wav: "audio/wav", }; return formats[format || ""] || "audio/raw"; // Default to PCM if unspecified } /** * Synthesize text to audio bytes * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio bytes */ async synthToBytes(text, options) { try { // Prepare text for synthesis (strip SSML/Markdown if present) const preparedText = await this.prepareText(text, options); // Use provided voice or the one set with setVoice let voice = options?.voice || this.voiceId; if (!voice) { // Use a default voice if none is set const voices = await this._getVoices(); if (voices.length === 0) { throw new Error("No voice ID provided and no default voice available"); } voice = voices[0].id; this.voiceId = voice; console.log(`Using default voice: ${voice}`); } // Get format from options if available const format = options?.format; // Set headers for audio format const headers = { ...this.headers, Accept: this.getAcceptHeader(format), }; const data = { q: preparedText, voice: voice, style: "default", // Add a default style }; console.log("WitAI TTS Request:", { url: `${this.baseUrl}/synthesize?v=${this.apiVersion}`, headers: headers, data: data, }); const response = await fetch(`${this.baseUrl}/synthesize?v=${this.apiVersion}`, { method: "POST", headers, body: JSON.stringify(data), }); if (!response.ok) { // Try to get more detailed error information let errorMessage = `Failed to synthesize speech: ${response.statusText}`; try { const errorData = await response.text(); console.error("WitAI TTS Error Response:", errorData); errorMessage += ` - ${errorData}`; } catch (_e) { // Ignore error parsing error } throw new Error(errorMessage); } const arrayBuffer = await response.arrayBuffer(); return new Uint8Array(arrayBuffer); } catch (error) { console.error("Error synthesizing speech with WitAI:", error); throw error; } } /** * Synthesize text to a byte stream with word boundary information * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to an object containing the audio stream and word boundary information */ async synthToBytestream(text, options) { try { // Prepare text for synthesis const preparedText = await this.prepareText(text, options); // Use provided voice or the one set with setVoice let voice = options?.voice || this.voiceId; if (!voice) { // Use a default voice if none is set const voices = await this._getVoices(); if (voices.length === 0) { throw new Error("No voice ID provided and no default voice available"); } voice = voices[0].id; this.voiceId = voice; console.log(`Using default voice for bytestream: ${voice}`); } // Get format from options if available const format = options?.format; // Set headers for audio format const headers = { ...this.headers, Accept: this.getAcceptHeader(format), }; const data = { q: preparedText, voice: voice, style: "default", // Add a default style }; console.log("WitAI TTS Bytestream Request:", { url: `${this.baseUrl}/synthesize?v=${this.apiVersion}`, headers: headers, data: data, }); const response = await fetch(`${this.baseUrl}/synthesize?v=${this.apiVersion}`, { method: "POST", headers, body: JSON.stringify(data), }); if (!response.ok) { // Try to get more detailed error information let errorMessage = `Failed to synthesize speech: ${response.statusText}`; try { const errorData = await response.text(); console.error("WitAI TTS Bytestream Error Response:", errorData); errorMessage += ` - ${errorData}`; } catch (_e) { // Ignore error parsing error } throw new Error(errorMessage); } // Create estimated word boundaries based on text length const words = preparedText.split(/\s+/); const estimatedDuration = 0.3; // Estimated duration per word in seconds const wordBoundaries = []; let currentTime = 0; for (const word of words) { if (word.trim()) { wordBoundaries.push({ text: word, offset: currentTime * 1000, // Convert to milliseconds duration: estimatedDuration * 1000, // Convert to milliseconds }); currentTime += estimatedDuration; } } return { audioStream: response.body, wordBoundaries, }; } catch (error) { console.error("Error synthesizing speech with WitAI:", error); throw error; } } /** * Set the voice to use for synthesis * @param voiceId Voice ID to use * @param lang Language code (not used in WitAI) */ setVoice(voiceId, lang) { console.log(`Setting WitAI voice to: ${voiceId}`); this.voiceId = voiceId; if (lang) { this.lang = lang; } } } exports.WitAITTSClient = WitAITTSClient;