UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

github.com/willwade/js-tts-wrapper

willwade/js-tts-wrapper

521 lines (520 loc) • 19.4 kB

JavaScript

// Node-only imports moved inside Node-only code paths below for browser compatibility. import { AbstractTTSClient } from "../core/abstract-tts.js"; import * as SpeechMarkdown from "../markdown/converter.js"; import * as SSMLUtils from "../core/ssml-utils.js"; import { estimateWordBoundaries } from "../utils/word-timing-estimator.js"; // Mock implementation of OpenAI class class MockOpenAI { // Constructor accepts options but doesn't use them constructor(_options) { Object.defineProperty(this, "models", { enumerable: true, configurable: true, writable: true, value: { list: async () => ({ data: [] }), } }); Object.defineProperty(this, "audio", { enumerable: true, configurable: true, writable: true, value: { speech: { create: async () => ({ arrayBuffer: async () => new ArrayBuffer(0), body: new ReadableStream({ start(controller) { controller.close(); }, }), }), }, } }); } } /** * OpenAI TTS Client * * This client uses the OpenAI API to convert text to speech. * It supports streaming audio but does not support SSML. * Word boundaries are estimated since OpenAI doesn't provide word events. */ export class OpenAITTSClient extends AbstractTTSClient { /** * Get the last word boundaries * @returns Array of word boundary objects */ getLastWordBoundaries() { return this.lastWordBoundaries; } /** * Set the last word boundaries * @param wordBoundaries Array of word boundary objects */ setLastWordBoundaries(wordBoundaries) { this.lastWordBoundaries = wordBoundaries; } /** * Create a new OpenAI TTS Client * @param credentials OpenAI API credentials */ constructor(credentials = {}) { super(credentials); // Use 'any' for client to accommodate both real and mock SDK types easily Object.defineProperty(this, "client", { enumerable: true, configurable: true, writable: true, value: null }); Object.defineProperty(this, "clientLoadingPromise", { enumerable: true, configurable: true, writable: true, value: null }); // Make credentials protected to match base class expectations Object.defineProperty(this, "credentials", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "model", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "voice", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "instructions", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "responseFormat", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "lastWordBoundaries", { enumerable: true, configurable: true, writable: true, value: [] }); this.credentials = credentials; // Don't initialize client here, load it on demand // Set default values this.model = "tts-1"; // Default model this.voice = "alloy"; // Default voice this.instructions = ""; this.responseFormat = "mp3"; // Default format } /** * Load the OpenAI SDK dynamically. * Returns the initialized client (real or mock). */ async loadClient() { if (this.client) { return this.client; } if (this.clientLoadingPromise) { const client = await this.clientLoadingPromise; if (client) return client; console.warn("Client loading promise resolved unexpectedly to null, using mock."); this.client = new MockOpenAI(); return this.client; } // Only attempt dynamic import in Node.js environment if (typeof window !== "undefined") { console.warn("OpenAI SDK dynamic import skipped in browser environment, using mock."); this.client = new MockOpenAI(); return this.client; } this.clientLoadingPromise = import("openai") .then((openaiModule) => { const OpenAIClass = openaiModule.OpenAI; this.client = new OpenAIClass({ apiKey: this.credentials.apiKey || process.env.OPENAI_API_KEY, baseURL: this.credentials.baseURL, organization: this.credentials.organization, }); this.clientLoadingPromise = null; console.log("OpenAI SDK loaded successfully."); return this.client; }) .catch((_error) => { console.warn("OpenAI package not found or failed to load, using mock implementation."); this.client = new MockOpenAI(); this.clientLoadingPromise = null; return this.client; // Return the mock client }) .finally(() => { this.clientLoadingPromise = null; // Clear promise once settled (success or fail) }); // Wait for the promise to resolve and return the client (could be real or mock) return this.clientLoadingPromise; } /** * Check if the credentials are valid * @returns Promise resolving to true if credentials are valid, false otherwise */ async checkCredentials() { try { const client = await this.loadClient(); if (client instanceof MockOpenAI) { console.warn("Cannot check credentials with mock OpenAI client."); return false; // Cannot validate with mock } // Try to list models to check if the real API key is valid await client.models.list(); return true; } catch (error) { console.error("Error checking OpenAI credentials:", error); return false; } } /** * Get the list of required credential types for this engine * @returns Array of required credential field names */ getRequiredCredentials() { return ['apiKey']; } /** * Get available voices * @returns Promise resolving to an array of unified voice objects */ async _getVoices() { // Validate credentials first by checking if we can access the API const credentialsValid = await this.checkCredentials(); if (!credentialsValid) { // If credentials are invalid, return empty array to signal test should be skipped return []; } // OpenAI has a fixed set of voices const voices = [ { id: "alloy", name: "Alloy", gender: "Unknown" }, { id: "ash", name: "Ash", gender: "Male" }, { id: "ballad", name: "Ballad", gender: "Male" }, { id: "coral", name: "Coral", gender: "Female" }, { id: "echo", name: "Echo", gender: "Male" }, { id: "fable", name: "Fable", gender: "Female" }, { id: "onyx", name: "Onyx", gender: "Male" }, { id: "nova", name: "Nova", gender: "Female" }, { id: "sage", name: "Sage", gender: "Male" }, { id: "shimmer", name: "Shimmer", gender: "Female" }, ]; return this._mapVoicesToUnified(voices); } /** * Map OpenAI voice objects to unified format * @param rawVoices Array of OpenAI voice objects * @returns Promise resolving to an array of unified voice objects */ async _mapVoicesToUnified(rawVoices) { return rawVoices.map((voice) => { // Create language code object const languageCode = { bcp47: "en-US", iso639_3: "eng", display: "English (US)", }; return { id: voice.id, name: voice.name, gender: voice.gender, provider: "openai", languageCodes: [languageCode], }; }); } /** * Set the voice to use for synthesis * @param voiceId Voice ID to use */ setVoice(voiceId) { this.voice = voiceId; } /** * Set the model to use for synthesis * @param model Model ID to use */ setModel(model) { this.model = model; } /** * Set instructions for the TTS engine * @param instructions Instructions for the TTS engine */ setInstructions(instructions) { this.instructions = instructions; } /** * Set the response format * @param format Response format (mp3, opus, aac, flac, wav, pcm) */ setResponseFormat(format) { this.responseFormat = format; } /** * Get a property value * @param property Property name * @returns Property value */ getProperty(property) { switch (property) { case "model": return this.model; case "voice": return this.voice; case "instructions": return this.instructions; case "responseFormat": return this.responseFormat; default: return super.getProperty(property); } } /** * Set a property value * @param property Property name * @param value Property value */ setProperty(property, value) { switch (property) { case "model": this.setModel(value); break; case "voice": this.setVoice(value); break; case "instructions": this.setInstructions(value); break; case "responseFormat": this.setResponseFormat(value); break; default: super.setProperty(property, value); break; } } /** * Convert text to speech * @param text Text to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async textToSpeech(text, options = {}) { if (typeof window !== "undefined") { throw new Error("textToSpeech with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead."); } // Node.js only const fs = await import("node:fs"); const path = await import("node:path"); try { // Create output directory if it doesn't exist const outputDir = options.outputDir || "."; if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Generate output file path const outputFile = options.outputFile || `openai-output.${this.responseFormat}`; const outputPath = path.join(outputDir, outputFile); // Synthesize audio const audioBytes = await this.synthToBytes(text, options); // Write audio to file fs.writeFileSync(outputPath, audioBytes); // Estimate word boundaries if (options.returnWordBoundaries) { const wordBoundaries = estimateWordBoundaries(text); this.setLastWordBoundaries(wordBoundaries); } // Call onEnd callback if (options.onEnd) { options.onEnd(); } return outputPath; } catch (error) { console.error("Error converting text to speech:", error); throw error; } } /** * Convert text to speech with streaming * @param text Text to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async textToSpeechStreaming(text, options = {}) { if (typeof window !== "undefined") { throw new Error("textToSpeechStreaming with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead."); } const fs = await import("node:fs"); const path = await import("node:path"); try { // Create output directory if it doesn't exist const outputDir = options.outputDir || "."; if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Generate output file path const outputFile = options.outputFile || `openai-streaming-output.${this.responseFormat}`; const outputPath = path.join(outputDir, outputFile); // Create speech with streaming const response = await this.client.audio.speech.create({ model: this.model, voice: this.voice, input: text, instructions: this.instructions || undefined, response_format: this.responseFormat, }); // Get the stream const stream = response.body; // Create a writable stream to the output file const writer = fs.createWriteStream(outputPath); // Pipe the stream to the file const reader = stream.getReader(); try { while (true) { const { done, value } = await reader.read(); if (done) break; writer.write(value); } } finally { reader.releaseLock(); } // Close the writer writer.end(); // Wait for the file to be written await new Promise((resolve, reject) => { writer.on("finish", resolve); writer.on("error", reject); }); // Estimate word boundaries if (options.onWord || options.returnWordBoundaries) { const wordBoundaries = estimateWordBoundaries(text); // Call onWord callback for each word if (options.onWord) { for (const wb of wordBoundaries) { options.onWord(wb); } } // Store word boundaries if requested if (options.returnWordBoundaries) { this.setLastWordBoundaries(wordBoundaries); } } // Call onEnd callback if (options.onEnd) { options.onEnd(); } return outputPath; } catch (error) { console.error("Error converting text to speech with streaming:", error); throw error; } } /** * Convert SSML to speech (not supported by OpenAI) * @param ssml SSML to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async ssmlToSpeech(_ssml, _options = {}) { throw new Error("SSML is not supported by OpenAI TTS"); } /** * Convert SSML to speech with streaming (not supported by OpenAI) * @param ssml SSML to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async ssmlToSpeechStreaming(_ssml, _options = {}) { throw new Error("SSML is not supported by OpenAI TTS"); } /** * Synthesize text to audio bytes * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio bytes */ async synthToBytes(text, options = {}) { try { // Prepare text for synthesis (handle Speech Markdown and SSML) let processedText = typeof text === "string" ? text : text.join(" "); // Convert from Speech Markdown if requested if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { // Convert to SSML first, then strip SSML tags since OpenAI doesn't support SSML const ssml = await SpeechMarkdown.toSSML(processedText); processedText = SSMLUtils.stripSSML(ssml); } // If text is SSML, strip the tags as OpenAI doesn't support SSML if (SSMLUtils.isSSML(processedText)) { processedText = SSMLUtils.stripSSML(processedText); } const client = await this.loadClient(); const params = { model: options.model || this.model, voice: options.voice || this.voice, input: processedText, instructions: this.instructions || undefined, response_format: options.format || this.responseFormat, // Map rate to speed if provided (_options.speed takes precedence over _options.rate) speed: options.speed ?? options.rate, }; // Use the initialized client (could be mock or real) const response = await client.audio.speech.create(params); const arrayBuffer = await response.arrayBuffer(); return new Uint8Array(arrayBuffer); } catch (error) { console.error("Error converting text to speech bytes:", error); throw error; } } /** * Synthesize text to a byte stream using OpenAI API. * @param text Text to synthesize. * @param _options Synthesis options (currently unused for streaming, uses defaults). * @returns Promise resolving to an object containing the audio stream and an empty word boundaries array. */ async synthToBytestream(text, _options) { try { const client = await this.loadClient(); // Use the initialized client (could be mock or real) const response = await client.audio.speech.create({ model: this.model, voice: this.voice, input: text, instructions: this.instructions || undefined, response_format: this.responseFormat, }); // Get the stream const stream = response.body; // Return the stream and an empty word boundaries array return { audioStream: stream, wordBoundaries: [] }; } catch (error) { console.error("Error converting text to speech stream:", error); throw error; } } }