UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

github.com/willwade/js-tts-wrapper

willwade/js-tts-wrapper

490 lines (489 loc) • 17.1 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.OpenAITTSClient = void 0; const fs = __importStar(require("node:fs")); const path = __importStar(require("node:path")); const abstract_tts_1 = require("../core/abstract-tts"); const word_timing_estimator_1 = require("../utils/word-timing-estimator"); // Mock implementation of OpenAI class class MockOpenAI { // Constructor accepts options but doesn't use them constructor(_options) { Object.defineProperty(this, "models", { enumerable: true, configurable: true, writable: true, value: { list: async () => ({ data: [] }), } }); Object.defineProperty(this, "audio", { enumerable: true, configurable: true, writable: true, value: { speech: { create: async () => ({ arrayBuffer: async () => new ArrayBuffer(0), body: new ReadableStream({ start(controller) { controller.close(); }, }), }), }, } }); } } // Use the mock OpenAI class if the openai package is not installed let OpenAIClass; let openaiPackageLoaded = false; // Function to load OpenAI package on demand function getOpenAIClass() { if (!openaiPackageLoaded) { try { // eslint-disable-next-line @typescript-eslint/no-var-requires OpenAIClass = require("openai").OpenAI; openaiPackageLoaded = true; } catch (_error) { console.warn("OpenAI package not found, using mock implementation"); OpenAIClass = MockOpenAI; openaiPackageLoaded = true; } } return OpenAIClass; } /** * OpenAI TTS Client * * This client uses the OpenAI API to convert text to speech. * It supports streaming audio but does not support SSML. * Word boundaries are estimated since OpenAI doesn't provide word events. */ class OpenAITTSClient extends abstract_tts_1.AbstractTTSClient { /** * Get the last word boundaries * @returns Array of word boundary objects */ getLastWordBoundaries() { return this.lastWordBoundaries; } /** * Set the last word boundaries * @param wordBoundaries Array of word boundary objects */ setLastWordBoundaries(wordBoundaries) { this.lastWordBoundaries = wordBoundaries; } /** * Create a new OpenAI TTS Client * @param credentials OpenAI API credentials */ constructor(credentials = {}) { super(credentials); Object.defineProperty(this, "client", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "model", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "voice", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "instructions", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "responseFormat", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "lastWordBoundaries", { enumerable: true, configurable: true, writable: true, value: [] }); // Initialize OpenAI client const OpenAIClass = getOpenAIClass(); this.client = new OpenAIClass({ apiKey: credentials.apiKey || process.env.OPENAI_API_KEY, baseURL: credentials.baseURL, organization: credentials.organization, }); // Set default values this.model = "gpt-4o-mini-tts"; this.voice = "coral"; this.instructions = ""; this.responseFormat = "mp3"; } /** * Check if the credentials are valid * @returns Promise resolving to true if credentials are valid, false otherwise */ async checkCredentials() { try { // Try to list models to check if the API key is valid await this.client.models.list(); return true; } catch (error) { console.error("Error checking OpenAI credentials:", error); return false; } } /** * Get available voices * @returns Promise resolving to an array of unified voice objects */ async _getVoices() { // OpenAI has a fixed set of voices const voices = [ { id: "alloy", name: "Alloy", gender: "Unknown" }, { id: "ash", name: "Ash", gender: "Male" }, { id: "ballad", name: "Ballad", gender: "Male" }, { id: "coral", name: "Coral", gender: "Female" }, { id: "echo", name: "Echo", gender: "Male" }, { id: "fable", name: "Fable", gender: "Female" }, { id: "onyx", name: "Onyx", gender: "Male" }, { id: "nova", name: "Nova", gender: "Female" }, { id: "sage", name: "Sage", gender: "Male" }, { id: "shimmer", name: "Shimmer", gender: "Female" }, ]; return this._mapVoicesToUnified(voices); } /** * Map OpenAI voice objects to unified format * @param rawVoices Array of OpenAI voice objects * @returns Promise resolving to an array of unified voice objects */ async _mapVoicesToUnified(rawVoices) { return rawVoices.map((voice) => { // Create language code object const languageCode = { bcp47: "en-US", iso639_3: "eng", display: "English (US)", }; return { id: voice.id, name: voice.name, gender: voice.gender, provider: "openai", languageCodes: [languageCode], }; }); } /** * Set the voice to use for synthesis * @param voiceId Voice ID to use */ setVoice(voiceId) { this.voice = voiceId; } /** * Set the model to use for synthesis * @param model Model ID to use */ setModel(model) { this.model = model; } /** * Set instructions for the TTS engine * @param instructions Instructions for the TTS engine */ setInstructions(instructions) { this.instructions = instructions; } /** * Set the response format * @param format Response format (mp3, opus, aac, flac, wav, pcm) */ setResponseFormat(format) { this.responseFormat = format; } /** * Get a property value * @param property Property name * @returns Property value */ getProperty(property) { switch (property) { case "model": return this.model; case "voice": return this.voice; case "instructions": return this.instructions; case "responseFormat": return this.responseFormat; default: return super.getProperty(property); } } /** * Set a property value * @param property Property name * @param value Property value */ setProperty(property, value) { switch (property) { case "model": this.setModel(value); break; case "voice": this.setVoice(value); break; case "instructions": this.setInstructions(value); break; case "responseFormat": this.setResponseFormat(value); break; default: super.setProperty(property, value); break; } } /** * Convert text to speech * @param text Text to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async textToSpeech(text, options = {}) { try { // Create output directory if it doesn't exist const outputDir = options.outputDir || "."; if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Generate output file path const outputFile = options.outputFile || `openai-output.${this.responseFormat}`; const outputPath = path.join(outputDir, outputFile); // Create speech const mp3 = await this.client.audio.speech.create({ model: this.model, voice: this.voice, input: text, instructions: this.instructions || undefined, response_format: this.responseFormat, }); // Save to file const buffer = Buffer.from(await mp3.arrayBuffer()); fs.writeFileSync(outputPath, buffer); // Estimate word boundaries if (options.onWord || options.returnWordBoundaries) { const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text); // Call onWord callback for each word if (options.onWord) { for (const wb of wordBoundaries) { options.onWord(wb); } } // Store word boundaries if requested if (options.returnWordBoundaries) { this.setLastWordBoundaries(wordBoundaries); } } // Call onEnd callback if (options.onEnd) { options.onEnd(); } return outputPath; } catch (error) { console.error("Error converting text to speech:", error); throw error; } } /** * Convert text to speech with streaming * @param text Text to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async textToSpeechStreaming(text, options = {}) { try { // Create output directory if it doesn't exist const outputDir = options.outputDir || "."; if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Generate output file path const outputFile = options.outputFile || `openai-streaming-output.${this.responseFormat}`; const outputPath = path.join(outputDir, outputFile); // Create speech with streaming const response = await this.client.audio.speech.create({ model: this.model, voice: this.voice, input: text, instructions: this.instructions || undefined, response_format: this.responseFormat, }); // Get the stream const stream = response.body; // Create a writable stream to the output file const writer = fs.createWriteStream(outputPath); // Pipe the stream to the file const reader = stream.getReader(); try { while (true) { const { done, value } = await reader.read(); if (done) break; writer.write(value); } } finally { reader.releaseLock(); } // Close the writer writer.end(); // Wait for the file to be written await new Promise((resolve, reject) => { writer.on("finish", resolve); writer.on("error", reject); }); // Estimate word boundaries if (options.onWord || options.returnWordBoundaries) { const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text); // Call onWord callback for each word if (options.onWord) { for (const wb of wordBoundaries) { options.onWord(wb); } } // Store word boundaries if requested if (options.returnWordBoundaries) { this.setLastWordBoundaries(wordBoundaries); } } // Call onEnd callback if (options.onEnd) { options.onEnd(); } return outputPath; } catch (error) { console.error("Error converting text to speech with streaming:", error); throw error; } } /** * Convert SSML to speech (not supported by OpenAI) * @param ssml SSML to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async ssmlToSpeech(_ssml, _options = {}) { throw new Error("SSML is not supported by OpenAI TTS"); } /** * Convert SSML to speech with streaming (not supported by OpenAI) * @param ssml SSML to convert to speech * @param options TTS options * @returns Promise resolving to the path of the generated audio file */ async ssmlToSpeechStreaming(_ssml, _options = {}) { throw new Error("SSML is not supported by OpenAI TTS"); } /** * Synthesize text to audio bytes * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio bytes */ async synthToBytes(text, _options = {}) { try { // Create speech const mp3 = await this.client.audio.speech.create({ model: this.model, voice: this.voice, input: typeof text === "string" ? text : text.join(" "), instructions: this.instructions || undefined, response_format: this.responseFormat, }); // Convert to bytes const buffer = Buffer.from(await mp3.arrayBuffer()); return new Uint8Array(buffer); } catch (error) { console.error("Error converting text to speech bytes:", error); throw error; } } /** * Synthesize text to audio byte stream * @param text Text to synthesize * @param options Synthesis options * @returns Promise resolving to audio byte stream */ async synthToBytestream(text, _options = {}) { try { // Create speech with streaming const response = await this.client.audio.speech.create({ model: this.model, voice: this.voice, input: text, instructions: this.instructions || undefined, response_format: this.responseFormat, }); // Return the stream return response.body; } catch (error) { console.error("Error converting text to speech stream:", error); throw error; } } } exports.OpenAITTSClient = OpenAITTSClient;