js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
530 lines (529 loc) • 19.8 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.OpenAITTSClient = void 0;
// Node-only imports moved inside Node-only code paths below for browser compatibility.
const abstract_tts_1 = require("../core/abstract-tts");
const word_timing_estimator_1 = require("../utils/word-timing-estimator");
// Mock implementation of OpenAI class
class MockOpenAI {
// Constructor accepts options but doesn't use them
constructor(_options) {
Object.defineProperty(this, "models", {
enumerable: true,
configurable: true,
writable: true,
value: {
list: async () => ({ data: [] }),
}
});
Object.defineProperty(this, "audio", {
enumerable: true,
configurable: true,
writable: true,
value: {
speech: {
create: async () => ({
arrayBuffer: async () => new ArrayBuffer(0),
body: new ReadableStream({
start(controller) {
controller.close();
},
}),
}),
},
}
});
}
}
/**
* OpenAI TTS Client
*
* This client uses the OpenAI API to convert text to speech.
* It supports streaming audio but does not support SSML.
* Word boundaries are estimated since OpenAI doesn't provide word events.
*/
class OpenAITTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Get the last word boundaries
* @returns Array of word boundary objects
*/
getLastWordBoundaries() {
return this.lastWordBoundaries;
}
/**
* Set the last word boundaries
* @param wordBoundaries Array of word boundary objects
*/
setLastWordBoundaries(wordBoundaries) {
this.lastWordBoundaries = wordBoundaries;
}
/**
* Create a new OpenAI TTS Client
* @param credentials OpenAI API credentials
*/
constructor(credentials = {}) {
super(credentials);
// Use 'any' for client to accommodate both real and mock SDK types easily
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
Object.defineProperty(this, "clientLoadingPromise", {
enumerable: true,
configurable: true,
writable: true,
value: null
});
// Make credentials protected to match base class expectations
Object.defineProperty(this, "credentials", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "model", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "voice", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "instructions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "responseFormat", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "lastWordBoundaries", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
this.credentials = credentials;
// Don't initialize client here, load it on demand
// Set default values
this.model = "tts-1"; // Default model
this.voice = "alloy"; // Default voice
this.instructions = "";
this.responseFormat = "mp3"; // Default format
}
/**
* Load the OpenAI SDK dynamically.
* Returns the initialized client (real or mock).
*/
async loadClient() {
if (this.client) {
return this.client;
}
if (this.clientLoadingPromise) {
const client = await this.clientLoadingPromise;
if (client)
return client;
console.warn("Client loading promise resolved unexpectedly to null, using mock.");
this.client = new MockOpenAI();
return this.client;
}
// Only attempt dynamic import in Node.js environment
if (typeof window !== "undefined") {
console.warn("OpenAI SDK dynamic import skipped in browser environment, using mock.");
this.client = new MockOpenAI();
return this.client;
}
this.clientLoadingPromise = Promise.resolve().then(() => __importStar(require("openai"))).then((openaiModule) => {
const OpenAIClass = openaiModule.OpenAI;
this.client = new OpenAIClass({
apiKey: this.credentials.apiKey || process.env.OPENAI_API_KEY,
baseURL: this.credentials.baseURL,
organization: this.credentials.organization,
});
this.clientLoadingPromise = null;
console.log("OpenAI SDK loaded successfully.");
return this.client;
})
.catch((_error) => {
console.warn("OpenAI package not found or failed to load, using mock implementation.");
this.client = new MockOpenAI();
this.clientLoadingPromise = null;
return this.client; // Return the mock client
})
.finally(() => {
this.clientLoadingPromise = null; // Clear promise once settled (success or fail)
});
// Wait for the promise to resolve and return the client (could be real or mock)
return this.clientLoadingPromise;
}
/**
* Check if the credentials are valid
* @returns Promise resolving to true if credentials are valid, false otherwise
*/
async checkCredentials() {
try {
const client = await this.loadClient();
if (client instanceof MockOpenAI) {
console.warn("Cannot check credentials with mock OpenAI client.");
return false; // Cannot validate with mock
}
// Try to list models to check if the real API key is valid
await client.models.list();
return true;
}
catch (error) {
console.error("Error checking OpenAI credentials:", error);
return false;
}
}
/**
* Get available voices
* @returns Promise resolving to an array of unified voice objects
*/
async _getVoices() {
// OpenAI has a fixed set of voices
const voices = [
{ id: "alloy", name: "Alloy", gender: "Unknown" },
{ id: "ash", name: "Ash", gender: "Male" },
{ id: "ballad", name: "Ballad", gender: "Male" },
{ id: "coral", name: "Coral", gender: "Female" },
{ id: "echo", name: "Echo", gender: "Male" },
{ id: "fable", name: "Fable", gender: "Female" },
{ id: "onyx", name: "Onyx", gender: "Male" },
{ id: "nova", name: "Nova", gender: "Female" },
{ id: "sage", name: "Sage", gender: "Male" },
{ id: "shimmer", name: "Shimmer", gender: "Female" },
];
return this._mapVoicesToUnified(voices);
}
/**
* Map OpenAI voice objects to unified format
* @param rawVoices Array of OpenAI voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
return rawVoices.map((voice) => {
// Create language code object
const languageCode = {
bcp47: "en-US",
iso639_3: "eng",
display: "English (US)",
};
return {
id: voice.id,
name: voice.name,
gender: voice.gender,
provider: "openai",
languageCodes: [languageCode],
};
});
}
/**
* Set the voice to use for synthesis
* @param voiceId Voice ID to use
*/
setVoice(voiceId) {
this.voice = voiceId;
}
/**
* Set the model to use for synthesis
* @param model Model ID to use
*/
setModel(model) {
this.model = model;
}
/**
* Set instructions for the TTS engine
* @param instructions Instructions for the TTS engine
*/
setInstructions(instructions) {
this.instructions = instructions;
}
/**
* Set the response format
* @param format Response format (mp3, opus, aac, flac, wav, pcm)
*/
setResponseFormat(format) {
this.responseFormat = format;
}
/**
* Get a property value
* @param property Property name
* @returns Property value
*/
getProperty(property) {
switch (property) {
case "model":
return this.model;
case "voice":
return this.voice;
case "instructions":
return this.instructions;
case "responseFormat":
return this.responseFormat;
default:
return super.getProperty(property);
}
}
/**
* Set a property value
* @param property Property name
* @param value Property value
*/
setProperty(property, value) {
switch (property) {
case "model":
this.setModel(value);
break;
case "voice":
this.setVoice(value);
break;
case "instructions":
this.setInstructions(value);
break;
case "responseFormat":
this.setResponseFormat(value);
break;
default:
super.setProperty(property, value);
break;
}
}
/**
* Convert text to speech
* @param text Text to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async textToSpeech(text, options = {}) {
if (typeof window !== "undefined") {
throw new Error("textToSpeech with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead.");
}
// Node.js only
const fs = await Promise.resolve().then(() => __importStar(require("node:fs")));
const path = await Promise.resolve().then(() => __importStar(require("node:path")));
try {
// Create output directory if it doesn't exist
const outputDir = options.outputDir || ".";
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Generate output file path
const outputFile = options.outputFile || `openai-output.${this.responseFormat}`;
const outputPath = path.join(outputDir, outputFile);
// Synthesize audio
const audioBytes = await this.synthToBytes(text, options);
// Write audio to file
fs.writeFileSync(outputPath, audioBytes);
// Estimate word boundaries
if (options.returnWordBoundaries) {
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
this.setLastWordBoundaries(wordBoundaries);
}
// Call onEnd callback
if (options.onEnd) {
options.onEnd();
}
return outputPath;
}
catch (error) {
console.error("Error converting text to speech:", error);
throw error;
}
}
/**
* Convert text to speech with streaming
* @param text Text to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async textToSpeechStreaming(text, options = {}) {
if (typeof window !== "undefined") {
throw new Error("textToSpeechStreaming with file output is not supported in the browser. Use synthToBytes or synthToBytestream instead.");
}
const fs = await Promise.resolve().then(() => __importStar(require("node:fs")));
const path = await Promise.resolve().then(() => __importStar(require("node:path")));
try {
// Create output directory if it doesn't exist
const outputDir = options.outputDir || ".";
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Generate output file path
const outputFile = options.outputFile || `openai-streaming-output.${this.responseFormat}`;
const outputPath = path.join(outputDir, outputFile);
// Create speech with streaming
const response = await this.client.audio.speech.create({
model: this.model,
voice: this.voice,
input: text,
instructions: this.instructions || undefined,
response_format: this.responseFormat,
});
// Get the stream
const stream = response.body;
// Create a writable stream to the output file
const writer = fs.createWriteStream(outputPath);
// Pipe the stream to the file
const reader = stream.getReader();
try {
while (true) {
const { done, value } = await reader.read();
if (done)
break;
writer.write(value);
}
}
finally {
reader.releaseLock();
}
// Close the writer
writer.end();
// Wait for the file to be written
await new Promise((resolve, reject) => {
writer.on("finish", resolve);
writer.on("error", reject);
});
// Estimate word boundaries
if (options.onWord || options.returnWordBoundaries) {
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
// Call onWord callback for each word
if (options.onWord) {
for (const wb of wordBoundaries) {
options.onWord(wb);
}
}
// Store word boundaries if requested
if (options.returnWordBoundaries) {
this.setLastWordBoundaries(wordBoundaries);
}
}
// Call onEnd callback
if (options.onEnd) {
options.onEnd();
}
return outputPath;
}
catch (error) {
console.error("Error converting text to speech with streaming:", error);
throw error;
}
}
/**
* Convert SSML to speech (not supported by OpenAI)
* @param ssml SSML to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async ssmlToSpeech(_ssml, _options = {}) {
throw new Error("SSML is not supported by OpenAI TTS");
}
/**
* Convert SSML to speech with streaming (not supported by OpenAI)
* @param ssml SSML to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async ssmlToSpeechStreaming(_ssml, _options = {}) {
throw new Error("SSML is not supported by OpenAI TTS");
}
/**
* Synthesize text to audio bytes
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options = {}) {
try {
const client = await this.loadClient();
const params = {
model: options.model || this.model,
voice: options.voice || this.voice,
input: typeof text === "string" ? text : text.join(" "),
instructions: this.instructions || undefined,
response_format: options.format || this.responseFormat,
// Map rate to speed if provided (_options.speed takes precedence over _options.rate)
speed: options.speed ?? options.rate,
};
// Use the initialized client (could be mock or real)
const response = await client.audio.speech.create(params);
const arrayBuffer = await response.arrayBuffer();
return new Uint8Array(arrayBuffer);
}
catch (error) {
console.error("Error converting text to speech bytes:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream using OpenAI API.
* @param text Text to synthesize.
* @param _options Synthesis options (currently unused for streaming, uses defaults).
* @returns Promise resolving to an object containing the audio stream and an empty word boundaries array.
*/
async synthToBytestream(text, _options) {
try {
const client = await this.loadClient();
// Use the initialized client (could be mock or real)
const response = await client.audio.speech.create({
model: this.model,
voice: this.voice,
input: text,
instructions: this.instructions || undefined,
response_format: this.responseFormat,
});
// Get the stream
const stream = response.body;
// Return the stream and an empty word boundaries array
return { audioStream: stream, wordBoundaries: [] };
}
catch (error) {
console.error("Error converting text to speech stream:", error);
throw error;
}
}
}
exports.OpenAITTSClient = OpenAITTSClient;