js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
490 lines (489 loc) • 17.1 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.OpenAITTSClient = void 0;
const fs = __importStar(require("node:fs"));
const path = __importStar(require("node:path"));
const abstract_tts_1 = require("../core/abstract-tts");
const word_timing_estimator_1 = require("../utils/word-timing-estimator");
// Mock implementation of OpenAI class
class MockOpenAI {
// Constructor accepts options but doesn't use them
constructor(_options) {
Object.defineProperty(this, "models", {
enumerable: true,
configurable: true,
writable: true,
value: {
list: async () => ({ data: [] }),
}
});
Object.defineProperty(this, "audio", {
enumerable: true,
configurable: true,
writable: true,
value: {
speech: {
create: async () => ({
arrayBuffer: async () => new ArrayBuffer(0),
body: new ReadableStream({
start(controller) {
controller.close();
},
}),
}),
},
}
});
}
}
// Use the mock OpenAI class if the openai package is not installed
let OpenAIClass;
let openaiPackageLoaded = false;
// Function to load OpenAI package on demand
function getOpenAIClass() {
if (!openaiPackageLoaded) {
try {
// eslint-disable-next-line @typescript-eslint/no-var-requires
OpenAIClass = require("openai").OpenAI;
openaiPackageLoaded = true;
}
catch (_error) {
console.warn("OpenAI package not found, using mock implementation");
OpenAIClass = MockOpenAI;
openaiPackageLoaded = true;
}
}
return OpenAIClass;
}
/**
* OpenAI TTS Client
*
* This client uses the OpenAI API to convert text to speech.
* It supports streaming audio but does not support SSML.
* Word boundaries are estimated since OpenAI doesn't provide word events.
*/
class OpenAITTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Get the last word boundaries
* @returns Array of word boundary objects
*/
getLastWordBoundaries() {
return this.lastWordBoundaries;
}
/**
* Set the last word boundaries
* @param wordBoundaries Array of word boundary objects
*/
setLastWordBoundaries(wordBoundaries) {
this.lastWordBoundaries = wordBoundaries;
}
/**
* Create a new OpenAI TTS Client
* @param credentials OpenAI API credentials
*/
constructor(credentials = {}) {
super(credentials);
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "model", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "voice", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "instructions", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "responseFormat", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "lastWordBoundaries", {
enumerable: true,
configurable: true,
writable: true,
value: []
});
// Initialize OpenAI client
const OpenAIClass = getOpenAIClass();
this.client = new OpenAIClass({
apiKey: credentials.apiKey || process.env.OPENAI_API_KEY,
baseURL: credentials.baseURL,
organization: credentials.organization,
});
// Set default values
this.model = "gpt-4o-mini-tts";
this.voice = "coral";
this.instructions = "";
this.responseFormat = "mp3";
}
/**
* Check if the credentials are valid
* @returns Promise resolving to true if credentials are valid, false otherwise
*/
async checkCredentials() {
try {
// Try to list models to check if the API key is valid
await this.client.models.list();
return true;
}
catch (error) {
console.error("Error checking OpenAI credentials:", error);
return false;
}
}
/**
* Get available voices
* @returns Promise resolving to an array of unified voice objects
*/
async _getVoices() {
// OpenAI has a fixed set of voices
const voices = [
{ id: "alloy", name: "Alloy", gender: "Unknown" },
{ id: "ash", name: "Ash", gender: "Male" },
{ id: "ballad", name: "Ballad", gender: "Male" },
{ id: "coral", name: "Coral", gender: "Female" },
{ id: "echo", name: "Echo", gender: "Male" },
{ id: "fable", name: "Fable", gender: "Female" },
{ id: "onyx", name: "Onyx", gender: "Male" },
{ id: "nova", name: "Nova", gender: "Female" },
{ id: "sage", name: "Sage", gender: "Male" },
{ id: "shimmer", name: "Shimmer", gender: "Female" },
];
return this._mapVoicesToUnified(voices);
}
/**
* Map OpenAI voice objects to unified format
* @param rawVoices Array of OpenAI voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
return rawVoices.map((voice) => {
// Create language code object
const languageCode = {
bcp47: "en-US",
iso639_3: "eng",
display: "English (US)",
};
return {
id: voice.id,
name: voice.name,
gender: voice.gender,
provider: "openai",
languageCodes: [languageCode],
};
});
}
/**
* Set the voice to use for synthesis
* @param voiceId Voice ID to use
*/
setVoice(voiceId) {
this.voice = voiceId;
}
/**
* Set the model to use for synthesis
* @param model Model ID to use
*/
setModel(model) {
this.model = model;
}
/**
* Set instructions for the TTS engine
* @param instructions Instructions for the TTS engine
*/
setInstructions(instructions) {
this.instructions = instructions;
}
/**
* Set the response format
* @param format Response format (mp3, opus, aac, flac, wav, pcm)
*/
setResponseFormat(format) {
this.responseFormat = format;
}
/**
* Get a property value
* @param property Property name
* @returns Property value
*/
getProperty(property) {
switch (property) {
case "model":
return this.model;
case "voice":
return this.voice;
case "instructions":
return this.instructions;
case "responseFormat":
return this.responseFormat;
default:
return super.getProperty(property);
}
}
/**
* Set a property value
* @param property Property name
* @param value Property value
*/
setProperty(property, value) {
switch (property) {
case "model":
this.setModel(value);
break;
case "voice":
this.setVoice(value);
break;
case "instructions":
this.setInstructions(value);
break;
case "responseFormat":
this.setResponseFormat(value);
break;
default:
super.setProperty(property, value);
break;
}
}
/**
* Convert text to speech
* @param text Text to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async textToSpeech(text, options = {}) {
try {
// Create output directory if it doesn't exist
const outputDir = options.outputDir || ".";
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Generate output file path
const outputFile = options.outputFile || `openai-output.${this.responseFormat}`;
const outputPath = path.join(outputDir, outputFile);
// Create speech
const mp3 = await this.client.audio.speech.create({
model: this.model,
voice: this.voice,
input: text,
instructions: this.instructions || undefined,
response_format: this.responseFormat,
});
// Save to file
const buffer = Buffer.from(await mp3.arrayBuffer());
fs.writeFileSync(outputPath, buffer);
// Estimate word boundaries
if (options.onWord || options.returnWordBoundaries) {
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
// Call onWord callback for each word
if (options.onWord) {
for (const wb of wordBoundaries) {
options.onWord(wb);
}
}
// Store word boundaries if requested
if (options.returnWordBoundaries) {
this.setLastWordBoundaries(wordBoundaries);
}
}
// Call onEnd callback
if (options.onEnd) {
options.onEnd();
}
return outputPath;
}
catch (error) {
console.error("Error converting text to speech:", error);
throw error;
}
}
/**
* Convert text to speech with streaming
* @param text Text to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async textToSpeechStreaming(text, options = {}) {
try {
// Create output directory if it doesn't exist
const outputDir = options.outputDir || ".";
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Generate output file path
const outputFile = options.outputFile || `openai-streaming-output.${this.responseFormat}`;
const outputPath = path.join(outputDir, outputFile);
// Create speech with streaming
const response = await this.client.audio.speech.create({
model: this.model,
voice: this.voice,
input: text,
instructions: this.instructions || undefined,
response_format: this.responseFormat,
});
// Get the stream
const stream = response.body;
// Create a writable stream to the output file
const writer = fs.createWriteStream(outputPath);
// Pipe the stream to the file
const reader = stream.getReader();
try {
while (true) {
const { done, value } = await reader.read();
if (done)
break;
writer.write(value);
}
}
finally {
reader.releaseLock();
}
// Close the writer
writer.end();
// Wait for the file to be written
await new Promise((resolve, reject) => {
writer.on("finish", resolve);
writer.on("error", reject);
});
// Estimate word boundaries
if (options.onWord || options.returnWordBoundaries) {
const wordBoundaries = (0, word_timing_estimator_1.estimateWordBoundaries)(text);
// Call onWord callback for each word
if (options.onWord) {
for (const wb of wordBoundaries) {
options.onWord(wb);
}
}
// Store word boundaries if requested
if (options.returnWordBoundaries) {
this.setLastWordBoundaries(wordBoundaries);
}
}
// Call onEnd callback
if (options.onEnd) {
options.onEnd();
}
return outputPath;
}
catch (error) {
console.error("Error converting text to speech with streaming:", error);
throw error;
}
}
/**
* Convert SSML to speech (not supported by OpenAI)
* @param ssml SSML to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async ssmlToSpeech(_ssml, _options = {}) {
throw new Error("SSML is not supported by OpenAI TTS");
}
/**
* Convert SSML to speech with streaming (not supported by OpenAI)
* @param ssml SSML to convert to speech
* @param options TTS options
* @returns Promise resolving to the path of the generated audio file
*/
async ssmlToSpeechStreaming(_ssml, _options = {}) {
throw new Error("SSML is not supported by OpenAI TTS");
}
/**
* Synthesize text to audio bytes
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, _options = {}) {
try {
// Create speech
const mp3 = await this.client.audio.speech.create({
model: this.model,
voice: this.voice,
input: typeof text === "string" ? text : text.join(" "),
instructions: this.instructions || undefined,
response_format: this.responseFormat,
});
// Convert to bytes
const buffer = Buffer.from(await mp3.arrayBuffer());
return new Uint8Array(buffer);
}
catch (error) {
console.error("Error converting text to speech bytes:", error);
throw error;
}
}
/**
* Synthesize text to audio byte stream
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio byte stream
*/
async synthToBytestream(text, _options = {}) {
try {
// Create speech with streaming
const response = await this.client.audio.speech.create({
model: this.model,
voice: this.voice,
input: text,
instructions: this.instructions || undefined,
response_format: this.responseFormat,
});
// Return the stream
return response.body;
}
catch (error) {
console.error("Error converting text to speech stream:", error);
throw error;
}
}
}
exports.OpenAITTSClient = OpenAITTSClient;