js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
511 lines (510 loc) • 21.5 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.GoogleTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SSMLUtils = __importStar(require("../core/ssml-utils"));
const SpeechMarkdown = __importStar(require("../markdown/converter"));
/**
* Google TTS client
*/
class GoogleTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new Google TTS client
* @param credentials Google Cloud credentials
*/
constructor(credentials) {
super(credentials);
/**
* Google Cloud Text-to-Speech client
*/
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
/**
* Whether to use the beta API for word timings
*/
Object.defineProperty(this, "useBetaApi", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
/**
* Google Cloud credentials
*/
Object.defineProperty(this, "googleCredentials", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
// Store the credentials for later use
this.googleCredentials = credentials;
this.client = null;
// Initialize the client if we're in a Node.js environment
if (typeof window === "undefined") {
this.initializeClient(credentials);
}
else {
// In browser: use REST API (to be implemented as needed)
this.client = null;
}
}
/**
* Initialize the Google TTS client
* @param credentials Google TTS credentials
*/
async initializeClient(credentials) {
try {
// Try to load the Google Cloud Text-to-Speech client (Node.js only)
const ttsModule = await Promise.resolve().then(() => __importStar(require("@google-cloud/text-to-speech")));
const { TextToSpeechClient } = ttsModule;
this.client = new TextToSpeechClient({
projectId: credentials.projectId,
credentials: credentials.credentials,
keyFilename: credentials.keyFilename,
});
// Try to load the beta client for word timings
try {
if (ttsModule.v1beta1) {
this.useBetaApi = true;
}
}
catch (error) {
console.warn("Google Cloud Text-to-Speech beta API not available. Word timing will be estimated.");
}
}
catch (error) {
// In test mode, we'll just log a warning instead of an error
if (process.env.NODE_ENV === "test") {
console.warn("Google TTS client not initialized in test mode. Some tests may be skipped.");
}
else {
console.error("Error initializing Google TTS client:", error);
console.warn("Google TTS will not be available. Install @google-cloud/text-to-speech to use this engine.");
}
this.client = null;
}
}
/**
* Get available voices from the provider
* @returns Promise resolving to an array of voice objects
*/
async _getVoices() {
// If the client is not available, return an empty array
if (!this.client) {
return [];
}
try {
const [response] = await this.client.listVoices({});
return response.voices || [];
}
catch (error) {
console.error("Error getting voices:", error);
return [];
}
}
/**
* Convert text to audio bytes
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
// If the client is not available, throw an error
if (!this.client) {
throw new Error("Google TTS client is not available. Install @google-cloud/text-to-speech to use this engine.");
}
try {
// Prepare SSML if needed
const ssml = await this.prepareSSML(text, options);
// Determine if we should use the beta API for word timings
const useWordTimings = options?.useWordBoundary && this.useBetaApi;
// Check if the voice supports SSML
const voiceName = options?.voice || this.voiceId;
// Only Standard and Wavenet voices support SSML
const supportsSSML = !voiceName || (voiceName.includes("Standard") || voiceName.includes("Wavenet"));
// Extract language code from voice name if available
let languageCode = this.lang || "en-US";
if (voiceName) {
// Extract language code from voice name (e.g., en-AU-Chirp-HD-D -> en-AU)
const parts = voiceName.split("-");
if (parts.length >= 2) {
languageCode = `${parts[0]}-${parts[1]}`;
}
}
// Prepare the request
const request = {
input: supportsSSML && SSMLUtils.isSSML(ssml) ? { ssml } : { text: SSMLUtils.isSSML(ssml) ? SSMLUtils.stripSSML(ssml) : ssml },
voice: {
languageCode: languageCode,
name: voiceName,
},
audioConfig: {
audioEncoding: options?.format === "mp3" ? "MP3" : "LINEAR16",
},
};
// Log a warning if SSML is being stripped
if (!supportsSSML && SSMLUtils.isSSML(ssml)) {
console.warn(`Voice ${voiceName} does not support SSML. Falling back to plain text.`);
}
// Add voice gender if no specific voice is set
if (!options?.voice && !this.voiceId) {
request.voice.ssmlGender = "NEUTRAL";
}
// Add timepoint type for word timings if using beta API
if (useWordTimings) {
request.enableTimePointing = ["SENTENCE", "SSML_MARK"];
}
// Synthesize speech
let response;
if (useWordTimings) {
// Use beta API for word timings
try {
// Use dynamic import for ESM compatibility
const ttsModule = await Promise.resolve().then(() => __importStar(require("@google-cloud/text-to-speech")));
const betaClient = new ttsModule.v1beta1.TextToSpeechClient({
projectId: this.googleCredentials.projectId,
credentials: this.googleCredentials.credentials,
keyFilename: this.googleCredentials.keyFilename,
});
[response] = await betaClient.synthesizeSpeech(request);
}
catch (error) {
console.warn("Error using beta API for word timings, falling back to standard API:", error);
[response] = await this.client.synthesizeSpeech(request);
}
}
else {
// Use standard API
[response] = await this.client.synthesizeSpeech(request);
}
// Process word timings if available
if (useWordTimings && response && 'timepoints' in response && Array.isArray(response.timepoints)) {
this.processTimepoints(response.timepoints, text);
}
else {
// Create estimated word timings
this._createEstimatedWordTimings(text);
}
// Return audio content, ensuring it's a Uint8Array
return response && response.audioContent ?
new Uint8Array(response.audioContent) :
new Uint8Array(0);
}
catch (error) {
console.error("Error synthesizing speech:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and word boundaries
*/
async synthToBytestream(text, options) {
// If the client is not available, throw an error
if (!this.client) {
throw new Error("Google TTS client is not available. Install @google-cloud/text-to-speech to use this engine.");
}
try {
// For Google TTS, we'll convert to bytes first and then create a stream
// This is because Google's API doesn't provide a streaming endpoint
const audioBytes = await this.synthToBytes(text, options);
// Create a standard ReadableStream
const stream = new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
// Always return the structure, populate boundaries only if requested AND available
const finalBoundaries = options?.useWordBoundary ? this.timings.map(([start, end, word]) => ({
text: word,
offset: Math.round(start * 10000), // Convert to 100-nanosecond units
duration: Math.round((end - start) * 10000),
})) : [];
return {
audioStream: stream,
wordBoundaries: finalBoundaries
};
}
catch (error) {
console.error("Error synthesizing speech stream:", error);
throw error;
}
}
/**
* Start playback with word boundary callbacks
* @param text Text to speak
* @param callback Callback function for word boundaries
* @param options Synthesis options
*/
async startPlaybackWithCallbacks(text, callback, options) {
// Register the callback
this.on("boundary", callback);
// Enable word boundary information
const enhancedOptions = { ...options, useWordBoundary: true };
// Start playback with word boundary information
await this.speakStreamed(text, enhancedOptions);
}
/**
* Get available voices
* @returns Promise resolving to an array of available voices
*/
/**
* Map Google voice objects to unified format
* @param rawVoices Array of Google voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Convert Google voices to unified format
return rawVoices.map((voice) => ({
id: voice.name,
name: voice.name || 'Unknown',
gender: voice.ssmlGender?.toLowerCase() || undefined,
languageCodes: voice.languageCodes,
provider: 'google',
raw: voice, // Keep the original raw voice data
}));
}
/**
* Prepare SSML for synthesis
* @param text Text or SSML to prepare
* @param options Synthesis options
* @returns SSML ready for synthesis
*/
async prepareSSML(text, options) {
// Convert from Speech Markdown if requested
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(text)) {
text = await SpeechMarkdown.toSSML(text, "google");
}
// If text is already SSML, return it
if (SSMLUtils.isSSML(text)) {
return this.addWordTimingMarks(text);
}
// Create SSML from plain text
let ssml = SSMLUtils.wrapWithSpeakTags(text);
// Add prosody if properties are set
if (this.properties.rate || this.properties.pitch || this.properties.volume) {
const attrs = [];
if (this.properties.rate) {
attrs.push(`rate="${this.properties.rate}"`);
}
if (this.properties.pitch) {
attrs.push(`pitch="${this.properties.pitch}"`);
}
if (this.properties.volume) {
attrs.push(`volume="${this.properties.volume}dB"`);
}
if (attrs.length > 0) {
// Extract content between speak tags
const match = ssml.match(/<speak[^>]*>(.*?)<\/speak>/s);
if (match) {
const content = match[1];
const prosodyContent = `<prosody ${attrs.join(" ")}>${content}</prosody>`;
ssml = ssml.replace(content, prosodyContent);
}
}
}
// Add word timing marks
return this.addWordTimingMarks(ssml);
}
/**
* Add SSML mark tags for word timing
* @param ssml SSML to add mark tags to
* @returns SSML with mark tags
*/
addWordTimingMarks(ssml) {
// Only add marks if using beta API
if (!this.useBetaApi) {
return ssml;
}
// Extract plain text from SSML
const plainText = SSMLUtils.stripSSML(ssml);
// Split into words
const words = plainText.split(/\s+/).filter((word) => word.length > 0);
// If no words, return original SSML
if (!words.length) {
return ssml;
}
// Add mark tags to each word
let markedText = plainText;
for (let i = words.length - 1; i >= 0; i--) {
const word = words[i];
const regex = new RegExp(`\\b${word}\\b`);
markedText = markedText.replace(regex, `<mark name="word_${i}"/>${word}`);
}
// Replace content in SSML
if (SSMLUtils.isSSML(ssml)) {
return ssml.replace(/>([^<]+)</g, (match, content) => {
if (content.trim() === plainText.trim()) {
return `>${markedText}<`;
}
return match;
});
}
// Wrap with speak tags if not already SSML
return `<speak>${markedText}</speak>`;
}
/**
* Process timepoints from Google TTS response
* @param timepoints Timepoints from Google TTS response
* @param text Original text
*/
processTimepoints(timepoints, text) {
// Extract plain text from SSML if needed
const plainText = SSMLUtils.isSSML(text) ? SSMLUtils.stripSSML(text) : text;
// Split into words
const words = plainText.split(/\s+/).filter((word) => word.length > 0);
// Create word timings from timepoints
this.timings = [];
for (let i = 0; i < timepoints.length; i++) {
const timepoint = timepoints[i];
const wordIndex = Number.parseInt(timepoint.markName.replace("word_", ""));
if (wordIndex >= 0 && wordIndex < words.length) {
const word = words[wordIndex];
const startTime = timepoint.timeSeconds;
// Estimate end time (next timepoint or start + word length * average time per character)
let endTime;
if (i < timepoints.length - 1) {
endTime = timepoints[i + 1].timeSeconds;
}
else {
// Estimate based on word length (assuming ~0.1s per character)
endTime = startTime + word.length * 0.1;
}
this.timings.push([startTime, endTime, word]);
}
}
// Sort timings by start time
this.timings.sort((a, b) => a[0] - b[0]);
}
/**
* Check if credentials are valid
* @returns Promise resolving to true if credentials are valid
*/
async checkCredentials() {
// If the client is not available, check if the credentials file exists
if (!this.client) {
try {
// Only import fs in Node.js environment
if (typeof window === "undefined") {
const fs = await Promise.resolve().then(() => __importStar(require("node:fs")));
const credentials = this.credentials;
// Check if the keyFilename exists
if (credentials.keyFilename && fs.existsSync(credentials.keyFilename)) {
return true;
}
// Check if the GOOGLE_APPLICATION_CREDENTIALS environment variable is set
if (process.env.GOOGLE_APPLICATION_CREDENTIALS &&
fs.existsSync(process.env.GOOGLE_APPLICATION_CREDENTIALS)) {
return true;
}
// Check if the GOOGLE_SA_PATH environment variable is set
if (process.env.GOOGLE_SA_PATH && fs.existsSync(process.env.GOOGLE_SA_PATH)) {
return true;
}
}
else {
// In browser environment, we can't check file existence
console.warn("Cannot check Google credentials file existence in browser environment");
return false;
}
}
catch (error) {
console.error("Error checking Google credentials:", error);
}
return false;
}
// Use the default implementation if client is available
return super.checkCredentials();
}
/**
* Check if credentials are valid with detailed response
* @returns Promise resolving to an object with success flag and optional error message
*/
async checkCredentialsDetailed() {
// If the client is not available, check if the credentials file exists
if (!this.client) {
try {
// Only import fs in Node.js environment
if (typeof window === "undefined") {
const fs = await Promise.resolve().then(() => __importStar(require("node:fs")));
const credentials = this.credentials;
// Check if the keyFilename exists
if (credentials.keyFilename && fs.existsSync(credentials.keyFilename)) {
return { success: true, error: "Credentials file exists but client not initialized" };
}
// Check if the GOOGLE_APPLICATION_CREDENTIALS environment variable is set
if (process.env.GOOGLE_APPLICATION_CREDENTIALS &&
fs.existsSync(process.env.GOOGLE_APPLICATION_CREDENTIALS)) {
return { success: true, error: "GOOGLE_APPLICATION_CREDENTIALS file exists but client not initialized" };
}
// Check if the GOOGLE_SA_PATH environment variable is set
if (process.env.GOOGLE_SA_PATH && fs.existsSync(process.env.GOOGLE_SA_PATH)) {
return { success: true, error: "GOOGLE_SA_PATH file exists but client not initialized" };
}
return {
success: false,
error: "No valid credentials file found"
};
}
else {
// In browser environment, we can't check file existence
return {
success: false,
error: "Cannot check Google credentials file existence in browser environment"
};
}
}
catch (error) {
console.error("Error checking Google credentials:", error);
return {
success: false,
error: error instanceof Error ? error.message : String(error)
};
}
}
// Use the default implementation if client is available
return super.checkCredentialsDetailed();
}
}
exports.GoogleTTSClient = GoogleTTSClient;
;