js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
325 lines (324 loc) • 12.8 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.PollyTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SpeechMarkdown = __importStar(require("../markdown/converter"));
// Import AWS SDK v3 Polly client
const client_polly_1 = require("@aws-sdk/client-polly");
/**
* AWS Polly TTS client
*/
class PollyTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new AWS Polly TTS client
* @param credentials AWS credentials
*/
constructor(credentials) {
super(credentials);
/**
* AWS Polly client
*/
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
try {
// Create the Polly client
this.client = new client_polly_1.PollyClient({
region: credentials.region,
credentials: {
accessKeyId: credentials.accessKeyId,
secretAccessKey: credentials.secretAccessKey,
},
});
}
catch (error) {
console.error("Error initializing AWS Polly client:", error);
console.warn("AWS Polly TTS will not be available. Make sure you have valid AWS credentials.");
}
}
/**
* Get available voices from the provider
* @returns Promise resolving to an array of voice objects
*/
async _getVoices() {
if (!this.client) {
return [];
}
try {
// Create the command to describe voices
const command = new client_polly_1.DescribeVoicesCommand({});
// Execute the command
const response = await this.client.send(command);
return response.Voices || [];
}
catch (error) {
console.error("Error getting voices:", error);
return [];
}
}
/**
* Map AWS Polly voice objects to unified format
* @param rawVoices Array of AWS Polly voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
return rawVoices.map((voice) => {
// Map gender
let gender = "Unknown";
if (voice.Gender === "Female") {
gender = "Female";
}
else if (voice.Gender === "Male") {
gender = "Male";
}
// Get language code
const langCode = voice.LanguageCode || "en-US";
// Create language code object
const languageCode = {
bcp47: langCode,
iso639_3: langCode.split("-")[0],
display: voice.LanguageName || langCode,
};
return {
id: voice.Id,
name: voice.Name,
gender,
provider: "polly",
languageCodes: [languageCode],
};
});
}
/**
* Prepare SSML for AWS Polly
* @param text Text or SSML to prepare
* @param options Synthesis options
* @returns Prepared SSML
*/
prepareSSML(text, options) {
// Convert from Speech Markdown if requested
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(text)) {
const ssmlText = SpeechMarkdown.toSSML(text, "amazon-polly");
text = ssmlText;
}
// If text is not SSML, wrap it in speak tags
if (!this._isSSML(text)) {
text = `<speak>${text}</speak>`;
return text;
}
// Fix common SSML issues for Polly
// 1. Make sure the speak tag has the correct xmlns attribute
// Polly requires the xmlns attribute to be present
if (!text.includes('xmlns="http://www.w3.org/2001/10/synthesis"')) {
text = text.replace(/<speak>/i, '<speak xmlns="http://www.w3.org/2001/10/synthesis">');
}
// 2. Fix any self-closing tags that Polly doesn't support
text = text.replace(/<break\s+([^>]+)\/>/gi, '<break $1></break>');
// 3. Apply prosody settings if needed
if (this.properties.rate !== "medium" ||
this.properties.pitch !== "medium" ||
this.properties.volume !== 100) {
// Extract the content inside the speak tags
const speakTagMatch = /<speak[^>]*>(.*?)<\/speak>/s.exec(text);
if (speakTagMatch && speakTagMatch[1]) {
const content = speakTagMatch[1];
// Wrap with prosody tag
const prosodyContent = this.constructProsodyTag(content);
// Put back inside speak tags with the original attributes
const openingTag = text.substring(0, text.indexOf('>') + 1);
text = `${openingTag}${prosodyContent}</speak>`;
}
}
return text;
}
/**
* Convert text to audio bytes
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
if (!this.client) {
throw new Error("AWS Polly client is not available. Make sure you have valid AWS credentials.");
}
try {
// Prepare SSML
const ssml = this.prepareSSML(text, options);
// Determine output format
const outputFormat = options?.format === "mp3" ? client_polly_1.OutputFormat.MP3 : client_polly_1.OutputFormat.PCM;
// Use voice from options or the default voice
const voiceId = (options?.voice || this.voiceId || "Joanna"); // Default voice
// Prepare the command input
const input = {
Text: ssml,
TextType: "ssml",
OutputFormat: outputFormat,
VoiceId: voiceId,
Engine: "neural", // Use neural engine for better quality
};
// Create the command
const command = new client_polly_1.SynthesizeSpeechCommand(input);
// Execute the command
const response = await this.client.send(command);
// Get audio data
if (!response.AudioStream) {
throw new Error("No audio data returned from AWS Polly");
}
// Convert audio stream to Uint8Array
const arrayBuffer = await response.AudioStream.transformToByteArray();
return new Uint8Array(arrayBuffer);
}
catch (error) {
console.error("Error synthesizing speech:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream with word boundaries
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to a readable stream of audio bytes with word boundaries
*/
async synthToBytestream(text, options) {
if (!this.client) {
throw new Error("AWS Polly client is not available. Make sure you have valid AWS credentials.");
}
try {
// Check if word boundary information is requested
const useWordBoundary = options?.useWordBoundary !== false;
if (useWordBoundary) {
// First, get the audio
const audioBytes = await this.synthToBytes(text, options);
// Then, get the speech marks for word boundaries
const wordBoundaries = await this.getSpeechMarks(text, options);
// Create a readable stream from the audio bytes
const audioStream = new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
// Return both the audio stream and word boundaries
return {
audioStream,
wordBoundaries,
};
}
else {
// If word boundaries are not needed, just return the audio as a stream
const audioBytes = await this.synthToBytes(text, options);
// Create a readable stream from the audio bytes
return new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
}
}
catch (error) {
console.error("Error synthesizing speech stream:", error);
throw error;
}
}
/**
* Get speech marks (word boundaries) for text
* @param inputText Text or SSML to get speech marks for
* @param options Synthesis options
* @returns Promise resolving to an array of word boundaries
*/
async getSpeechMarks(inputText, options) {
// Prepare SSML
const ssml = this.prepareSSML(inputText, options);
// Use voice from options or the default voice
const voiceId = (options?.voice || this.voiceId || "Joanna"); // Default voice
// Prepare the command input for speech marks
const input = {
Text: ssml,
TextType: "ssml",
OutputFormat: client_polly_1.OutputFormat.JSON,
VoiceId: voiceId,
Engine: "neural", // Use neural engine for better quality
SpeechMarkTypes: [client_polly_1.SpeechMarkType.WORD],
};
// Create the command
const command = new client_polly_1.SynthesizeSpeechCommand(input);
// Execute the command
const response = await this.client.send(command);
// Get speech marks data
if (!response.AudioStream) {
return [];
}
// Convert audio stream to string
const arrayBuffer = await response.AudioStream.transformToByteArray();
const buffer = new Uint8Array(arrayBuffer);
// Convert buffer to string
const content = new TextDecoder().decode(buffer);
// Parse speech marks (each line is a JSON object)
const speechMarks = content
.split("\n")
.filter((line) => line.trim())
.map((line) => JSON.parse(line));
// Convert to our format
return speechMarks.map((mark) => ({
text: mark.value,
offset: mark.time,
duration: mark.end ? mark.end - mark.time : 0, // Some marks might not have end time
}));
}
/**
* Check if credentials are valid
* @returns Promise resolving to true if credentials are valid
*/
async checkCredentials() {
if (!this.client) {
return false;
}
try {
// Try to list voices as a simple API call to check credentials
const command = new client_polly_1.DescribeVoicesCommand({});
const response = await this.client.send(command);
return Array.isArray(response.Voices) && response.Voices.length > 0;
}
catch (error) {
console.error("Error checking AWS Polly credentials:", error);
return false;
}
}
}
exports.PollyTTSClient = PollyTTSClient;