js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
597 lines (596 loc) • 27.6 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.PollyTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SpeechMarkdown = __importStar(require("../markdown/converter"));
const stream_utils_1 = require("../utils/stream-utils");
/**
* AWS Polly TTS client
*/
class PollyTTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new AWS Polly TTS client
* @param credentials AWS credentials
*/
constructor(credentials) {
super(credentials);
/**
* AWS Polly client
*/
Object.defineProperty(this, "client", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
}); // PollyClient type is only available at runtime in Node
Object.defineProperty(this, "_pollyModule", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
// Set the default sample rate for PCM format to match the Python implementation
// The Python implementation uses wav.setparams((1, 2, 16000, 0, "NONE", "NONE"))
this.sampleRate = 16000; // Default sample rate for Polly PCM format
if (typeof window !== "undefined") {
throw new Error("AWS Polly is not supported in the browser. Use synthToBytes or synthToBytestream if available.");
}
try {
// Do not import here, only store credentials. Actual import is done in each async method.
this._pollyModule = null;
this.client = null;
this.credentials = credentials;
}
catch (error) {
console.error("Error initializing AWS Polly client:", error);
console.warn("AWS Polly TTS will not be available. Make sure you have valid AWS credentials.");
}
}
/**
* Get available voices from the provider
* @returns Promise resolving to an array of voice objects
*/
async _getVoices() {
try {
const pollyModule = this._pollyModule || (await Promise.resolve().then(() => __importStar(require("@aws-sdk/client-polly"))));
if (!this.client) {
const PollyClient = pollyModule.PollyClient;
this.client = new PollyClient({
region: this.credentials.region, // Reverted: Directly use credentials
credentials: {
accessKeyId: this.credentials.accessKeyId,
secretAccessKey: this.credentials.secretAccessKey,
},
});
this._pollyModule = pollyModule;
}
const DescribeVoicesCommand = pollyModule.DescribeVoicesCommand;
const command = new DescribeVoicesCommand({});
const response = await this.client.send(command);
return response.Voices || [];
}
catch (error) {
console.error("Error getting voices:", error);
return [];
}
}
/**
* Map AWS Polly voice objects to unified format
* @param rawVoices Array of AWS Polly voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
return rawVoices.map((voice) => {
// Map gender
let gender = "Unknown";
if (voice.Gender === "Female") {
gender = "Female";
}
else if (voice.Gender === "Male") {
gender = "Male";
}
// Get language code
const langCode = voice.LanguageCode || "en-US";
// Create language code object
const languageCode = {
bcp47: langCode,
iso639_3: langCode.split("-")[0],
display: voice.LanguageName || langCode,
};
return {
id: voice.Id,
name: voice.Name,
gender,
provider: "polly",
languageCodes: [languageCode],
};
});
}
/**
* Check if a voice is a neural voice
* @param voiceId Voice ID to check
* @returns True if the voice is a neural voice
*/
async isNeuralVoice(voiceId) {
// If no voice ID is provided, use the current voice
const voice = voiceId || this.voiceId || "";
// If the voice name includes "Neural", it's definitely a neural voice
if (voice.includes("Neural")) {
return true;
}
try {
// Get the raw voices from Polly to check the SupportedEngines property
const rawVoices = await this._getVoices();
const voiceDetails = rawVoices.find(v => v.Id === voice);
// Check if the voice supports the neural engine and we're using the neural engine
return voiceDetails?.SupportedEngines?.includes("neural") || false;
}
catch (error) {
console.warn(`Error checking if voice ${voice} is neural:`, error);
// Default to false if we can't determine
return false;
}
}
/**
* Prepare SSML for AWS Polly
* @param text Text or SSML to prepare
* @param options Synthesis options
* @returns Promise resolving to prepared SSML or plain text
*/
async prepareSSML(text, options) {
// Get the voice ID from options or the current voice
const voiceId = options?.voice || this.voiceId || "";
// Convert from Speech Markdown if requested
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(text)) {
const ssmlText = await SpeechMarkdown.toSSML(text, "amazon-polly");
text = ssmlText;
}
// Check if the voice is neural
const isNeural = await this.isNeuralVoice(voiceId);
// If using a neural voice and the text is SSML, strip SSML tags
// Neural voices don't support SSML
if (isNeural && this._isSSML(text)) {
console.warn(`Voice ${voiceId} is a neural voice and doesn't support SSML. Stripping SSML tags.`);
return this._stripSSML(text);
}
// If text is not SSML, wrap it in speak tags
if (!this._isSSML(text)) {
text = `<speak>${text}</speak>`;
return text;
}
// Fix common SSML issues for Polly (only for non-neural voices)
if (!isNeural) {
// 1. Make sure the speak tag has the correct xmlns attribute
// Polly requires the xmlns attribute to be present
if (!text.includes('xmlns="http://www.w3.org/2001/10/synthesis"')) {
text = text.replace(/<speak>/i, '<speak xmlns="http://www.w3.org/2001/10/synthesis">');
}
// 2. Fix any self-closing tags that Polly doesn't support
text = text.replace(/<break\s+([^>]+)\/>/gi, '<break $1></break>');
// 3. Apply prosody settings if needed
if (this.properties.rate !== "medium" ||
this.properties.pitch !== "medium" ||
this.properties.volume !== 100) {
// Extract the content inside the speak tags
const speakTagMatch = /<speak[^>]*>(.*?)<\/speak>/s.exec(text);
if (speakTagMatch && speakTagMatch[1]) {
const content = speakTagMatch[1];
// Wrap with prosody tag
const prosodyContent = this.constructProsodyTag(content);
// Put back inside speak tags with the original attributes
const openingTag = text.substring(0, text.indexOf('>') + 1);
text = `${openingTag}${prosodyContent}</speak>`;
}
}
}
return text;
}
/**
* Convert text to audio bytes
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
try {
const pollyModule = this._pollyModule || (await Promise.resolve().then(() => __importStar(require("@aws-sdk/client-polly"))));
if (!this.client) {
const PollyClient = pollyModule.PollyClient;
this.client = new PollyClient({
region: this.credentials.region, // Reverted
credentials: {
accessKeyId: this.credentials.accessKeyId,
secretAccessKey: this.credentials.secretAccessKey,
},
});
this._pollyModule = pollyModule;
}
const { OutputFormat, SynthesizeSpeechCommand, VoiceId } = pollyModule;
// Determine the output format
// For Polly, we always request PCM for WAV (so we can add the header)
// and MP3/OGG directly for those formats
const requestedFormat = options?.format || "wav";
let outputFormat;
if (requestedFormat === "mp3") {
// Request MP3 directly from Polly
outputFormat = OutputFormat.MP3;
}
else if (requestedFormat === "ogg") {
// Request OGG directly from Polly
outputFormat = OutputFormat.OGG_VORBIS;
}
else {
// For WAV, request PCM and we'll add the WAV header
outputFormat = OutputFormat.PCM;
}
// Get the voice ID
const VoiceIdType = VoiceId; // Get the RUNTIME VoiceId enum/object
const voiceIdString = options?.voice || this.voiceId || "Joanna";
const voiceId = voiceIdString; // Cast using the runtime type
// Prepare text or SSML
const preparedText = await this.prepareSSML(text, options);
// Determine if the prepared text is SSML
const isSSML = this._isSSML(preparedText);
// Determine the engine to use based on the voice
// Standard voices: Geraint, Raveena, Aditi, etc.
// Neural voices: Joanna, Matthew, Lupe, etc.
const standardVoices = ['Geraint', 'Raveena', 'Aditi', 'Carmen', 'Maxim', 'Tatyana', 'Conchita', 'Enrique', 'Russell', 'Nicole', 'Amy', 'Brian', 'Emma', 'Gwyneth', 'Raveena', 'Ivy', 'Joanna', 'Kendra', 'Kimberly', 'Salli', 'Joey', 'Justin', 'Matthew'];
const engine = standardVoices.includes(voiceIdString) ? "standard" : "neural";
// Create input parameters
const input = {
Text: preparedText,
TextType: isSSML ? "ssml" : "text",
OutputFormat: outputFormat,
VoiceId: voiceId,
Engine: engine, // Use standard engine for standard voices, neural for neural voices
// Set sample rate based on format
// For PCM, always use 16000 Hz to match the Python implementation
// For MP3 and OGG, use 24000 Hz for better quality
SampleRate: outputFormat === OutputFormat.PCM ? "16000" : "24000",
};
// We use a fixed sample rate of 4000 Hz for playback
// This is set in the constructor and doesn't need to be updated here
// Create the command
const command = new SynthesizeSpeechCommand(input);
// Execute the command
const response = await this.client.send(command);
// Get audio data
if (!response.AudioStream) {
throw new Error("No audio data returned from AWS Polly");
}
// Convert audio stream to Uint8Array
const arrayBuffer = await response.AudioStream.transformToByteArray();
const audioData = new Uint8Array(arrayBuffer);
// If we requested WAV format but got PCM data, add a WAV header
if (options?.format === "wav" && outputFormat === OutputFormat.PCM) {
// Determine if this is for playback or file saving
const isForPlayback = !options?.filePath; // If no filePath, it's for playback
// Add the WAV header with the appropriate sample rate
// For playback, we use a much lower sample rate (4000 Hz)
// For file saving, we use the actual sample rate (16000 Hz)
return this.addWavHeader(audioData, 16000, isForPlayback);
}
return audioData;
}
catch (error) {
console.error("Error synthesizing speech:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream with word boundaries
* @param text Text or SSML to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and word boundaries
*/
async synthToBytestream(text, options) {
try {
const pollyModule = this._pollyModule || (await Promise.resolve().then(() => __importStar(require("@aws-sdk/client-polly"))));
if (!this.client) {
const PollyClient = pollyModule.PollyClient;
this.client = new PollyClient({
region: this.credentials.region, // Reverted
credentials: {
accessKeyId: this.credentials.accessKeyId,
secretAccessKey: this.credentials.secretAccessKey,
},
});
this._pollyModule = pollyModule;
}
const { OutputFormat, SynthesizeSpeechCommandInput, SynthesizeSpeechCommand, VoiceId, SpeechMarkType } = pollyModule;
const VoiceIdType = VoiceId; // Get the RUNTIME VoiceId enum/object
const voiceIdString = options?.voice || this.voiceId || "Joanna";
const voiceId = voiceIdString; // Cast via unknown
// Prepare text or SSML
const preparedText = await this.prepareSSML(text, options);
// Determine if the prepared text is SSML
const textType = this._isSSML(preparedText) ? "ssml" : "text";
// Determine the engine to use based on the voice
// Standard voices: Geraint, Raveena, Aditi, etc.
// Neural voices: Joanna, Matthew, Lupe, etc.
const standardVoices = ['Geraint', 'Raveena', 'Aditi', 'Carmen', 'Maxim', 'Tatyana', 'Conchita', 'Enrique', 'Russell', 'Nicole', 'Amy', 'Brian', 'Emma', 'Gwyneth', 'Raveena', 'Ivy', 'Joanna', 'Kendra', 'Kimberly', 'Salli', 'Joey', 'Justin', 'Matthew'];
const engine = standardVoices.includes(voiceId) ? "standard" : "neural";
let wordBoundaries = [];
// Request Speech Marks (JSON)
try {
const marksParams = {
Text: preparedText,
VoiceId: voiceId,
OutputFormat: "json",
SpeechMarkTypes: [SpeechMarkType.WORD],
TextType: textType,
Engine: engine,
};
const marksCommand = new SynthesizeSpeechCommand(marksParams);
const marksResponse = await this.client.send(marksCommand);
if (marksResponse.AudioStream) {
const streamData = await (0, stream_utils_1.streamToBuffer)(marksResponse.AudioStream); // Use correct util
const marksJsonString = new TextDecoder().decode(streamData); // Decode Buffer/Uint8Array
const jsonLines = marksJsonString.trim().split("\n");
for (const line of jsonLines) {
try {
const mark = JSON.parse(line);
if (mark.type === "word") {
wordBoundaries.push({
text: mark.value,
offset: mark.time, // Use Polly's time (ms) as offset
duration: 0, // Polly doesn't provide duration for word marks
});
}
}
catch (parseError) {
console.warn(`Skipping invalid JSON line in speech marks: ${line}`, parseError);
}
}
}
else {
console.warn("No AudioStream received from Polly for speech marks");
}
}
catch (error) {
console.error("Error getting speech marks from Polly:", error);
// Don't throw here, allow audio synthesis to proceed if possible
// Caller should check wordBoundaries array length if marks are critical
}
// Request Audio Stream (PCM/MP3/OGG)
// For Polly, we always request PCM for WAV (so we can add the header)
// and MP3/OGG directly for those formats
const requestedFormat = options?.format || "wav";
let outputFormat;
if (requestedFormat === "mp3") {
// Request MP3 directly from Polly
outputFormat = OutputFormat.MP3;
}
else if (requestedFormat === "ogg") {
// Request OGG directly from Polly
outputFormat = OutputFormat.OGG_VORBIS;
}
else {
// For WAV, request PCM and we'll add the WAV header
outputFormat = OutputFormat.PCM;
}
const audioParams = {
Text: preparedText,
VoiceId: voiceId, // Use the same casted voiceId
OutputFormat: outputFormat,
TextType: textType,
Engine: engine,
// Set sample rate based on format
// For PCM, always use 16000 Hz to match the Python implementation
// For MP3 and OGG, use 24000 Hz for better quality
SampleRate: outputFormat === OutputFormat.PCM ? "16000" : "24000",
};
// We use a fixed sample rate of 4000 Hz for playback
// This is set in the constructor and doesn't need to be updated here
try {
const audioCommand = new SynthesizeSpeechCommand(audioParams);
const audioResponse = await this.client.send(audioCommand);
if (!audioResponse.AudioStream) {
throw new Error("No AudioStream received from Polly for audio data");
}
// Get the audio stream
let audioStream = audioResponse.AudioStream;
// If we requested WAV format but got PCM data, add a WAV header
if (requestedFormat === "wav" && outputFormat === OutputFormat.PCM) {
// For streaming, we'll need to convert the entire stream to a buffer first,
// add the WAV header, and then create a new stream
try {
// For streaming, we're always doing playback
const isForPlayback = true;
// Convert the stream to a buffer
const streamData = await (0, stream_utils_1.streamToBuffer)(audioResponse.AudioStream);
// Add WAV header to the PCM data with a fixed sample rate of 4000 Hz for playback
// This compensates for the sound-play library playing Polly audio too fast
const wavData = this.addWavHeader(new Uint8Array(streamData), 16000, isForPlayback);
// Create a new ReadableStream from the WAV data
audioStream = new ReadableStream({
start(controller) {
controller.enqueue(wavData);
controller.close();
}
});
}
catch (error) {
console.error("Error adding WAV header to PCM stream:", error);
// Fall back to the original stream if there's an error
}
}
// Return combined result
return {
audioStream: audioStream,
wordBoundaries: wordBoundaries,
};
}
catch (error) {
console.error("Error synthesizing audio stream from Polly:", error);
throw error; // Re-throw the audio synthesis error
}
}
catch (error) {
console.error("Error initializing Polly client:", error);
throw error;
}
}
/**
* Strip SSML tags from text
* @param text Text with SSML tags
* @returns Plain text without SSML tags
*/
_stripSSML(text) {
// If text is not SSML, return as is
if (!this._isSSML(text)) {
return text;
}
// Remove all XML tags
let plainText = text.replace(/<[^>]+>/g, "");
// Decode XML entities
plainText = plainText.replace(/</g, "<");
plainText = plainText.replace(/>/g, ">");
plainText = plainText.replace(/&/g, "&");
plainText = plainText.replace(/"/g, '"');
plainText = plainText.replace(/'/g, "'");
// Remove extra whitespace
plainText = plainText.replace(/\s+/g, " ").trim();
return plainText;
}
/**
* Add a WAV header to PCM audio data
* This matches the Python implementation using wave.setparams((1, 2, 16000, 0, "NONE", "NONE"))
* @param pcmData PCM audio data from AWS Polly (signed 16-bit, 1 channel, little-endian)
* @param sampleRate Sample rate in Hz (default: 16000)
* @returns PCM audio data with WAV header
*/
addWavHeader(pcmData, sampleRate = 16000, _isForPlayback = false) {
// Always use 16000 Hz for Polly PCM data to match the Python implementation
// The Python implementation uses wav.setparams((1, 2, 16000, 0, "NONE", "NONE"))
sampleRate = 16000;
// WAV header is 44 bytes
const headerSize = 44;
const wavData = new Uint8Array(headerSize + pcmData.length);
// Set up WAV header
// "RIFF" chunk descriptor
wavData[0] = 0x52; // 'R'
wavData[1] = 0x49; // 'I'
wavData[2] = 0x46; // 'F'
wavData[3] = 0x46; // 'F'
// Chunk size (file size - 8)
const fileSize = pcmData.length + headerSize - 8;
wavData[4] = fileSize & 0xFF;
wavData[5] = (fileSize >> 8) & 0xFF;
wavData[6] = (fileSize >> 16) & 0xFF;
wavData[7] = (fileSize >> 24) & 0xFF;
// "WAVE" format
wavData[8] = 0x57; // 'W'
wavData[9] = 0x41; // 'A'
wavData[10] = 0x56; // 'V'
wavData[11] = 0x45; // 'E'
// "fmt " sub-chunk
wavData[12] = 0x66; // 'f'
wavData[13] = 0x6D; // 'm'
wavData[14] = 0x74; // 't'
wavData[15] = 0x20; // ' '
// Sub-chunk size (16 for PCM)
wavData[16] = 16;
wavData[17] = 0;
wavData[18] = 0;
wavData[19] = 0;
// Audio format (1 for PCM)
wavData[20] = 1;
wavData[21] = 0;
// Number of channels (1 for mono)
wavData[22] = 1;
wavData[23] = 0;
// Sample rate (always 16000 Hz for Polly PCM)
wavData[24] = sampleRate & 0xFF;
wavData[25] = (sampleRate >> 8) & 0xFF;
wavData[26] = (sampleRate >> 16) & 0xFF;
wavData[27] = (sampleRate >> 24) & 0xFF;
// Byte rate (SampleRate * NumChannels * BitsPerSample/8)
const byteRate = sampleRate * 1 * 16 / 8;
wavData[28] = byteRate & 0xFF;
wavData[29] = (byteRate >> 8) & 0xFF;
wavData[30] = (byteRate >> 16) & 0xFF;
wavData[31] = (byteRate >> 24) & 0xFF;
// Block align (NumChannels * BitsPerSample/8)
wavData[32] = 2; // 1 * 16 / 8
wavData[33] = 0;
// Bits per sample
wavData[34] = 16;
wavData[35] = 0;
// "data" sub-chunk
wavData[36] = 0x64; // 'd'
wavData[37] = 0x61; // 'a'
wavData[38] = 0x74; // 't'
wavData[39] = 0x61; // 'a'
// Sub-chunk size (data size)
wavData[40] = pcmData.length & 0xFF;
wavData[41] = (pcmData.length >> 8) & 0xFF;
wavData[42] = (pcmData.length >> 16) & 0xFF;
wavData[43] = (pcmData.length >> 24) & 0xFF;
// Copy PCM data after header
wavData.set(pcmData, headerSize);
return wavData;
}
/**
* Check if credentials are valid
* @returns Promise resolving to true if credentials are valid
*/
async checkCredentials() {
try {
const pollyModule = this._pollyModule || (await Promise.resolve().then(() => __importStar(require("@aws-sdk/client-polly"))));
if (!this.client) {
const PollyClient = pollyModule.PollyClient;
this.client = new PollyClient({
region: this.credentials.region,
credentials: {
accessKeyId: this.credentials.accessKeyId,
secretAccessKey: this.credentials.secretAccessKey,
},
});
this._pollyModule = pollyModule;
}
const DescribeVoicesCommand = pollyModule.DescribeVoicesCommand;
const command = new DescribeVoicesCommand({});
const response = await this.client.send(command);
return Array.isArray(response.Voices) && response.Voices.length > 0;
}
catch (error) {
console.error("Error checking AWS Polly credentials:", error);
return false;
}
}
}
exports.PollyTTSClient = PollyTTSClient;
;