js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
508 lines (507 loc) • 23.3 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.GeminiTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SSMLUtils = __importStar(require("../core/ssml-utils"));
const SpeechMarkdown = __importStar(require("../markdown/converter"));
const fetch_utils_1 = require("../utils/fetch-utils");
const language_utils_1 = require("../utils/language-utils");
const GEMINI_SUPPORTED_LANGUAGES = [
{ bcp47: "ar-EG", display: "Arabic (Egypt)", readiness: "GA" },
{ bcp47: "bn-BD", display: "Bangla (Bangladesh)", readiness: "GA" },
{ bcp47: "nl-NL", display: "Dutch (Netherlands)", readiness: "GA" },
{ bcp47: "en-IN", display: "English (India)", readiness: "GA" },
{ bcp47: "en-US", display: "English (United States)", readiness: "GA" },
{ bcp47: "fr-FR", display: "French (France)", readiness: "GA" },
{ bcp47: "de-DE", display: "German (Germany)", readiness: "GA" },
{ bcp47: "hi-IN", display: "Hindi (India)", readiness: "GA" },
{ bcp47: "id-ID", display: "Indonesian (Indonesia)", readiness: "GA" },
{ bcp47: "it-IT", display: "Italian (Italy)", readiness: "GA" },
{ bcp47: "ja-JP", display: "Japanese (Japan)", readiness: "GA" },
{ bcp47: "ko-KR", display: "Korean (South Korea)", readiness: "GA" },
{ bcp47: "mr-IN", display: "Marathi (India)", readiness: "GA" },
{ bcp47: "pl-PL", display: "Polish (Poland)", readiness: "GA" },
{ bcp47: "pt-BR", display: "Portuguese (Brazil)", readiness: "GA" },
{ bcp47: "ro-RO", display: "Romanian (Romania)", readiness: "GA" },
{ bcp47: "ru-RU", display: "Russian (Russia)", readiness: "GA" },
{ bcp47: "es-ES", display: "Spanish (Spain)", readiness: "GA" },
{ bcp47: "ta-IN", display: "Tamil (India)", readiness: "GA" },
{ bcp47: "te-IN", display: "Telugu (India)", readiness: "GA" },
{ bcp47: "th-TH", display: "Thai (Thailand)", readiness: "GA" },
{ bcp47: "tr-TR", display: "Turkish (Turkey)", readiness: "GA" },
{ bcp47: "uk-UA", display: "Ukrainian (Ukraine)", readiness: "GA" },
{ bcp47: "vi-VN", display: "Vietnamese (Vietnam)", readiness: "GA" },
{ bcp47: "af-ZA", display: "Afrikaans (South Africa)", readiness: "Preview" },
{ bcp47: "sq-AL", display: "Albanian (Albania)", readiness: "Preview" },
{ bcp47: "am-ET", display: "Amharic (Ethiopia)", readiness: "Preview" },
{ bcp47: "ar-001", display: "Arabic (World)", readiness: "Preview" },
{ bcp47: "hy-AM", display: "Armenian (Armenia)", readiness: "Preview" },
{ bcp47: "az-AZ", display: "Azerbaijani (Azerbaijan)", readiness: "Preview" },
{ bcp47: "eu-ES", display: "Basque (Spain)", readiness: "Preview" },
{ bcp47: "be-BY", display: "Belarusian (Belarus)", readiness: "Preview" },
{ bcp47: "bg-BG", display: "Bulgarian (Bulgaria)", readiness: "Preview" },
{ bcp47: "my-MM", display: "Burmese (Myanmar)", readiness: "Preview" },
{ bcp47: "ca-ES", display: "Catalan (Spain)", readiness: "Preview" },
{ bcp47: "ceb-PH", display: "Cebuano (Philippines)", readiness: "Preview" },
{ bcp47: "cmn-CN", display: "Chinese, Mandarin (China)", readiness: "Preview" },
{ bcp47: "cmn-TW", display: "Chinese, Mandarin (Taiwan)", readiness: "Preview" },
{ bcp47: "hr-HR", display: "Croatian (Croatia)", readiness: "Preview" },
{ bcp47: "cs-CZ", display: "Czech (Czech Republic)", readiness: "Preview" },
{ bcp47: "da-DK", display: "Danish (Denmark)", readiness: "Preview" },
{ bcp47: "en-AU", display: "English (Australia)", readiness: "Preview" },
{ bcp47: "en-GB", display: "English (United Kingdom)", readiness: "Preview" },
{ bcp47: "et-EE", display: "Estonian (Estonia)", readiness: "Preview" },
{ bcp47: "fil-PH", display: "Filipino (Philippines)", readiness: "Preview" },
{ bcp47: "fi-FI", display: "Finnish (Finland)", readiness: "Preview" },
{ bcp47: "fr-CA", display: "French (Canada)", readiness: "Preview" },
{ bcp47: "gl-ES", display: "Galician (Spain)", readiness: "Preview" },
{ bcp47: "ka-GE", display: "Georgian (Georgia)", readiness: "Preview" },
{ bcp47: "el-GR", display: "Greek (Greece)", readiness: "Preview" },
{ bcp47: "gu-IN", display: "Gujarati (India)", readiness: "Preview" },
{ bcp47: "ht-HT", display: "Haitian Creole (Haiti)", readiness: "Preview" },
{ bcp47: "he-IL", display: "Hebrew (Israel)", readiness: "Preview" },
{ bcp47: "hu-HU", display: "Hungarian (Hungary)", readiness: "Preview" },
{ bcp47: "is-IS", display: "Icelandic (Iceland)", readiness: "Preview" },
{ bcp47: "jv-JV", display: "Javanese (Java)", readiness: "Preview" },
{ bcp47: "kn-IN", display: "Kannada (India)", readiness: "Preview" },
{ bcp47: "kok-IN", display: "Konkani (India)", readiness: "Preview" },
{ bcp47: "lo-LA", display: "Lao (Laos)", readiness: "Preview" },
{ bcp47: "la-VA", display: "Latin (Vatican City)", readiness: "Preview" },
{ bcp47: "lv-LV", display: "Latvian (Latvia)", readiness: "Preview" },
{ bcp47: "lt-LT", display: "Lithuanian (Lithuania)", readiness: "Preview" },
{ bcp47: "lb-LU", display: "Luxembourgish (Luxembourg)", readiness: "Preview" },
{ bcp47: "mk-MK", display: "Macedonian (North Macedonia)", readiness: "Preview" },
{ bcp47: "mai-IN", display: "Maithili (India)", readiness: "Preview" },
{ bcp47: "mg-MG", display: "Malagasy (Madagascar)", readiness: "Preview" },
{ bcp47: "ms-MY", display: "Malay (Malaysia)", readiness: "Preview" },
{ bcp47: "ml-IN", display: "Malayalam (India)", readiness: "Preview" },
{ bcp47: "mn-MN", display: "Mongolian (Mongolia)", readiness: "Preview" },
{ bcp47: "ne-NP", display: "Nepali (Nepal)", readiness: "Preview" },
{ bcp47: "nb-NO", display: "Norwegian, Bokmal (Norway)", readiness: "Preview" },
{ bcp47: "nn-NO", display: "Norwegian, Nynorsk (Norway)", readiness: "Preview" },
{ bcp47: "or-IN", display: "Odia (India)", readiness: "Preview" },
{ bcp47: "ps-AF", display: "Pashto (Afghanistan)", readiness: "Preview" },
{ bcp47: "fa-IR", display: "Persian (Iran)", readiness: "Preview" },
{ bcp47: "pt-PT", display: "Portuguese (Portugal)", readiness: "Preview" },
{ bcp47: "pa-IN", display: "Punjabi (India)", readiness: "Preview" },
{ bcp47: "sr-RS", display: "Serbian (Serbia)", readiness: "Preview" },
{ bcp47: "sd-IN", display: "Sindhi (India)", readiness: "Preview" },
{ bcp47: "si-LK", display: "Sinhala (Sri Lanka)", readiness: "Preview" },
{ bcp47: "sk-SK", display: "Slovak (Slovakia)", readiness: "Preview" },
{ bcp47: "sl-SI", display: "Slovenian (Slovenia)", readiness: "Preview" },
{ bcp47: "es-419", display: "Spanish (Latin America)", readiness: "Preview" },
{ bcp47: "es-MX", display: "Spanish (Mexico)", readiness: "Preview" },
{ bcp47: "sw-KE", display: "Swahili (Kenya)", readiness: "Preview" },
{ bcp47: "sv-SE", display: "Swedish (Sweden)", readiness: "Preview" },
{ bcp47: "ur-PK", display: "Urdu (Pakistan)", readiness: "Preview" },
];
const GEMINI_SUPPORTED_LANGUAGE_CODES = GEMINI_SUPPORTED_LANGUAGES.map((language) => language.bcp47);
const GEMINI_LANGUAGE_READINESS = GEMINI_SUPPORTED_LANGUAGES.reduce((readiness, language) => {
readiness[language.bcp47] = language.readiness;
return readiness;
}, {});
/**
* Gemini Flash TTS client.
*
* Uses the Gemini generateContent REST API directly. Gemini TTS returns PCM audio;
* this client wraps it as WAV by default so normal playback and conversion paths work.
*/
class GeminiTTSClient extends abstract_tts_1.AbstractTTSClient {
constructor(credentials = {}) {
super(credentials);
Object.defineProperty(this, "apiKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "baseUrl", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "model", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.apiKey = credentials.apiKey || this.getEnv("GEMINI_API_KEY");
this.baseUrl = credentials.baseURL || "https://generativelanguage.googleapis.com/v1beta";
this.model = credentials.model || GeminiTTSClient.DEFAULT_MODEL;
this.voiceId = credentials.voice || GeminiTTSClient.DEFAULT_VOICE;
this.sampleRate = 24000;
this.capabilities = { browserSupported: true, nodeSupported: true };
this._models = [
{ id: "gemini-3.1-flash-tts-preview", features: ["audio-tags"] },
{ id: "gemini-2.5-flash-preview-tts", features: ["audio-tags"] },
];
this.applyCredentialProperties(credentials);
}
getEnv(name) {
if (typeof process !== "undefined" && process.env?.[name]) {
return process.env[name] || "";
}
return "";
}
applyCredentialProperties(credentials) {
const rawProps = credentials.properties ??
credentials.propertiesJson ??
credentials.propertiesJSON;
if (!rawProps)
return;
let parsed = null;
if (typeof rawProps === "string") {
try {
parsed = JSON.parse(rawProps);
}
catch {
return;
}
}
else if (typeof rawProps === "object") {
parsed = rawProps;
}
if (!parsed)
return;
for (const [key, value] of Object.entries(parsed)) {
this.setProperty(key, value);
}
}
async prepareText(text, options) {
let processedText = text;
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) {
const ssml = await SpeechMarkdown.toSSML(processedText, "w3c");
processedText = SSMLUtils.stripSSML(ssml);
}
if (SSMLUtils.isSSML(processedText)) {
processedText = SSMLUtils.stripSSML(processedText);
}
return processedText;
}
setModel(model) {
this.model = model;
}
setVoice(voiceId) {
this.voiceId = voiceId;
}
getProperty(property) {
switch (property) {
case "model":
return this.model;
case "voice":
return this.voiceId;
case "baseURL":
case "baseUrl":
return this.baseUrl;
default:
return super.getProperty(property);
}
}
setProperty(property, value) {
switch (property) {
case "model":
this.setModel(value);
break;
case "voice":
this.setVoice(value);
break;
case "baseURL":
case "baseUrl":
this.baseUrl = value;
break;
default:
super.setProperty(property, value);
break;
}
}
getRequiredCredentials() {
return ["apiKey"];
}
async checkCredentials() {
if (!this.apiKey)
return false;
try {
const response = await (0, fetch_utils_1.getFetch)()(`${this.baseUrl}/models`, {
method: "GET",
headers: {
"x-goog-api-key": this.apiKey,
},
});
if (!response.ok)
return false;
const json = await response.json().catch(() => null);
if (!json || !Array.isArray(json.models))
return true;
return json.models.some((model) => {
const name = String(model?.name || model?.id || "");
return name === this.model || name === `models/${this.model}`;
});
}
catch {
return false;
}
}
async checkCredentialsDetailed() {
try {
const success = await this.checkCredentials();
return success
? { success: true, voiceCount: GeminiTTSClient.VOICES.length }
: {
success: false,
error: this.apiKey ? "Gemini credentials are invalid" : "Missing apiKey",
};
}
catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : String(error),
};
}
}
async _getVoices() {
return GeminiTTSClient.VOICES;
}
async _mapVoicesToUnified(rawVoices) {
return rawVoices.map((voice) => ({
id: voice.id,
name: voice.name,
gender: voice.gender,
provider: "gemini",
languageCodes: GEMINI_SUPPORTED_LANGUAGES.map((language) => ({
bcp47: language.bcp47,
iso639_3: (0, language_utils_1.toIso639_3)(language.bcp47),
display: language.display,
})),
metadata: {
style: voice.style,
genderSource: "google-cloud-gemini-tts",
supportedLanguageCodes: [...GEMINI_SUPPORTED_LANGUAGE_CODES],
languageReadiness: { ...GEMINI_LANGUAGE_READINESS },
},
}));
}
async synthToBytes(text, options = {}) {
if (!this.apiKey) {
throw new Error("Gemini TTS API key is required. Set apiKey or GEMINI_API_KEY.");
}
const preparedText = await this.prepareText(text, options);
const model = options.model || this.model;
const voiceName = options.voice || this.voiceId || GeminiTTSClient.DEFAULT_VOICE;
const generationConfig = {
...options.providerOptions,
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName,
},
},
},
};
const request = {
contents: [
{
parts: [
{
text: preparedText,
},
],
},
],
generationConfig,
model,
};
const response = await (0, fetch_utils_1.getFetch)()(`${this.baseUrl}/models/${model}:generateContent`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-goog-api-key": this.apiKey,
},
body: JSON.stringify(request),
});
if (!response.ok) {
const errorText = await response.text().catch(() => "");
throw new Error(`Gemini TTS API error: ${response.status} ${response.statusText} - ${errorText}`);
}
const json = await response.json();
const pcmBytes = this.extractAudioBytes(json);
this._createEstimatedWordTimings(preparedText);
if (options.format === "pcm") {
return pcmBytes;
}
return this.pcm16ToWav(pcmBytes, this.sampleRate, 1);
}
async synthToBytestream(text, options = {}) {
const audioBytes = await this.synthToBytes(text, options);
const stream = new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
const wordBoundaries = options.useWordBoundary
? this.timings.map(([start, end, word]) => ({
text: word,
offset: Math.round(start * 10000),
duration: Math.round((end - start) * 10000),
}))
: [];
return { audioStream: stream, wordBoundaries };
}
extractAudioBytes(response) {
const candidates = Array.isArray(response?.candidates) ? response.candidates : [];
const textParts = [];
for (const candidate of candidates) {
const parts = Array.isArray(candidate?.content?.parts) ? candidate.content.parts : [];
for (const part of parts) {
const inlineData = part?.inlineData || part?.inline_data;
if (typeof inlineData?.data === "string" && inlineData.data.length > 0) {
return this.base64ToBytes(inlineData.data);
}
if (typeof part?.text === "string") {
textParts.push(part.text);
}
}
}
const finishReasons = candidates
.map((candidate) => candidate?.finishReason || candidate?.finish_reason)
.filter(Boolean)
.join(", ");
const details = [
finishReasons ? `finish reason: ${finishReasons}` : "",
textParts.length ? `text parts: ${textParts.join(" ")}` : "",
]
.filter(Boolean)
.join("; ");
throw new Error(`Gemini TTS response did not include audio data${details ? ` (${details})` : ""}.`);
}
base64ToBytes(base64) {
try {
if (typeof Buffer !== "undefined" && typeof Buffer.from === "function") {
return new Uint8Array(Buffer.from(base64, "base64"));
}
const binary = atob(base64);
const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i);
}
return bytes;
}
catch (error) {
throw new Error(`Failed to decode Gemini TTS audio data: ${error instanceof Error ? error.message : String(error)}`);
}
}
pcm16ToWav(pcmBytes, sampleRate = 24000, channels = 1) {
const bitsPerSample = 16;
const byteRate = (sampleRate * channels * bitsPerSample) / 8;
const blockAlign = (channels * bitsPerSample) / 8;
const headerSize = 44;
const wavBytes = new Uint8Array(headerSize + pcmBytes.length);
const view = new DataView(wavBytes.buffer);
this.writeAscii(wavBytes, 0, "RIFF");
view.setUint32(4, 36 + pcmBytes.length, true);
this.writeAscii(wavBytes, 8, "WAVE");
this.writeAscii(wavBytes, 12, "fmt ");
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, channels, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, byteRate, true);
view.setUint16(32, blockAlign, true);
view.setUint16(34, bitsPerSample, true);
this.writeAscii(wavBytes, 36, "data");
view.setUint32(40, pcmBytes.length, true);
wavBytes.set(pcmBytes, headerSize);
return wavBytes;
}
writeAscii(target, offset, value) {
for (let i = 0; i < value.length; i++) {
target[offset + i] = value.charCodeAt(i);
}
}
}
exports.GeminiTTSClient = GeminiTTSClient;
Object.defineProperty(GeminiTTSClient, "DEFAULT_MODEL", {
enumerable: true,
configurable: true,
writable: true,
value: "gemini-3.1-flash-tts-preview"
});
Object.defineProperty(GeminiTTSClient, "DEFAULT_VOICE", {
enumerable: true,
configurable: true,
writable: true,
value: "Kore"
});
Object.defineProperty(GeminiTTSClient, "VOICES", {
enumerable: true,
configurable: true,
writable: true,
value: [
{ id: "Zephyr", name: "Zephyr", style: "Bright", gender: "Female" },
{ id: "Puck", name: "Puck", style: "Upbeat", gender: "Male" },
{ id: "Charon", name: "Charon", style: "Informative", gender: "Male" },
{ id: "Kore", name: "Kore", style: "Firm", gender: "Female" },
{ id: "Fenrir", name: "Fenrir", style: "Excitable", gender: "Male" },
{ id: "Leda", name: "Leda", style: "Youthful", gender: "Female" },
{ id: "Orus", name: "Orus", style: "Firm", gender: "Male" },
{ id: "Aoede", name: "Aoede", style: "Breezy", gender: "Female" },
{ id: "Callirrhoe", name: "Callirrhoe", style: "Easy-going", gender: "Female" },
{ id: "Autonoe", name: "Autonoe", style: "Bright", gender: "Female" },
{ id: "Enceladus", name: "Enceladus", style: "Breathy", gender: "Male" },
{ id: "Iapetus", name: "Iapetus", style: "Clear", gender: "Male" },
{ id: "Umbriel", name: "Umbriel", style: "Easy-going", gender: "Male" },
{ id: "Algieba", name: "Algieba", style: "Smooth", gender: "Male" },
{ id: "Despina", name: "Despina", style: "Smooth", gender: "Female" },
{ id: "Erinome", name: "Erinome", style: "Clear", gender: "Female" },
{ id: "Algenib", name: "Algenib", style: "Gravelly", gender: "Male" },
{ id: "Rasalgethi", name: "Rasalgethi", style: "Informative", gender: "Male" },
{ id: "Laomedeia", name: "Laomedeia", style: "Upbeat", gender: "Female" },
{ id: "Achernar", name: "Achernar", style: "Soft", gender: "Female" },
{ id: "Alnilam", name: "Alnilam", style: "Firm", gender: "Male" },
{ id: "Schedar", name: "Schedar", style: "Even", gender: "Male" },
{ id: "Gacrux", name: "Gacrux", style: "Mature", gender: "Female" },
{ id: "Pulcherrima", name: "Pulcherrima", style: "Forward", gender: "Female" },
{ id: "Achird", name: "Achird", style: "Friendly", gender: "Male" },
{ id: "Zubenelgenubi", name: "Zubenelgenubi", style: "Casual", gender: "Male" },
{ id: "Vindemiatrix", name: "Vindemiatrix", style: "Gentle", gender: "Female" },
{ id: "Sadachbia", name: "Sadachbia", style: "Lively", gender: "Male" },
{ id: "Sadaltager", name: "Sadaltager", style: "Knowledgeable", gender: "Male" },
{ id: "Sulafat", name: "Sulafat", style: "Warm", gender: "Female" },
]
});