js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
681 lines (680 loc) • 25 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.CereVoiceTTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SSMLUtils = __importStar(require("../core/ssml-utils"));
const SpeechMarkdown = __importStar(require("../markdown/converter"));
const fetch_utils_1 = require("../utils/fetch-utils");
const language_utils_1 = require("../utils/language-utils");
const TOKEN_LIFETIME_MS = 3 * 60 * 60 * 1000;
const TOKEN_EXPIRY_BUFFER_MS = 60 * 1000;
const SUPPORTED_AUDIO_FORMATS = new Set(["wav", "mp3", "ogg"]);
class CereVoiceTTSClient extends abstract_tts_1.AbstractTTSClient {
constructor(credentials = {}) {
super(credentials);
Object.defineProperty(this, "email", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "password", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "accessToken", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "refreshToken", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "baseUrl", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "audioFormat", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "outputSampleRate", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "language", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "accent", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "metadata", {
enumerable: true,
configurable: true,
writable: true,
value: false
});
Object.defineProperty(this, "tokenExpiresAt", {
enumerable: true,
configurable: true,
writable: true,
value: 0
});
this.email =
credentials.email ||
(typeof process !== "undefined" ? process.env.CEREVOICE_EMAIL || "" : "");
this.password =
credentials.password ||
(typeof process !== "undefined" ? process.env.CEREVOICE_PASSWORD || "" : "");
this.accessToken =
credentials.accessToken ||
(typeof process !== "undefined" ? process.env.CEREVOICE_ACCESS_TOKEN || "" : "");
this.refreshToken =
credentials.refreshToken ||
(typeof process !== "undefined" ? process.env.CEREVOICE_REFRESH_TOKEN || "" : "");
this.baseUrl = (credentials.baseURL || "https://api.cerevoice.com/v2").replace(/\/+$/, "");
this.voiceId = credentials.voice || "Heather";
this.audioFormat = credentials.audioFormat || "wav";
this.outputSampleRate = credentials.sampleRate;
if (this.outputSampleRate) {
this.sampleRate = this.outputSampleRate;
}
this.capabilities = {
browserSupported: true,
nodeSupported: true,
needsWasm: false,
};
this._models = [
{ id: "cerevoice-cloud-v2", features: ["streaming", "ssml", "word-boundary-events"] },
];
if (this.accessToken) {
this.tokenExpiresAt = Number.POSITIVE_INFINITY;
}
this.applyCredentialProperties(credentials);
}
applyCredentialProperties(credentials) {
const rawProps = credentials.properties ??
credentials.propertiesJson ??
credentials.propertiesJSON;
if (!rawProps) {
return;
}
let parsed = null;
if (typeof rawProps === "string") {
try {
parsed = JSON.parse(rawProps);
}
catch {
parsed = null;
}
}
else if (typeof rawProps === "object") {
parsed = rawProps;
}
if (!parsed) {
return;
}
for (const [key, value] of Object.entries(parsed)) {
this.setProperty(key, value);
}
}
setVoice(voiceId, lang) {
this.voiceId = voiceId;
if (lang) {
this.lang = lang;
}
}
getProperty(property) {
switch (property) {
case "voice":
return this.voiceId;
case "baseURL":
return this.baseUrl;
case "audioFormat":
return this.audioFormat;
case "sampleRate":
return this.outputSampleRate;
case "language":
return this.language;
case "accent":
return this.accent;
case "metadata":
return this.metadata;
default:
return super.getProperty(property);
}
}
setProperty(property, value) {
switch (property) {
case "voice":
this.setVoice(String(value));
break;
case "baseURL":
case "baseUrl":
this.baseUrl = String(value).replace(/\/+$/, "");
break;
case "audioFormat":
if (this.isSupportedAudioFormat(value)) {
this.audioFormat = value;
}
break;
case "sampleRate": {
const sampleRate = Number(value);
if (Number.isFinite(sampleRate) && sampleRate > 0) {
this.outputSampleRate = sampleRate;
this.sampleRate = sampleRate;
}
break;
}
case "language":
this.language = String(value);
break;
case "accent":
this.accent = String(value);
break;
case "metadata":
this.metadata = Boolean(value);
break;
default:
super.setProperty(property, value);
break;
}
}
async checkCredentials() {
if (!this.accessToken && !this.refreshToken && (!this.email || !this.password)) {
return false;
}
try {
const voices = await this._getVoices();
return voices.length > 0;
}
catch {
return false;
}
}
getRequiredCredentials() {
return ["email", "password"];
}
async _getVoices() {
try {
const response = await this.fetchWithAuth(this.buildUrl("/voices"));
if (!response.ok) {
return [];
}
const data = (await response.json());
return Array.isArray(data.voices) ? data.voices : [];
}
catch {
return [];
}
}
async _mapVoicesToUnified(rawVoices) {
return rawVoices.map((voice) => {
const language = voice.language_iso || "en";
const country = voice.country_iso || undefined;
const bcp47 = country ? `${language.toLowerCase()}-${country.toUpperCase()}` : language;
return {
id: voice.name || "unknown",
name: voice.name || "Unknown",
gender: this.mapGender(voice.gender),
provider: "cerevoice",
languageCodes: [
{
bcp47,
iso639_3: (0, language_utils_1.toIso639_3)(bcp47),
display: (0, language_utils_1.toLanguageDisplay)(bcp47),
},
],
metadata: {
sample_rate: voice.sample_rate,
accent_code: voice.accent_code,
accent: voice.accent,
country: voice.country,
region: voice.region,
language_iso: voice.language_iso,
country_iso: voice.country_iso,
language_ms: voice.language_ms,
language: voice.language,
},
};
});
}
async synthToBytes(text, options = {}) {
const prepared = await this.prepareInput(text, options);
const wantsMetadata = this.shouldRequestMetadata(options);
const response = await this.requestSynthesis(prepared, options, wantsMetadata);
const audioBytes = new Uint8Array(await response.arrayBuffer());
if (wantsMetadata) {
const wordBoundaries = await this.getWordBoundariesFromResponse(response);
if (wordBoundaries.length > 0) {
this.timings = wordBoundaries.map((wb) => [
wb.offset / 10000,
(wb.offset + wb.duration) / 10000,
wb.text,
]);
}
}
else {
this._createEstimatedWordTimings(prepared.plainText);
}
return audioBytes;
}
async synthToBytestream(text, options = {}) {
const prepared = await this.prepareInput(text, options);
const wantsMetadata = this.shouldRequestMetadata(options);
const response = await this.requestSynthesis(prepared, options, wantsMetadata);
const wordBoundaries = wantsMetadata ? await this.getWordBoundariesFromResponse(response) : [];
if (wordBoundaries.length > 0) {
this.timings = wordBoundaries.map((wb) => [
wb.offset / 10000,
(wb.offset + wb.duration) / 10000,
wb.text,
]);
}
if (response.body) {
return {
audioStream: response.body,
wordBoundaries,
};
}
const audioBytes = new Uint8Array(await response.arrayBuffer());
const audioStream = new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
return {
audioStream,
wordBoundaries,
};
}
async requestSynthesis(prepared, options, metadata) {
const audioFormat = this.resolveAudioFormat(options);
const providerOptions = options.providerOptions || {};
const url = this.buildUrl("/speak", {
voice: options.voice || this.voiceId || undefined,
audio_format: audioFormat,
sample_rate: options.sampleRate || this.outputSampleRate,
language: options.language || this.language,
accent: options.accent || this.accent,
metadata,
...providerOptions,
});
const response = await this.fetchWithAuth(url, {
method: "POST",
headers: {
Accept: this.acceptHeaderForFormat(audioFormat),
"Content-Type": prepared.contentType,
},
body: prepared.body,
});
if (!response.ok) {
const errorText = await this.safeReadErrorText(response);
throw new Error(`CereVoice API error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}`);
}
return response;
}
async prepareInput(text, options) {
let processedText = text;
if (options.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) {
processedText = await SpeechMarkdown.toSSML(processedText, "w3c");
}
if (options.rawSSML || this.isXmlLike(processedText)) {
const body = options.rawSSML && !this.isXmlLike(processedText)
? SSMLUtils.wrapWithSpeakTags(this.escapeXml(processedText))
: processedText;
return {
body,
contentType: "text/xml",
plainText: SSMLUtils.stripSSML(body),
};
}
if (this.shouldApplyProsody(options)) {
const attrs = [];
const rate = options.rate ?? this.properties.rate;
const pitch = options.pitch ?? this.properties.pitch;
const volume = options.volume ?? this.properties.volume;
if (rate && rate !== "medium") {
attrs.push(`rate="${rate}"`);
}
if (pitch && pitch !== "medium") {
attrs.push(`pitch="${pitch}"`);
}
if (volume !== undefined && volume !== 100) {
attrs.push(`volume="${volume}"`);
}
const escapedText = this.escapeXml(processedText);
const body = attrs.length > 0
? `<speak><prosody ${attrs.join(" ")}>${escapedText}</prosody></speak>`
: `<speak>${escapedText}</speak>`;
return {
body,
contentType: "text/xml",
plainText: processedText,
};
}
return {
body: processedText,
contentType: "text/plain",
plainText: processedText,
};
}
shouldApplyProsody(options) {
return (options.rate !== undefined ||
options.pitch !== undefined ||
options.volume !== undefined ||
this.properties.rate !== "medium" ||
this.properties.pitch !== "medium" ||
this.properties.volume !== 100);
}
shouldRequestMetadata(options) {
return Boolean(options.useWordBoundary || options.metadata || this.metadata);
}
async getWordBoundariesFromResponse(response) {
const metadataUrl = this.getHeader(response.headers, "X-CereVoice-Metadata");
if (!metadataUrl) {
return [];
}
try {
const metadataResponse = await (0, fetch_utils_1.getFetch)()(metadataUrl, {
method: "GET",
headers: {
Accept: "text/xml, application/xml, text/plain",
},
});
if (!metadataResponse.ok) {
return [];
}
return this.parseMetadataXml(await metadataResponse.text());
}
catch {
return [];
}
}
parseMetadataXml(xml) {
if (!xml.trim()) {
return [];
}
if (typeof DOMParser !== "undefined") {
try {
const document = new DOMParser().parseFromString(xml, "application/xml");
const words = Array.from(document.getElementsByTagName("word"));
const parsed = words
.map((word) => this.createWordBoundary(word.getAttribute("name"), word.getAttribute("start"), word.getAttribute("end")))
.filter((word) => Boolean(word));
if (parsed.length > 0) {
return this.fillMissingDurations(parsed);
}
}
catch {
return [];
}
}
const wordBoundaries = [];
const wordTagRegex = /<word\b([^>]*)\/?>/gi;
let wordMatch = wordTagRegex.exec(xml);
while (wordMatch !== null) {
const attributes = this.parseXmlAttributes(wordMatch[1]);
const boundary = this.createWordBoundary(attributes.name, attributes.start, attributes.end);
if (boundary) {
wordBoundaries.push(boundary);
}
wordMatch = wordTagRegex.exec(xml);
}
return this.fillMissingDurations(wordBoundaries);
}
fillMissingDurations(wordBoundaries) {
return wordBoundaries.map((boundary, index) => {
if (boundary.duration > 0) {
return boundary;
}
const next = wordBoundaries[index + 1];
const fallbackDuration = next ? Math.max(next.offset - boundary.offset, 0) : 5000;
return {
...boundary,
duration: fallbackDuration,
};
});
}
parseXmlAttributes(attributeText) {
const attributes = {};
const attrRegex = /([A-Za-z_:][\w:.-]*)\s*=\s*(?:"([^"]*)"|'([^']*)')/g;
let attrMatch = attrRegex.exec(attributeText);
while (attrMatch !== null) {
attributes[attrMatch[1]] = this.decodeXmlEntities(attrMatch[2] ?? attrMatch[3] ?? "");
attrMatch = attrRegex.exec(attributeText);
}
return attributes;
}
createWordBoundary(name, start, end) {
if (!name || start === undefined || start === null || end === undefined || end === null) {
return null;
}
const startSeconds = Number(start);
const endSeconds = Number(end);
if (!Number.isFinite(startSeconds) ||
!Number.isFinite(endSeconds) ||
endSeconds < startSeconds) {
return null;
}
return {
text: name,
offset: Math.round(startSeconds * 10000),
duration: Math.round((endSeconds - startSeconds) * 10000),
};
}
async fetchWithAuth(url, options = {}, retry = true) {
const token = await this.ensureAccessToken();
const response = await (0, fetch_utils_1.getFetch)()(url, {
...options,
headers: {
...(options.headers || {}),
Authorization: `Bearer ${token}`,
},
});
if (response.status === 401 && retry) {
const refreshedToken = await this.ensureAccessToken(true);
return (0, fetch_utils_1.getFetch)()(url, {
...options,
headers: {
...(options.headers || {}),
Authorization: `Bearer ${refreshedToken}`,
},
});
}
return response;
}
async ensureAccessToken(forceRefresh = false) {
if (!forceRefresh && this.accessToken && Date.now() < this.tokenExpiresAt) {
return this.accessToken;
}
if (this.refreshToken) {
try {
await this.refreshAccessToken();
return this.accessToken;
}
catch {
if (!this.email || !this.password) {
throw new Error("CereVoice refresh token is invalid or expired");
}
}
}
if (!this.email || !this.password) {
throw new Error("CereVoice email and password are required for authentication");
}
await this.login();
return this.accessToken;
}
async login() {
const response = await (0, fetch_utils_1.getFetch)()(this.buildUrl("/auth"), {
method: "GET",
headers: {
Authorization: `Basic ${this.encodeBasicCredentials(`${this.email}:${this.password}`)}`,
},
});
if (!response.ok) {
const errorText = await this.safeReadErrorText(response);
throw new Error(`CereVoice auth error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}`);
}
const data = (await response.json());
if (!data.access_token) {
throw new Error("CereVoice auth response did not include an access token");
}
this.accessToken = data.access_token;
this.refreshToken = data.refresh_token || this.refreshToken;
this.tokenExpiresAt = Date.now() + TOKEN_LIFETIME_MS - TOKEN_EXPIRY_BUFFER_MS;
}
async refreshAccessToken() {
const response = await (0, fetch_utils_1.getFetch)()(this.buildUrl("/auth/refresh", { refresh_token: this.refreshToken }), {
method: "GET",
});
if (!response.ok) {
const errorText = await this.safeReadErrorText(response);
throw new Error(`CereVoice refresh error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}`);
}
const data = (await response.json());
if (!data.access_token) {
throw new Error("CereVoice refresh response did not include an access token");
}
this.accessToken = data.access_token;
this.tokenExpiresAt = Date.now() + TOKEN_LIFETIME_MS - TOKEN_EXPIRY_BUFFER_MS;
}
buildUrl(path, params = {}) {
const url = new URL(`${this.baseUrl}${path}`);
for (const [key, value] of Object.entries(params)) {
if (value !== undefined) {
url.searchParams.set(key, String(value));
}
}
return url.toString();
}
resolveAudioFormat(options) {
const requested = options.audioFormat || options.format || this.audioFormat;
return this.isSupportedAudioFormat(requested) ? requested : this.audioFormat;
}
isSupportedAudioFormat(value) {
return typeof value === "string" && SUPPORTED_AUDIO_FORMATS.has(value);
}
acceptHeaderForFormat(format) {
switch (format) {
case "mp3":
return "audio/mpeg";
case "ogg":
return "audio/ogg";
case "wav":
default:
return "audio/wav";
}
}
mapGender(gender) {
const normalized = gender?.toLowerCase();
if (normalized === "male") {
return "Male";
}
if (normalized === "female") {
return "Female";
}
return "Unknown";
}
isXmlLike(text) {
return /^\s*(<\?xml|<speak\b|<doc\b|<[A-Za-z][\w:.-]*(\s|>|\/>))/i.test(text);
}
escapeXml(text) {
return text
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, """)
.replace(/'/g, "'");
}
decodeXmlEntities(text) {
return text
.replace(/'/g, "'")
.replace(/"/g, '"')
.replace(/>/g, ">")
.replace(/</g, "<")
.replace(/&/g, "&");
}
getHeader(headers, name) {
if (!headers) {
return null;
}
if (typeof headers.get === "function") {
return headers.get(name) || headers.get(name.toLowerCase());
}
const record = headers;
return record[name] || record[name.toLowerCase()] || null;
}
encodeBasicCredentials(value) {
if (typeof Buffer !== "undefined") {
return Buffer.from(value, "utf8").toString("base64");
}
const bytes = new TextEncoder().encode(value);
let binary = "";
for (const byte of bytes) {
binary += String.fromCharCode(byte);
}
return btoa(binary);
}
async safeReadErrorText(response) {
try {
return await response.text();
}
catch {
return "";
}
}
}
exports.CereVoiceTTSClient = CereVoiceTTSClient;