UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

681 lines (680 loc) 25 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.CereVoiceTTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); const SSMLUtils = __importStar(require("../core/ssml-utils")); const SpeechMarkdown = __importStar(require("../markdown/converter")); const fetch_utils_1 = require("../utils/fetch-utils"); const language_utils_1 = require("../utils/language-utils"); const TOKEN_LIFETIME_MS = 3 * 60 * 60 * 1000; const TOKEN_EXPIRY_BUFFER_MS = 60 * 1000; const SUPPORTED_AUDIO_FORMATS = new Set(["wav", "mp3", "ogg"]); class CereVoiceTTSClient extends abstract_tts_1.AbstractTTSClient { constructor(credentials = {}) { super(credentials); Object.defineProperty(this, "email", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "password", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "accessToken", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "refreshToken", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "baseUrl", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "audioFormat", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "outputSampleRate", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "language", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "accent", { enumerable: true, configurable: true, writable: true, value: void 0 }); Object.defineProperty(this, "metadata", { enumerable: true, configurable: true, writable: true, value: false }); Object.defineProperty(this, "tokenExpiresAt", { enumerable: true, configurable: true, writable: true, value: 0 }); this.email = credentials.email || (typeof process !== "undefined" ? process.env.CEREVOICE_EMAIL || "" : ""); this.password = credentials.password || (typeof process !== "undefined" ? process.env.CEREVOICE_PASSWORD || "" : ""); this.accessToken = credentials.accessToken || (typeof process !== "undefined" ? process.env.CEREVOICE_ACCESS_TOKEN || "" : ""); this.refreshToken = credentials.refreshToken || (typeof process !== "undefined" ? process.env.CEREVOICE_REFRESH_TOKEN || "" : ""); this.baseUrl = (credentials.baseURL || "https://api.cerevoice.com/v2").replace(/\/+$/, ""); this.voiceId = credentials.voice || "Heather"; this.audioFormat = credentials.audioFormat || "wav"; this.outputSampleRate = credentials.sampleRate; if (this.outputSampleRate) { this.sampleRate = this.outputSampleRate; } this.capabilities = { browserSupported: true, nodeSupported: true, needsWasm: false, }; this._models = [ { id: "cerevoice-cloud-v2", features: ["streaming", "ssml", "word-boundary-events"] }, ]; if (this.accessToken) { this.tokenExpiresAt = Number.POSITIVE_INFINITY; } this.applyCredentialProperties(credentials); } applyCredentialProperties(credentials) { const rawProps = credentials.properties ?? credentials.propertiesJson ?? credentials.propertiesJSON; if (!rawProps) { return; } let parsed = null; if (typeof rawProps === "string") { try { parsed = JSON.parse(rawProps); } catch { parsed = null; } } else if (typeof rawProps === "object") { parsed = rawProps; } if (!parsed) { return; } for (const [key, value] of Object.entries(parsed)) { this.setProperty(key, value); } } setVoice(voiceId, lang) { this.voiceId = voiceId; if (lang) { this.lang = lang; } } getProperty(property) { switch (property) { case "voice": return this.voiceId; case "baseURL": return this.baseUrl; case "audioFormat": return this.audioFormat; case "sampleRate": return this.outputSampleRate; case "language": return this.language; case "accent": return this.accent; case "metadata": return this.metadata; default: return super.getProperty(property); } } setProperty(property, value) { switch (property) { case "voice": this.setVoice(String(value)); break; case "baseURL": case "baseUrl": this.baseUrl = String(value).replace(/\/+$/, ""); break; case "audioFormat": if (this.isSupportedAudioFormat(value)) { this.audioFormat = value; } break; case "sampleRate": { const sampleRate = Number(value); if (Number.isFinite(sampleRate) && sampleRate > 0) { this.outputSampleRate = sampleRate; this.sampleRate = sampleRate; } break; } case "language": this.language = String(value); break; case "accent": this.accent = String(value); break; case "metadata": this.metadata = Boolean(value); break; default: super.setProperty(property, value); break; } } async checkCredentials() { if (!this.accessToken && !this.refreshToken && (!this.email || !this.password)) { return false; } try { const voices = await this._getVoices(); return voices.length > 0; } catch { return false; } } getRequiredCredentials() { return ["email", "password"]; } async _getVoices() { try { const response = await this.fetchWithAuth(this.buildUrl("/voices")); if (!response.ok) { return []; } const data = (await response.json()); return Array.isArray(data.voices) ? data.voices : []; } catch { return []; } } async _mapVoicesToUnified(rawVoices) { return rawVoices.map((voice) => { const language = voice.language_iso || "en"; const country = voice.country_iso || undefined; const bcp47 = country ? `${language.toLowerCase()}-${country.toUpperCase()}` : language; return { id: voice.name || "unknown", name: voice.name || "Unknown", gender: this.mapGender(voice.gender), provider: "cerevoice", languageCodes: [ { bcp47, iso639_3: (0, language_utils_1.toIso639_3)(bcp47), display: (0, language_utils_1.toLanguageDisplay)(bcp47), }, ], metadata: { sample_rate: voice.sample_rate, accent_code: voice.accent_code, accent: voice.accent, country: voice.country, region: voice.region, language_iso: voice.language_iso, country_iso: voice.country_iso, language_ms: voice.language_ms, language: voice.language, }, }; }); } async synthToBytes(text, options = {}) { const prepared = await this.prepareInput(text, options); const wantsMetadata = this.shouldRequestMetadata(options); const response = await this.requestSynthesis(prepared, options, wantsMetadata); const audioBytes = new Uint8Array(await response.arrayBuffer()); if (wantsMetadata) { const wordBoundaries = await this.getWordBoundariesFromResponse(response); if (wordBoundaries.length > 0) { this.timings = wordBoundaries.map((wb) => [ wb.offset / 10000, (wb.offset + wb.duration) / 10000, wb.text, ]); } } else { this._createEstimatedWordTimings(prepared.plainText); } return audioBytes; } async synthToBytestream(text, options = {}) { const prepared = await this.prepareInput(text, options); const wantsMetadata = this.shouldRequestMetadata(options); const response = await this.requestSynthesis(prepared, options, wantsMetadata); const wordBoundaries = wantsMetadata ? await this.getWordBoundariesFromResponse(response) : []; if (wordBoundaries.length > 0) { this.timings = wordBoundaries.map((wb) => [ wb.offset / 10000, (wb.offset + wb.duration) / 10000, wb.text, ]); } if (response.body) { return { audioStream: response.body, wordBoundaries, }; } const audioBytes = new Uint8Array(await response.arrayBuffer()); const audioStream = new ReadableStream({ start(controller) { controller.enqueue(audioBytes); controller.close(); }, }); return { audioStream, wordBoundaries, }; } async requestSynthesis(prepared, options, metadata) { const audioFormat = this.resolveAudioFormat(options); const providerOptions = options.providerOptions || {}; const url = this.buildUrl("/speak", { voice: options.voice || this.voiceId || undefined, audio_format: audioFormat, sample_rate: options.sampleRate || this.outputSampleRate, language: options.language || this.language, accent: options.accent || this.accent, metadata, ...providerOptions, }); const response = await this.fetchWithAuth(url, { method: "POST", headers: { Accept: this.acceptHeaderForFormat(audioFormat), "Content-Type": prepared.contentType, }, body: prepared.body, }); if (!response.ok) { const errorText = await this.safeReadErrorText(response); throw new Error(`CereVoice API error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}`); } return response; } async prepareInput(text, options) { let processedText = text; if (options.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { processedText = await SpeechMarkdown.toSSML(processedText, "w3c"); } if (options.rawSSML || this.isXmlLike(processedText)) { const body = options.rawSSML && !this.isXmlLike(processedText) ? SSMLUtils.wrapWithSpeakTags(this.escapeXml(processedText)) : processedText; return { body, contentType: "text/xml", plainText: SSMLUtils.stripSSML(body), }; } if (this.shouldApplyProsody(options)) { const attrs = []; const rate = options.rate ?? this.properties.rate; const pitch = options.pitch ?? this.properties.pitch; const volume = options.volume ?? this.properties.volume; if (rate && rate !== "medium") { attrs.push(`rate="${rate}"`); } if (pitch && pitch !== "medium") { attrs.push(`pitch="${pitch}"`); } if (volume !== undefined && volume !== 100) { attrs.push(`volume="${volume}"`); } const escapedText = this.escapeXml(processedText); const body = attrs.length > 0 ? `<speak><prosody ${attrs.join(" ")}>${escapedText}</prosody></speak>` : `<speak>${escapedText}</speak>`; return { body, contentType: "text/xml", plainText: processedText, }; } return { body: processedText, contentType: "text/plain", plainText: processedText, }; } shouldApplyProsody(options) { return (options.rate !== undefined || options.pitch !== undefined || options.volume !== undefined || this.properties.rate !== "medium" || this.properties.pitch !== "medium" || this.properties.volume !== 100); } shouldRequestMetadata(options) { return Boolean(options.useWordBoundary || options.metadata || this.metadata); } async getWordBoundariesFromResponse(response) { const metadataUrl = this.getHeader(response.headers, "X-CereVoice-Metadata"); if (!metadataUrl) { return []; } try { const metadataResponse = await (0, fetch_utils_1.getFetch)()(metadataUrl, { method: "GET", headers: { Accept: "text/xml, application/xml, text/plain", }, }); if (!metadataResponse.ok) { return []; } return this.parseMetadataXml(await metadataResponse.text()); } catch { return []; } } parseMetadataXml(xml) { if (!xml.trim()) { return []; } if (typeof DOMParser !== "undefined") { try { const document = new DOMParser().parseFromString(xml, "application/xml"); const words = Array.from(document.getElementsByTagName("word")); const parsed = words .map((word) => this.createWordBoundary(word.getAttribute("name"), word.getAttribute("start"), word.getAttribute("end"))) .filter((word) => Boolean(word)); if (parsed.length > 0) { return this.fillMissingDurations(parsed); } } catch { return []; } } const wordBoundaries = []; const wordTagRegex = /<word\b([^>]*)\/?>/gi; let wordMatch = wordTagRegex.exec(xml); while (wordMatch !== null) { const attributes = this.parseXmlAttributes(wordMatch[1]); const boundary = this.createWordBoundary(attributes.name, attributes.start, attributes.end); if (boundary) { wordBoundaries.push(boundary); } wordMatch = wordTagRegex.exec(xml); } return this.fillMissingDurations(wordBoundaries); } fillMissingDurations(wordBoundaries) { return wordBoundaries.map((boundary, index) => { if (boundary.duration > 0) { return boundary; } const next = wordBoundaries[index + 1]; const fallbackDuration = next ? Math.max(next.offset - boundary.offset, 0) : 5000; return { ...boundary, duration: fallbackDuration, }; }); } parseXmlAttributes(attributeText) { const attributes = {}; const attrRegex = /([A-Za-z_:][\w:.-]*)\s*=\s*(?:"([^"]*)"|'([^']*)')/g; let attrMatch = attrRegex.exec(attributeText); while (attrMatch !== null) { attributes[attrMatch[1]] = this.decodeXmlEntities(attrMatch[2] ?? attrMatch[3] ?? ""); attrMatch = attrRegex.exec(attributeText); } return attributes; } createWordBoundary(name, start, end) { if (!name || start === undefined || start === null || end === undefined || end === null) { return null; } const startSeconds = Number(start); const endSeconds = Number(end); if (!Number.isFinite(startSeconds) || !Number.isFinite(endSeconds) || endSeconds < startSeconds) { return null; } return { text: name, offset: Math.round(startSeconds * 10000), duration: Math.round((endSeconds - startSeconds) * 10000), }; } async fetchWithAuth(url, options = {}, retry = true) { const token = await this.ensureAccessToken(); const response = await (0, fetch_utils_1.getFetch)()(url, { ...options, headers: { ...(options.headers || {}), Authorization: `Bearer ${token}`, }, }); if (response.status === 401 && retry) { const refreshedToken = await this.ensureAccessToken(true); return (0, fetch_utils_1.getFetch)()(url, { ...options, headers: { ...(options.headers || {}), Authorization: `Bearer ${refreshedToken}`, }, }); } return response; } async ensureAccessToken(forceRefresh = false) { if (!forceRefresh && this.accessToken && Date.now() < this.tokenExpiresAt) { return this.accessToken; } if (this.refreshToken) { try { await this.refreshAccessToken(); return this.accessToken; } catch { if (!this.email || !this.password) { throw new Error("CereVoice refresh token is invalid or expired"); } } } if (!this.email || !this.password) { throw new Error("CereVoice email and password are required for authentication"); } await this.login(); return this.accessToken; } async login() { const response = await (0, fetch_utils_1.getFetch)()(this.buildUrl("/auth"), { method: "GET", headers: { Authorization: `Basic ${this.encodeBasicCredentials(`${this.email}:${this.password}`)}`, }, }); if (!response.ok) { const errorText = await this.safeReadErrorText(response); throw new Error(`CereVoice auth error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}`); } const data = (await response.json()); if (!data.access_token) { throw new Error("CereVoice auth response did not include an access token"); } this.accessToken = data.access_token; this.refreshToken = data.refresh_token || this.refreshToken; this.tokenExpiresAt = Date.now() + TOKEN_LIFETIME_MS - TOKEN_EXPIRY_BUFFER_MS; } async refreshAccessToken() { const response = await (0, fetch_utils_1.getFetch)()(this.buildUrl("/auth/refresh", { refresh_token: this.refreshToken }), { method: "GET", }); if (!response.ok) { const errorText = await this.safeReadErrorText(response); throw new Error(`CereVoice refresh error: ${response.status} ${response.statusText}${errorText ? ` - ${errorText}` : ""}`); } const data = (await response.json()); if (!data.access_token) { throw new Error("CereVoice refresh response did not include an access token"); } this.accessToken = data.access_token; this.tokenExpiresAt = Date.now() + TOKEN_LIFETIME_MS - TOKEN_EXPIRY_BUFFER_MS; } buildUrl(path, params = {}) { const url = new URL(`${this.baseUrl}${path}`); for (const [key, value] of Object.entries(params)) { if (value !== undefined) { url.searchParams.set(key, String(value)); } } return url.toString(); } resolveAudioFormat(options) { const requested = options.audioFormat || options.format || this.audioFormat; return this.isSupportedAudioFormat(requested) ? requested : this.audioFormat; } isSupportedAudioFormat(value) { return typeof value === "string" && SUPPORTED_AUDIO_FORMATS.has(value); } acceptHeaderForFormat(format) { switch (format) { case "mp3": return "audio/mpeg"; case "ogg": return "audio/ogg"; case "wav": default: return "audio/wav"; } } mapGender(gender) { const normalized = gender?.toLowerCase(); if (normalized === "male") { return "Male"; } if (normalized === "female") { return "Female"; } return "Unknown"; } isXmlLike(text) { return /^\s*(<\?xml|<speak\b|<doc\b|<[A-Za-z][\w:.-]*(\s|>|\/>))/i.test(text); } escapeXml(text) { return text .replace(/&/g, "&amp;") .replace(/</g, "&lt;") .replace(/>/g, "&gt;") .replace(/"/g, "&quot;") .replace(/'/g, "&apos;"); } decodeXmlEntities(text) { return text .replace(/&apos;/g, "'") .replace(/&quot;/g, '"') .replace(/&gt;/g, ">") .replace(/&lt;/g, "<") .replace(/&amp;/g, "&"); } getHeader(headers, name) { if (!headers) { return null; } if (typeof headers.get === "function") { return headers.get(name) || headers.get(name.toLowerCase()); } const record = headers; return record[name] || record[name.toLowerCase()] || null; } encodeBasicCredentials(value) { if (typeof Buffer !== "undefined") { return Buffer.from(value, "utf8").toString("base64"); } const bytes = new TextEncoder().encode(value); let binary = ""; for (const byte of bytes) { binary += String.fromCharCode(byte); } return btoa(binary); } async safeReadErrorText(response) { try { return await response.text(); } catch { return ""; } } } exports.CereVoiceTTSClient = CereVoiceTTSClient;