js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
325 lines (324 loc) • 10.4 kB
JavaScript
import { AbstractTTSClient } from "../core/abstract-tts.js";
import * as SSMLUtils from "../core/ssml-utils.js";
import * as SpeechMarkdown from "../markdown/converter.js";
import { getFetch } from "../utils/fetch-utils.js";
import { toIso639_3, toLanguageDisplay } from "../utils/language-utils.js";
const fetch = getFetch();
const AUDIO_TAG_REGEX = /\[[^\]]+\]/g;
const CARTESIA_PASSTHROUGH_TAGS = ["laughter"];
const CARTESIA_EMOTIONS = [
"neutral",
"angry",
"excited",
"content",
"sad",
"scared",
"happy",
"euphoric",
"anxious",
"panicked",
"calm",
"confident",
"curious",
"frustrated",
"sarcastic",
"melancholic",
"surprised",
"disgusted",
"contemplative",
"determined",
"proud",
"distant",
"skeptical",
"mysterious",
"anticipation",
"grateful",
"affectionate",
"sympathetic",
"nostalgic",
"wistful",
"apologetic",
"hesitant",
"insecure",
"confused",
"resigned",
"alarmed",
"bored",
"tired",
"rejected",
"hurt",
"disappointed",
"dejected",
"guilty",
"envious",
"contempt",
"threatened",
"agitated",
"outraged",
"mad",
"triumphant",
"amazed",
"flirtatious",
"joking/comedic",
"serene",
"peaceful",
"enthusiastic",
"elated",
"trust",
];
export class CartesiaTTSClient extends AbstractTTSClient {
constructor(credentials = {}) {
super(credentials);
Object.defineProperty(this, "apiKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "baseUrl", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "model", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "outputFormat", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.apiKey = credentials.apiKey || process.env.CARTESIA_API_KEY || "";
this.baseUrl = credentials.baseURL || "https://api.cartesia.ai";
this.model = credentials.model || "sonic-3";
this.voiceId = "694f938dd2a74762ba554ff8e2a9d786";
this.outputFormat = {
container: "wav",
encoding: "pcm_f32le",
sample_rate: 44100,
};
this._models = [
{ id: "sonic-3", features: ["streaming", "audio-tags", "inline-voice-cloning"] },
{ id: "sonic-2", features: ["streaming"] },
];
this.sampleRate = 44100;
this.applyCredentialProperties(credentials);
}
applyCredentialProperties(credentials) {
const rawProps = credentials.properties ??
credentials.propertiesJson ??
credentials.propertiesJSON;
if (rawProps) {
let parsed = null;
if (typeof rawProps === "string") {
try {
parsed = JSON.parse(rawProps);
}
catch {
/* ignore */
}
}
else if (typeof rawProps === "object") {
parsed = rawProps;
}
if (parsed) {
for (const [key, value] of Object.entries(parsed)) {
this.setProperty(key, value);
}
}
}
}
processAudioTags(text) {
if (this.model !== "sonic-3") {
return text.replace(AUDIO_TAG_REGEX, "").replace(/\s+/g, " ").trim();
}
const tags = text.match(AUDIO_TAG_REGEX) ?? [];
if (tags.length === 0)
return text;
let processed = text;
for (const tag of tags) {
const inner = tag.slice(1, -1).toLowerCase();
if (CARTESIA_PASSTHROUGH_TAGS.includes(inner))
continue;
if (CARTESIA_EMOTIONS.includes(inner)) {
processed = processed.replace(tag, `<emotion value="${inner}"/>`);
continue;
}
processed = processed.replace(tag, "");
}
return processed.replace(/\s+/g, " ").trim();
}
async prepareText(text, options) {
let processedText = text;
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) {
const ssml = await SpeechMarkdown.toSSML(processedText, "w3c");
processedText = SSMLUtils.stripSSML(ssml);
}
if (SSMLUtils.isSSML(processedText)) {
processedText = SSMLUtils.stripSSML(processedText);
}
processedText = this.processAudioTags(processedText);
return processedText;
}
setModel(model) {
this.model = model;
}
setVoice(voiceId) {
this.voiceId = voiceId;
}
getProperty(property) {
switch (property) {
case "model":
return this.model;
case "voice":
return this.voiceId;
case "outputFormat":
return this.outputFormat;
default:
return super.getProperty(property);
}
}
setProperty(property, value) {
switch (property) {
case "model":
this.setModel(value);
break;
case "voice":
this.setVoice(value);
break;
case "outputFormat":
if (typeof value === "object")
this.outputFormat = value;
break;
default:
super.setProperty(property, value);
break;
}
}
async checkCredentials() {
if (!this.apiKey)
return false;
try {
const response = await fetch(`${this.baseUrl}/voices`, {
method: "GET",
headers: {
"X-API-Key": this.apiKey,
"Cartesia-Version": "2025-04-16",
},
});
return response.ok;
}
catch {
return false;
}
}
getRequiredCredentials() {
return ["apiKey"];
}
async _getVoices() {
try {
const response = await fetch(`${this.baseUrl}/voices`, {
method: "GET",
headers: {
"X-API-Key": this.apiKey,
"Cartesia-Version": "2025-04-16",
},
});
if (!response.ok)
return [];
return await response.json();
}
catch {
return [];
}
}
async _mapVoicesToUnified(rawVoices) {
return rawVoices.map((voice) => ({
id: voice.id,
name: voice.name,
gender: (voice.description?.toLowerCase().includes("female")
? "Female"
: voice.description?.toLowerCase().includes("male")
? "Male"
: "Unknown"),
languageCodes: voice.language
? [
{
bcp47: voice.language,
iso639_3: toIso639_3(voice.language),
display: toLanguageDisplay(voice.language),
},
]
: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
provider: "cartesia",
}));
}
async synthToBytes(text, options = {}) {
const preparedText = await this.prepareText(text, options);
const voiceId = options.voice || this.voiceId || "694f938dd2a74762ba554ff8e2a9d786";
const body = {
output_format: this.outputFormat,
...options.providerOptions,
model_id: options.model || this.model,
transcript: preparedText,
voice: { mode: "id", id: voiceId },
};
const response = await fetch(`${this.baseUrl}/tts/bytes`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-API-Key": this.apiKey,
"Cartesia-Version": "2025-04-16",
},
body: JSON.stringify(body),
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Cartesia API error: ${response.status} ${response.statusText} - ${errorText}`);
}
const arrayBuffer = await response.arrayBuffer();
this._createEstimatedWordTimings(preparedText);
return new Uint8Array(arrayBuffer);
}
async synthToBytestream(text, options = {}) {
const preparedText = await this.prepareText(text, options);
const voiceId = options.voice || this.voiceId || "694f938dd2a74762ba554ff8e2a9d786";
const body = {
output_format: this.outputFormat,
...options.providerOptions,
model_id: options.model || this.model,
transcript: preparedText,
voice: { mode: "id", id: voiceId },
};
const response = await fetch(`${this.baseUrl}/tts/bytes`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-API-Key": this.apiKey,
"Cartesia-Version": "2025-04-16",
},
body: JSON.stringify(body),
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Cartesia API error: ${response.status} ${response.statusText} - ${errorText}`);
}
if (!response.body) {
const arrayBuffer = await response.arrayBuffer();
const audioData = new Uint8Array(arrayBuffer);
const readableStream = new ReadableStream({
start(controller) {
controller.enqueue(audioData);
controller.close();
},
});
return { audioStream: readableStream, wordBoundaries: [] };
}
return { audioStream: response.body, wordBoundaries: [] };
}
}