js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
267 lines (266 loc) • 9.24 kB
JavaScript
import { AbstractTTSClient } from "../core/abstract-tts.js";
import * as SSMLUtils from "../core/ssml-utils.js";
import * as SpeechMarkdown from "../markdown/converter.js";
import { getFetch } from "../utils/fetch-utils.js";
/** Static list of available voices */
const MODELSLAB_VOICES = [
// Emotion-capable female voices
{
id: "madison",
name: "Madison",
gender: "Female",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
{
id: "tara",
name: "Tara",
gender: "Female",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
{
id: "leah",
name: "Leah",
gender: "Female",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
{
id: "jess",
name: "Jess",
gender: "Female",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
{
id: "mia",
name: "Mia",
gender: "Female",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
{
id: "zoe",
name: "Zoe",
gender: "Female",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
// Emotion-capable male voices
{
id: "leo",
name: "Leo",
gender: "Male",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
{
id: "dan",
name: "Dan",
gender: "Male",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
{
id: "zac",
name: "Zac",
gender: "Male",
provider: "modelslab",
languageCodes: [{ bcp47: "en-US", iso639_3: "eng", display: "English (US)" }],
},
];
const API_URL = "https://modelslab.com/api/v6/voice/text_to_speech";
const DEFAULT_VOICE = "madison";
const DEFAULT_LANGUAGE = "american english";
const POLL_INTERVAL_MS = 2000;
const MAX_POLL_ATTEMPTS = 20;
/**
* ModelsLab TTS Client
*
* Provides text-to-speech via the ModelsLab Voice API.
* API docs: https://docs.modelslab.com/voice-cloning/text-to-speech
*
* @example
* ```ts
* const client = new ModelsLabTTSClient({ apiKey: "your-api-key" });
* await client.synthToFile("Hello world!", "output.mp3");
* ```
*/
export class ModelsLabTTSClient extends AbstractTTSClient {
constructor(credentials = {}) {
super(credentials);
Object.defineProperty(this, "apiKey", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "defaultLanguage", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "defaultSpeed", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "sampleRate", {
enumerable: true,
configurable: true,
writable: true,
value: 24000
});
this._models = [{ id: "modelslab", features: [] }];
this.apiKey =
credentials.apiKey ||
(typeof process !== "undefined" ? (process.env.MODELSLAB_API_KEY ?? "") : "");
this.defaultLanguage = DEFAULT_LANGUAGE;
this.defaultSpeed = 1.0;
if (!this.voiceId) {
this.voiceId = DEFAULT_VOICE;
}
}
/** Check if credentials are present */
async checkCredentials() {
if (!this.apiKey) {
console.error("ModelsLab API key is required. Set MODELSLAB_API_KEY or pass apiKey.");
return false;
}
return true;
}
getRequiredCredentials() {
return ["apiKey"];
}
async _getVoices() {
return MODELSLAB_VOICES;
}
/**
* Synthesize text to audio bytes (Uint8Array).
* Handles async generation — polls until audio is ready.
*/
async synthToBytes(text, options = {}) {
const { audioStream } = await this.synthToBytestream(text, options);
const reader = audioStream.getReader();
const chunks = [];
while (true) {
const { done, value } = await reader.read();
if (done)
break;
chunks.push(value);
}
const totalLen = chunks.reduce((n, c) => n + c.length, 0);
const out = new Uint8Array(totalLen);
let offset = 0;
for (const chunk of chunks) {
out.set(chunk, offset);
offset += chunk.length;
}
return out;
}
/**
* Synthesize text to a ReadableStream of audio chunks.
*/
async synthToBytestream(text, options = {}) {
let processedText = text;
// Convert SpeechMarkdown → SSML → plain text if needed
if (options.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) {
const ssml = await SpeechMarkdown.toSSML(processedText);
processedText = SSMLUtils.stripSSML(ssml);
}
else if (SSMLUtils.isSSML(processedText)) {
// ModelsLab doesn't support SSML — strip tags
processedText = SSMLUtils.stripSSML(processedText);
}
const voiceId = options.voice || this.voiceId || DEFAULT_VOICE;
this.voiceId = voiceId;
const speed = options.speed ?? this.defaultSpeed;
const language = options.language ?? this.defaultLanguage;
const audioBytes = await this._synthesize(processedText, voiceId, language, speed, options.emotion ?? false);
const audioStream = new ReadableStream({
start(controller) {
controller.enqueue(audioBytes);
controller.close();
},
});
return { audioStream, wordBoundaries: [] };
}
/** Internal: call ModelsLab API and return audio bytes. */
async _synthesize(text, voiceId, language, speed, emotion) {
const fetch = getFetch();
const resp = await fetch(API_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
key: this.apiKey,
prompt: text,
language,
voice_id: voiceId,
speed,
emotion,
}),
});
if (!resp.ok) {
throw new Error(`ModelsLab API error: ${resp.status} ${resp.statusText}`);
}
const data = (await resp.json());
if (data.status === "error") {
throw new Error(`ModelsLab TTS error: ${data.message ?? JSON.stringify(data)}`);
}
let audioUrl;
if (data.status === "success" && data.output?.length) {
audioUrl = data.output[0];
}
else if (data.status === "processing") {
const fetchUrl = data.fetch_result ?? data.link;
if (!fetchUrl) {
throw new Error("ModelsLab returned processing status with no fetch URL");
}
audioUrl = await this._poll(fetchUrl, fetch);
}
else {
throw new Error(`Unexpected ModelsLab status: ${data.status}`);
}
if (!audioUrl) {
throw new Error("ModelsLab returned no audio URL");
}
return this._downloadAudio(audioUrl, fetch);
}
/** Poll the fetch_result URL until audio is ready. */
async _poll(fetchUrl, fetch) {
for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) {
await this._sleep(POLL_INTERVAL_MS);
const resp = await fetch(fetchUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ key: this.apiKey }),
});
if (!resp.ok)
continue;
const data = (await resp.json());
if (data.status === "success" && data.output?.length) {
return data.output[0];
}
if (data.status === "error") {
throw new Error(`ModelsLab poll error: ${data.message}`);
}
}
throw new Error(`ModelsLab audio generation timed out after ${MAX_POLL_ATTEMPTS} attempts`);
}
/** Download audio from URL and return as Uint8Array. */
async _downloadAudio(url, fetch) {
const resp = await fetch(url);
if (!resp.ok) {
throw new Error(`Failed to download audio: ${resp.status} ${resp.statusText}`);
}
const buf = await resp.arrayBuffer();
return new Uint8Array(buf);
}
_sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
}
export default ModelsLabTTSClient;