js-tts-wrapper
Version:
A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services
343 lines (342 loc) • 13.3 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.WitAITTSClient = void 0;
const abstract_tts_1 = require("../core/abstract-tts");
const SSMLUtils = __importStar(require("../core/ssml-utils"));
const SpeechMarkdown = __importStar(require("../markdown/converter"));
/**
* WitAI TTS Client
*/
class WitAITTSClient extends abstract_tts_1.AbstractTTSClient {
/**
* Create a new WitAI TTS client
* @param credentials WitAI credentials object with token
*/
constructor(credentials) {
super(credentials);
Object.defineProperty(this, "token", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "baseUrl", {
enumerable: true,
configurable: true,
writable: true,
value: "https://api.wit.ai"
});
Object.defineProperty(this, "apiVersion", {
enumerable: true,
configurable: true,
writable: true,
value: "20240601"
});
Object.defineProperty(this, "headers", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "sampleRate", {
enumerable: true,
configurable: true,
writable: true,
value: 24000
}); // Default sample rate for WitAI
if (!credentials.token) {
throw new Error("An API token for Wit.ai must be provided");
}
this.token = credentials.token;
this.headers = {
Authorization: `Bearer ${this.token}`,
"Content-Type": "application/json",
};
}
/**
* Get raw voices from WitAI
* @returns Promise resolving to an array of unified voice objects
*/
async _getVoices() {
try {
const response = await fetch(`${this.baseUrl}/voices?v=${this.apiVersion}`, {
method: "GET",
headers: this.headers,
});
if (!response.ok) {
throw new Error(`Failed to fetch voices: ${response.statusText}`);
}
const voices = await response.json();
console.log("WitAI Raw Voices Response:", JSON.stringify(voices, null, 2));
const standardizedVoices = [];
for (const localeKey in voices) {
// Get the original locale (e.g., "en_US")
const locale = localeKey.replace("_", "-");
for (const voice of voices[localeKey]) {
const standardizedVoice = {
id: voice.name,
languageCodes: [locale],
name: voice.name.split("$")[1] || voice.name,
gender: voice.gender,
styles: voice.styles || [],
};
standardizedVoices.push(standardizedVoice);
console.log("WitAI Standardized Voice:", standardizedVoice);
}
}
return standardizedVoices;
}
catch (error) {
console.error("Error fetching WitAI voices:", error);
return [];
}
}
/**
* Map WitAI voice objects to unified format
* @param rawVoices Array of WitAI voice objects
* @returns Promise resolving to an array of unified voice objects
*/
async _mapVoicesToUnified(rawVoices) {
// Transform WitAI voices to unified format
return rawVoices.map((voice) => ({
id: voice.id,
name: voice.name,
gender: voice.gender === "female" ? "Female" : voice.gender === "male" ? "Male" : "Unknown",
provider: "witai",
languageCodes: voice.languageCodes.map((locale) => {
const [language, region] = locale.split("-");
return {
bcp47: locale,
iso639_3: language, // Simple extraction of language code
display: `${language.toUpperCase()} (${region || language})`,
};
}),
}));
}
/**
* Prepare text for synthesis
* @param text Text to prepare
* @param options Synthesis options
* @returns Prepared text
*/
async prepareText(text, options) {
let processedText = text;
// Check if the input is SpeechMarkdown and useSpeechMarkdown is enabled, convert it to plain text
if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) {
// Convert SpeechMarkdown to SSML first, then strip SSML tags
const ssml = await SpeechMarkdown.toSSML(processedText);
processedText = SSMLUtils.stripSSML(ssml);
}
// If the input is SSML, convert it to plain text
if (SSMLUtils.isSSML(processedText)) {
processedText = SSMLUtils.stripSSML(processedText);
}
return processedText;
}
/**
* Get the appropriate Accept header based on the format option
* @param format Format option from WitAITTSOptions
* @returns MIME type string
*/
getAcceptHeader(format) {
const formats = {
pcm: "audio/raw",
mp3: "audio/mpeg",
wav: "audio/wav",
};
return formats[format || ""] || "audio/raw"; // Default to PCM if unspecified
}
/**
* Synthesize text to audio bytes
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to audio bytes
*/
async synthToBytes(text, options) {
try {
// Prepare text for synthesis (strip SSML/Markdown if present)
const preparedText = await this.prepareText(text, options);
// Use provided voice or the one set with setVoice
let voice = options?.voice || this.voiceId;
if (!voice) {
// Use a default voice if none is set
const voices = await this._getVoices();
if (voices.length === 0) {
throw new Error("No voice ID provided and no default voice available");
}
voice = voices[0].id;
this.voiceId = voice;
console.log(`Using default voice: ${voice}`);
}
// Get format from options if available
const format = options?.format;
// Set headers for audio format
const headers = {
...this.headers,
Accept: this.getAcceptHeader(format),
};
const data = {
q: preparedText,
voice: voice,
style: "default", // Add a default style
};
console.log("WitAI TTS Request:", {
url: `${this.baseUrl}/synthesize?v=${this.apiVersion}`,
headers: headers,
data: data,
});
const response = await fetch(`${this.baseUrl}/synthesize?v=${this.apiVersion}`, {
method: "POST",
headers,
body: JSON.stringify(data),
});
if (!response.ok) {
// Try to get more detailed error information
let errorMessage = `Failed to synthesize speech: ${response.statusText}`;
try {
const errorData = await response.text();
console.error("WitAI TTS Error Response:", errorData);
errorMessage += ` - ${errorData}`;
}
catch (_e) {
// Ignore error parsing error
}
throw new Error(errorMessage);
}
const arrayBuffer = await response.arrayBuffer();
return new Uint8Array(arrayBuffer);
}
catch (error) {
console.error("Error synthesizing speech with WitAI:", error);
throw error;
}
}
/**
* Synthesize text to a byte stream with word boundary information
* @param text Text to synthesize
* @param options Synthesis options
* @returns Promise resolving to an object containing the audio stream and word boundary information
*/
async synthToBytestream(text, options) {
try {
// Prepare text for synthesis
const preparedText = await this.prepareText(text, options);
// Use provided voice or the one set with setVoice
let voice = options?.voice || this.voiceId;
if (!voice) {
// Use a default voice if none is set
const voices = await this._getVoices();
if (voices.length === 0) {
throw new Error("No voice ID provided and no default voice available");
}
voice = voices[0].id;
this.voiceId = voice;
console.log(`Using default voice for bytestream: ${voice}`);
}
// Get format from options if available
const format = options?.format;
// Set headers for audio format
const headers = {
...this.headers,
Accept: this.getAcceptHeader(format),
};
const data = {
q: preparedText,
voice: voice,
style: "default", // Add a default style
};
console.log("WitAI TTS Bytestream Request:", {
url: `${this.baseUrl}/synthesize?v=${this.apiVersion}`,
headers: headers,
data: data,
});
const response = await fetch(`${this.baseUrl}/synthesize?v=${this.apiVersion}`, {
method: "POST",
headers,
body: JSON.stringify(data),
});
if (!response.ok) {
// Try to get more detailed error information
let errorMessage = `Failed to synthesize speech: ${response.statusText}`;
try {
const errorData = await response.text();
console.error("WitAI TTS Bytestream Error Response:", errorData);
errorMessage += ` - ${errorData}`;
}
catch (_e) {
// Ignore error parsing error
}
throw new Error(errorMessage);
}
// Create estimated word boundaries based on text length
const words = preparedText.split(/\s+/);
const estimatedDuration = 0.3; // Estimated duration per word in seconds
const wordBoundaries = [];
let currentTime = 0;
for (const word of words) {
if (word.trim()) {
wordBoundaries.push({
text: word,
offset: currentTime * 1000, // Convert to milliseconds
duration: estimatedDuration * 1000, // Convert to milliseconds
});
currentTime += estimatedDuration;
}
}
return {
audioStream: response.body,
wordBoundaries,
};
}
catch (error) {
console.error("Error synthesizing speech with WitAI:", error);
throw error;
}
}
/**
* Set the voice to use for synthesis
* @param voiceId Voice ID to use
* @param lang Language code (not used in WitAI)
*/
setVoice(voiceId, lang) {
console.log(`Setting WitAI voice to: ${voiceId}`);
this.voiceId = voiceId;
if (lang) {
this.lang = lang;
}
}
}
exports.WitAITTSClient = WitAITTSClient;
;