edge-tts-client
Version:
Client-side (web browser) implementation of Edge TTS package — Microsoft Edge Read Aloud API called to generate free text-to-speech
217 lines (216 loc) • 9.41 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.EdgeTTSClient = exports.ProsodyOptions = void 0;
const buffer_1 = require("buffer");
const constants_1 = require("./constants");
// Ensure Buffer is globally available for browser-like environments
if (typeof globalThis.Buffer === 'undefined') {
globalThis.Buffer = buffer_1.Buffer;
}
// Generates a random hex string of the specified length
function generateRandomHex(length) {
const randomValues = new Uint8Array(length);
window.crypto.getRandomValues(randomValues);
return Array.from(randomValues, (byte) => `0${byte.toString(16)}`.slice(-2)).join("");
}
class EventEmitter {
constructor() {
this.eventListeners = { data: [], close: [], end: [] };
}
on(event, callback) {
this.eventListeners[event].push(callback);
}
emit(event, data) {
this.eventListeners[event].forEach((callback) => callback(data));
}
}
class ProsodyOptions {
constructor() {
this.pitch = "+0Hz";
this.rate = 1.0;
this.volume = 100.0;
}
}
exports.ProsodyOptions = ProsodyOptions;
class EdgeTTSClient {
constructor(enableLogging = false) {
this.ws = null;
this.voice = null;
this.voiceLocale = null;
this.outputFormat = null;
this.requestQueue = {};
this.connectionStartTime = 0;
this.enableLogging = enableLogging;
this.isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
}
log(...args) {
if (this.enableLogging)
console.log(...args);
}
sendMessage(message) {
return __awaiter(this, void 0, void 0, function* () {
var _a, _b;
for (let attempt = 1; attempt <= 3 && ((_a = this.ws) === null || _a === void 0 ? void 0 : _a.readyState) !== WebSocket.OPEN; attempt++) {
if (attempt === 1)
this.connectionStartTime = Date.now();
this.log(`Connecting... attempt ${attempt}`);
yield this.initWebSocket();
}
(_b = this.ws) === null || _b === void 0 ? void 0 : _b.send(message);
});
}
initWebSocket() {
this.ws = new WebSocket(EdgeTTSClient.SYNTH_URL);
this.ws.binaryType = "arraybuffer";
let metadataBuffer = [];
return new Promise((resolve, reject) => {
this.ws.onopen = () => {
this.log("Connected in", (Date.now() - this.connectionStartTime) / 1000, "seconds");
this.sendMessage(this.getConfigMessage()).then(resolve);
};
this.ws.onmessage = (event) => this.handleMessage(event, metadataBuffer);
this.ws.onclose = () => this.handleClose();
this.ws.onerror = (error) => reject(`Connection Error: ${error}`);
});
}
handleMessage(event, metadataBuffer) {
var _a;
const buffer = buffer_1.Buffer.from(event.data);
const message = buffer.toString();
const requestIdMatch = /X-RequestId:(.*?)\r\n/.exec(message);
const requestId = requestIdMatch ? requestIdMatch[1] : "";
if (message.includes("Path:turn.start")) {
metadataBuffer.length = 0;
}
else if (message.includes("Path:turn.end")) {
(_a = this.requestQueue[requestId]) === null || _a === void 0 ? void 0 : _a.emit("end", metadataBuffer);
}
else if (message.includes("Path:audio")) {
this.cacheAudioData(buffer, requestId);
}
else if (message.includes("Path:audio.metadata")) {
const startIndex = message.indexOf("{");
metadataBuffer.push(JSON.parse(message.slice(startIndex)).Metadata[0]);
}
else {
this.log("Unknown Message", message);
}
}
handleClose() {
this.log("Disconnected after:", (Date.now() - this.connectionStartTime) / 1000, "seconds");
for (const requestId in this.requestQueue) {
this.requestQueue[requestId].emit("close", null);
}
}
cacheAudioData(buffer, requestId) {
var _a;
// Convert the BINARY_DELIM string to a Uint8Array using TextEncoder
const binaryDelimBytes = new TextEncoder().encode(EdgeTTSClient.BINARY_DELIM);
// Use the helper function to find the delimiter index in the buffer
const delimiterIndex = this.findDelimiterIndex(buffer, binaryDelimBytes);
if (delimiterIndex === -1) {
this.log('Delimiter not found in the buffer.');
return;
}
const audioDataStart = delimiterIndex + binaryDelimBytes.length;
const audioData = buffer.slice(audioDataStart);
(_a = this.requestQueue[requestId]) === null || _a === void 0 ? void 0 : _a.emit("data", audioData);
this.log("Received audio chunk of size:", audioData === null || audioData === void 0 ? void 0 : audioData.length);
}
// Helper function to find the index of a byte sequence within another byte sequence
findDelimiterIndex(buffer, delimiter) {
for (let i = 0; i <= buffer.length - delimiter.length; i++) {
let match = true;
for (let j = 0; j < delimiter.length; j++) {
if (buffer[i + j] !== delimiter[j]) {
match = false;
break;
}
}
if (match)
return i;
}
return -1;
}
getConfigMessage() {
return `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{
"context": {
"synthesis": {
"audio": {
"metadataoptions": {
"sentenceBoundaryEnabled": "true",
"wordBoundaryEnabled": "true"
},
"outputFormat": "${this.outputFormat}"
}
}
}
}`;
}
getVoices() {
return fetch(EdgeTTSClient.VOICES_URL)
.then((response) => response.json())
.catch((error) => Promise.reject(error));
}
setMetadata(voiceName, outputFormat, voiceLocale) {
return __awaiter(this, void 0, void 0, function* () {
this.voice = voiceName;
this.outputFormat = outputFormat;
this.voiceLocale = voiceLocale || this.inferLocaleFromVoiceName(voiceName);
if (!this.voiceLocale) {
throw new Error("Could not infer voiceLocale from voiceName!");
}
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
this.connectionStartTime = Date.now();
yield this.initWebSocket();
}
});
}
inferLocaleFromVoiceName(voiceName) {
const match = EdgeTTSClient.VOICE_LANG_REGEX.exec(voiceName);
return match ? match[0] : null;
}
close() {
var _a;
(_a = this.ws) === null || _a === void 0 ? void 0 : _a.close();
}
toStream(text, options = new ProsodyOptions()) {
return this.sendSSMLRequest(this.buildSSML(text, options));
}
buildSSML(text, options) {
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this.voiceLocale}">
<voice name="${this.voice}">
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
${text}
</prosody>
</voice>
</speak>`;
}
sendSSMLRequest(ssml) {
if (!this.ws) {
throw new Error("WebSocket not initialized. Call setMetadata first.");
}
const requestId = generateRandomHex(16);
const requestMessage = `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n${ssml.trim()}`;
const eventEmitter = new EventEmitter();
this.requestQueue[requestId] = eventEmitter;
this.sendMessage(requestMessage).then();
return eventEmitter;
}
}
exports.EdgeTTSClient = EdgeTTSClient;
EdgeTTSClient.OUTPUT_FORMAT = constants_1.OUTPUT_FORMAT;
EdgeTTSClient.CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
EdgeTTSClient.VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${EdgeTTSClient.CLIENT_TOKEN}`;
EdgeTTSClient.SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${EdgeTTSClient.CLIENT_TOKEN}`;
EdgeTTSClient.BINARY_DELIM = "Path:audio\r\n";
EdgeTTSClient.VOICE_LANG_REGEX = /\w{2}-\w{2}/;